mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 21:20:24 +01:00
Enhance word splitting strategy
This commit is contained in:
parent
358aa337ea
commit
7f9680f0a0
79
milli/src/search/query_tree.rs
Normal file → Executable file
79
milli/src/search/query_tree.rs
Normal file → Executable file
@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cmp::max;
|
use std::cmp::max;
|
||||||
use std::{cmp, fmt, mem};
|
use std::{fmt, mem};
|
||||||
|
|
||||||
use charabia::classifier::ClassifiedTokenIter;
|
use charabia::classifier::ClassifiedTokenIter;
|
||||||
use charabia::{SeparatorKind, TokenKind};
|
use charabia::{SeparatorKind, TokenKind};
|
||||||
@ -10,7 +10,7 @@ use slice_group_by::GroupBy;
|
|||||||
|
|
||||||
use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
|
use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
|
||||||
use crate::search::TermsMatchingStrategy;
|
use crate::search::TermsMatchingStrategy;
|
||||||
use crate::{Index, MatchingWords, Result};
|
use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result};
|
||||||
|
|
||||||
type IsOptionalWord = bool;
|
type IsOptionalWord = bool;
|
||||||
type IsPrefix = bool;
|
type IsPrefix = bool;
|
||||||
@ -146,6 +146,7 @@ impl fmt::Debug for Query {
|
|||||||
|
|
||||||
trait Context {
|
trait Context {
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
||||||
match self.word_docids(word)? {
|
match self.word_docids(word)? {
|
||||||
@ -156,6 +157,12 @@ trait Context {
|
|||||||
/// Returns the minimum word len for 1 and 2 typos.
|
/// Returns the minimum word len for 1 and 2 typos.
|
||||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
||||||
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
|
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
|
||||||
|
fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
|
||||||
|
match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
|
||||||
|
Some(rb) => Ok(Some(rb.len())),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The query tree builder is the interface to build a query tree.
|
/// The query tree builder is the interface to build a query tree.
|
||||||
@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
self.index.word_docids.get(self.rtxn, word)
|
self.index.word_docids.get(self.rtxn, word)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
|
||||||
|
}
|
||||||
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
self.index.words_synonyms(self.rtxn, words)
|
self.index.words_synonyms(self.rtxn, words)
|
||||||
}
|
}
|
||||||
@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
|
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
|
||||||
self.exact_words.as_ref()
|
self.exact_words.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
|
||||||
|
let key = (left_word, right_word, proximity);
|
||||||
|
self.index.word_pair_proximity_docids.remap_data_type::<CboRoaringBitmapLenCodec>().get(&self.rtxn, &key)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTreeBuilder<'a> {
|
impl<'a> QueryTreeBuilder<'a> {
|
||||||
@ -274,12 +290,10 @@ fn split_best_frequency<'a>(
|
|||||||
for (i, _) in chars {
|
for (i, _) in chars {
|
||||||
let (left, right) = word.split_at(i);
|
let (left, right) = word.split_at(i);
|
||||||
|
|
||||||
let left_freq = ctx.word_documents_count(left)?.unwrap_or(0);
|
let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0);
|
||||||
let right_freq = ctx.word_documents_count(right)?.unwrap_or(0);
|
|
||||||
|
|
||||||
let min_freq = cmp::min(left_freq, right_freq);
|
if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) {
|
||||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
best = Some((pair_freq, left, right));
|
||||||
best = Some((min_freq, left, right));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -824,6 +838,11 @@ mod test {
|
|||||||
Ok(self.postings.get(word).cloned())
|
Ok(self.postings.get(word).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
|
||||||
|
Ok(bitmap.cloned())
|
||||||
|
}
|
||||||
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||||
Ok(self.synonyms.get(&words).cloned())
|
Ok(self.synonyms.get(&words).cloned())
|
||||||
@ -881,19 +900,22 @@ mod test {
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
postings: hashmap! {
|
postings: hashmap! {
|
||||||
String::from("hello") => random_postings(rng, 1500),
|
String::from("hello") => random_postings(rng, 1500),
|
||||||
String::from("hi") => random_postings(rng, 4000),
|
String::from("hi") => random_postings(rng, 4000),
|
||||||
String::from("word") => random_postings(rng, 2500),
|
String::from("word") => random_postings(rng, 2500),
|
||||||
String::from("split") => random_postings(rng, 400),
|
String::from("split") => random_postings(rng, 400),
|
||||||
String::from("ngrams") => random_postings(rng, 1400),
|
String::from("ngrams") => random_postings(rng, 1400),
|
||||||
String::from("world") => random_postings(rng, 15_000),
|
String::from("world") => random_postings(rng, 15_000),
|
||||||
String::from("earth") => random_postings(rng, 8000),
|
String::from("earth") => random_postings(rng, 8000),
|
||||||
String::from("2021") => random_postings(rng, 100),
|
String::from("2021") => random_postings(rng, 100),
|
||||||
String::from("2020") => random_postings(rng, 500),
|
String::from("2020") => random_postings(rng, 500),
|
||||||
String::from("is") => random_postings(rng, 50_000),
|
String::from("is") => random_postings(rng, 50_000),
|
||||||
String::from("this") => random_postings(rng, 50_000),
|
String::from("this") => random_postings(rng, 50_000),
|
||||||
String::from("good") => random_postings(rng, 1250),
|
String::from("good") => random_postings(rng, 1250),
|
||||||
String::from("morning") => random_postings(rng, 125),
|
String::from("morning") => random_postings(rng, 125),
|
||||||
|
String::from("word split") => random_postings(rng, 5000),
|
||||||
|
String::from("quick brownfox") => random_postings(rng, 7000),
|
||||||
|
String::from("quickbrown fox") => random_postings(rng, 8000),
|
||||||
},
|
},
|
||||||
exact_words,
|
exact_words,
|
||||||
}
|
}
|
||||||
@ -1041,6 +1063,23 @@ mod test {
|
|||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn word_split_choose_pair_with_max_freq() {
|
||||||
|
let query = "quickbrownfox";
|
||||||
|
let tokens = query.tokenize();
|
||||||
|
|
||||||
|
let (query_tree, _) = TestContext::default()
|
||||||
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
||||||
|
OR
|
||||||
|
PHRASE ["quickbrown", "fox"]
|
||||||
|
PrefixTolerant { word: "quickbrownfox", max typo: 2 }
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn phrase() {
|
fn phrase() {
|
||||||
let query = "\"hey friends\" \" \" \"wooop";
|
let query = "\"hey friends\" \" \" \"wooop";
|
||||||
|
Loading…
x
Reference in New Issue
Block a user