mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
First probably working phrase query doc filtering
This commit is contained in:
parent
22b19c0d93
commit
9c03bb3428
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -952,6 +952,7 @@ dependencies = [
|
|||||||
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
@ -17,6 +17,7 @@ env_logger = "0.7.0"
|
|||||||
fst = { version = "0.3.5", default-features = false }
|
fst = { version = "0.3.5", default-features = false }
|
||||||
hashbrown = { version = "0.6.0", features = ["serde"] }
|
hashbrown = { version = "0.6.0", features = ["serde"] }
|
||||||
heed = "0.6.1"
|
heed = "0.6.1"
|
||||||
|
itertools = "0.8.2" # kill me please
|
||||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||||
log = "0.4.8"
|
log = "0.4.8"
|
||||||
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
|
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
|
||||||
|
@ -59,11 +59,9 @@ pub fn bucket_sort<'c>(
|
|||||||
let before_raw_documents_building = Instant::now();
|
let before_raw_documents_building = Instant::now();
|
||||||
let mut raw_documents = Vec::new();
|
let mut raw_documents = Vec::new();
|
||||||
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||||
raw_documents.push(RawDocument {
|
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) {
|
||||||
raw_matches,
|
raw_documents.push(raw_document);
|
||||||
processed_matches: Vec::new(),
|
}
|
||||||
processed_distances: Vec::new(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
debug!("creating {} candidates documents took {:.02?}",
|
debug!("creating {} candidates documents took {:.02?}",
|
||||||
raw_documents.len(),
|
raw_documents.len(),
|
||||||
@ -149,6 +147,57 @@ pub struct RawDocument<'a, 'tag> {
|
|||||||
pub processed_distances: Vec<Option<u8>>,
|
pub processed_distances: Vec<Option<u8>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
||||||
|
fn new<'txn>(
|
||||||
|
raw_matches: &'a mut [BareMatch<'tag>],
|
||||||
|
automatons: &[QueryWordAutomaton],
|
||||||
|
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
|
) -> Option<RawDocument<'a, 'tag>>
|
||||||
|
{
|
||||||
|
raw_matches.sort_unstable_by_key(|m| m.query_index);
|
||||||
|
|
||||||
|
// debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches);
|
||||||
|
|
||||||
|
let mut previous_word = None;
|
||||||
|
for i in 0..raw_matches.len() {
|
||||||
|
let a = &raw_matches[i];
|
||||||
|
let auta = &automatons[a.query_index as usize];
|
||||||
|
|
||||||
|
match auta.phrase_query {
|
||||||
|
Some((0, _)) => {
|
||||||
|
previous_word = Some(a.query_index);
|
||||||
|
let b = raw_matches.get(i + 1)?;
|
||||||
|
if a.query_index + 1 != b.query_index {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let pla = &postings_lists[a.postings_list];
|
||||||
|
let plb = &postings_lists[b.postings_list];
|
||||||
|
|
||||||
|
let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
|
||||||
|
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
||||||
|
});
|
||||||
|
|
||||||
|
if !iter.any(|eb| eb.is_both()) { return None }
|
||||||
|
},
|
||||||
|
Some((1, _)) => {
|
||||||
|
if previous_word.take() != Some(a.query_index - 1) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Some((_, _)) => unreachable!(),
|
||||||
|
None => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(RawDocument {
|
||||||
|
raw_matches,
|
||||||
|
processed_matches: Vec::new(),
|
||||||
|
processed_distances: Vec::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct BareMatch<'tag> {
|
pub struct BareMatch<'tag> {
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub query_index: u16,
|
pub query_index: u16,
|
||||||
@ -186,6 +235,15 @@ pub struct PostingsListView<'txn> {
|
|||||||
len: usize,
|
len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for PostingsListView<'_> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.debug_struct("PostingsListView")
|
||||||
|
.field("input", &std::str::from_utf8(&self.input).unwrap())
|
||||||
|
.field("postings_list", &self.as_ref())
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'txn> PostingsListView<'txn> {
|
impl<'txn> PostingsListView<'txn> {
|
||||||
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
|
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
|
||||||
let len = postings_list.len();
|
let len = postings_list.len();
|
||||||
@ -275,6 +333,7 @@ fn fetch_matches<'txn, 'tag>(
|
|||||||
let input = Rc::from(input);
|
let input = Rc::from(input);
|
||||||
let postings_list = Rc::new(postings_list);
|
let postings_list = Rc::new(postings_list);
|
||||||
let postings_list_view = PostingsListView::new(input, postings_list);
|
let postings_list_view = PostingsListView::new(input, postings_list);
|
||||||
|
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
||||||
|
|
||||||
@ -442,7 +501,7 @@ fn construct_automatons2(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if false && n == 1 {
|
if true && n == 1 {
|
||||||
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
||||||
let mut left_automaton = QueryWordAutomaton::exact(left);
|
let mut left_automaton = QueryWordAutomaton::exact(left);
|
||||||
left_automaton.phrase_query = Some((0, 2));
|
left_automaton.phrase_query = Some((0, 2));
|
||||||
|
@ -43,16 +43,42 @@ pub trait Criterion {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn prepare_query_distances(
|
fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||||
documents: &mut [RawDocument],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
|
postings_lists: &PostingsListsArena<'tag, 'txn>,
|
||||||
) {
|
) {
|
||||||
for document in documents {
|
for document in documents {
|
||||||
if !document.processed_distances.is_empty() { continue }
|
if !document.processed_distances.is_empty() { continue }
|
||||||
|
|
||||||
|
// debug!("{:?}", document.raw_matches[0].document_id);
|
||||||
|
|
||||||
let mut processed = Vec::new();
|
let mut processed = Vec::new();
|
||||||
for m in document.raw_matches.iter() {
|
let mut raw_matches = document.raw_matches.iter().peekable();
|
||||||
|
while let Some(m) = raw_matches.next() {
|
||||||
|
|
||||||
|
// let automaton = &automatons[m.query_index as usize];
|
||||||
|
|
||||||
|
// debug!("{:?} {:?}", m, automaton);
|
||||||
|
// debug!("{:?}", &postings_lists[m.postings_list]);
|
||||||
|
|
||||||
|
// match automaton.phrase_query {
|
||||||
|
// Some((0, len)) => {
|
||||||
|
// match raw_matches.peek() {
|
||||||
|
// Some(BareMatch { query_index, .. }) => {
|
||||||
|
// if *query_index != m.query_index + 1 {
|
||||||
|
// raw_matches.next();
|
||||||
|
// continue
|
||||||
|
// }
|
||||||
|
// },
|
||||||
|
// None => continue,
|
||||||
|
// }
|
||||||
|
// },
|
||||||
|
// Some((_, _)) => continue,
|
||||||
|
// None => (),
|
||||||
|
// }
|
||||||
|
|
||||||
// FIXME we really need to take splitted words into account
|
// FIXME we really need to take splitted words into account
|
||||||
// those must be seen at the same level as the non-splitteds
|
// those must be seen at the same level as the non-splitteds
|
||||||
// if automatons[m.query_index as usize].phrase_query.is_some() {
|
// if automatons[m.query_index as usize].phrase_query.is_some() {
|
||||||
@ -73,6 +99,8 @@ fn prepare_query_distances(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// debug!("{:?}", processed);
|
||||||
|
|
||||||
document.processed_distances = processed;
|
document.processed_distances = processed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -82,14 +110,14 @@ pub struct Typo;
|
|||||||
impl Criterion for Typo {
|
impl Criterion for Typo {
|
||||||
fn name(&self) -> &str { "typo" }
|
fn name(&self) -> &str { "typo" }
|
||||||
|
|
||||||
fn prepare(
|
fn prepare<'a, 'tag, 'txn>(
|
||||||
&self,
|
&self,
|
||||||
documents: &mut [RawDocument],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
postings_lists: &mut PostingsListsArena,
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
) {
|
) {
|
||||||
prepare_query_distances(documents, query_enhancer, automatons);
|
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate(
|
fn evaluate(
|
||||||
@ -140,14 +168,14 @@ pub struct Words;
|
|||||||
impl Criterion for Words {
|
impl Criterion for Words {
|
||||||
fn name(&self) -> &str { "words" }
|
fn name(&self) -> &str { "words" }
|
||||||
|
|
||||||
fn prepare(
|
fn prepare<'a, 'tag, 'txn>(
|
||||||
&self,
|
&self,
|
||||||
documents: &mut [RawDocument],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
postings_lists: &mut PostingsListsArena,
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
) {
|
) {
|
||||||
prepare_query_distances(documents, query_enhancer, automatons);
|
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate(
|
fn evaluate(
|
||||||
|
Loading…
Reference in New Issue
Block a user