Add new tests and fix construction of query graph from paths

This commit is contained in:
Loïc Lecrenier 2023-04-05 14:10:01 +02:00
parent 6e50f23896
commit b5691802a3
5 changed files with 73 additions and 60 deletions

View File

@ -176,6 +176,9 @@ impl<T> Interner<T> {
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
pub fn freeze(self) -> FixedSizeInterner<T> {
FixedSizeInterner { stable_store: self.stable_store }
}
}
/// A store of values of type `T`, each linked to a value of type `From`

View File

@ -4,7 +4,7 @@ use super::query_term::{
};
use super::small_bitmap::SmallBitmap;
use super::SearchContext;
use crate::search::new::interner::DedupInterner;
use crate::search::new::interner::Interner;
use crate::Result;
use std::cmp::Ordering;
use std::collections::BTreeMap;
@ -345,29 +345,13 @@ impl QueryGraph {
Build a query graph from a list of paths
The paths are composed of source and dest terms.
If the source term is `None`, then the last dest term is used
as the predecessor of the dest term. If the source is Some(_),
then an edge is built between the last dest term and the source,
and between the source and new dest term.
Note that the resulting graph will not correspond to a perfect
representation of the set of paths.
For example, consider the following paths:
```txt
PATH 1 : a -> b1 -> c1 -> d -> e1
PATH 2 : a -> b2 -> c2 -> d -> e2
```
Then the resulting graph will be:
```txt
b1 c1 e1
a d
b2 c2 e2
```
which is different from the fully correct representation:
```txt
b1 c1 d e1
@ -383,21 +367,51 @@ impl QueryGraph {
pub fn build_from_paths(
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
) -> Self {
let mut node_data = DedupInterner::default();
let root_node = node_data.insert(QueryNodeData::Start);
let end_node = node_data.insert(QueryNodeData::End);
let mut node_data = Interner::default();
let root_node = node_data.push(QueryNodeData::Start);
let end_node = node_data.push(QueryNodeData::End);
let mut paths_with_single_terms = vec![];
for path in paths {
let mut processed_path = vec![];
let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
for (start_term, dest_term) in path {
if let Some(prev_dest_term) = prev_dest_term.take() {
if let Some(mut start_term) = start_term {
if start_term.term_ids == prev_dest_term.term_ids {
start_term.term_subset.intersect(&prev_dest_term.term_subset);
processed_path.push(start_term);
} else {
processed_path.push(prev_dest_term);
processed_path.push(start_term);
}
} else {
processed_path.push(prev_dest_term);
}
} else if let Some(start_term) = start_term {
processed_path.push(start_term);
}
prev_dest_term = Some(dest_term);
}
if let Some(prev_dest_term) = prev_dest_term {
processed_path.push(prev_dest_term);
}
paths_with_single_terms.push(processed_path);
}
// TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes
let mut paths_with_ids = vec![];
for path in paths {
for path in paths_with_single_terms {
let mut path_with_ids = vec![];
for node in path {
let (start_term, end_term) = node;
let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x)));
let dest_node_id = node_data.insert(QueryNodeData::Term(end_term));
path_with_ids.push((src_node_id, dest_node_id));
for term in path {
let id = node_data.push(QueryNodeData::Term(term));
path_with_ids.push(Interned::from_raw(id.into_raw()));
}
paths_with_ids.push(path_with_ids);
}
let nodes_data = node_data.freeze();
let nodes_data_len = nodes_data.len();
let mut nodes = nodes_data.map_move(|n| QueryNode {
@ -406,31 +420,22 @@ impl QueryGraph {
successors: SmallBitmap::new(nodes_data_len),
});
let root_node = Interned::from_raw(root_node.into_raw());
let end_node = Interned::from_raw(end_node.into_raw());
let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
for path in paths_with_ids {
let mut prev_node = root_node;
for node in path {
let (start_term, dest_term) = node;
let end_term = Interned::from_raw(dest_term.into_raw());
let src = if let Some(start_term) = start_term {
// TODO: this is incorrect! should take the intersection
// between the prev node and the start term if they refer to the same
// original query term!
let start_term = Interned::from_raw(start_term.into_raw());
nodes.get_mut(prev_node).successors.insert(start_term);
nodes.get_mut(start_term).predecessors.insert(prev_node);
start_term
} else {
prev_node
};
nodes.get_mut(src).successors.insert(end_term);
nodes.get_mut(end_term).predecessors.insert(src);
prev_node = end_term;
let mut prev_node_id = root_node;
for node_id in path {
let prev_node = nodes.get_mut(prev_node_id);
prev_node.successors.insert(node_id);
let node = nodes.get_mut(node_id);
node.predecessors.insert(prev_node_id);
prev_node_id = node_id;
}
nodes.get_mut(prev_node).successors.insert(end_node);
nodes.get_mut(end_node).predecessors.insert(prev_node);
let prev_node = nodes.get_mut(prev_node_id);
prev_node.successors.insert(end_node);
let node = nodes.get_mut(end_node);
node.predecessors.insert(prev_node_id);
}
QueryGraph { root_node, end_node, nodes }

View File

@ -3,6 +3,8 @@ This module tests the interactions between the proximity and typo ranking rules.
The proximity ranking rule should transform the query graph such that it
only contains the word pairs that it used to compute its bucket.
TODO: This is not currently implemented.
*/
use crate::{
@ -61,8 +63,13 @@ fn test_trap_basic() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("summer holiday");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: this is incorrect, 1 should come before 0
insta::assert_debug_snapshot!(texts, @r###"
[
"\"summer. holiday. sommer holidty\"",
"\"summer. holiday. sommer holiday\"",
]
"###);
}

View File

@ -271,13 +271,13 @@ fn test_sort() {
"false",
"true",
"true",
"__does_not_exist___",
"__does_not_exist__",
"null",
"[null,null,\"\"]",
"\"\"",
"{\"sub\":0}",
"__does_not_exist___",
"__does_not_exist___",
"__does_not_exist__",
"__does_not_exist__",
]
"###);
@ -304,13 +304,13 @@ fn test_sort() {
"false",
"\"1\"",
"\"0\"",
"__does_not_exist___",
"__does_not_exist__",
"null",
"[null,null,\"\"]",
"\"\"",
"{\"sub\":0}",
"__does_not_exist___",
"__does_not_exist___",
"__does_not_exist__",
"__does_not_exist__",
]
"###);
}

View File

@ -59,8 +59,7 @@ fn create_index() -> TempIndex {
"id": 3,
"text": "beautiful sommer"
},
// The next two documents lay out an even more complex trap, which the current implementation
// fails to handle properly.
// The next two documents lay out an even more complex trap.
// With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of:
// - id 4: delicitous + sweet + dessert
// - id 5: beautiful + sweet + desgert
@ -114,13 +113,12 @@ fn test_trap_complex2() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("delicious sweet dessert");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: this is incorrect. 5 should appear before 4
insta::assert_debug_snapshot!(texts, @r###"
[
"\"delicitous. sweet. dessert. delicitous sweet desgert\"",
"\"delicious. sweet desgert. delicious sweet desgert\"",
"\"delicitous. sweet. dessert. delicitous sweet desgert\"",
]
"###);
}