601: Introduce snapshot tests r=Kerollmops a=loiclec

# Pull Request
## What does this PR do?
Introduce snapshot tests into milli, by using the `insta` crate. This implements the idea described by #597 

See: [insta.rs](https://insta.rs)

## Design
There is now a new file, `snapshot_tests.rs`, which is compiled only under `#[cfg(test)]`. It exposes the `db_snap!` macro, which is used to snapshot the content of a database.

When running `cargo test`, `insta` will check that the value of the current snapshot is the same as the previous one (on the file system). If they are the same, the test passes. If they are different, the test fails and you are asked to review the new snapshot to approve or reject it.

We don't want to save very large snapshots to the file system, because it will pollute the git repository and increase its size too much. Instead, we only save their `md5` hashes under the name `<snapshot_name>.hash.snap`. There is a new environment variable called `MILLI_TEST_FULL_SNAPS` which can be set to `true` in order to *also* save the full content of the snapshot under the name `<snapshot_name>.full.snap`. However, snapshots with the extension `.full.snap` are never saved to the git repository.

## Example
```rust
// In e.g. facets.rs
#[test]
fn my_test() {
    // create an index
    let index = TempIndex::new():
    index.add_documents(...);
    index.update_settings(|settings| ...);
    
    // then snapshot the content of one of its databases
    // the snapshot will be saved at the current folder under facets.rs/my_test/facet_id_string_docids.snap
    db_snap!(index, facet_id_string_docids);

    index.add_documents(...);   

    // we can also name the snapshot to ensure there is no conflict
    // this snapshot will be saved at facets.rs/my_test/updated/facet_id_string_docids.snap
    db_snap!(index, facet_id_string, docids, "updated");
    
    // and we can also use "inline" snapshots, which insert their content in the given string literal
    db_snap!(index, field_distributions, `@"");`
    // once the snapshot is approved, it will automatically get transformed to, e.g.:
    // db_snap!(index, field_distributions, `@"`
    // my_facet        21
    // other_field     3
    // ");
    
    // now let's add **many** documents
    index.add_documents(...);
    
    // because the snapshot is too big, its hash is saved instead
    // if the MILLI_TEST_FULL_SNAPS env variable is set to true, then the full snapshot will also be saved
    // at facets.rs/my_test/large/facet_id_string_docids.full.snap
    db_snap!(index, facet_id_string_docids, "large", `@"5348bbc46b5384455b6a900666d2a502");`
}
```

Co-authored-by: Loïc Lecrenier <loic@meilisearch.com>
This commit is contained in:
bors[bot] 2022-08-16 11:57:09 +00:00 committed by GitHub
commit 293a246af8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 1234 additions and 661 deletions

7
.gitignore vendored
View File

@ -6,3 +6,10 @@
*.csv
*.mmdb
*.svg
# Snapshots
## ... large
*.full.snap
# ... unreviewed
*.snap.new

View File

@ -51,7 +51,9 @@ csv = "1.1.6"
[dev-dependencies]
big_s = "1.0.2"
insta = "1.18.1"
maplit = "1.0.2"
md5 = "0.7.0"
rand = "0.8.5"
[features]

View File

@ -99,7 +99,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
)]
InvalidDocumentId { document_id: Value },
#[error("Invalid facet distribution, the fields `{}` are not set as filterable.",
.invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ")
.invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
)]
InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
#[error(transparent)]
@ -111,7 +111,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
match .valid_fields.is_empty() {
true => "This index does not have configured sortable attributes.".to_string(),
false => format!("Available sortable attributes are: `{}`.",
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ")
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
),
}
)]

View File

@ -1183,13 +1183,12 @@ pub(crate) mod tests {
use big_s::S;
use heed::{EnvOpenOptions, RwTxn};
use maplit::btreemap;
use tempfile::TempDir;
use crate::documents::DocumentsBatchReader;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use crate::Index;
use crate::{db_snap, Index};
pub(crate) struct TempIndex {
pub inner: Index,
@ -1288,17 +1287,30 @@ pub(crate) mod tests {
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"age".to_string() => 1,
}
db_snap!(index, field_distribution, 1);
db_snap!(index, word_docids,
@r###"
1 [0, ]
2 [1, ]
20 [1, ]
bob [1, ]
kevin [0, ]
"###
);
db_snap!(index, field_distribution);
db_snap!(index, field_distribution,
@"
age 1
id 2
name 2
"
);
// snapshot_index!(&index, "1", include: "^field_distribution$");
// we add all the documents a second time. we are supposed to get the same
// field_distribution in the end
index
@ -1309,16 +1321,12 @@ pub(crate) mod tests {
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"age".to_string() => 1,
}
db_snap!(index, field_distribution,
@r###"
age 1
id 2
name 2
"###
);
// then we update a document by removing one field and another by adding one field
@ -1329,16 +1337,12 @@ pub(crate) mod tests {
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"has_dog".to_string() => 1,
}
db_snap!(index, field_distribution,
@r###"
has_dog 1
id 2
name 2
"###
);
}

View File

@ -13,6 +13,10 @@ pub mod proximity;
mod search;
pub mod update;
#[cfg(test)]
#[macro_use]
pub mod snapshot_tests;
use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto};
use std::hash::BuildHasherDefault;

View File

@ -632,25 +632,59 @@ mod tests {
]),
],
);
let expected = vec![
vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],
vec![
vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
],
vec![
vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }],
],
vec![
vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
],
];
let result = flatten_query_tree(&query_tree);
assert_eq!(expected, result);
insta::assert_debug_snapshot!(result, @r###"
[
[
[
Exact {
word: "manythefish",
},
],
],
[
[
Exact {
word: "manythe",
},
],
[
Exact {
word: "fish",
},
],
],
[
[
Exact {
word: "many",
},
],
[
Exact {
word: "thefish",
},
],
],
[
[
Exact {
word: "many",
},
],
[
Exact {
word: "the",
},
],
[
Exact {
word: "fish",
},
],
],
]
"###);
}
}

View File

@ -349,22 +349,33 @@ mod test {
use super::super::test::TestContext;
use super::*;
fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String {
let mut result = String::new();
while let Some(criterion) = criteria.next(&mut parameters).unwrap() {
result.push_str(&format!("{criterion:?}\n\n"));
}
result
}
#[test]
fn initial_placeholder_no_facets() {
let context = TestContext::default();
let query_tree = None;
let facet_candidates = None;
let mut criterion_parameters = CriterionParameters {
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(query_tree, facet_candidates);
let mut criteria = Typo::new(&context, Box::new(parent));
let criteria = Typo::new(&context, Box::new(parent));
assert!(criteria.next(&mut criterion_parameters).unwrap().unwrap().candidates.is_none());
assert!(criteria.next(&mut criterion_parameters).unwrap().is_none());
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, bucket_candidates: None }
"###);
}
#[test]
@ -390,78 +401,32 @@ mod test {
let facet_candidates = None;
let mut criterion_parameters = CriterionParameters {
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(Some(query_tree), facet_candidates);
let mut criteria = Typo::new(&context, Box::new(parent));
let criteria = Typo::new(&context, Box::new(parent));
let candidates_1 = context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
])],
)),
candidates: Some(candidates_1.clone()),
bucket_candidates: Some(candidates_1),
filtered_candidates: None,
};
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) }
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
OR
Exact { word: "word" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) }
let candidates_2 = (context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap())
- context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact_with_typo(1, "word".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
],
),
])],
)),
candidates: Some(candidates_2.clone()),
bucket_candidates: Some(candidates_2),
filtered_candidates: None,
};
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2));
"###);
}
#[test]
@ -470,25 +435,18 @@ mod test {
let query_tree = None;
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut criterion_parameters = CriterionParameters {
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(query_tree, Some(facet_candidates.clone()));
let mut criteria = Typo::new(&context, Box::new(parent));
let criteria = Typo::new(&context, Box::new(parent));
let expected = CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates: None,
filtered_candidates: Some(facet_candidates.clone()),
};
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), bucket_candidates: None }
// first iteration, returns the facet candidates
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected));
// second iteration, returns None because there is no more things to do
assert!(criteria.next(&mut criterion_parameters).unwrap().is_none());
"###);
}
#[test]
@ -514,77 +472,31 @@ mod test {
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut criterion_parameters = CriterionParameters {
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone()));
let mut criteria = Typo::new(&context, Box::new(parent));
let criteria = Typo::new(&context, Box::new(parent));
let candidates_1 = context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
])],
)),
candidates: Some(&candidates_1 & &facet_candidates),
bucket_candidates: Some(&candidates_1 & &facet_candidates),
filtered_candidates: None,
};
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) }
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
OR
Exact { word: "word" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) }
let candidates_2 = (context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap())
- context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact_with_typo(1, "word".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
],
),
])],
)),
candidates: Some(&candidates_2 & &facet_candidates),
bucket_candidates: Some(&candidates_2 & &facet_candidates),
filtered_candidates: None,
};
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2));
"###);
}
}

View File

@ -45,7 +45,7 @@ impl<'a> Display for FilterError<'a> {
attribute,
)
} else {
let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(" ");
let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(" ");
write!(
f,

View File

@ -573,15 +573,18 @@ mod tests {
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
insta::assert_snapshot!(
matcher.format(format_options),
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text);
// no crop should return complete text with highlighted matches.
assert_eq!(
&matcher.format(format_options),
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
insta::assert_snapshot!(
matcher.format(format_options),
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
);
}
@ -602,19 +605,28 @@ mod tests {
let text = "Ŵôřlḑôle";
let mut matcher = builder.build(text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle");
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Ŵôřlḑ</em>ôle"
);
// Text containing unicode match.
let text = "Ŵôřlḑ";
let mut matcher = builder.build(text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>");
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Ŵôřlḑ</em>"
);
// Text containing unicode match.
let text = "Westfália";
let mut matcher = builder.build(text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a");
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Westfáli</em>a"
);
}
#[test]
@ -628,83 +640,89 @@ mod tests {
// empty text.
let text = "";
let mut matcher = builder.build(text);
assert_eq!(&matcher.format(format_options), "");
insta::assert_snapshot!(
matcher.format(format_options),
@""
);
// text containing only separators.
let text = ":-)";
let mut matcher = builder.build(text);
assert_eq!(&matcher.format(format_options), ":-)");
insta::assert_snapshot!(
matcher.format(format_options),
@":-)"
);
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let mut matcher = builder.build(text);
// no highlight should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(format_options),
"A quick brown fox can not jump 32 feet, right…"
insta::assert_snapshot!(
matcher.format(format_options),
@"A quick brown fox can not jump 32 feet, right…"
);
// Text without any match starting by a separator.
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
let mut matcher = builder.build(text);
// no highlight should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(format_options),
"(A quick brown fox can not jump 32 feet, right…"
insta::assert_snapshot!(
matcher.format(format_options),
@"(A quick brown fox can not jump 32 feet, right…"
);
// Test phrase propagation
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
let mut matcher = builder.build(text);
// should crop the phrase instead of croping around the match.
assert_eq!(
&matcher.format(format_options),
"… Split The World is a book written by Emily Henry…",
insta::assert_snapshot!(
matcher.format(format_options),
@"… Split The World is a book written by Emily Henry…"
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text);
// no highlight should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(format_options),
"…future to build a world with the boy she loves…"
insta::assert_snapshot!(
matcher.format(format_options),
@"…future to build a world with the boy she loves…"
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text);
// no highlight should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(format_options),
"…she loves. Emily Henry: The Love That Split The World."
insta::assert_snapshot!(
matcher.format(format_options),
@"…she loves. Emily Henry: The Love That Split The World."
);
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let mut matcher = builder.build(text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(format_options),
"…void void void void void split the world void void"
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void split the world void void"
);
// Text containing matches with diferent density.
let text = "split void the void void world void void void void void void void void void void split the world void void";
let mut matcher = builder.build(text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(format_options),
"…void void void void void split the world void void"
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void split the world void void"
);
// Text containing matches with same word.
let text = "split split split split split split void void void void void void void void void void split the world void void";
let mut matcher = builder.build(text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(format_options),
"…void void void void void split the world void void"
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void split the world void void"
);
}
@ -719,44 +737,53 @@ mod tests {
// empty text.
let text = "";
let mut matcher = builder.build(text);
assert_eq!(&matcher.format(format_options), "");
insta::assert_snapshot!(
matcher.format(format_options),
@""
);
// text containing only separators.
let text = ":-)";
let mut matcher = builder.build(text);
assert_eq!(&matcher.format(format_options), ":-)");
insta::assert_snapshot!(
matcher.format(format_options),
@":-)"
);
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let mut matcher = builder.build(text);
// both should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(format_options),
"A quick brown fox can not jump 32 feet, right…"
insta::assert_snapshot!(
matcher.format(format_options),
@"A quick brown fox can not jump 32 feet, right…"
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text);
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(
&matcher.format(format_options),
"…future to build a <em>world</em> with <em>the</em> boy she loves…"
insta::assert_snapshot!(
matcher.format(format_options),
@"…future to build a <em>world</em> with <em>the</em> boy she loves…"
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text);
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
insta::assert_snapshot!(
matcher.format(format_options),
@"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
);
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let mut matcher = builder.build(text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(format_options),
"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
);
}
@ -773,19 +800,28 @@ mod tests {
let format_options = FormatOptions { highlight: false, crop: Some(2) };
let mut matcher = builder.build(text);
// because crop size < query size, partially format matches.
assert_eq!(&matcher.format(format_options), "…split the…");
insta::assert_snapshot!(
matcher.format(format_options),
@"…split the…"
);
// set a smaller crop size
let format_options = FormatOptions { highlight: false, crop: Some(1) };
let mut matcher = builder.build(text);
// because crop size < query size, partially format matches.
assert_eq!(&matcher.format(format_options), "…split…");
insta::assert_snapshot!(
matcher.format(format_options),
@"…split…"
);
// set crop size to 0
let format_options = FormatOptions { highlight: false, crop: Some(0) };
let mut matcher = builder.build(text);
// because crop size is 0, crop is ignored.
assert_eq!(&matcher.format(format_options), "void void split the world void void.");
insta::assert_snapshot!(
matcher.format(format_options),
@"void void split the world void void."
);
}
#[test]
@ -820,11 +856,9 @@ mod tests {
let text = "the do or die can't be he do and or isn't he";
let mut matcher = builder.build(text);
assert_eq!(
&matcher.format(format_options),
"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_",
"matches: {:?}",
&matcher.matches
insta::assert_snapshot!(
matcher.format(format_options),
@"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_"
);
}
}

View File

@ -99,11 +99,6 @@ impl QueryKind {
QueryKind::Exact { original_typo: 0, word }
}
#[cfg(test)]
pub fn exact_with_typo(original_typo: u8, word: String) -> Self {
QueryKind::Exact { original_typo, word }
}
pub fn tolerant(typo: u8, word: String) -> Self {
QueryKind::Tolerant { typo, word }
}
@ -857,30 +852,16 @@ mod test {
let query = "hey friends";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "heyfriends".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "hey" }
PrefixTolerant { word: "friends", max typo: 1 }
PrefixTolerant { word: "heyfriends", max typo: 1 }
"###);
}
#[test]
@ -888,30 +869,16 @@ mod test {
let query = "hey friends ";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heyfriends".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "hey" }
Tolerant { word: "friends", max typo: 1 }
Tolerant { word: "heyfriends", max typo: 1 }
"###);
}
#[test]
@ -919,62 +886,24 @@ mod test {
let query = "hello world ";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hi".to_string()),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("morning".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "hello".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("earth".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nature".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
],
),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "helloworld".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
OR
Exact { word: "hi" }
AND
Exact { word: "good" }
Exact { word: "morning" }
Tolerant { word: "hello", max typo: 1 }
OR
Exact { word: "earth" }
Exact { word: "nature" }
Tolerant { word: "world", max typo: 1 }
Tolerant { word: "helloworld", max typo: 1 }
"###);
}
#[test]
@ -982,97 +911,34 @@ mod test {
let query = "new york city ";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "yorkcity".to_string()),
}),
],
),
]),
Operation::And(vec![
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "newyork".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "newyorkcity".to_string()),
}),
],
),
],
);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "new" }
OR
AND
Exact { word: "york" }
Exact { word: "city" }
Tolerant { word: "yorkcity", max typo: 1 }
AND
OR
Exact { word: "nyc" }
AND
Exact { word: "new" }
Exact { word: "york" }
Exact { word: "city" }
Tolerant { word: "newyork", max typo: 1 }
Exact { word: "city" }
OR
Exact { word: "nyc" }
AND
Exact { word: "new" }
Exact { word: "york" }
Tolerant { word: "newyorkcity", max typo: 1 }
"###);
}
#[test]
@ -1080,30 +946,16 @@ mod test {
let query = "n grams ";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("n".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "grams".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "ngrams".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "n" }
Tolerant { word: "grams", max typo: 1 }
Tolerant { word: "ngrams", max typo: 1 }
"###);
}
#[test]
@ -1111,36 +963,18 @@ mod test {
let query = "wordsplit fish ";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Or(
false,
vec![
Operation::Phrase(vec!["word".to_string(), "split".to_string()]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "wordsplit".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("fish".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "wordsplitfish".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
OR
PHRASE ["word", "split"]
Tolerant { word: "wordsplit", max typo: 2 }
Exact { word: "fish" }
Tolerant { word: "wordsplitfish", max typo: 1 }
"###);
}
#[test]
@ -1148,15 +982,14 @@ mod test {
let query = "\"hey friends\" \" \" \"wooop";
let tokens = query.tokenize();
let expected = Operation::And(vec![
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "friends"]
Exact { word: "wooop" }
"###);
}
#[test]
@ -1164,15 +997,14 @@ mod test {
let query = "\"hey friends. wooop wooop\"";
let tokens = query.tokenize();
let expected = Operation::And(vec![
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]),
]);
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "friends"]
PHRASE ["wooop", "wooop"]
"###);
}
#[test]
@ -1180,82 +1012,30 @@ mod test {
let query = "hey my friend ";
let tokens = query.tokenize();
let expected = Operation::Or(
true,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "myfriend".to_string()),
}),
],
),
]),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymyfriend".to_string()),
}),
],
),
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR(WORD)
Exact { word: "hey" }
OR
AND
Exact { word: "hey" }
Exact { word: "my" }
Tolerant { word: "heymy", max typo: 1 }
OR
AND
Exact { word: "hey" }
OR
AND
Exact { word: "my" }
Tolerant { word: "friend", max typo: 1 }
Tolerant { word: "myfriend", max typo: 1 }
AND
Tolerant { word: "heymy", max typo: 1 }
Tolerant { word: "friend", max typo: 1 }
Tolerant { word: "heymyfriend", max typo: 1 }
"###);
}
#[test]
@ -1263,11 +1043,12 @@ mod test {
let query = "\"hey my\"";
let tokens = query.tokenize();
let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
PHRASE ["hey", "my"]
"###);
}
#[test]
@ -1275,68 +1056,27 @@ mod test {
let query = r#""hey" my good "friend""#;
let tokens = query.tokenize();
let expected = Operation::Or(
true,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "mygood".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR(WORD)
AND
Exact { word: "hey" }
Exact { word: "friend" }
AND
Exact { word: "hey" }
Exact { word: "my" }
Exact { word: "friend" }
AND
Exact { word: "hey" }
OR
AND
Exact { word: "my" }
Exact { word: "good" }
Tolerant { word: "mygood", max typo: 1 }
Exact { word: "friend" }
"###);
}
#[test]
@ -1344,29 +1084,16 @@ mod test {
let query = "hey friends ";
let tokens = query.tokenize();
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friends".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("heyfriends".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, false, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "hey" }
Exact { word: "friends" }
Exact { word: "heyfriends" }
"###);
}
#[test]
@ -1374,15 +1101,14 @@ mod test {
let query = "\"hey my\" good friend";
let tokens = query.tokenize();
let expected = Operation::And(vec![
Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]);
let (query_tree, _) =
TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "my"]
Exact { word: "good" }
"###);
}
#[test]

527
milli/src/snapshot_tests.rs Normal file
View File

@ -0,0 +1,527 @@
use std::borrow::Cow;
use std::fmt::Write;
use std::path::Path;
use heed::types::ByteSlice;
use heed::BytesDecode;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::{
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
FacetStringZeroBoundsValueCodec,
};
use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index};
#[track_caller]
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings {
let mut settings = insta::Settings::clone_current();
settings.set_prepend_module_to_snapshot(false);
let path = Path::new(std::panic::Location::caller().file());
let filename = path.file_name().unwrap().to_str().unwrap();
settings.set_omit_expression(true);
let test_name = std::thread::current().name().unwrap().rsplit("::").next().unwrap().to_owned();
if let Some(name) = name {
settings
.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name).join(name));
} else {
settings.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name));
}
settings
}
/**
Create a snapshot test of the given database.
## Arguments
1. The identifier for the `Index`
2. The content of the index to snapshot. Available options are:
- `settings`
- `word_docids`
- `exact_word_docids`
- `word_prefix_docids`
- `exact_word_prefix_docids`
- `docid_word_positions`
- `word_pair_proximity_docids`
- `word_prefix_pair_proximity_docids`
- `word_position_docids`
- `field_id_word_count_docids`
- `word_prefix_position_docids`
- `facet_id_f64_docids`
- `facet_id_string_docids`
- `documents_ids`
- `stop_words`
- `soft_deleted_documents_ids`
- `field_distribution`
- `fields_ids_map`
- `geo_faceted_documents_ids`
- `external_documents_ids`
- `number_faceted_documents_ids`
- `string_faceted_documents_ids`
- `words_fst`
- `words_prefixes_fst`
3. The identifier for the snapshot test (optional)
4. `@""` to write the snapshot inline (optional)
## Behaviour
The content of the database will be printed either inline or to the file system
at `test_directory/test_file.rs/test_name/db_name.snap`.
If the database is too large, then only the hash of the database will be saved, with
the name `db_name.hash.snap`. To *also* save the full content of the database anyway,
set the `MILLI_TEST_FULL_SNAPS` environment variable to `true`. The full snapshot will
be saved with the name `db_name.full.snap` but will not be saved to the git repository.
Running `cargo test` will check whether the old snapshot is identical to the
current one. If they are equal, the test passes. Otherwise, the test fails.
Use the command line `cargo insta` to approve or reject new snapshots.
## Example
```ignore
let index = TempIndex::new();
// basic usages
db_snap!(index, word_docids);
// named snapshot to avoid conflicts
db_snap!(index, word_docids, "some_identifier");
// write the snapshot inline
db_snap!(index, word_docids, @""); // will be autocompleted by running `cargo insta review`
// give a name to the inline snapshot
db_snap!(index, word_docids, "some_identifier", @"");
```
*/
#[macro_export]
macro_rules! db_snap {
($index:ident, $db_name:ident, $name:expr) => {
let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(
&format!("{}", $name),
));
settings.bind(|| {
let snap = $crate::full_snap_of_db!($index, $db_name);
let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false);
for (name, snap) in snaps {
insta::assert_snapshot!(name, snap);
}
});
};
($index:ident, $db_name:ident) => {
let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None);
settings.bind(|| {
let snap = $crate::full_snap_of_db!($index, $db_name);
let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false);
for (name, snap) in snaps {
insta::assert_snapshot!(name, snap);
}
});
};
($index:ident, $db_name:ident, @$inline:literal) => {
let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None);
settings.bind(|| {
let snap = $crate::full_snap_of_db!($index, $db_name);
let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true);
for (name, snap) in snaps {
if !name.ends_with(".full") {
insta::assert_snapshot!(snap, @$inline);
} else {
insta::assert_snapshot!(name, snap);
}
}
});
};
($index:ident, $db_name:ident, $name:literal, @$inline:literal) => {
let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name)));
settings.bind(|| {
let snap = $crate::full_snap_of_db!($index, $db_name);
let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true);
for (name, snap) in snaps {
if !name.ends_with(".full") {
insta::assert_snapshot!(snap, @$inline);
} else {
insta::assert_snapshot!(name, snap);
}
}
});
};
}
pub fn snap_word_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_docids, |(s, b)| {
&format!("{s:<16} {}", display_bitmap(&b))
});
snap
}
pub fn snap_exact_word_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, exact_word_docids, |(s, b)| {
&format!("{s:<16} {}", display_bitmap(&b))
});
snap
}
pub fn snap_word_prefix_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_prefix_docids, |(s, b)| {
&format!("{s:<16} {}", display_bitmap(&b))
});
snap
}
pub fn snap_exact_word_prefix_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, exact_word_prefix_docids, |(s, b)| {
&format!("{s:<16} {}", display_bitmap(&b))
});
snap
}
pub fn snap_docid_word_positions(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| {
&format!("{idx:<6} {s:<16} {}", display_bitmap(&b))
});
snap
}
pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |(
(word1, word2, proximity),
b,
)| {
&format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b))
});
snap
}
pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
(word1, prefix, proximity),
b,
)| {
&format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b))
});
snap
}
pub fn snap_word_position_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
});
snap
}
pub fn snap_field_id_word_count_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, field_id_word_count_docids, |(
(field_id, word_count),
b,
)| {
&format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
});
snap
}
pub fn snap_word_prefix_position_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_prefix_position_docids, |(
(word_prefix, position),
b,
)| {
&format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b))
});
snap
}
pub fn snap_facet_id_f64_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |(
(facet_id, level, left, right),
b,
)| {
&format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b))
});
snap
}
pub fn snap_facet_id_string_docids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let bytes_db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
let iter = bytes_db.iter(&rtxn).unwrap();
let mut snap = String::new();
for x in iter {
let (key, value) = x.unwrap();
if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) {
let (orig_string, docids) =
FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap();
snap.push_str(&format!(
"{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n",
display_bitmap(&docids)
));
} else if let Some((field_id, level, left, right)) =
FacetLevelValueU32Codec::bytes_decode(key)
{
snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} "));
let (bounds, docids) =
FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_decode(value)
.unwrap();
if let Some((left, right)) = bounds {
snap.push_str(&format!("{left:<8} {right:<8} "));
}
snap.push_str(&display_bitmap(&docids));
snap.push('\n');
} else {
panic!();
}
}
snap
}
pub fn snap_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let documents_ids = index.documents_ids(&rtxn).unwrap();
let snap = display_bitmap(&documents_ids);
snap
}
pub fn snap_stop_words(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let stop_words = index.stop_words(&rtxn).unwrap();
let snap = format!("{stop_words:?}");
snap
}
pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids);
soft_deleted_documents_ids
}
pub fn snap_field_distributions(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let mut snap = String::new();
for (field, count) in index.field_distribution(&rtxn).unwrap() {
writeln!(&mut snap, "{field:<16} {count:<6}").unwrap();
}
snap
}
pub fn snap_fields_ids_map(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let name = fields_ids_map.name(field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap();
}
snap
}
pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap();
let snap = display_bitmap(&geo_faceted_documents_ids);
snap
}
pub fn snap_external_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
let mut snap = String::new();
let soft_bytes = soft.into_fst().as_bytes().to_owned();
let mut hex_soft = String::new();
for byte in soft_bytes {
write!(&mut hex_soft, "{:x}", byte).unwrap();
}
writeln!(&mut snap, "soft: {hex_soft}").unwrap();
let hard_bytes = hard.into_fst().as_bytes().to_owned();
let mut hex_hard = String::new();
for byte in hard_bytes {
write!(&mut hex_hard, "{:x}", byte).unwrap();
}
writeln!(&mut snap, "hard: {hex_hard}").unwrap();
snap
}
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let number_faceted_documents_ids =
index.number_faceted_documents_ids(&rtxn, field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
.unwrap();
}
snap
}
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let string_faceted_documents_ids =
index.string_faceted_documents_ids(&rtxn, field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
.unwrap();
}
snap
}
pub fn snap_words_fst(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let words_fst = index.words_fst(&rtxn).unwrap();
let bytes = words_fst.into_fst().as_bytes().to_owned();
let mut snap = String::new();
for byte in bytes {
write!(&mut snap, "{:x}", byte).unwrap();
}
snap
}
pub fn snap_words_prefixes_fst(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap();
let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned();
let mut snap = String::new();
for byte in bytes {
write!(&mut snap, "{:x}", byte).unwrap();
}
snap
}
pub fn snap_settings(index: &Index) -> String {
let mut snap = String::new();
let rtxn = index.read_txn().unwrap();
macro_rules! write_setting_to_snap {
($name:ident) => {
let $name = index.$name(&rtxn).unwrap();
writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap();
};
}
write_setting_to_snap!(primary_key);
write_setting_to_snap!(criteria);
write_setting_to_snap!(displayed_fields);
write_setting_to_snap!(distinct_field);
write_setting_to_snap!(filterable_fields);
write_setting_to_snap!(sortable_fields);
write_setting_to_snap!(synonyms);
write_setting_to_snap!(authorize_typos);
write_setting_to_snap!(min_word_len_one_typo);
write_setting_to_snap!(min_word_len_two_typos);
write_setting_to_snap!(exact_words);
write_setting_to_snap!(exact_attributes);
write_setting_to_snap!(max_values_per_facet);
write_setting_to_snap!(pagination_max_total_hits);
write_setting_to_snap!(searchable_fields);
write_setting_to_snap!(user_defined_searchable_fields);
snap
}
#[macro_export]
macro_rules! full_snap_of_db {
($index:ident, settings) => {{
$crate::snapshot_tests::snap_settings(&$index)
}};
($index:ident, word_docids) => {{
$crate::snapshot_tests::snap_word_docids(&$index)
}};
($index:ident, exact_word_docids) => {{
$crate::snapshot_tests::snap_exact_word_docids(&$index)
}};
($index:ident, word_prefix_docids) => {{
$crate::snapshot_tests::snap_word_prefix_docids(&$index)
}};
($index:ident, exact_word_prefix_docids) => {{
$crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
}};
($index:ident, docid_word_positions) => {{
$crate::snapshot_tests::snap_docid_word_positions(&$index)
}};
($index:ident, word_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
}};
($index:ident, word_prefix_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index)
}};
($index:ident, word_position_docids) => {{
$crate::snapshot_tests::snap_word_position_docids(&$index)
}};
($index:ident, field_id_word_count_docids) => {{
$crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
}};
($index:ident, word_prefix_position_docids) => {{
$crate::snapshot_tests::snap_word_prefix_position_docids(&$index)
}};
($index:ident, facet_id_f64_docids) => {{
$crate::snapshot_tests::snap_facet_id_f64_docids(&$index)
}};
($index:ident, facet_id_string_docids) => {{
$crate::snapshot_tests::snap_facet_id_string_docids(&$index)
}};
($index:ident, documents_ids) => {{
$crate::snapshot_tests::snap_documents_ids(&$index)
}};
($index:ident, stop_words) => {{
$crate::snapshot_tests::snap_stop_words(&$index)
}};
($index:ident, soft_deleted_documents_ids) => {{
$crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
}};
($index:ident, field_distribution) => {{
$crate::snapshot_tests::snap_field_distributions(&$index)
}};
($index:ident, fields_ids_map) => {{
$crate::snapshot_tests::snap_fields_ids_map(&$index)
}};
($index:ident, geo_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index)
}};
($index:ident, external_documents_ids) => {{
$crate::snapshot_tests::snap_external_documents_ids(&$index)
}};
($index:ident, number_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
}};
($index:ident, string_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
}};
($index:ident, words_fst) => {{
$crate::snapshot_tests::snap_words_fst(&$index)
}};
($index:ident, words_prefixes_fst) => {{
$crate::snapshot_tests::snap_words_prefixes_fst(&$index)
}};
}
pub fn convert_snap_to_hash_if_needed<'snap>(
name: &str,
snap: &'snap str,
inline: bool,
) -> Vec<(String, Cow<'snap, str>)> {
let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned());
let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap();
let max_len = if inline { 256 } else { 2048 };
if snap.len() < max_len {
vec![(name.to_owned(), Cow::Borrowed(snap))]
} else {
let mut r = vec![];
if store_whole_snapshot {
r.push((format!("{name}.full"), Cow::Borrowed(snap)));
}
let hash = md5::compute(snap.as_bytes());
let hash_str = format!("{hash:x}");
r.push((format!("{name}.hash"), Cow::Owned(hash_str)));
r
}
}
#[macro_export]
macro_rules! make_db_snap_from_iter {
($index:ident, $name:ident, |$vars:pat| $push:block) => {{
let rtxn = $index.read_txn().unwrap();
let iter = $index.$name.iter(&rtxn).unwrap();
let mut snap = String::new();
for x in iter {
let $vars = x.unwrap();
snap.push_str($push);
snap.push('\n');
}
snap
}};
}
pub fn display_bitmap(b: &RoaringBitmap) -> String {
let mut s = String::new();
s.push('[');
for x in b.into_iter() {
write!(&mut s, "{x}, ").unwrap();
}
s.push(']');
s
}

View File

@ -0,0 +1,7 @@
---
source: milli/src/index.rs
---
age 1
id 2
name 2

View File

@ -0,0 +1,7 @@
---
source: milli/src/index.rs
---
age 1
id 2
name 2

View File

@ -342,3 +342,93 @@ fn write_string_entry(
writer.insert(&key, &data)?;
Ok(())
}
#[cfg(test)]
mod tests {
use std::num::NonZeroUsize;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
#[test]
fn test_facets_number() {
let test =
|name: &str, group_size: Option<NonZeroUsize>, min_level_size: Option<NonZeroUsize>| {
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB
index.index_documents_config.autogenerate_docids = true;
index.index_documents_config.facet_level_group_size = group_size;
index.index_documents_config.facet_min_level_size = min_level_size;
index
.update_settings(|settings| {
settings.set_filterable_fields(
IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()])
.collect(),
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1_000 {
documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone());
}
for i in 0..100 {
documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone());
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, name);
};
test("default", None, None);
test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1));
test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2));
test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128));
test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2));
test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256));
}
#[test]
fn test_facets_string() {
let test = |name: &str,
group_size: Option<NonZeroUsize>,
min_level_size: Option<NonZeroUsize>| {
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB
index.index_documents_config.autogenerate_docids = true;
index.index_documents_config.facet_level_group_size = group_size;
index.index_documents_config.facet_min_level_size = min_level_size;
index
.update_settings(|settings| {
settings.set_filterable_fields(
IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()])
.collect(),
);
})
.unwrap();
let mut documents = vec![];
for i in 0..100 {
documents.push(
serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(),
);
}
for i in 0..10 {
documents.push(
serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_string_docids, name);
};
test("default", None, None);
test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1));
}
}

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
587899707db2848da3f18399e14ed4d0

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
02bbf2ca1663cccea0e4c06d5ad06a45

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
e68ea591e1af3e53e544dff9a1648e88

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
12a4bb0f5b95d7629c2b9a915150c0cf

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
6438e94bc7fada13022e0efccdf294e0

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
5348bbc46b5384455b6a900666d2a502

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
faddef9eae5f2efacfec51f20f2e8cd6

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facets.rs
---
ddb8fc987c5dc892337682595043858e

View File

@ -0,0 +1,46 @@
---
source: milli/src/update/word_prefix_pair_proximity_docids.rs
---
5 a 1 [101, ]
5 a 2 [101, ]
5 b 4 [101, ]
5 be 4 [101, ]
am a 3 [101, ]
amazing a 1 [100, ]
amazing a 2 [100, ]
amazing a 3 [100, ]
amazing b 2 [100, ]
amazing be 2 [100, ]
an a 1 [100, ]
an a 2 [100, ]
an b 3 [100, ]
an be 3 [100, ]
and a 2 [100, ]
and a 3 [100, ]
and a 4 [100, ]
and b 1 [100, ]
and be 1 [100, ]
at a 1 [100, ]
at a 2 [100, 101, ]
at a 3 [100, ]
at b 3 [101, ]
at b 4 [100, ]
at be 3 [101, ]
at be 4 [100, ]
beautiful a 2 [100, ]
beautiful a 3 [100, ]
beautiful a 4 [100, ]
bell a 2 [101, ]
bell a 4 [101, ]
house a 3 [100, ]
house a 4 [100, ]
house b 2 [100, ]
house be 2 [100, ]
rings a 1 [101, ]
rings a 3 [101, ]
rings b 2 [101, ]
rings be 2 [101, ]
the a 3 [101, ]
the b 1 [101, ]
the be 1 [101, ]

View File

@ -0,0 +1,56 @@
---
source: milli/src/update/word_prefix_pair_proximity_docids.rs
---
5 a 1 [101, ]
5 a 2 [101, ]
5 am 1 [101, ]
5 b 4 [101, ]
5 be 4 [101, ]
am a 3 [101, ]
amazing a 1 [100, ]
amazing a 2 [100, ]
amazing a 3 [100, ]
amazing b 2 [100, ]
amazing be 2 [100, ]
an a 1 [100, ]
an a 2 [100, 202, ]
an am 1 [100, ]
an b 3 [100, ]
an be 3 [100, ]
and a 2 [100, ]
and a 3 [100, ]
and a 4 [100, ]
and am 2 [100, ]
and b 1 [100, ]
and be 1 [100, ]
at a 1 [100, 202, ]
at a 2 [100, 101, ]
at a 3 [100, ]
at am 2 [100, 101, ]
at b 3 [101, ]
at b 4 [100, ]
at be 3 [101, ]
at be 4 [100, ]
beautiful a 2 [100, ]
beautiful a 3 [100, ]
beautiful a 4 [100, ]
beautiful am 3 [100, ]
bell a 2 [101, ]
bell a 4 [101, ]
bell am 4 [101, ]
extraordinary a 2 [202, ]
extraordinary a 3 [202, ]
house a 3 [100, 202, ]
house a 4 [100, 202, ]
house am 4 [100, ]
house b 2 [100, ]
house be 2 [100, ]
rings a 1 [101, ]
rings a 3 [101, ]
rings am 3 [101, ]
rings b 2 [101, ]
rings be 2 [101, ]
the a 3 [101, ]
the b 1 [101, ]
the be 1 [101, ]

View File

@ -244,3 +244,88 @@ fn insert_current_prefix_data_in_sorter<'a>(
Ok(())
}
#[cfg(test)]
mod tests {
use std::io::Cursor;
use crate::db_snap;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
let mut documents = Vec::new();
for prefix in prefixes {
for i in 0..50 {
documents.push(
serde_json::json!({
"text": format!("{prefix}{i:x}"),
})
.as_object()
.unwrap()
.clone(),
)
}
}
documents
}
#[test]
fn test_update() {
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"text": "At an amazing and beautiful house"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"text": "The bell rings at 5 am"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
documents.push(
serde_json::json!({
"text": "At an extraordinary house"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_prefix_pair_proximity_docids, "update");
}
}