mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-11 23:48:56 +01:00
Merge #478
478: Disable typo on attribute r=Kerollmops a=MarinPostma disable typo on attributes Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
commit
aadb0c58c9
@ -99,8 +99,10 @@ impl Settings {
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
let exact_attributes = index.exact_attributes(&txn)?;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\n",
|
"displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n\t{}\n",
|
||||||
displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
|
displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
|
||||||
searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
|
searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
|
||||||
filterable_attributes.join("\n\t"),
|
filterable_attributes.join("\n\t"),
|
||||||
@ -109,6 +111,7 @@ impl Settings {
|
|||||||
stop_words.join("\n\t"),
|
stop_words.join("\n\t"),
|
||||||
distinct_field.unwrap_or_default(),
|
distinct_field.unwrap_or_default(),
|
||||||
synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::<String>(),
|
synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::<String>(),
|
||||||
|
exact_attributes.join("\n\t"),
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -463,6 +466,8 @@ struct SettingsUpdate {
|
|||||||
filterable_attributes: Option<Vec<String>>,
|
filterable_attributes: Option<Vec<String>>,
|
||||||
#[structopt(long)]
|
#[structopt(long)]
|
||||||
criteria: Option<Vec<String>>,
|
criteria: Option<Vec<String>>,
|
||||||
|
#[structopt(long)]
|
||||||
|
exact_attributes: Option<Vec<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Performer for SettingsUpdate {
|
impl Performer for SettingsUpdate {
|
||||||
@ -489,6 +494,14 @@ impl Performer for SettingsUpdate {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(exact_attributes) = self.exact_attributes {
|
||||||
|
if !exact_attributes.is_empty() {
|
||||||
|
update.set_exact_attributes(exact_attributes.into_iter().collect());
|
||||||
|
} else {
|
||||||
|
update.reset_exact_attributes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut bars = Vec::new();
|
let mut bars = Vec::new();
|
||||||
let progesses = MultiProgress::new();
|
let progesses = MultiProgress::new();
|
||||||
for _ in 0..4 {
|
for _ in 0..4 {
|
||||||
|
@ -29,6 +29,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[
|
|||||||
FACET_ID_STRING_DOCIDS,
|
FACET_ID_STRING_DOCIDS,
|
||||||
FIELD_ID_DOCID_FACET_F64S,
|
FIELD_ID_DOCID_FACET_F64S,
|
||||||
FIELD_ID_DOCID_FACET_STRINGS,
|
FIELD_ID_DOCID_FACET_STRINGS,
|
||||||
|
EXACT_WORD_DOCIDS,
|
||||||
|
EXACT_WORD_PREFIX_DOCIDS,
|
||||||
DOCUMENTS,
|
DOCUMENTS,
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -384,6 +386,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
exact_word_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
field_id_docid_facet_f64s: _,
|
field_id_docid_facet_f64s: _,
|
||||||
field_id_docid_facet_strings: _,
|
field_id_docid_facet_strings: _,
|
||||||
documents,
|
documents,
|
||||||
@ -436,6 +440,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for result in exact_word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
|
let (word, value) = result?;
|
||||||
|
heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
|
||||||
|
if heap.len() > limit {
|
||||||
|
heap.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
let (word, value) = result?;
|
let (word, value) = result?;
|
||||||
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
|
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
|
||||||
@ -444,6 +456,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for result in exact_word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
|
let (word, value) = result?;
|
||||||
|
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
|
||||||
|
if heap.len() > limit {
|
||||||
|
heap.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
let ((docid, word), value) = result?;
|
let ((docid, word), value) = result?;
|
||||||
let key = format!("{} {}", docid, word);
|
let key = format!("{} {}", docid, word);
|
||||||
@ -967,6 +987,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
|||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
|
exact_word_prefix_docids,
|
||||||
|
exact_word_docids,
|
||||||
documents,
|
documents,
|
||||||
} = index;
|
} = index;
|
||||||
|
|
||||||
@ -991,6 +1013,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
|||||||
FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
|
FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
|
||||||
FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
|
FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
|
||||||
FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
|
FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
|
||||||
|
EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(),
|
||||||
|
EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(),
|
||||||
|
|
||||||
DOCUMENTS => documents.as_polymorph(),
|
DOCUMENTS => documents.as_polymorph(),
|
||||||
unknown => anyhow::bail!("unknown database {:?}", unknown),
|
unknown => anyhow::bail!("unknown database {:?}", unknown),
|
||||||
|
@ -53,12 +53,15 @@ pub mod main_key {
|
|||||||
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
|
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
|
||||||
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
|
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
|
||||||
pub const EXACT_WORDS: &str = "exact-words";
|
pub const EXACT_WORDS: &str = "exact-words";
|
||||||
|
pub const EXACT_ATTRIBUTES: &str = "exact-attributes";
|
||||||
}
|
}
|
||||||
|
|
||||||
pub mod db_name {
|
pub mod db_name {
|
||||||
pub const MAIN: &str = "main";
|
pub const MAIN: &str = "main";
|
||||||
pub const WORD_DOCIDS: &str = "word-docids";
|
pub const WORD_DOCIDS: &str = "word-docids";
|
||||||
|
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
|
||||||
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
||||||
|
pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
|
||||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||||
@ -82,9 +85,16 @@ pub struct Index {
|
|||||||
|
|
||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
|
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||||
|
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix.
|
/// A prefix of word and all the documents ids containing this prefix.
|
||||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
|
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||||
|
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
||||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||||
|
|
||||||
@ -118,13 +128,15 @@ impl Index {
|
|||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(14);
|
options.max_dbs(16);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
let main = env.create_poly_database(Some(MAIN))?;
|
let main = env.create_poly_database(Some(MAIN))?;
|
||||||
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
||||||
|
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
|
||||||
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
||||||
|
let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
|
||||||
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
||||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
let word_prefix_pair_proximity_docids =
|
let word_prefix_pair_proximity_docids =
|
||||||
@ -145,7 +157,9 @@ impl Index {
|
|||||||
env,
|
env,
|
||||||
main,
|
main,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
@ -949,6 +963,33 @@ impl Index {
|
|||||||
)?;
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the exact attributes: attributes for which typo is disallowed.
|
||||||
|
pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result<Vec<&'t str>> {
|
||||||
|
Ok(self
|
||||||
|
.main
|
||||||
|
.get::<_, Str, SerdeBincode<Vec<&str>>>(txn, main_key::EXACT_ATTRIBUTES)?
|
||||||
|
.unwrap_or_default())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the list of exact attributes field ids.
|
||||||
|
pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result<HashSet<FieldId>> {
|
||||||
|
let attrs = self.exact_attributes(txn)?;
|
||||||
|
let fid_map = self.fields_ids_map(txn)?;
|
||||||
|
Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes the exact attributes to the database.
|
||||||
|
pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> {
|
||||||
|
self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clears the exact attributes from the store.
|
||||||
|
pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> {
|
||||||
|
self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -68,7 +68,9 @@ impl Default for Candidates {
|
|||||||
pub trait Context<'c> {
|
pub trait Context<'c> {
|
||||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
|
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn word_pair_proximity_docids(
|
fn word_pair_proximity_docids(
|
||||||
&self,
|
&self,
|
||||||
left: &str,
|
left: &str,
|
||||||
@ -118,10 +120,18 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
|||||||
self.index.word_docids.get(self.rtxn, &word)
|
self.index.word_docids.get(self.rtxn, &word)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
self.index.exact_word_docids.get(self.rtxn, &word)
|
||||||
|
}
|
||||||
|
|
||||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
self.index.word_prefix_docids.get(self.rtxn, &word)
|
self.index.word_prefix_docids.get(self.rtxn, &word)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
self.index.exact_word_prefix_docids.get(self.rtxn, &word)
|
||||||
|
}
|
||||||
|
|
||||||
fn word_pair_proximity_docids(
|
fn word_pair_proximity_docids(
|
||||||
&self,
|
&self,
|
||||||
left: &str,
|
left: &str,
|
||||||
@ -392,26 +402,42 @@ fn query_docids(
|
|||||||
wdcache: &mut WordDerivationsCache,
|
wdcache: &mut WordDerivationsCache,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
match &query.kind {
|
match &query.kind {
|
||||||
QueryKind::Exact { word, .. } => {
|
QueryKind::Exact { word, original_typo } => {
|
||||||
if query.prefix && ctx.in_prefix_cache(&word) {
|
if query.prefix && ctx.in_prefix_cache(&word) {
|
||||||
Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default())
|
let mut docids = ctx.word_prefix_docids(&word)?.unwrap_or_default();
|
||||||
|
// only add the exact docids if the word hasn't been derived
|
||||||
|
if *original_typo == 0 {
|
||||||
|
docids |= ctx.exact_word_prefix_docids(&word)?.unwrap_or_default();
|
||||||
|
}
|
||||||
|
Ok(docids)
|
||||||
} else if query.prefix {
|
} else if query.prefix {
|
||||||
let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?;
|
let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?;
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for (word, _typo) in words {
|
for (word, _typo) in words {
|
||||||
let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
|
docids |= ctx.word_docids(&word)?.unwrap_or_default();
|
||||||
docids |= current_docids;
|
// only add the exact docids if the word hasn't been derived
|
||||||
|
if *original_typo == 0 {
|
||||||
|
docids |= ctx.exact_word_docids(&word)?.unwrap_or_default();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
} else {
|
} else {
|
||||||
Ok(ctx.word_docids(&word)?.unwrap_or_default())
|
let mut docids = ctx.word_docids(&word)?.unwrap_or_default();
|
||||||
|
// only add the exact docids if the word hasn't been derived
|
||||||
|
if *original_typo == 0 {
|
||||||
|
docids |= ctx.exact_word_docids(&word)?.unwrap_or_default();
|
||||||
|
}
|
||||||
|
Ok(docids)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
QueryKind::Tolerant { typo, word } => {
|
QueryKind::Tolerant { typo, word } => {
|
||||||
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?;
|
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?;
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for (word, _typo) in words {
|
for (word, typo) in words {
|
||||||
let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
|
let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default();
|
||||||
|
if *typo == 0 {
|
||||||
|
current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default()
|
||||||
|
}
|
||||||
docids |= current_docids;
|
docids |= current_docids;
|
||||||
}
|
}
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
@ -512,7 +538,9 @@ pub mod test {
|
|||||||
pub struct TestContext<'t> {
|
pub struct TestContext<'t> {
|
||||||
words_fst: fst::Set<Cow<'t, [u8]>>,
|
words_fst: fst::Set<Cow<'t, [u8]>>,
|
||||||
word_docids: HashMap<String, RoaringBitmap>,
|
word_docids: HashMap<String, RoaringBitmap>,
|
||||||
|
exact_word_docids: HashMap<String, RoaringBitmap>,
|
||||||
word_prefix_docids: HashMap<String, RoaringBitmap>,
|
word_prefix_docids: HashMap<String, RoaringBitmap>,
|
||||||
|
exact_word_prefix_docids: HashMap<String, RoaringBitmap>,
|
||||||
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||||
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||||
docid_words: HashMap<u32, Vec<String>>,
|
docid_words: HashMap<u32, Vec<String>>,
|
||||||
@ -527,10 +555,18 @@ pub mod test {
|
|||||||
Ok(self.word_docids.get(&word.to_string()).cloned())
|
Ok(self.word_docids.get(&word.to_string()).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
Ok(self.exact_word_docids.get(&word.to_string()).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
|
Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
|
Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned())
|
||||||
|
}
|
||||||
|
|
||||||
fn word_pair_proximity_docids(
|
fn word_pair_proximity_docids(
|
||||||
&self,
|
&self,
|
||||||
left: &str,
|
left: &str,
|
||||||
@ -643,6 +679,8 @@ pub mod test {
|
|||||||
s("morning") => random_postings(rng, 125),
|
s("morning") => random_postings(rng, 125),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let exact_word_docids = HashMap::new();
|
||||||
|
|
||||||
let mut docid_words = HashMap::new();
|
let mut docid_words = HashMap::new();
|
||||||
for (word, docids) in word_docids.iter() {
|
for (word, docids) in word_docids.iter() {
|
||||||
for docid in docids {
|
for docid in docids {
|
||||||
@ -657,6 +695,8 @@ pub mod test {
|
|||||||
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
|
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let exact_word_prefix_docids = HashMap::new();
|
||||||
|
|
||||||
let mut word_pair_proximity_docids = HashMap::new();
|
let mut word_pair_proximity_docids = HashMap::new();
|
||||||
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
||||||
for (lword, lcandidates) in &word_docids {
|
for (lword, lcandidates) in &word_docids {
|
||||||
@ -712,7 +752,9 @@ pub mod test {
|
|||||||
TestContext {
|
TestContext {
|
||||||
words_fst,
|
words_fst,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
docid_words,
|
docid_words,
|
||||||
|
@ -1267,6 +1267,7 @@ mod test {
|
|||||||
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn disable_typo_on_word() {
|
fn disable_typo_on_word() {
|
||||||
let query = "goodbye";
|
let query = "goodbye";
|
||||||
|
@ -19,7 +19,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
env: _env,
|
env: _env,
|
||||||
main: _main,
|
main: _main,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
@ -55,7 +57,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// Clear the other databases.
|
// Clear the other databases.
|
||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
|
exact_word_docids.clear(self.wtxn)?;
|
||||||
word_prefix_docids.clear(self.wtxn)?;
|
word_prefix_docids.clear(self.wtxn)?;
|
||||||
|
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||||
docid_word_positions.clear(self.wtxn)?;
|
docid_word_positions.clear(self.wtxn)?;
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
|
@ -2,8 +2,8 @@ use std::collections::btree_map::Entry;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::{ByteSlice, Str};
|
||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode, Database};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@ -16,7 +16,10 @@ use crate::heed_codec::facet::{
|
|||||||
};
|
};
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::index::{db_name, main_key};
|
use crate::index::{db_name, main_key};
|
||||||
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
|
use crate::{
|
||||||
|
DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32,
|
||||||
|
BEU32,
|
||||||
|
};
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -108,7 +111,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
env: _env,
|
env: _env,
|
||||||
main: _main,
|
main: _main,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
@ -204,25 +209,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
// We iterate over the words and delete the documents ids
|
// We iterate over the words and delete the documents ids
|
||||||
// from the word docids database.
|
// from the word docids database.
|
||||||
for (word, must_remove) in &mut words {
|
for (word, must_remove) in &mut words {
|
||||||
// We create an iterator to be able to get the content and delete the word docids.
|
remove_from_word_docids(
|
||||||
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
self.wtxn,
|
||||||
// the LMDB B-Tree two times but only once.
|
word_docids,
|
||||||
let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
|
word.as_str(),
|
||||||
if let Some((key, mut docids)) = iter.next().transpose()? {
|
must_remove,
|
||||||
if key == word.as_str() {
|
&self.documents_ids,
|
||||||
let previous_len = docids.len();
|
)?;
|
||||||
docids -= &self.documents_ids;
|
|
||||||
if docids.is_empty() {
|
remove_from_word_docids(
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
self.wtxn,
|
||||||
unsafe { iter.del_current()? };
|
exact_word_docids,
|
||||||
*must_remove = true;
|
word.as_str(),
|
||||||
} else if docids.len() != previous_len {
|
must_remove,
|
||||||
let key = key.to_owned();
|
&self.documents_ids,
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
)?;
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// We construct an FST set that contains the words to delete from the words FST.
|
// We construct an FST set that contains the words to delete from the words FST.
|
||||||
@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
// We write the new words FST into the main database.
|
// We write the new words FST into the main database.
|
||||||
self.index.put_words_fst(self.wtxn, &new_words_fst)?;
|
self.index.put_words_fst(self.wtxn, &new_words_fst)?;
|
||||||
|
|
||||||
// We iterate over the word prefix docids database and remove the deleted documents ids
|
let prefixes_to_delete =
|
||||||
// from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
|
remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?;
|
||||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
|
||||||
let mut iter = word_prefix_docids.iter_mut(self.wtxn)?;
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let (prefix, mut docids) = result?;
|
|
||||||
let prefix = prefix.to_owned();
|
|
||||||
let previous_len = docids.len();
|
|
||||||
docids -= &self.documents_ids;
|
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
prefixes_to_delete.insert(prefix)?;
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&prefix, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
let exact_prefix_to_delete = remove_from_word_prefix_docids(
|
||||||
|
self.wtxn,
|
||||||
|
exact_word_prefix_docids,
|
||||||
|
&self.documents_ids,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
|
||||||
|
|
||||||
// We compute the new prefix FST and write it only if there is a change.
|
// We compute the new prefix FST and write it only if there is a change.
|
||||||
let prefixes_to_delete = prefixes_to_delete.into_set();
|
if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() {
|
||||||
if !prefixes_to_delete.is_empty() {
|
|
||||||
let new_words_prefixes_fst = {
|
let new_words_prefixes_fst = {
|
||||||
// We retrieve the current words prefixes FST from the database.
|
// We retrieve the current words prefixes FST from the database.
|
||||||
let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference();
|
let difference =
|
||||||
|
words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference();
|
||||||
|
|
||||||
// We stream the new external ids that does no more contains the to-delete external ids.
|
// We stream the new external ids that does no more contains the to-delete external ids.
|
||||||
let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
|
let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
|
||||||
@ -457,6 +448,64 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn remove_from_word_prefix_docids(
|
||||||
|
txn: &mut heed::RwTxn,
|
||||||
|
db: &Database<Str, RoaringBitmapCodec>,
|
||||||
|
to_remove: &RoaringBitmap,
|
||||||
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
|
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||||
|
|
||||||
|
// We iterate over the word prefix docids database and remove the deleted documents ids
|
||||||
|
// from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
|
||||||
|
let mut iter = db.iter_mut(txn)?;
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let (prefix, mut docids) = result?;
|
||||||
|
let prefix = prefix.to_owned();
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids -= to_remove;
|
||||||
|
if docids.is_empty() {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
prefixes_to_delete.insert(prefix)?;
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.put_current(&prefix, &docids)? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(prefixes_to_delete.into_set())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_from_word_docids(
|
||||||
|
txn: &mut heed::RwTxn,
|
||||||
|
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||||
|
word: &str,
|
||||||
|
must_remove: &mut bool,
|
||||||
|
to_remove: &RoaringBitmap,
|
||||||
|
) -> Result<()> {
|
||||||
|
// We create an iterator to be able to get the content and delete the word docids.
|
||||||
|
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
||||||
|
// the LMDB B-Tree two times but only once.
|
||||||
|
let mut iter = db.prefix_iter_mut(txn, &word)?;
|
||||||
|
if let Some((key, mut docids)) = iter.next().transpose()? {
|
||||||
|
if key == word {
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids -= to_remove;
|
||||||
|
if docids.is_empty() {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
*must_remove = true;
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
let key = key.to_owned();
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.put_current(&key, &docids)? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
|
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
|
||||||
wtxn: &'a mut heed::RwTxn,
|
wtxn: &'a mut heed::RwTxn,
|
||||||
db: &heed::Database<C, DC>,
|
db: &heed::Database<C, DC>,
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
@ -10,17 +11,22 @@ use super::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::Result;
|
use crate::update::index_documents::helpers::read_u32_ne_bytes;
|
||||||
|
use crate::{relative_from_absolute_position, FieldId, Result};
|
||||||
|
|
||||||
/// Extracts the word and the documents ids where this word appear.
|
/// Extracts the word and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
/// Returns a grenad reader with the list of extracted words and
|
/// Returns a grenad reader with the list of extracted words and
|
||||||
/// documents ids from the given chunk of docid word positions.
|
/// documents ids from the given chunk of docid word positions.
|
||||||
|
///
|
||||||
|
/// The first returned reader is the one for normal word_docids, and the second one is for
|
||||||
|
/// exact_word_docids
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
exact_attributes: &HashSet<FieldId>,
|
||||||
|
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_docids_sorter = create_sorter(
|
||||||
@ -28,20 +34,53 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory.map(|x| x / 2),
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut exact_word_docids_sorter = create_sorter(
|
||||||
|
merge_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory.map(|x| x / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut value_buffer = Vec::new();
|
let mut value_buffer = Vec::new();
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, _value)) = cursor.move_on_next()? {
|
while let Some((key, positions)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||||
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
||||||
|
|
||||||
|
// If there are no exact attributes, we do not need to iterate over positions.
|
||||||
|
if exact_attributes.is_empty() {
|
||||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
|
} else {
|
||||||
|
let mut added_to_exact = false;
|
||||||
|
let mut added_to_word_docids = false;
|
||||||
|
for position in read_u32_ne_bytes(positions) {
|
||||||
|
// as soon as we know that this word had been to both readers, we don't need to
|
||||||
|
// iterate over the positions.
|
||||||
|
if added_to_exact && added_to_word_docids {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let (fid, _) = relative_from_absolute_position(position);
|
||||||
|
if exact_attributes.contains(&fid) && !added_to_exact {
|
||||||
|
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
|
added_to_exact = true;
|
||||||
|
} else if !added_to_word_docids {
|
||||||
|
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
|
added_to_word_docids = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(word_docids_sorter, indexer)
|
Ok((
|
||||||
|
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||||
|
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
|
|||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
|
as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
|
||||||
merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader,
|
||||||
};
|
};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result};
|
||||||
@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
geo_field_id: Option<FieldId>,
|
geo_field_id: Option<FieldId>,
|
||||||
stop_words: Option<fst::Set<&[u8]>>,
|
stop_words: Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
|
exact_attributes: HashSet<FieldId>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
@ -66,7 +67,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
||||||
) = result?;
|
) = result?;
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -76,7 +77,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-pair-proximity-docids",
|
"word-pair-proximity-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -86,17 +87,20 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_docids,
|
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||||
merge_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
TypedChunk::WordDocids,
|
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
},
|
||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -106,7 +110,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_strings_chunks.clone(),
|
docid_fid_facet_strings_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -116,7 +120,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-facet-string-docids",
|
"field-id-facet-string-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_numbers_chunks.clone(),
|
docid_fid_facet_numbers_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -133,7 +137,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
/// Generated grenad chunks are merged using the merge_fn.
|
/// Generated grenad chunks are merged using the merge_fn.
|
||||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||||
/// and sent into lmdb_writer_sx.
|
/// and sent into lmdb_writer_sx.
|
||||||
fn spawn_extraction_task<FE, FS>(
|
fn spawn_extraction_task<FE, FS, M>(
|
||||||
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
@ -142,19 +146,21 @@ fn spawn_extraction_task<FE, FS>(
|
|||||||
serialize_fn: FS,
|
serialize_fn: FS,
|
||||||
name: &'static str,
|
name: &'static str,
|
||||||
) where
|
) where
|
||||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<grenad::Reader<File>>
|
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
|
||||||
+ Sync
|
+ Sync
|
||||||
+ Send
|
+ Send
|
||||||
+ 'static,
|
+ 'static,
|
||||||
FS: Fn(grenad::Reader<File>) -> TypedChunk + Sync + Send + 'static,
|
FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
|
||||||
|
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
|
||||||
|
M::Output: Send,
|
||||||
{
|
{
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
let chunks: Result<Vec<_>> =
|
let chunks: Result<M> =
|
||||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
|
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
|
||||||
rayon::spawn(move || match chunks {
|
rayon::spawn(move || match chunks {
|
||||||
Ok(chunks) => {
|
Ok(chunks) => {
|
||||||
debug!("merge {} database", name);
|
debug!("merge {} database", name);
|
||||||
let reader = merge_readers(chunks, merge_fn, indexer);
|
let reader = chunks.merge(merge_fn, &indexer);
|
||||||
let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
|
let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
@ -78,26 +78,63 @@ pub unsafe fn as_cloneable_grenad(
|
|||||||
Ok(reader)
|
Ok(reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_readers<R: io::Read + io::Seek>(
|
pub trait MergeableReader
|
||||||
readers: Vec<grenad::Reader<R>>,
|
where
|
||||||
merge_fn: MergeFn,
|
Self: Sized,
|
||||||
indexer: GrenadParameters,
|
{
|
||||||
) -> Result<grenad::Reader<File>> {
|
type Output;
|
||||||
let mut merger_builder = grenad::MergerBuilder::new(merge_fn);
|
|
||||||
for reader in readers {
|
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||||
merger_builder.push(reader.into_cursor()?);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let merger = merger_builder.build();
|
impl MergeableReader for Vec<grenad::Reader<File>> {
|
||||||
|
type Output = grenad::Reader<File>;
|
||||||
|
|
||||||
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
|
let mut merger = MergerBuilder::new(merge_fn);
|
||||||
|
self.into_iter().try_for_each(|r| merger.push(r))?;
|
||||||
|
merger.finish(params)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
|
type Output = (grenad::Reader<File>, grenad::Reader<File>);
|
||||||
|
|
||||||
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
|
let mut m1 = MergerBuilder::new(merge_fn);
|
||||||
|
let mut m2 = MergerBuilder::new(merge_fn);
|
||||||
|
for (r1, r2) in self.into_iter() {
|
||||||
|
m1.push(r1)?;
|
||||||
|
m2.push(r2)?;
|
||||||
|
}
|
||||||
|
Ok((m1.finish(params)?, m2.finish(params)?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||||
|
|
||||||
|
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||||
|
fn new(merge_fn: MergeFn) -> Self {
|
||||||
|
Self(grenad::MergerBuilder::new(merge_fn))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
|
||||||
|
self.0.push(reader.into_cursor()?);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
|
||||||
|
let merger = self.0.build();
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
params.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
params.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
merger.write_into_stream_writer(&mut writer)?;
|
merger.write_into_stream_writer(&mut writer)?;
|
||||||
|
|
||||||
Ok(writer_into_reader(writer)?)
|
Ok(writer_into_reader(writer)?)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct GrenadParameters {
|
pub struct GrenadParameters {
|
||||||
@ -240,3 +277,8 @@ pub fn sorter_into_lmdb_database(
|
|||||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||||
|
pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
|
Ok(Cow::Owned(Vec::new()))
|
||||||
|
}
|
||||||
|
@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto};
|
|||||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
pub use grenad_helpers::{
|
pub use grenad_helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers,
|
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||||
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
|
merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database,
|
||||||
GrenadParameters,
|
writer_into_reader, GrenadParameters, MergeableReader,
|
||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
|
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
|
||||||
|
@ -4,11 +4,13 @@ mod transform;
|
|||||||
mod typed_chunk;
|
mod typed_chunk;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::{Read, Seek};
|
use std::io::{Cursor, Read, Seek};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
|
|
||||||
use crossbeam_channel::{Receiver, Sender};
|
use crossbeam_channel::{Receiver, Sender};
|
||||||
|
use heed::types::Str;
|
||||||
|
use heed::Database;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@ -28,7 +30,7 @@ use crate::update::{
|
|||||||
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
||||||
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||||
@ -226,6 +228,7 @@ where
|
|||||||
};
|
};
|
||||||
|
|
||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
|
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||||
|
|
||||||
// Run extraction pipeline in parallel.
|
// Run extraction pipeline in parallel.
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
@ -255,6 +258,7 @@ where
|
|||||||
geo_field_id,
|
geo_field_id,
|
||||||
stop_words,
|
stop_words,
|
||||||
self.indexer_config.max_positions_per_attributes,
|
self.indexer_config.max_positions_per_attributes,
|
||||||
|
exact_attributes,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -282,6 +286,7 @@ where
|
|||||||
let mut word_pair_proximity_docids = None;
|
let mut word_pair_proximity_docids = None;
|
||||||
let mut word_position_docids = None;
|
let mut word_position_docids = None;
|
||||||
let mut word_docids = None;
|
let mut word_docids = None;
|
||||||
|
let mut exact_word_docids = None;
|
||||||
|
|
||||||
let mut databases_seen = 0;
|
let mut databases_seen = 0;
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
@ -291,10 +296,13 @@ where
|
|||||||
|
|
||||||
for result in lmdb_writer_rx {
|
for result in lmdb_writer_rx {
|
||||||
let typed_chunk = match result? {
|
let typed_chunk = match result? {
|
||||||
TypedChunk::WordDocids(chunk) => {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
word_docids = Some(cloneable_chunk);
|
word_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordDocids(chunk)
|
let cloneable_chunk =
|
||||||
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||||
|
exact_word_docids = Some(cloneable_chunk);
|
||||||
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
|
||||||
}
|
}
|
||||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
@ -346,6 +354,7 @@ where
|
|||||||
|
|
||||||
self.execute_prefix_databases(
|
self.execute_prefix_databases(
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
)?;
|
)?;
|
||||||
@ -357,6 +366,7 @@ where
|
|||||||
pub fn execute_prefix_databases(
|
pub fn execute_prefix_databases(
|
||||||
self,
|
self,
|
||||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
|
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
@ -425,14 +435,25 @@ where
|
|||||||
});
|
});
|
||||||
|
|
||||||
if let Some(word_docids) = word_docids {
|
if let Some(word_docids) = word_docids {
|
||||||
// Run the word prefix docids update operation.
|
execute_word_prefix_docids(
|
||||||
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
|
self.wtxn,
|
||||||
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
|
|
||||||
builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
|
|
||||||
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
|
|
||||||
builder.max_memory = self.indexer_config.max_memory;
|
|
||||||
builder.execute(
|
|
||||||
word_docids,
|
word_docids,
|
||||||
|
self.index.word_docids.clone(),
|
||||||
|
self.index.word_prefix_docids.clone(),
|
||||||
|
&self.indexer_config,
|
||||||
|
&new_prefix_fst_words,
|
||||||
|
&common_prefix_fst_words,
|
||||||
|
&del_prefix_fst_words,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(exact_word_docids) = exact_word_docids {
|
||||||
|
execute_word_prefix_docids(
|
||||||
|
self.wtxn,
|
||||||
|
exact_word_docids,
|
||||||
|
self.index.exact_word_docids.clone(),
|
||||||
|
self.index.exact_word_prefix_docids.clone(),
|
||||||
|
&self.indexer_config,
|
||||||
&new_prefix_fst_words,
|
&new_prefix_fst_words,
|
||||||
&common_prefix_fst_words,
|
&common_prefix_fst_words,
|
||||||
&del_prefix_fst_words,
|
&del_prefix_fst_words,
|
||||||
@ -497,6 +518,32 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Run the word prefix docids update operation.
|
||||||
|
fn execute_word_prefix_docids(
|
||||||
|
txn: &mut heed::RwTxn,
|
||||||
|
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||||
|
word_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||||
|
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||||
|
indexer_config: &IndexerConfig,
|
||||||
|
new_prefix_fst_words: &[String],
|
||||||
|
common_prefix_fst_words: &[&[String]],
|
||||||
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let cursor = reader.into_cursor()?;
|
||||||
|
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
||||||
|
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
||||||
|
builder.chunk_compression_level = indexer_config.chunk_compression_level;
|
||||||
|
builder.max_nb_chunks = indexer_config.max_nb_chunks;
|
||||||
|
builder.max_memory = indexer_config.max_memory;
|
||||||
|
builder.execute(
|
||||||
|
cursor,
|
||||||
|
&new_prefix_fst_words,
|
||||||
|
&common_prefix_fst_words,
|
||||||
|
&del_prefix_fst_words,
|
||||||
|
)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
@ -3,14 +3,16 @@ use std::convert::TryInto;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
use grenad::MergerBuilder;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesDecode, RwTxn};
|
use heed::{BytesDecode, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
|
self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap,
|
||||||
CursorClonableMmap,
|
valid_lmdb_key, CursorClonableMmap,
|
||||||
};
|
};
|
||||||
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::{
|
use crate::{
|
||||||
@ -25,7 +27,10 @@ pub(crate) enum TypedChunk {
|
|||||||
Documents(grenad::Reader<CursorClonableMmap>),
|
Documents(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdWordcountDocids(grenad::Reader<File>),
|
FieldIdWordcountDocids(grenad::Reader<File>),
|
||||||
NewDocumentsIds(RoaringBitmap),
|
NewDocumentsIds(RoaringBitmap),
|
||||||
WordDocids(grenad::Reader<File>),
|
WordDocids {
|
||||||
|
word_docids_reader: grenad::Reader<File>,
|
||||||
|
exact_word_docids_reader: grenad::Reader<File>,
|
||||||
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<File>),
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||||
@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
TypedChunk::NewDocumentsIds(documents_ids) => {
|
||||||
return Ok((documents_ids, is_merged_database))
|
return Ok((documents_ids, is_merged_database))
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids(word_docids_iter) => {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?;
|
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
word_docids_iter.clone(),
|
word_docids_iter.clone(),
|
||||||
&index.word_docids,
|
&index.word_docids,
|
||||||
@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
merge_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||||
|
append_entries_into_database(
|
||||||
|
exact_word_docids_iter.clone(),
|
||||||
|
&index.exact_word_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_roaring_bitmaps,
|
||||||
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
|
||||||
let mut cursor = word_docids_iter.into_cursor()?;
|
|
||||||
while let Some((word, _value)) = cursor.move_on_next()? {
|
|
||||||
// This is a lexicographically ordered word position
|
|
||||||
// we use the key to construct the words fst.
|
|
||||||
builder.insert(word)?;
|
|
||||||
}
|
|
||||||
let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
|
|
||||||
let db_fst = index.words_fst(wtxn)?;
|
let db_fst = index.words_fst(wtxn)?;
|
||||||
|
|
||||||
// merge new fst with database fst
|
// merge new fst with database fst
|
||||||
@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
Ok((RoaringBitmap::new(), is_merged_database))
|
Ok((RoaringBitmap::new(), is_merged_database))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge_word_docids_reader_into_fst(
|
||||||
|
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||||
|
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||||
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
|
let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
||||||
|
merger_builder.push(word_docids_iter.into_cursor()?);
|
||||||
|
merger_builder.push(exact_word_docids_iter.into_cursor()?);
|
||||||
|
let mut iter = merger_builder.build().into_stream_merger_iter()?;
|
||||||
|
let mut builder = fst::SetBuilder::memory();
|
||||||
|
|
||||||
|
while let Some((k, _)) = iter.next()? {
|
||||||
|
builder.insert(k)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(builder.into_set())
|
||||||
|
}
|
||||||
|
|
||||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
||||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
||||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
||||||
|
@ -93,6 +93,8 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
|||||||
min_word_len_two_typos: Setting<u8>,
|
min_word_len_two_typos: Setting<u8>,
|
||||||
min_word_len_one_typo: Setting<u8>,
|
min_word_len_one_typo: Setting<u8>,
|
||||||
exact_words: Setting<BTreeSet<String>>,
|
exact_words: Setting<BTreeSet<String>>,
|
||||||
|
/// Attributes on which typo tolerance is disabled.
|
||||||
|
exact_attributes: Setting<HashSet<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||||
@ -117,6 +119,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
exact_words: Setting::NotSet,
|
exact_words: Setting::NotSet,
|
||||||
min_word_len_two_typos: Setting::Reset,
|
min_word_len_two_typos: Setting::Reset,
|
||||||
min_word_len_one_typo: Setting::Reset,
|
min_word_len_one_typo: Setting::Reset,
|
||||||
|
exact_attributes: Setting::Reset,
|
||||||
indexer_config,
|
indexer_config,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -226,6 +229,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.exact_words = Setting::Reset;
|
self.exact_words = Setting::Reset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set_exact_attributes(&mut self, attrs: HashSet<String>) {
|
||||||
|
self.exact_attributes = Setting::Set(attrs);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset_exact_attributes(&mut self) {
|
||||||
|
self.exact_attributes = Setting::Reset;
|
||||||
|
}
|
||||||
|
|
||||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
@ -411,6 +422,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_exact_attributes(&mut self) -> Result<bool> {
|
||||||
|
match self.exact_attributes {
|
||||||
|
Setting::Set(ref attrs) => {
|
||||||
|
let attrs = attrs.iter().map(String::as_str).collect::<Vec<_>>();
|
||||||
|
self.index.put_exact_attributes(&mut self.wtxn, &attrs)?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
Setting::Reset => {
|
||||||
|
self.index.delete_exact_attributes(&mut self.wtxn)?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
Setting::NotSet => Ok(false),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn update_filterable(&mut self) -> Result<()> {
|
fn update_filterable(&mut self) -> Result<()> {
|
||||||
match self.filterable_fields {
|
match self.filterable_fields {
|
||||||
Setting::Set(ref fields) => {
|
Setting::Set(ref fields) => {
|
||||||
@ -579,8 +605,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
let stop_words_updated = self.update_stop_words()?;
|
let stop_words_updated = self.update_stop_words()?;
|
||||||
let synonyms_updated = self.update_synonyms()?;
|
let synonyms_updated = self.update_synonyms()?;
|
||||||
let searchable_updated = self.update_searchable()?;
|
let searchable_updated = self.update_searchable()?;
|
||||||
|
let exact_attributes_updated = self.update_exact_attributes()?;
|
||||||
|
|
||||||
if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated {
|
if stop_words_updated
|
||||||
|
|| faceted_updated
|
||||||
|
|| synonyms_updated
|
||||||
|
|| searchable_updated
|
||||||
|
|| exact_attributes_updated
|
||||||
|
{
|
||||||
self.reindex(&progress_callback, old_fields_ids_map)?;
|
self.reindex(&progress_callback, old_fields_ids_map)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,16 +1,18 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::{ByteSlice, Str};
|
||||||
|
use heed::Database;
|
||||||
|
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn,
|
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result};
|
use crate::{Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
index: &'i Index,
|
word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
@ -20,11 +22,13 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
index: &'i Index,
|
word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
) -> WordPrefixDocids<'t, 'u, 'i> {
|
) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||||
WordPrefixDocids {
|
WordPrefixDocids {
|
||||||
wtxn,
|
wtxn,
|
||||||
index,
|
word_docids,
|
||||||
|
word_prefix_docids,
|
||||||
chunk_compression_type: CompressionType::None,
|
chunk_compression_type: CompressionType::None,
|
||||||
chunk_compression_level: None,
|
chunk_compression_level: None,
|
||||||
max_nb_chunks: None,
|
max_nb_chunks: None,
|
||||||
@ -35,7 +39,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
#[logging_timer::time("WordPrefixDocids::{}")]
|
#[logging_timer::time("WordPrefixDocids::{}")]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
new_word_docids: grenad::Reader<CursorClonableMmap>,
|
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
@ -51,7 +55,6 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if !common_prefix_fst_words.is_empty() {
|
if !common_prefix_fst_words.is_empty() {
|
||||||
let mut new_word_docids_iter = new_word_docids.into_cursor()?;
|
|
||||||
let mut current_prefixes: Option<&&[String]> = None;
|
let mut current_prefixes: Option<&&[String]> = None;
|
||||||
let mut prefixes_cache = HashMap::new();
|
let mut prefixes_cache = HashMap::new();
|
||||||
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
|
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
|
||||||
@ -84,7 +87,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We fetch the docids associated to the newly added word prefix fst only.
|
// We fetch the docids associated to the newly added word prefix fst only.
|
||||||
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
|
let db = self.word_docids.remap_data_type::<ByteSlice>();
|
||||||
for prefix in new_prefix_fst_words {
|
for prefix in new_prefix_fst_words {
|
||||||
let prefix = std::str::from_utf8(prefix.as_bytes())?;
|
let prefix = std::str::from_utf8(prefix.as_bytes())?;
|
||||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||||
@ -94,7 +97,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We remove all the entries that are no more required in this word prefix docids database.
|
// We remove all the entries that are no more required in this word prefix docids database.
|
||||||
let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data();
|
let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data();
|
||||||
while let Some((prefix, _)) = iter.next().transpose()? {
|
while let Some((prefix, _)) = iter.next().transpose()? {
|
||||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||||
unsafe { iter.del_current()? };
|
unsafe { iter.del_current()? };
|
||||||
@ -106,7 +109,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
// We finally write the word prefix docids into the LMDB database.
|
// We finally write the word prefix docids into the LMDB database.
|
||||||
sorter_into_lmdb_database(
|
sorter_into_lmdb_database(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.index.word_prefix_docids.as_polymorph(),
|
*self.word_prefix_docids.as_polymorph(),
|
||||||
prefix_docids_sorter,
|
prefix_docids_sorter,
|
||||||
merge_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
@ -373,7 +373,7 @@ fn criteria_mixup() {
|
|||||||
fn criteria_ascdesc() {
|
fn criteria_ascdesc() {
|
||||||
let path = tempfile::tempdir().unwrap();
|
let path = tempfile::tempdir().unwrap();
|
||||||
let mut options = EnvOpenOptions::new();
|
let mut options = EnvOpenOptions::new();
|
||||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
options.map_size(12 * 1024 * 1024); // 10 MB
|
||||||
let index = Index::new(options, &path).unwrap();
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
@ -170,3 +170,41 @@ fn test_typo_disabled_on_word() {
|
|||||||
let result = search.execute().unwrap();
|
let result = search.execute().unwrap();
|
||||||
assert_eq!(result.documents_ids.len(), 1);
|
assert_eq!(result.documents_ids.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_disable_typo_on_attribute() {
|
||||||
|
let criteria = [Typo];
|
||||||
|
let index = super::setup_search_index_with_criteria(&criteria);
|
||||||
|
|
||||||
|
// basic typo search with default typo settings
|
||||||
|
{
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut search = Search::new(&txn, &index);
|
||||||
|
// typo in `antebel(l)um`
|
||||||
|
search.query("antebelum");
|
||||||
|
search.limit(10);
|
||||||
|
search.authorize_typos(true);
|
||||||
|
search.optional_words(true);
|
||||||
|
|
||||||
|
let result = search.execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut txn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||||
|
// disable typos on `description`
|
||||||
|
builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect());
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
|
||||||
|
let mut search = Search::new(&txn, &index);
|
||||||
|
search.query("antebelum");
|
||||||
|
search.limit(10);
|
||||||
|
search.authorize_typos(true);
|
||||||
|
search.optional_words(true);
|
||||||
|
|
||||||
|
let result = search.execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 0);
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user