Improve documents deletion by iterating over all the word pair positions

This commit is contained in:
Kerollmops 2020-10-27 18:50:09 +01:00
parent 3889d956d9
commit d6338af766
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -2,7 +2,6 @@ use std::borrow::Cow;
use std::convert::TryFrom; use std::convert::TryFrom;
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use itertools::Itertools;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{Index, BEU32}; use crate::{Index, BEU32};
@ -168,21 +167,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// We write the new words FST into the main database. // We write the new words FST into the main database.
self.index.put_words_fst(self.wtxn, &new_words_fst)?; self.index.put_words_fst(self.wtxn, &new_words_fst)?;
// We delete the documents ids that are under the pairs of words we found. // We delete the documents ids that are under the pairs of words,
// TODO We can maybe improve this by using the `compute_words_pair_proximities` // it is faster and use no memory to iterate over all the words pairs than
// function instead of iterating over all the possible word pairs. // to compute the cartesian product of every words of the deleted documents.
for ((w1, _), (w2, _)) in words.iter().cartesian_product(&words) { let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?;
let start = &(w1.as_str(), w2.as_str(), 0); while let Some(result) = iter.next() {
let end = &(w1.as_str(), w2.as_str(), 7); let ((w1, w2, prox), mut docids) = result?;
let mut iter = word_pair_proximity_docids.range_mut(self.wtxn, &(start..=end))?; docids.difference_with(&documents_ids);
while let Some(result) = iter.next() { if docids.is_empty() {
let ((w1, w2, prox), mut docids) = result?; iter.del_current()?;
docids.difference_with(&documents_ids); } else {
if docids.is_empty() { iter.put_current(&(w1, w2, prox), &docids)?;
iter.del_current()?;
} else {
iter.put_current(&(w1, w2, prox), &docids)?;
}
} }
} }