mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-25 20:57:35 +01:00
Move the words pairs proximities compute into the write document function
This commit is contained in:
parent
ed05999f63
commit
25b2853b70
@ -130,6 +130,14 @@ fn create_writer(type_: CompressionType, level: Option<u32>, file: File) -> Writ
|
|||||||
builder.build(file)
|
builder.build(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Outputs a list of all pairs of words with the proximities between 1 and 7 inclusive.
|
||||||
|
///
|
||||||
|
/// This list is used by the engine to calculate the documents containing the word that are
|
||||||
|
/// close to each other.
|
||||||
|
//
|
||||||
|
// TODO we currently store both words pairs (a,b) and (b,a) but we can maybe optimize
|
||||||
|
// that by only storing the lexicographically ordered pair and increment by one the pair
|
||||||
|
// that is not in the right order. This way we would avoid storing pairs in both orders.
|
||||||
fn compute_words_pair_proximities(
|
fn compute_words_pair_proximities(
|
||||||
word_positions: &HashMap<String, RoaringBitmap>,
|
word_positions: &HashMap<String, RoaringBitmap>,
|
||||||
) -> HashMap<(&str, &str), RoaringBitmap>
|
) -> HashMap<(&str, &str), RoaringBitmap>
|
||||||
@ -240,6 +248,10 @@ impl Store {
|
|||||||
record: &StringRecord,
|
record: &StringRecord,
|
||||||
) -> anyhow::Result<()>
|
) -> anyhow::Result<()>
|
||||||
{
|
{
|
||||||
|
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
|
||||||
|
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
|
||||||
|
Self::write_words_pairs_proximities(&mut self.sorter, document_id, &words_pair_proximities)?;
|
||||||
|
|
||||||
// We store document_id associated with all the words the record contains.
|
// We store document_id associated with all the words the record contains.
|
||||||
for (word, _) in words_positions {
|
for (word, _) in words_positions {
|
||||||
self.insert_word_docid(word, document_id)?;
|
self.insert_word_docid(word, document_id)?;
|
||||||
@ -255,9 +267,9 @@ impl Store {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME We must store those pairs in an ArcCache to reduce the number of I/O operations,
|
// FIXME We must store those pairs in a cache to reduce the number of I/O operations,
|
||||||
// We must store the documents ids associated with the words pairs and proximities.
|
// We must store the documents ids associated with the words pairs and proximities.
|
||||||
fn write_words_proximities(
|
fn write_words_pairs_proximities(
|
||||||
sorter: &mut Sorter<MergeFn>,
|
sorter: &mut Sorter<MergeFn>,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
words_pair_proximities: &HashMap<(&str, &str), RoaringBitmap>,
|
words_pair_proximities: &HashMap<(&str, &str), RoaringBitmap>,
|
||||||
@ -365,7 +377,7 @@ impl Store {
|
|||||||
let mut before = Instant::now();
|
let mut before = Instant::now();
|
||||||
let mut document_id: usize = 0;
|
let mut document_id: usize = 0;
|
||||||
let mut document = csv::StringRecord::new();
|
let mut document = csv::StringRecord::new();
|
||||||
let mut word_positions = HashMap::new();
|
let mut words_positions = HashMap::new();
|
||||||
|
|
||||||
while rdr.read_record(&mut document)? {
|
while rdr.read_record(&mut document)? {
|
||||||
// We skip documents that must not be indexed by this thread.
|
// We skip documents that must not be indexed by this thread.
|
||||||
@ -381,16 +393,13 @@ impl Store {
|
|||||||
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
||||||
let word = token.to_lowercase();
|
let word = token.to_lowercase();
|
||||||
let position = (attr * MAX_POSITION + pos) as u32;
|
let position = (attr * MAX_POSITION + pos) as u32;
|
||||||
word_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position);
|
words_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let words_pair_proximities = compute_words_pair_proximities(&word_positions);
|
|
||||||
Self::write_words_proximities(&mut self.sorter, document_id, &words_pair_proximities)?;
|
|
||||||
|
|
||||||
// We write the document in the documents store.
|
// We write the document in the documents store.
|
||||||
self.write_document(document_id, &word_positions, &document)?;
|
self.write_document(document_id, &words_positions, &document)?;
|
||||||
word_positions.clear();
|
words_positions.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the document id of the next document.
|
// Compute the document id of the next document.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user