mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-05 04:28:55 +01:00
Simplify the indexer record loop
This commit is contained in:
parent
e5adfaade0
commit
15208c7d3d
@ -193,21 +193,31 @@ impl Store {
|
|||||||
|
|
||||||
fn write_document(
|
fn write_document(
|
||||||
&mut self,
|
&mut self,
|
||||||
id: DocumentId,
|
document_id: DocumentId,
|
||||||
iter: impl IntoIterator<Item=(String, RoaringBitmap)>,
|
words_positions: &HashMap<String, RoaringBitmap>,
|
||||||
record: &StringRecord,
|
record: &StringRecord,
|
||||||
) -> anyhow::Result<()>
|
) -> anyhow::Result<()>
|
||||||
{
|
{
|
||||||
|
// We store document_id associated with all the words the record contains.
|
||||||
|
for (word, _) in words_positions {
|
||||||
|
self.insert_word_docid(word, document_id)?;
|
||||||
|
}
|
||||||
|
|
||||||
let record = CsvStringRecordCodec::bytes_encode(record)
|
let record = CsvStringRecordCodec::bytes_encode(record)
|
||||||
.with_context(|| format!("could not encode csv record"))?;
|
.with_context(|| format!("could not encode CSV record"))?;
|
||||||
self.documents_ids.insert(id);
|
|
||||||
self.documents_sorter.insert(id.to_be_bytes(), record)?;
|
self.documents_ids.insert(document_id);
|
||||||
Self::write_docid_word_positions(&mut self.sorter, id, iter)?;
|
self.documents_sorter.insert(document_id.to_be_bytes(), record)?;
|
||||||
|
Self::write_docid_word_positions(&mut self.sorter, document_id, words_positions)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_docid_word_positions<I>(sorter: &mut Sorter<MergeFn>, id: DocumentId, iter: I) -> anyhow::Result<()>
|
fn write_docid_word_positions(
|
||||||
where I: IntoIterator<Item=(String, RoaringBitmap)>
|
sorter: &mut Sorter<MergeFn>,
|
||||||
|
id: DocumentId,
|
||||||
|
words_positions: &HashMap<String, RoaringBitmap>,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
{
|
{
|
||||||
// postings positions ids keys are all prefixed
|
// postings positions ids keys are all prefixed
|
||||||
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
||||||
@ -216,7 +226,7 @@ impl Store {
|
|||||||
key.extend_from_slice(&id.to_be_bytes());
|
key.extend_from_slice(&id.to_be_bytes());
|
||||||
let base_size = key.len();
|
let base_size = key.len();
|
||||||
|
|
||||||
for (word, positions) in iter {
|
for (word, positions) in words_positions {
|
||||||
key.truncate(base_size);
|
key.truncate(base_size);
|
||||||
key.extend_from_slice(word.as_bytes());
|
key.extend_from_slice(word.as_bytes());
|
||||||
// We serialize the positions into a buffer.
|
// We serialize the positions into a buffer.
|
||||||
@ -278,8 +288,8 @@ impl Store {
|
|||||||
let mut document_id: usize = 0;
|
let mut document_id: usize = 0;
|
||||||
let mut document = csv::StringRecord::new();
|
let mut document = csv::StringRecord::new();
|
||||||
let mut word_positions = HashMap::new();
|
let mut word_positions = HashMap::new();
|
||||||
while rdr.read_record(&mut document)? {
|
|
||||||
|
|
||||||
|
while rdr.read_record(&mut document)? {
|
||||||
// We skip documents that must not be indexed by this thread.
|
// We skip documents that must not be indexed by this thread.
|
||||||
if document_id % num_threads == thread_index {
|
if document_id % num_threads == thread_index {
|
||||||
if document_id % ONE_MILLION == 0 {
|
if document_id % ONE_MILLION == 0 {
|
||||||
@ -293,13 +303,13 @@ impl Store {
|
|||||||
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
|
||||||
let word = token.to_lowercase();
|
let word = token.to_lowercase();
|
||||||
let position = (attr * MAX_POSITION + pos) as u32;
|
let position = (attr * MAX_POSITION + pos) as u32;
|
||||||
self.insert_word_docid(&word, document_id)?;
|
|
||||||
word_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position);
|
word_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We write the document in the documents store.
|
// We write the document in the documents store.
|
||||||
self.write_document(document_id, word_positions.drain(), &document)?;
|
self.write_document(document_id, &word_positions, &document)?;
|
||||||
|
word_positions.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the document id of the next document.
|
// Compute the document id of the next document.
|
||||||
|
Loading…
Reference in New Issue
Block a user