Simplify the indexer record loop

This commit is contained in:
Clément Renault 2020-09-22 10:33:25 +02:00
parent e5adfaade0
commit 15208c7d3d
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -193,21 +193,31 @@ impl Store {
fn write_document( fn write_document(
&mut self, &mut self,
id: DocumentId, document_id: DocumentId,
iter: impl IntoIterator<Item=(String, RoaringBitmap)>, words_positions: &HashMap<String, RoaringBitmap>,
record: &StringRecord, record: &StringRecord,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
{ {
// We store document_id associated with all the words the record contains.
for (word, _) in words_positions {
self.insert_word_docid(word, document_id)?;
}
let record = CsvStringRecordCodec::bytes_encode(record) let record = CsvStringRecordCodec::bytes_encode(record)
.with_context(|| format!("could not encode csv record"))?; .with_context(|| format!("could not encode CSV record"))?;
self.documents_ids.insert(id);
self.documents_sorter.insert(id.to_be_bytes(), record)?; self.documents_ids.insert(document_id);
Self::write_docid_word_positions(&mut self.sorter, id, iter)?; self.documents_sorter.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.sorter, document_id, words_positions)?;
Ok(()) Ok(())
} }
fn write_docid_word_positions<I>(sorter: &mut Sorter<MergeFn>, id: DocumentId, iter: I) -> anyhow::Result<()> fn write_docid_word_positions(
where I: IntoIterator<Item=(String, RoaringBitmap)> sorter: &mut Sorter<MergeFn>,
id: DocumentId,
words_positions: &HashMap<String, RoaringBitmap>,
) -> anyhow::Result<()>
{ {
// postings positions ids keys are all prefixed // postings positions ids keys are all prefixed
let mut key = vec![WORD_DOCID_POSITIONS_BYTE]; let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
@ -216,7 +226,7 @@ impl Store {
key.extend_from_slice(&id.to_be_bytes()); key.extend_from_slice(&id.to_be_bytes());
let base_size = key.len(); let base_size = key.len();
for (word, positions) in iter { for (word, positions) in words_positions {
key.truncate(base_size); key.truncate(base_size);
key.extend_from_slice(word.as_bytes()); key.extend_from_slice(word.as_bytes());
// We serialize the positions into a buffer. // We serialize the positions into a buffer.
@ -278,8 +288,8 @@ impl Store {
let mut document_id: usize = 0; let mut document_id: usize = 0;
let mut document = csv::StringRecord::new(); let mut document = csv::StringRecord::new();
let mut word_positions = HashMap::new(); let mut word_positions = HashMap::new();
while rdr.read_record(&mut document)? {
while rdr.read_record(&mut document)? {
// We skip documents that must not be indexed by this thread. // We skip documents that must not be indexed by this thread.
if document_id % num_threads == thread_index { if document_id % num_threads == thread_index {
if document_id % ONE_MILLION == 0 { if document_id % ONE_MILLION == 0 {
@ -293,13 +303,13 @@ impl Store {
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
let word = token.to_lowercase(); let word = token.to_lowercase();
let position = (attr * MAX_POSITION + pos) as u32; let position = (attr * MAX_POSITION + pos) as u32;
self.insert_word_docid(&word, document_id)?;
word_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position); word_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position);
} }
} }
// We write the document in the documents store. // We write the document in the documents store.
self.write_document(document_id, word_positions.drain(), &document)?; self.write_document(document_id, &word_positions, &document)?;
word_positions.clear();
} }
// Compute the document id of the next document. // Compute the document id of the next document.