From 71f59749dca59bec6119da76cef5d984864b43fb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 9 Dec 2024 15:44:06 +0100 Subject: [PATCH 1/2] Reduce union impact in merging --- crates/milli/src/update/new/merger.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 9728f99d6..9e87388a2 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -235,8 +235,12 @@ fn merge_cbo_bitmaps( (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), (Some(current), Some(del), add) => { + debug_assert!( + del.is_subset(¤t), + "del is not a subset of current, which must be impossible." + ); let output = match add { - Some(add) => (¤t - del) | add, + Some(add) => (¤t - (&del - &add)) | (add - del), None => ¤t - del, }; if output.is_empty() { From 07f42e805712fde3087829d9400e767384de7a7f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 9 Dec 2024 15:45:12 +0100 Subject: [PATCH 2/2] Do not index a filed count when no word is counted --- .../extract/searchable/extract_word_docids.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 06fb747c6..5e85eb1c8 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -28,7 +28,7 @@ pub struct WordDocidsBalancedCaches<'extractor> { exact_word_docids: BalancedCaches<'extractor>, word_position_docids: BalancedCaches<'extractor>, fid_word_count_docids: BalancedCaches<'extractor>, - fid_word_count: HashMap, + fid_word_count: HashMap, Option)>, current_docid: Option, } @@ -85,8 +85,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { self.fid_word_count .entry(field_id) - .and_modify(|(_current_count, new_count)| *new_count += 1) - .or_insert((0, 1)); + .and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1) + .or_insert((None, Some(1))); self.current_docid = Some(docid); Ok(()) @@ -130,8 +130,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { self.fid_word_count .entry(field_id) - .and_modify(|(current_count, _new_count)| *current_count += 1) - .or_insert((1, 0)); + .and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1) + .or_insert((Some(1), None)); self.current_docid = Some(docid); @@ -141,14 +141,18 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { fn flush_fid_word_count(&mut self, buffer: &mut BumpVec) -> Result<()> { for (fid, (current_count, new_count)) in self.fid_word_count.drain() { if current_count != new_count { - if current_count <= MAX_COUNTED_WORDS { + if let Some(current_count) = + current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(current_count as u8); self.fid_word_count_docids .insert_del_u32(buffer, self.current_docid.unwrap())?; } - if new_count <= MAX_COUNTED_WORDS { + if let Some(new_count) = + new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(new_count as u8);