diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 3f4751185..3376cebb2 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -11,9 +11,7 @@ use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; -use crate::{ - CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, -}; +use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext}; /// A cache storing pointers to values in the LMDB databases. /// diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c3b2cf1a3..1fef922cd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -16,9 +16,7 @@ use crate::facet::FacetType; use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::Hnsw; -use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32, -}; +use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index a45d488e4..0c7c5cf46 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -12,9 +12,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::index_documents::MergeFn; -use crate::{ - absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, -}; +use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index d9fb72cc2..3df962585 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -5,11 +5,10 @@ use std::iter::FromIterator; use heed::BytesDecode; use obkv::KvReaderU16; -use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap, - sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader, + try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; @@ -47,7 +46,6 @@ pub fn extract_word_docids( max_memory.map(|x| x / 3), ); let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); let mut words = BTreeSet::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -66,7 +64,6 @@ pub fn extract_word_docids( document_id, fid, &mut key_buffer, - &mut value_buffer, &mut words, &mut word_fid_docids_sorter, )?; @@ -124,7 +121,6 @@ fn words_into_sorter( document_id: DocumentId, fid: FieldId, key_buffer: &mut Vec, - value_buffer: &mut Vec, words: &mut BTreeSet>, word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs deleted file mode 100644 index dd4d42431..000000000 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::error::SerializationError; -use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, Result}; - -/// Extracts the word, field id, and the documents ids where this word appear at this field id. -#[logging_timer::time] -pub fn extract_word_fid_docids( - docid_word_positions: grenad::Reader, - indexer: GrenadParameters, -) -> Result>> { - puffin::profile_function!(); - - todo!("remove me"); - - let max_memory = indexer.max_memory_by_thread(); - - let mut word_fid_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut key_buffer = Vec::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = DocumentId::from_be_bytes(document_id_bytes); - - for position in read_u32_ne_bytes(value) { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - let (fid, _) = relative_from_absolute_position(position); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } - } - - let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; - - Ok(word_fid_docids_reader) -} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 32ec6fe5c..164f95452 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -6,7 +6,6 @@ mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_vector_points; mod extract_word_docids; -mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::extract_vector_points; use self::extract_word_docids::extract_word_docids; -use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, - GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, + MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -206,15 +204,6 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::WordPositionDocids, "word-position-docids", ); - // spawn_extraction_task::<_, _, Vec>>>( - // docid_word_positions_chunks, - // indexer, - // lmdb_writer_sx.clone(), - // extract_word_fid_docids, - // merge_cbo_roaring_bitmaps, - // TypedChunk::WordFidDocids, - // "word-fid-docids", - // ); spawn_extraction_task::<_, _, Vec>>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 5d111067a..90cfa0f60 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -11,6 +11,7 @@ use crate::Result; pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; +#[allow(unused)] pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { if values.len() == 1 { Ok(values[0].clone()) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index d59a3bc08..3dc9f8172 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -44,6 +44,7 @@ where Some((head, tail)) } +#[allow(unused)] pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 22e42937f..e4385de70 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -38,7 +38,7 @@ use crate::update::{ self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -434,11 +434,6 @@ where word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } - TypedChunk::WordFidDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordFidDocids(chunk) - } otherwise => otherwise, }; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index cf3194255..a94bcf581 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,7 +35,6 @@ pub(crate) enum TypedChunk { word_fid_docids_reader: grenad::Reader>, }, WordPositionDocids(grenad::Reader>), - WordFidDocids(grenad::Reader>), WordPairProximityDocids(grenad::Reader>), FieldIdFacetStringDocids(grenad::Reader>), FieldIdFacetNumberDocids(grenad::Reader>), @@ -78,9 +77,6 @@ impl TypedChunk { TypedChunk::WordPositionDocids(grenad) => { format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::WordFidDocids(grenad) => { - format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len()) - } TypedChunk::WordPairProximityDocids(grenad) => { format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) } @@ -202,17 +198,6 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::WordFidDocids(word_fid_docids_iter) => { - append_entries_into_database( - word_fid_docids_iter, - &index.word_fid_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; - is_merged_database = true; - } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 980bab01a..8220aa777 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -8,7 +8,7 @@ use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Result}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>,