diff --git a/infos/src/main.rs b/infos/src/main.rs index ee2060d38..d6aa1f854 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -23,6 +23,7 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; +const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids"; const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; @@ -39,6 +40,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME, FACET_ID_F64_DOCIDS_DB_NAME, FACET_ID_STRING_DOCIDS_DB_NAME, FIELD_ID_DOCID_FACET_F64S_DB_NAME, @@ -155,6 +157,17 @@ enum Command { prefixes: Vec, }, + /// Outputs a CSV with the documents ids along with + /// the field id and the word count where it appears. + FieldIdWordCountDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -271,6 +284,9 @@ fn main() -> anyhow::Result<()> { WordPrefixesLevelPositionsDocids { full_display, prefixes } => { word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) }, + FieldIdWordCountDocids { full_display, field_name } => { + field_id_word_count_docids(&index, &rtxn, !full_display, field_name) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -357,6 +373,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s: _, @@ -372,6 +389,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; + let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; let documents_name = "documents"; @@ -443,6 +461,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in field_id_word_count_docids.remap_data_type::().iter(rtxn)? { + let ((field_id, word_count), docids) = result?; + let key = format!("{} {}", field_id, word_count); + heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; @@ -676,6 +701,39 @@ fn word_prefixes_level_positions_docids( Ok(wtr.flush()?) } +fn field_id_word_count_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + field_name: String +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["field_name", "word_count", "docids"])?; + + let field_id = index.fields_ids_map(rtxn)? + .id(&field_name) + .with_context(|| format!("unknown field name: {}", &field_name))?; + + let left = (field_id, 0); + let right = (field_id, u8::max_value()); + let iter = index.field_id_word_count_docids + .range(rtxn, &(left..=right))?; + + for result in iter { + let ((_, word_count), docids) = result?; + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?; + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -870,6 +928,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -893,6 +952,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), @@ -999,6 +1059,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => { + let db = index.field_id_word_count_docids.as_polymorph(); + compute_stats::(*db, rtxn, name) + }, unknown => anyhow::bail!("unknown database {:?}", unknown), } } diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs new file mode 100644 index 000000000..5796e5020 --- /dev/null +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -0,0 +1,22 @@ +use std::{borrow::Cow, convert::TryInto}; + +use crate::FieldId; + +pub struct FieldIdWordCountCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { + type DItem = (FieldId, u8); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?; + Some((field_id, word_count)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { + type EItem = (FieldId, u8); + + fn bytes_encode((field_id, word_count): &Self::EItem) -> Option> { + Some(Cow::Owned(vec![*field_id, *word_count])) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index cc73cdc65..65a06573e 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -4,6 +4,7 @@ mod roaring_bitmap; mod roaring_bitmap_length; mod str_level_position_codec; mod str_str_u8_codec; +mod field_id_word_count_codec; pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; @@ -12,3 +13,4 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; +pub use self::field_id_word_count_codec::FieldIdWordCountCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index 14b153a2e..bd057a02a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -13,6 +13,7 @@ use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, + FieldIdWordCountCodec, }; use crate::heed_codec::facet::{ FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -60,9 +61,11 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, - + /// Maps the word, level and position range with the docids that corresponds to it. pub word_level_position_docids: Database, + /// Maps the field id and the word count with the docids that corresponds to it. + pub field_id_word_count_docids: Database, /// Maps the level positions of a word prefix with all the docids where this prefix appears. pub word_prefix_level_position_docids: Database, @@ -82,7 +85,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(13); + options.max_dbs(14); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -92,6 +95,7 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; + let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?; let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; @@ -111,6 +115,7 @@ impl Index { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 03169bce7..e4b58765e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -23,7 +23,7 @@ use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; +pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index b1026ccc2..4d9e54f6e 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,9 +1,10 @@ +use std::convert::TryFrom; use std::mem::take; +use std::ops::BitOr; use log::debug; use roaring::RoaringBitmap; use itertools::Itertools; -use std::ops::BitOr; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::search::criteria::{ @@ -162,23 +163,24 @@ fn resolve_state( use State::*; match state { ExactAttribute(mut allowed_candidates) => { - let query_len = query.len() as u32; let mut candidates = RoaringBitmap::new(); - let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - if let Some(attribute_allowed_docids) = ctx.field_id_len_docids(id, query_len)? { - let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; - attribute_candidates_array.push(attribute_allowed_docids); - candidates |= intersection_of(attribute_candidates_array.iter().collect()); + if let Ok(query_len) = u8::try_from(query.len()) { + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { + let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + attribute_candidates_array.push(attribute_allowed_docids); + candidates |= intersection_of(attribute_candidates_array.iter().collect()); + } } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; } - // only keep allowed candidates - candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &candidates; Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) - }, AttributeStartsWith(mut allowed_candidates) => { let mut candidates = RoaringBitmap::new(); diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 99e4a4209..456d16e1a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -78,7 +78,7 @@ pub trait Context<'c> { fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> heed::Result>; - fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result>; + fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; } pub struct CriteriaBuilder<'t> { @@ -181,8 +181,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } } - fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result> { - Ok(None) + fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result> { + let key = (field_id, word_count); + self.index.field_id_word_count_docids.get(self.rtxn, &key) } fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { @@ -488,7 +489,7 @@ pub mod test { todo!() } - fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result> { + fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result> { todo!() } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index e4c1d35f8..f4c13e8f8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + field_id_word_count_docids, word_prefix_level_position_docids, facet_id_f64_docids, facet_id_string_docids, @@ -62,6 +63,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?; + field_id_word_count_docids.clear(self.wtxn)?; word_prefix_level_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; @@ -117,6 +119,7 @@ mod tests { assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e93ff9a0a..f0f4788fb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -86,6 +86,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + field_id_word_count_docids, word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, @@ -316,6 +317,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // Remove the documents ids from the field id word count database. + let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; + while let Some((key, mut docids)) = iter.next().transpose()? { + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(&key, &docids)?; + } + } + + drop(iter); + // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_value_docids( self.wtxn, diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index a6d008513..230116e99 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -60,6 +60,10 @@ pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> an cbo_roaring_bitmap_merge(values) } +pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 064f4e6fd..71f281e98 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -29,6 +29,7 @@ pub use self::merge_function::{ docid_word_positions_merge, documents_merge, word_level_position_docids_merge, word_prefix_level_positions_docids_merge, facet_field_value_docids_merge, field_id_docid_facet_values_merge, + field_id_word_count_docids_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -412,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Main, WordDocids, WordLevel0PositionDocids, + FieldIdWordCountDocids, FacetLevel0NumbersDocids, } @@ -476,6 +478,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); + let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); @@ -488,6 +491,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, + field_id_word_count_docids, facet_field_numbers_docids, facet_field_strings_docids, field_id_docid_facet_numbers, @@ -499,6 +503,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); word_level_position_docids_readers.push(word_level_position_docids); + field_id_word_count_docids_readers.push(field_id_word_count_docids); facet_field_numbers_docids_readers.push(facet_field_numbers_docids); facet_field_strings_docids_readers.push(facet_field_strings_docids); field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); @@ -536,6 +541,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_level_position_docids_readers, word_level_position_docids_merge, ), + ( + DatabaseType::FieldIdWordCountDocids, + field_id_word_count_docids_readers, + field_id_word_count_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -595,7 +605,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 10; + let total_databases = 11; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -727,6 +737,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::FieldIdWordCountDocids => { + debug!("Writing the field id word count docids into LMDB on disk..."); + let db = *self.index.field_id_word_count_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + field_id_word_count_docids_merge, + write_method, + )?; + }, DatabaseType::WordLevel0PositionDocids => { debug!("Writing the word level 0 positions docids into LMDB on disk..."); let db = *self.index.word_level_position_docids.as_polymorph(); diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 4f65d77e1..08050092e 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -29,7 +29,7 @@ use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, word_level_position_docids_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, + field_id_docid_facet_values_merge, field_id_word_count_docids_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -44,6 +44,7 @@ pub struct Readers { pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, pub word_level_position_docids: Reader, + pub field_id_word_count_docids: Reader, pub facet_field_numbers_docids: Reader, pub facet_field_strings_docids: Reader, pub field_id_docid_facet_numbers: Reader, @@ -58,6 +59,7 @@ pub struct Store<'s, A> { // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, + field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, @@ -72,6 +74,7 @@ pub struct Store<'s, A> { word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, word_level_position_docids_sorter: Sorter, + field_id_word_count_docids_sorter: Sorter, facet_field_numbers_docids_sorter: Sorter, facet_field_strings_docids_sorter: Sorter, field_id_docid_facet_numbers_sorter: Sorter, @@ -132,6 +135,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); + let field_id_word_count_docids_sorter = create_sorter( + field_id_word_count_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let facet_field_numbers_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, @@ -184,6 +195,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { faceted_fields, // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + field_id_word_count_docids: HashMap::new(), word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, @@ -199,6 +211,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_sorter, words_pairs_proximities_docids_sorter, word_level_position_docids_sorter, + field_id_word_count_docids_sorter, facet_field_numbers_docids_sorter, facet_field_strings_docids_sorter, field_id_docid_facet_numbers_sorter, @@ -620,10 +633,17 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let analyzed = self.analyzer.analyze(&content); let tokens = process_tokens(analyzed.tokens()); + let mut last_pos = None; for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { + last_pos = Some(pos); let position = (attr as usize * MAX_POSITION + pos) as u32; words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); } + + if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { + let key = (attr, last_pos as u8 + 1); + self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); + } } } } @@ -683,6 +703,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_wtr.insert(word, val)?; } + let mut docids_buffer = Vec::new(); + for ((fid, count), docids) in self.field_id_word_count_docids { + docids_buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?; + self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; + } + let fst = builder.into_set(); self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; @@ -695,6 +722,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; + let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; + let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; @@ -711,6 +741,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; + let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; @@ -724,6 +755,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, + field_id_word_count_docids, facet_field_numbers_docids, facet_field_strings_docids, field_id_docid_facet_numbers,