mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
add field id word count database
This commit is contained in:
parent
2f5e61bacb
commit
4ddf008be2
22
milli/src/heed_codec/field_id_word_count_codec.rs
Normal file
22
milli/src/heed_codec/field_id_word_count_codec.rs
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
use std::{borrow::Cow, convert::TryInto};
|
||||||
|
|
||||||
|
use crate::FieldId;
|
||||||
|
|
||||||
|
pub struct FieldIdWordCountCodec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
|
||||||
|
type DItem = (FieldId, u8);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?;
|
||||||
|
Some((field_id, word_count))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
|
||||||
|
type EItem = (FieldId, u8);
|
||||||
|
|
||||||
|
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
Some(Cow::Owned(vec![*field_id, *word_count]))
|
||||||
|
}
|
||||||
|
}
|
@ -4,6 +4,7 @@ mod roaring_bitmap;
|
|||||||
mod roaring_bitmap_length;
|
mod roaring_bitmap_length;
|
||||||
mod str_level_position_codec;
|
mod str_level_position_codec;
|
||||||
mod str_str_u8_codec;
|
mod str_str_u8_codec;
|
||||||
|
mod field_id_word_count_codec;
|
||||||
pub mod facet;
|
pub mod facet;
|
||||||
|
|
||||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||||
@ -12,3 +13,4 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
|
|||||||
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||||
pub use self::str_level_position_codec::StrLevelPositionCodec;
|
pub use self::str_level_position_codec::StrLevelPositionCodec;
|
||||||
pub use self::str_str_u8_codec::StrStrU8Codec;
|
pub use self::str_str_u8_codec::StrStrU8Codec;
|
||||||
|
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||||
|
@ -13,6 +13,7 @@ use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
|
|||||||
use crate::{
|
use crate::{
|
||||||
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||||
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
|
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
|
||||||
|
FieldIdWordCountCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
@ -63,6 +64,8 @@ pub struct Index {
|
|||||||
|
|
||||||
/// Maps the word, level and position range with the docids that corresponds to it.
|
/// Maps the word, level and position range with the docids that corresponds to it.
|
||||||
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||||
|
/// Maps the field id and the word count with the docids that corresponds to it.
|
||||||
|
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the level positions of a word prefix with all the docids where this prefix appears.
|
/// Maps the level positions of a word prefix with all the docids where this prefix appears.
|
||||||
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
@ -82,7 +85,7 @@ pub struct Index {
|
|||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
||||||
options.max_dbs(13);
|
options.max_dbs(14);
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
let main = env.create_poly_database(Some("main"))?;
|
let main = env.create_poly_database(Some("main"))?;
|
||||||
@ -92,6 +95,7 @@ impl Index {
|
|||||||
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
||||||
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
|
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
|
||||||
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
|
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
|
||||||
|
let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?;
|
||||||
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
|
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
|
||||||
let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?;
|
let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?;
|
||||||
let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?;
|
let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?;
|
||||||
@ -111,6 +115,7 @@ impl Index {
|
|||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
word_prefix_level_position_docids,
|
word_prefix_level_position_docids,
|
||||||
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
|
@ -23,7 +23,7 @@ use serde_json::{Map, Value};
|
|||||||
pub use self::criterion::{Criterion, default_criteria};
|
pub use self::criterion::{Criterion, default_criteria};
|
||||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||||
pub use self::fields_ids_map::FieldsIdsMap;
|
pub use self::fields_ids_map::FieldsIdsMap;
|
||||||
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec};
|
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec};
|
||||||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
|
@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
field_id_word_count_docids,
|
||||||
word_prefix_level_position_docids,
|
word_prefix_level_position_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
@ -62,6 +63,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_level_position_docids.clear(self.wtxn)?;
|
word_level_position_docids.clear(self.wtxn)?;
|
||||||
|
field_id_word_count_docids.clear(self.wtxn)?;
|
||||||
word_prefix_level_position_docids.clear(self.wtxn)?;
|
word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||||
facet_id_f64_docids.clear(self.wtxn)?;
|
facet_id_f64_docids.clear(self.wtxn)?;
|
||||||
facet_id_string_docids.clear(self.wtxn)?;
|
facet_id_string_docids.clear(self.wtxn)?;
|
||||||
@ -117,6 +119,7 @@ mod tests {
|
|||||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
|
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||||
|
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
||||||
|
@ -86,6 +86,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
|
field_id_word_count_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
word_prefix_level_position_docids,
|
word_prefix_level_position_docids,
|
||||||
@ -316,6 +317,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
drop(iter);
|
drop(iter);
|
||||||
|
|
||||||
|
// Remove the documents ids from field id word count database.
|
||||||
|
let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
|
||||||
|
while let Some((key, mut docids)) = iter.next().transpose()? {
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids.difference_with(&self.documents_ids);
|
||||||
|
if docids.is_empty() {
|
||||||
|
iter.del_current()?;
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
iter.put_current(&key, &docids)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(iter);
|
||||||
|
|
||||||
// We delete the documents ids that are under the facet field id values.
|
// We delete the documents ids that are under the facet field id values.
|
||||||
remove_docids_from_facet_field_id_value_docids(
|
remove_docids_from_facet_field_id_value_docids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
|
@ -60,6 +60,10 @@ pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> an
|
|||||||
cbo_roaring_bitmap_merge(values)
|
cbo_roaring_bitmap_merge(values)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
|
cbo_roaring_bitmap_merge(values)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
cbo_roaring_bitmap_merge(values)
|
cbo_roaring_bitmap_merge(values)
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ pub use self::merge_function::{
|
|||||||
docid_word_positions_merge, documents_merge,
|
docid_word_positions_merge, documents_merge,
|
||||||
word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
|
word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
|
||||||
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
|
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
|
||||||
|
field_id_word_count_docids_merge,
|
||||||
};
|
};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
|
|
||||||
@ -412,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
Main,
|
Main,
|
||||||
WordDocids,
|
WordDocids,
|
||||||
WordLevel0PositionDocids,
|
WordLevel0PositionDocids,
|
||||||
|
FieldIdWordCountDocids,
|
||||||
FacetLevel0NumbersDocids,
|
FacetLevel0NumbersDocids,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -476,6 +478,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
|
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
|
||||||
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
|
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
|
||||||
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
|
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
|
||||||
|
let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len());
|
||||||
let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len());
|
let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len());
|
||||||
let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len());
|
let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len());
|
||||||
let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len());
|
let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len());
|
||||||
@ -488,6 +491,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
words_pairs_proximities_docids,
|
words_pairs_proximities_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
field_id_word_count_docids,
|
||||||
facet_field_numbers_docids,
|
facet_field_numbers_docids,
|
||||||
facet_field_strings_docids,
|
facet_field_strings_docids,
|
||||||
field_id_docid_facet_numbers,
|
field_id_docid_facet_numbers,
|
||||||
@ -499,6 +503,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
docid_word_positions_readers.push(docid_word_positions);
|
docid_word_positions_readers.push(docid_word_positions);
|
||||||
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
|
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
|
||||||
word_level_position_docids_readers.push(word_level_position_docids);
|
word_level_position_docids_readers.push(word_level_position_docids);
|
||||||
|
field_id_word_count_docids_readers.push(field_id_word_count_docids);
|
||||||
facet_field_numbers_docids_readers.push(facet_field_numbers_docids);
|
facet_field_numbers_docids_readers.push(facet_field_numbers_docids);
|
||||||
facet_field_strings_docids_readers.push(facet_field_strings_docids);
|
facet_field_strings_docids_readers.push(facet_field_strings_docids);
|
||||||
field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers);
|
field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers);
|
||||||
@ -536,6 +541,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
word_level_position_docids_readers,
|
word_level_position_docids_readers,
|
||||||
word_level_position_docids_merge,
|
word_level_position_docids_merge,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
DatabaseType::FieldIdWordCountDocids,
|
||||||
|
field_id_word_count_docids_readers,
|
||||||
|
field_id_word_count_docids_merge,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.for_each(|(dbtype, readers, merge)| {
|
.for_each(|(dbtype, readers, merge)| {
|
||||||
@ -595,7 +605,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
||||||
|
|
||||||
let mut database_count = 0;
|
let mut database_count = 0;
|
||||||
let total_databases = 10;
|
let total_databases = 11;
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
databases_seen: 0,
|
databases_seen: 0,
|
||||||
@ -727,6 +737,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
write_method,
|
write_method,
|
||||||
)?;
|
)?;
|
||||||
},
|
},
|
||||||
|
DatabaseType::FieldIdWordCountDocids => {
|
||||||
|
debug!("Writing the field id word count docids into LMDB on disk...");
|
||||||
|
let db = *self.index.field_id_word_count_docids.as_polymorph();
|
||||||
|
write_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
db,
|
||||||
|
content,
|
||||||
|
field_id_word_count_docids_merge,
|
||||||
|
write_method,
|
||||||
|
)?;
|
||||||
|
},
|
||||||
DatabaseType::WordLevel0PositionDocids => {
|
DatabaseType::WordLevel0PositionDocids => {
|
||||||
debug!("Writing the word level 0 positions docids into LMDB on disk...");
|
debug!("Writing the word level 0 positions docids into LMDB on disk...");
|
||||||
let db = *self.index.word_level_position_docids.as_polymorph();
|
let db = *self.index.word_level_position_docids.as_polymorph();
|
||||||
|
@ -29,7 +29,7 @@ use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
|||||||
use super::merge_function::{
|
use super::merge_function::{
|
||||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||||
word_level_position_docids_merge, facet_field_value_docids_merge,
|
word_level_position_docids_merge, facet_field_value_docids_merge,
|
||||||
field_id_docid_facet_values_merge,
|
field_id_docid_facet_values_merge, field_id_word_count_docids_merge,
|
||||||
};
|
};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||||
@ -44,6 +44,7 @@ pub struct Readers {
|
|||||||
pub docid_word_positions: Reader<FileFuse>,
|
pub docid_word_positions: Reader<FileFuse>,
|
||||||
pub words_pairs_proximities_docids: Reader<FileFuse>,
|
pub words_pairs_proximities_docids: Reader<FileFuse>,
|
||||||
pub word_level_position_docids: Reader<FileFuse>,
|
pub word_level_position_docids: Reader<FileFuse>,
|
||||||
|
pub field_id_word_count_docids: Reader<FileFuse>,
|
||||||
pub facet_field_numbers_docids: Reader<FileFuse>,
|
pub facet_field_numbers_docids: Reader<FileFuse>,
|
||||||
pub facet_field_strings_docids: Reader<FileFuse>,
|
pub facet_field_strings_docids: Reader<FileFuse>,
|
||||||
pub field_id_docid_facet_numbers: Reader<FileFuse>,
|
pub field_id_docid_facet_numbers: Reader<FileFuse>,
|
||||||
@ -58,6 +59,7 @@ pub struct Store<'s, A> {
|
|||||||
// Caches
|
// Caches
|
||||||
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
||||||
word_docids_limit: usize,
|
word_docids_limit: usize,
|
||||||
|
field_id_word_count_docids: HashMap<(u8, u8), RoaringBitmap>,
|
||||||
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||||
words_pairs_proximities_docids_limit: usize,
|
words_pairs_proximities_docids_limit: usize,
|
||||||
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
|
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
|
||||||
@ -72,6 +74,7 @@ pub struct Store<'s, A> {
|
|||||||
word_docids_sorter: Sorter<MergeFn>,
|
word_docids_sorter: Sorter<MergeFn>,
|
||||||
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
||||||
word_level_position_docids_sorter: Sorter<MergeFn>,
|
word_level_position_docids_sorter: Sorter<MergeFn>,
|
||||||
|
field_id_word_count_docids_sorter: Sorter<MergeFn>,
|
||||||
facet_field_numbers_docids_sorter: Sorter<MergeFn>,
|
facet_field_numbers_docids_sorter: Sorter<MergeFn>,
|
||||||
facet_field_strings_docids_sorter: Sorter<MergeFn>,
|
facet_field_strings_docids_sorter: Sorter<MergeFn>,
|
||||||
field_id_docid_facet_numbers_sorter: Sorter<MergeFn>,
|
field_id_docid_facet_numbers_sorter: Sorter<MergeFn>,
|
||||||
@ -132,6 +135,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
max_nb_chunks,
|
max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
let field_id_word_count_docids_sorter = create_sorter(
|
||||||
|
field_id_word_count_docids_merge,
|
||||||
|
chunk_compression_type,
|
||||||
|
chunk_compression_level,
|
||||||
|
chunk_fusing_shrink_size,
|
||||||
|
max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
);
|
||||||
let facet_field_numbers_docids_sorter = create_sorter(
|
let facet_field_numbers_docids_sorter = create_sorter(
|
||||||
facet_field_value_docids_merge,
|
facet_field_value_docids_merge,
|
||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
@ -184,6 +195,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
faceted_fields,
|
faceted_fields,
|
||||||
// Caches
|
// Caches
|
||||||
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||||
|
field_id_word_count_docids: HashMap::new(),
|
||||||
word_docids_limit: linked_hash_map_size,
|
word_docids_limit: linked_hash_map_size,
|
||||||
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||||
words_pairs_proximities_docids_limit: linked_hash_map_size,
|
words_pairs_proximities_docids_limit: linked_hash_map_size,
|
||||||
@ -199,6 +211,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
word_docids_sorter,
|
word_docids_sorter,
|
||||||
words_pairs_proximities_docids_sorter,
|
words_pairs_proximities_docids_sorter,
|
||||||
word_level_position_docids_sorter,
|
word_level_position_docids_sorter,
|
||||||
|
field_id_word_count_docids_sorter,
|
||||||
facet_field_numbers_docids_sorter,
|
facet_field_numbers_docids_sorter,
|
||||||
facet_field_strings_docids_sorter,
|
facet_field_strings_docids_sorter,
|
||||||
field_id_docid_facet_numbers_sorter,
|
field_id_docid_facet_numbers_sorter,
|
||||||
@ -620,10 +633,17 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
let analyzed = self.analyzer.analyze(&content);
|
let analyzed = self.analyzer.analyze(&content);
|
||||||
let tokens = process_tokens(analyzed.tokens());
|
let tokens = process_tokens(analyzed.tokens());
|
||||||
|
|
||||||
|
let mut last_pos = None;
|
||||||
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
||||||
|
last_pos = Some(pos);
|
||||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||||
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
|
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
|
||||||
|
let key = (attr, last_pos as u8 + 1);
|
||||||
|
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -683,6 +703,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
word_docids_wtr.insert(word, val)?;
|
word_docids_wtr.insert(word, val)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut docids_buffer = Vec::new();
|
||||||
|
for ((fid, count), docids) in self.field_id_word_count_docids {
|
||||||
|
docids_buffer.clear();
|
||||||
|
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?;
|
||||||
|
self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
let fst = builder.into_set();
|
let fst = builder.into_set();
|
||||||
self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?;
|
self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?;
|
||||||
|
|
||||||
@ -695,6 +722,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
|
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
|
||||||
|
|
||||||
|
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
|
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
|
||||||
|
|
||||||
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
|
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
|
||||||
|
|
||||||
@ -711,6 +741,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
||||||
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||||
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||||
|
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
|
||||||
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
||||||
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
||||||
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
||||||
@ -724,6 +755,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
words_pairs_proximities_docids,
|
words_pairs_proximities_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
field_id_word_count_docids,
|
||||||
facet_field_numbers_docids,
|
facet_field_numbers_docids,
|
||||||
facet_field_strings_docids,
|
facet_field_strings_docids,
|
||||||
field_id_docid_facet_numbers,
|
field_id_docid_facet_numbers,
|
||||||
|
Loading…
Reference in New Issue
Block a user