202: Add field id word count docids database r=Kerollmops a=LegendreM

This PR introduces a new database, `field_id_word_count_docids`, that maps the number of words in an attribute with a list of document ids. This relation is limited to attributes that contain less than 11 words.
This database is used by the exactness criterion to know if a document has an attribute that contains exactly the query without any additional word.

Fix #165 
Fix #196
Related to [specifications:#36](https://github.com/meilisearch/specifications/pull/36)

Co-authored-by: many <maxime@meilisearch.com>
Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
bors[bot] 2021-06-01 16:09:48 +00:00 committed by GitHub
commit 270da98c46
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 193 additions and 22 deletions

View File

@ -23,6 +23,7 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids";
const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids";
const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids";
const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s";
@ -39,6 +40,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME,
FACET_ID_F64_DOCIDS_DB_NAME, FACET_ID_F64_DOCIDS_DB_NAME,
FACET_ID_STRING_DOCIDS_DB_NAME, FACET_ID_STRING_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_F64S_DB_NAME, FIELD_ID_DOCID_FACET_F64S_DB_NAME,
@ -155,6 +157,17 @@ enum Command {
prefixes: Vec<String>, prefixes: Vec<String>,
}, },
/// Outputs a CSV with the documents ids along with
/// the field id and the word count where it appears.
FieldIdWordCountDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// The field name in the document.
field_name: String,
},
/// Outputs a CSV with the documents ids, words and the positions where this word appears. /// Outputs a CSV with the documents ids, words and the positions where this word appears.
DocidsWordsPositions { DocidsWordsPositions {
/// Display the whole positions in detail. /// Display the whole positions in detail.
@ -271,6 +284,9 @@ fn main() -> anyhow::Result<()> {
WordPrefixesLevelPositionsDocids { full_display, prefixes } => { WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
}, },
FieldIdWordCountDocids { full_display, field_name } => {
field_id_word_count_docids(&index, &rtxn, !full_display, field_name)
},
DocidsWordsPositions { full_display, internal_documents_ids } => { DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
}, },
@ -357,6 +373,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
field_id_word_count_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
field_id_docid_facet_f64s: _, field_id_docid_facet_f64s: _,
@ -372,6 +389,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let word_level_position_docids_name = "word_level_position_docids"; let word_level_position_docids_name = "word_level_position_docids";
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
let field_id_word_count_docids_name = "field_id_word_count_docids";
let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids";
let facet_id_string_docids_name = "facet_id_string_docids"; let facet_id_string_docids_name = "facet_id_string_docids";
let documents_name = "documents"; let documents_name = "documents";
@ -443,6 +461,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); } if heap.len() > limit { heap.pop(); }
} }
for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((field_id, word_count), docids) = result?;
let key = format!("{} {}", field_id, word_count);
heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name)));
if heap.len() > limit { heap.pop(); }
}
let faceted_fields = index.faceted_fields_ids(rtxn)?; let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
@ -676,6 +701,39 @@ fn word_prefixes_level_positions_docids(
Ok(wtr.flush()?) Ok(wtr.flush()?)
} }
fn field_id_word_count_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
field_name: String
) -> anyhow::Result<()>
{
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["field_name", "word_count", "docids"])?;
let field_id = index.fields_ids_map(rtxn)?
.id(&field_name)
.with_context(|| format!("unknown field name: {}", &field_name))?;
let left = (field_id, 0);
let right = (field_id, u8::max_value());
let iter = index.field_id_word_count_docids
.range(rtxn, &(left..=right))?;
for result in iter {
let ((_, word_count), docids) = result?;
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?;
}
Ok(wtr.flush()?)
}
fn docids_words_positions( fn docids_words_positions(
index: &Index, index: &Index,
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
@ -870,6 +928,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
field_id_word_count_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
@ -893,6 +952,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(),
FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(),
FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(),
@ -999,6 +1059,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
let db = index.word_prefix_pair_proximity_docids.as_polymorph(); let db = index.word_prefix_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
}, },
FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => {
let db = index.field_id_word_count_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
unknown => anyhow::bail!("unknown database {:?}", unknown), unknown => anyhow::bail!("unknown database {:?}", unknown),
} }
} }

View File

@ -0,0 +1,22 @@
use std::{borrow::Cow, convert::TryInto};
use crate::FieldId;
pub struct FieldIdWordCountCodec;
impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
type DItem = (FieldId, u8);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?;
Some((field_id, word_count))
}
}
impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
type EItem = (FieldId, u8);
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Owned(vec![*field_id, *word_count]))
}
}

View File

@ -4,6 +4,7 @@ mod roaring_bitmap;
mod roaring_bitmap_length; mod roaring_bitmap_length;
mod str_level_position_codec; mod str_level_position_codec;
mod str_str_u8_codec; mod str_str_u8_codec;
mod field_id_word_count_codec;
pub mod facet; pub mod facet;
pub use self::beu32_str_codec::BEU32StrCodec; pub use self::beu32_str_codec::BEU32StrCodec;
@ -12,3 +13,4 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_level_position_codec::StrLevelPositionCodec;
pub use self::str_str_u8_codec::StrStrU8Codec; pub use self::str_str_u8_codec::StrStrU8Codec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;

View File

@ -13,6 +13,7 @@ use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
use crate::{ use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
FieldIdWordCountCodec,
}; };
use crate::heed_codec::facet::{ use crate::heed_codec::facet::{
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
@ -60,9 +61,11 @@ pub struct Index {
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the word, level and position range with the docids that corresponds to it. /// Maps the word, level and position range with the docids that corresponds to it.
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the field id and the word count with the docids that corresponds to it.
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
/// Maps the level positions of a word prefix with all the docids where this prefix appears. /// Maps the level positions of a word prefix with all the docids where this prefix appears.
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
@ -82,7 +85,7 @@ pub struct Index {
impl Index { impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> { pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
options.max_dbs(13); options.max_dbs(14);
let env = options.open(path)?; let env = options.open(path)?;
let main = env.create_poly_database(Some("main"))?; let main = env.create_poly_database(Some("main"))?;
@ -92,6 +95,7 @@ impl Index {
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?;
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?;
let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?;
@ -111,6 +115,7 @@ impl Index {
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
field_id_word_count_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,

View File

@ -23,7 +23,7 @@ use serde_json::{Map, Value};
pub use self::criterion::{Criterion, default_criteria}; pub use self::criterion::{Criterion, default_criteria};
pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap; pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::index::Index; pub use self::index::Index;

View File

@ -1,9 +1,10 @@
use std::convert::TryFrom;
use std::mem::take; use std::mem::take;
use std::ops::BitOr;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use itertools::Itertools; use itertools::Itertools;
use std::ops::BitOr;
use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::search::criteria::{ use crate::search::criteria::{
@ -162,23 +163,24 @@ fn resolve_state(
use State::*; use State::*;
match state { match state {
ExactAttribute(mut allowed_candidates) => { ExactAttribute(mut allowed_candidates) => {
let query_len = query.len() as u32;
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?; if let Ok(query_len) = u8::try_from(query.len()) {
for id in attributes_ids { let attributes_ids = ctx.searchable_fields_ids()?;
if let Some(attribute_allowed_docids) = ctx.field_id_len_docids(id, query_len)? { for id in attributes_ids {
let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? {
attribute_candidates_array.push(attribute_allowed_docids); let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?;
candidates |= intersection_of(attribute_candidates_array.iter().collect()); attribute_candidates_array.push(attribute_allowed_docids);
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}
} }
// only keep allowed candidates
candidates &= &allowed_candidates;
// remove current candidates from allowed candidates
allowed_candidates -= &candidates;
} }
// only keep allowed candidates
candidates &= &allowed_candidates;
// remove current candidates from allowed candidates
allowed_candidates -= &candidates;
Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
}, },
AttributeStartsWith(mut allowed_candidates) => { AttributeStartsWith(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();

View File

@ -78,7 +78,7 @@ pub trait Context<'c> {
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>; fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>;
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
fn searchable_fields_ids(&self) -> heed::Result<Vec<FieldId>>; fn searchable_fields_ids(&self) -> heed::Result<Vec<FieldId>>;
fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result<Option<RoaringBitmap>>; fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>>;
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result<Option<RoaringBitmap>, heed::Error>; fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result<Option<RoaringBitmap>, heed::Error>;
} }
pub struct CriteriaBuilder<'t> { pub struct CriteriaBuilder<'t> {
@ -181,8 +181,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
} }
} }
fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result<Option<RoaringBitmap>> { fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>> {
Ok(None) let key = (field_id, word_count);
self.index.field_id_word_count_docids.get(self.rtxn, &key)
} }
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result<Option<RoaringBitmap>, heed::Error> { fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result<Option<RoaringBitmap>, heed::Error> {
@ -488,7 +489,7 @@ pub mod test {
todo!() todo!()
} }
fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result<Option<RoaringBitmap>> { fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result<Option<RoaringBitmap>> {
todo!() todo!()
} }
} }

View File

@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
field_id_word_count_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
@ -62,6 +63,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_pair_proximity_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?;
word_level_position_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?;
field_id_word_count_docids.clear(self.wtxn)?;
word_prefix_level_position_docids.clear(self.wtxn)?; word_prefix_level_position_docids.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?;
facet_id_string_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?;
@ -117,6 +119,7 @@ mod tests {
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());

View File

@ -86,6 +86,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_docids, word_prefix_docids,
docid_word_positions, docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
field_id_word_count_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
@ -316,6 +317,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// Remove the documents ids from the field id word count database.
let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
while let Some((key, mut docids)) = iter.next().transpose()? {
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(&key, &docids)?;
}
}
drop(iter);
// We delete the documents ids that are under the facet field id values. // We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_field_id_value_docids( remove_docids_from_facet_field_id_value_docids(
self.wtxn, self.wtxn,

View File

@ -60,6 +60,10 @@ pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> an
cbo_roaring_bitmap_merge(values) cbo_roaring_bitmap_merge(values)
} }
pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values) cbo_roaring_bitmap_merge(values)
} }

View File

@ -29,6 +29,7 @@ pub use self::merge_function::{
docid_word_positions_merge, documents_merge, docid_word_positions_merge, documents_merge,
word_level_position_docids_merge, word_prefix_level_positions_docids_merge, word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
facet_field_value_docids_merge, field_id_docid_facet_values_merge, facet_field_value_docids_merge, field_id_docid_facet_values_merge,
field_id_word_count_docids_merge,
}; };
pub use self::transform::{Transform, TransformOutput}; pub use self::transform::{Transform, TransformOutput};
@ -412,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
Main, Main,
WordDocids, WordDocids,
WordLevel0PositionDocids, WordLevel0PositionDocids,
FieldIdWordCountDocids,
FacetLevel0NumbersDocids, FacetLevel0NumbersDocids,
} }
@ -476,6 +478,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len());
let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len());
let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len());
@ -488,6 +491,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions, docid_word_positions,
words_pairs_proximities_docids, words_pairs_proximities_docids,
word_level_position_docids, word_level_position_docids,
field_id_word_count_docids,
facet_field_numbers_docids, facet_field_numbers_docids,
facet_field_strings_docids, facet_field_strings_docids,
field_id_docid_facet_numbers, field_id_docid_facet_numbers,
@ -499,6 +503,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions_readers.push(docid_word_positions); docid_word_positions_readers.push(docid_word_positions);
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
word_level_position_docids_readers.push(word_level_position_docids); word_level_position_docids_readers.push(word_level_position_docids);
field_id_word_count_docids_readers.push(field_id_word_count_docids);
facet_field_numbers_docids_readers.push(facet_field_numbers_docids); facet_field_numbers_docids_readers.push(facet_field_numbers_docids);
facet_field_strings_docids_readers.push(facet_field_strings_docids); facet_field_strings_docids_readers.push(facet_field_strings_docids);
field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers);
@ -536,6 +541,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
word_level_position_docids_readers, word_level_position_docids_readers,
word_level_position_docids_merge, word_level_position_docids_merge,
), ),
(
DatabaseType::FieldIdWordCountDocids,
field_id_word_count_docids_readers,
field_id_word_count_docids_merge,
),
] ]
.into_par_iter() .into_par_iter()
.for_each(|(dbtype, readers, merge)| { .for_each(|(dbtype, readers, merge)| {
@ -595,7 +605,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.index.put_documents_ids(self.wtxn, &documents_ids)?; self.index.put_documents_ids(self.wtxn, &documents_ids)?;
let mut database_count = 0; let mut database_count = 0;
let total_databases = 10; let total_databases = 11;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: 0, databases_seen: 0,
@ -727,6 +737,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method, write_method,
)?; )?;
}, },
DatabaseType::FieldIdWordCountDocids => {
debug!("Writing the field id word count docids into LMDB on disk...");
let db = *self.index.field_id_word_count_docids.as_polymorph();
write_into_lmdb_database(
self.wtxn,
db,
content,
field_id_word_count_docids_merge,
write_method,
)?;
},
DatabaseType::WordLevel0PositionDocids => { DatabaseType::WordLevel0PositionDocids => {
debug!("Writing the word level 0 positions docids into LMDB on disk..."); debug!("Writing the word level 0 positions docids into LMDB on disk...");
let db = *self.index.word_level_position_docids.as_polymorph(); let db = *self.index.word_level_position_docids.as_polymorph();

View File

@ -29,7 +29,7 @@ use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{ use super::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge, main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
word_level_position_docids_merge, facet_field_value_docids_merge, word_level_position_docids_merge, facet_field_value_docids_merge,
field_id_docid_facet_values_merge, field_id_docid_facet_values_merge, field_id_word_count_docids_merge,
}; };
const LMDB_MAX_KEY_LENGTH: usize = 511; const LMDB_MAX_KEY_LENGTH: usize = 511;
@ -44,6 +44,7 @@ pub struct Readers {
pub docid_word_positions: Reader<FileFuse>, pub docid_word_positions: Reader<FileFuse>,
pub words_pairs_proximities_docids: Reader<FileFuse>, pub words_pairs_proximities_docids: Reader<FileFuse>,
pub word_level_position_docids: Reader<FileFuse>, pub word_level_position_docids: Reader<FileFuse>,
pub field_id_word_count_docids: Reader<FileFuse>,
pub facet_field_numbers_docids: Reader<FileFuse>, pub facet_field_numbers_docids: Reader<FileFuse>,
pub facet_field_strings_docids: Reader<FileFuse>, pub facet_field_strings_docids: Reader<FileFuse>,
pub field_id_docid_facet_numbers: Reader<FileFuse>, pub field_id_docid_facet_numbers: Reader<FileFuse>,
@ -58,6 +59,7 @@ pub struct Store<'s, A> {
// Caches // Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize, word_docids_limit: usize,
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize, words_pairs_proximities_docids_limit: usize,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
@ -72,6 +74,7 @@ pub struct Store<'s, A> {
word_docids_sorter: Sorter<MergeFn>, word_docids_sorter: Sorter<MergeFn>,
words_pairs_proximities_docids_sorter: Sorter<MergeFn>, words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
word_level_position_docids_sorter: Sorter<MergeFn>, word_level_position_docids_sorter: Sorter<MergeFn>,
field_id_word_count_docids_sorter: Sorter<MergeFn>,
facet_field_numbers_docids_sorter: Sorter<MergeFn>, facet_field_numbers_docids_sorter: Sorter<MergeFn>,
facet_field_strings_docids_sorter: Sorter<MergeFn>, facet_field_strings_docids_sorter: Sorter<MergeFn>,
field_id_docid_facet_numbers_sorter: Sorter<MergeFn>, field_id_docid_facet_numbers_sorter: Sorter<MergeFn>,
@ -132,6 +135,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_nb_chunks, max_nb_chunks,
max_memory, max_memory,
); );
let field_id_word_count_docids_sorter = create_sorter(
field_id_word_count_docids_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
max_nb_chunks,
max_memory,
);
let facet_field_numbers_docids_sorter = create_sorter( let facet_field_numbers_docids_sorter = create_sorter(
facet_field_value_docids_merge, facet_field_value_docids_merge,
chunk_compression_type, chunk_compression_type,
@ -184,6 +195,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
faceted_fields, faceted_fields,
// Caches // Caches
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
field_id_word_count_docids: HashMap::new(),
word_docids_limit: linked_hash_map_size, word_docids_limit: linked_hash_map_size,
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
words_pairs_proximities_docids_limit: linked_hash_map_size, words_pairs_proximities_docids_limit: linked_hash_map_size,
@ -199,6 +211,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids_sorter, word_docids_sorter,
words_pairs_proximities_docids_sorter, words_pairs_proximities_docids_sorter,
word_level_position_docids_sorter, word_level_position_docids_sorter,
field_id_word_count_docids_sorter,
facet_field_numbers_docids_sorter, facet_field_numbers_docids_sorter,
facet_field_strings_docids_sorter, facet_field_strings_docids_sorter,
field_id_docid_facet_numbers_sorter, field_id_docid_facet_numbers_sorter,
@ -620,10 +633,17 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let analyzed = self.analyzer.analyze(&content); let analyzed = self.analyzer.analyze(&content);
let tokens = process_tokens(analyzed.tokens()); let tokens = process_tokens(analyzed.tokens());
let mut last_pos = None;
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
last_pos = Some(pos);
let position = (attr as usize * MAX_POSITION + pos) as u32; let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
} }
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
let key = (attr, last_pos as u8 + 1);
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
}
} }
} }
} }
@ -683,6 +703,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids_wtr.insert(word, val)?; word_docids_wtr.insert(word, val)?;
} }
let mut docids_buffer = Vec::new();
for ((fid, count), docids) in self.field_id_word_count_docids {
docids_buffer.clear();
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?;
self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?;
}
let fst = builder.into_set(); let fst = builder.into_set();
self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?;
@ -695,6 +722,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
@ -711,6 +741,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
@ -724,6 +755,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
docid_word_positions, docid_word_positions,
words_pairs_proximities_docids, words_pairs_proximities_docids,
word_level_position_docids, word_level_position_docids,
field_id_word_count_docids,
facet_field_numbers_docids, facet_field_numbers_docids,
facet_field_strings_docids, facet_field_strings_docids,
field_id_docid_facet_numbers, field_id_docid_facet_numbers,