Use the word pair proximity and fid word count docids extractors

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
Clément Renault 2024-09-05 10:56:22 +02:00
parent 0fc02f7351
commit 73ce67862d
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
9 changed files with 205 additions and 157 deletions

View File

@ -112,23 +112,27 @@ pub struct WriterOperation {
}
pub enum Database {
WordDocids,
ExactWordDocids,
WordFidDocids,
WordPositionDocids,
Documents,
ExactWordDocids,
FidWordCountDocids,
Main,
WordDocids,
WordFidDocids,
WordPairProximityDocids,
WordPositionDocids,
}
impl WriterOperation {
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
match self.database {
Database::Main => index.main.remap_types(),
Database::Documents => index.documents.remap_types(),
Database::WordDocids => index.word_docids.remap_types(),
Database::ExactWordDocids => index.exact_word_docids.remap_types(),
Database::Main => index.main.remap_types(),
Database::WordDocids => index.word_docids.remap_types(),
Database::WordFidDocids => index.word_fid_docids.remap_types(),
Database::WordPositionDocids => index.word_position_docids.remap_types(),
Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
}
}
@ -198,9 +202,11 @@ impl MainSender<'_> {
}
}
pub enum WordDocids {}
pub enum ExactWordDocids {}
pub enum FidWordCountDocids {}
pub enum WordDocids {}
pub enum WordFidDocids {}
pub enum WordPairProximityDocids {}
pub enum WordPositionDocids {}
pub trait DatabaseType {
@ -209,14 +215,6 @@ pub trait DatabaseType {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation;
}
impl DatabaseType for WordDocids {
const DATABASE: Database = Database::WordDocids;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::WordDocidsMerger(merger)
}
}
impl DatabaseType for ExactWordDocids {
const DATABASE: Database = Database::ExactWordDocids;
@ -225,6 +223,22 @@ impl DatabaseType for ExactWordDocids {
}
}
impl DatabaseType for FidWordCountDocids {
const DATABASE: Database = Database::FidWordCountDocids;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::FidWordCountDocidsMerger(merger)
}
}
impl DatabaseType for WordDocids {
const DATABASE: Database = Database::WordDocids;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::WordDocidsMerger(merger)
}
}
impl DatabaseType for WordFidDocids {
const DATABASE: Database = Database::WordFidDocids;
@ -233,6 +247,14 @@ impl DatabaseType for WordFidDocids {
}
}
impl DatabaseType for WordPairProximityDocids {
const DATABASE: Database = Database::WordPairProximityDocids;
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
MergerOperation::WordPairProximityDocidsMerger(merger)
}
}
impl DatabaseType for WordPositionDocids {
const DATABASE: Database = Database::WordPositionDocids;
@ -293,12 +315,14 @@ impl DocumentsSender<'_> {
}
pub enum MergerOperation {
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
FidWordCountDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
WordFidDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
WordPairProximityDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
WordPositionDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
DeleteDocument { docid: DocumentId },
InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
}
pub struct MergerReceiver(Receiver<MergerOperation>);

View File

@ -1,5 +1,6 @@
use serde_json::Value;
use crate::update::new::extract::perm_json_p;
use crate::update::new::KvReaderFieldId;
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};

View File

@ -2,6 +2,7 @@ use std::collections::HashSet;
use std::fmt::Debug;
use std::fs::File;
pub use extract_facets::*;
use grenad::{MergeFunction, Merger};
use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelIterator};

View File

@ -2,16 +2,8 @@ mod cache;
mod faceted;
mod searchable;
pub use faceted::modname::{
FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor,
FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor,
FieldIdFacetStringDocidsExtractor,
};
pub use faceted::FacetedExtractor;
pub use searchable::{
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
WordPositionDocidsExtractor,
};
pub use faceted::*;
pub use searchable::*;
/// TODO move in permissive json pointer
pub mod perm_json_p {

View File

@ -1,15 +1,14 @@
use std::{borrow::Cow, collections::HashMap};
use std::borrow::Cow;
use std::collections::HashMap;
use heed::RoTxn;
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
use crate::{
update::{
new::{extract::cache::CboCachedSorter, DocumentChange},
MergeDeladdCboRoaringBitmaps,
},
FieldId, GlobalFieldsIdsMap, Index, Result,
};
use super::tokenize_document::DocumentTokenizer;
use super::SearchableExtractor;
use crate::update::new::extract::cache::CboCachedSorter;
use crate::update::new::DocumentChange;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
const MAX_COUNTED_WORDS: usize = 30;
@ -22,12 +21,13 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
}
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
Ok(vec![])
}
/// This case is unreachable because extract_document_change has been reimplemented to not call this function.
fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> {
/// TODO remove this
unreachable!()
}
@ -45,7 +45,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
match document_change {
DocumentChange::Deletion(inner) => {
let mut fid_word_count = HashMap::new();
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
Ok(())
};
@ -66,10 +66,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
}
DocumentChange::Update(inner) => {
let mut fid_word_count = HashMap::new();
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
fid_word_count
.entry(fid)
.and_modify(|(current_count, new_count)| *current_count += 1)
.and_modify(|(current_count, _new_count)| *current_count += 1)
.or_insert((1, 0));
Ok(())
};
@ -79,10 +79,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
&mut token_fn,
)?;
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
fid_word_count
.entry(fid)
.and_modify(|(current_count, new_count)| *new_count += 1)
.and_modify(|(_current_count, new_count)| *new_count += 1)
.or_insert((0, 1));
Ok(())
};
@ -106,7 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
}
DocumentChange::Insertion(inner) => {
let mut fid_word_count = HashMap::new();
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
Ok(())
};

View File

@ -20,7 +20,7 @@ impl SearchableExtractor for WordDocidsExtractor {
}
/// TODO write in an external Vec buffer
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
Cow::Borrowed(word.as_bytes())
}
}
@ -49,7 +49,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor {
Ok(vec![])
}
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
Cow::Borrowed(word.as_bytes())
}
}
@ -67,7 +67,7 @@ impl SearchableExtractor for WordFidDocidsExtractor {
Ok(vec![])
}
fn build_key<'a>(field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
let mut key = Vec::new();
key.extend_from_slice(word.as_bytes());
key.push(0);
@ -89,7 +89,7 @@ impl SearchableExtractor for WordPositionDocidsExtractor {
Ok(vec![])
}
fn build_key<'a>(_field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]> {
fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> {
// position must be bucketed to reduce the number of keys in the DB.
let position = bucketed_position(position);
let mut key = Vec::new();

View File

@ -1,21 +1,17 @@
use std::{
borrow::Cow,
collections::{BTreeMap, VecDeque},
};
use std::borrow::Cow;
use std::collections::{BTreeMap, VecDeque};
use heed::RoTxn;
use itertools::merge_join_by;
use obkv::KvReader;
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
use crate::{
proximity::{index_proximity, MAX_DISTANCE},
update::{
new::{extract::cache::CboCachedSorter, DocumentChange},
MergeDeladdCboRoaringBitmaps,
},
FieldId, GlobalFieldsIdsMap, Index, Result,
};
use super::tokenize_document::DocumentTokenizer;
use super::SearchableExtractor;
use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::update::new::extract::cache::CboCachedSorter;
use crate::update::new::DocumentChange;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
pub struct WordPairProximityDocidsExtractor;
impl SearchableExtractor for WordPairProximityDocidsExtractor {
@ -26,12 +22,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
}
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
Ok(vec![])
}
/// This case is unreachable because extract_document_change has been reimplemented to not call this function.
fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> {
/// TODO remove this
unreachable!()
}
@ -159,7 +156,7 @@ fn process_document_tokens(
word_positions: &mut VecDeque<(String, u16)>,
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
) -> Result<()> {
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
let mut token_fn = |_fid: FieldId, pos: u16, word: &str| {
// drain the proximity window until the head word is considered close to the word we are inserting.
while word_positions
.front()

View File

@ -11,15 +11,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
use rayon::ThreadPool;
pub use update_by_function::UpdateByFunction;
use super::channel::{
extractors_merger_channels, merger_writer_channel, EntryOperation, ExactWordDocids, WordDocids,
WordFidDocids, WordPositionDocids,
};
use super::channel::*;
use super::document_change::DocumentChange;
use super::extract::{
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
WordPositionDocidsExtractor,
};
use super::extract::*;
use super::merger::merge_grenad_entries;
use super::StdResult;
use crate::documents::{
@ -71,79 +65,98 @@ where
// TODO manage the errors correctly
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
pool.in_place_scope(|_s| {
let document_changes = document_changes.into_par_iter();
let document_changes = document_changes.into_par_iter();
// document but we need to create a function that collects and compresses documents.
document_changes.clone().into_par_iter().try_for_each(|result| {
match result? {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
extractor_sender.document_delete(docid).unwrap();
// document but we need to create a function that collects and compresses documents.
document_changes.clone().into_par_iter().try_for_each(|result| {
match result? {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
extractor_sender.document_delete(docid).unwrap();
}
DocumentChange::Update(update) => {
let docid = update.docid();
let content = update.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap();
}
DocumentChange::Insertion(insertion) => {
let docid = insertion.docid();
let content = insertion.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap();
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
}
}
DocumentChange::Update(update) => {
let docid = update.docid();
let content = update.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap();
}
DocumentChange::Insertion(insertion) => {
let docid = insertion.docid();
let content = insertion.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap();
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
}
}
Ok(()) as Result<_>
})?;
extract_and_send_docids::<WordDocidsExtractor, WordDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<WordFidDocidsExtractor, WordFidDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<
WordPairProximityDocidsExtractor,
WordPairProximityDocids,
>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
// TODO THIS IS TOO MUCH
// Extract fieldid docid facet number
// Extract fieldid docid facet string
// Extract facetid string fst
// Extract facetid normalized string strings
// TODO Inverted Indexes again
// Extract fieldid facet isempty docids
// Extract fieldid facet isnull docids
// Extract fieldid facet exists docids
// TODO This is the normal system
// Extract fieldid facet number docids
// Extract fieldid facet string docids
Ok(()) as Result<_>
})?;
extract_and_send_docids::<WordDocidsExtractor, WordDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<WordFidDocidsExtractor, WordFidDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
// TODO THIS IS TOO MUCH
// Extract fieldid docid facet number
// Extract fieldid docid facet string
// Extract facetid string fst
// Extract facetid normalized string strings
// TODO Inverted Indexes again
// Extract fieldid facet isempty docids
// Extract fieldid facet isnull docids
// Extract fieldid facet exists docids
// TODO This is the normal system
// Extract fieldid facet number docids
// Extract fieldid facet string docids
Ok(()) as Result<_>
})
})
})?;
// TODO manage the errors correctly

View File

@ -8,10 +8,7 @@ use memmap2::Mmap;
use roaring::RoaringBitmap;
use tempfile::tempfile;
use super::channel::{
DatabaseType, DocidsSender, ExactWordDocids, MergerReceiver, MergerSender, WordDocids,
WordFidDocids, WordPositionDocids,
};
use super::channel::*;
use super::KvReaderDelAdd;
use crate::update::del_add::DelAdd;
use crate::update::new::channel::MergerOperation;
@ -30,6 +27,29 @@ pub fn merge_grenad_entries(
for merger_operation in receiver {
match merger_operation {
MergerOperation::ExactWordDocidsMerger(merger) => {
merge_and_send_docids(
merger,
/// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>.
index.exact_word_docids.remap_types(),
rtxn,
&mut buffer,
sender.docids::<ExactWordDocids>(),
|_key| Ok(()),
|_key| Ok(()),
)?;
}
MergerOperation::FidWordCountDocidsMerger(merger) => {
merge_and_send_docids(
merger,
index.field_id_word_count_docids.remap_types(),
rtxn,
&mut buffer,
sender.docids::<FidWordCountDocids>(),
|_key| Ok(()),
|_key| Ok(()),
)?;
}
MergerOperation::WordDocidsMerger(merger) => {
let mut add_words_fst = SetBuilder::new(tempfile()?)?;
let mut del_words_fst = SetBuilder::new(tempfile()?)?;
@ -49,17 +69,6 @@ pub fn merge_grenad_entries(
let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?;
sender.main().write_words_fst(mmap).unwrap();
}
MergerOperation::ExactWordDocidsMerger(merger) => {
merge_and_send_docids(
merger,
index.exact_word_docids.remap_types(),
rtxn,
&mut buffer,
sender.docids::<ExactWordDocids>(),
|_key| Ok(()),
|_key| Ok(()),
)?;
}
MergerOperation::WordFidDocidsMerger(merger) => {
merge_and_send_docids(
merger,
@ -71,6 +80,17 @@ pub fn merge_grenad_entries(
|_key| Ok(()),
)?;
}
MergerOperation::WordPairProximityDocidsMerger(merger) => {
merge_and_send_docids(
merger,
index.word_pair_proximity_docids.remap_types(),
rtxn,
&mut buffer,
sender.docids::<WordPairProximityDocids>(),
|_key| Ok(()),
|_key| Ok(()),
)?;
}
MergerOperation::WordPositionDocidsMerger(merger) => {
merge_and_send_docids(
merger,