mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-09 22:48:54 +01:00
Mutualize tokenization
This commit is contained in:
parent
3848adf5a2
commit
39b5990f64
@ -38,7 +38,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
|||||||
match document_change {
|
match document_change {
|
||||||
DocumentChange::Deletion(inner) => {
|
DocumentChange::Deletion(inner) => {
|
||||||
let mut fid_word_count = HashMap::new();
|
let mut fid_word_count = HashMap::new();
|
||||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||||
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||||
Ok(())
|
Ok(())
|
||||||
};
|
};
|
||||||
@ -58,7 +58,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
|||||||
}
|
}
|
||||||
DocumentChange::Update(inner) => {
|
DocumentChange::Update(inner) => {
|
||||||
let mut fid_word_count = HashMap::new();
|
let mut fid_word_count = HashMap::new();
|
||||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||||
fid_word_count
|
fid_word_count
|
||||||
.entry(fid)
|
.entry(fid)
|
||||||
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||||
@ -71,7 +71,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
|||||||
&mut token_fn,
|
&mut token_fn,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||||
fid_word_count
|
fid_word_count
|
||||||
.entry(fid)
|
.entry(fid)
|
||||||
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||||
@ -96,7 +96,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
|||||||
}
|
}
|
||||||
DocumentChange::Insertion(inner) => {
|
DocumentChange::Insertion(inner) => {
|
||||||
let mut fid_word_count = HashMap::new();
|
let mut fid_word_count = HashMap::new();
|
||||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||||
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||||
Ok(())
|
Ok(())
|
||||||
};
|
};
|
||||||
|
@ -1,17 +1,30 @@
|
|||||||
use std::borrow::Cow;
|
use std::collections::HashMap;
|
||||||
|
use std::{borrow::Cow, fs::File, num::NonZero};
|
||||||
|
|
||||||
|
use grenad::Merger;
|
||||||
|
use grenad::MergerBuilder;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
|
use rayon::iter::IntoParallelIterator;
|
||||||
|
use rayon::iter::ParallelIterator;
|
||||||
|
|
||||||
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
|
use super::{
|
||||||
|
tokenize_document::{tokenizer_builder, DocumentTokenizer},
|
||||||
|
SearchableExtractor,
|
||||||
|
};
|
||||||
|
use crate::update::new::extract::perm_json_p::contained_in;
|
||||||
|
use crate::DocumentId;
|
||||||
use crate::{
|
use crate::{
|
||||||
bucketed_position,
|
bucketed_position,
|
||||||
update::{
|
update::{
|
||||||
new::{extract::cache::CboCachedSorter, DocumentChange},
|
create_sorter,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
new::{extract::cache::CboCachedSorter, DocumentChange, ItemsPool},
|
||||||
|
GrenadParameters, MergeDeladdCboRoaringBitmaps,
|
||||||
},
|
},
|
||||||
FieldId, GlobalFieldsIdsMap, Index, Result,
|
FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const MAX_COUNTED_WORDS: usize = 30;
|
||||||
|
|
||||||
trait ProtoWordDocidsExtractor {
|
trait ProtoWordDocidsExtractor {
|
||||||
fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
|
fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
|
||||||
fn attributes_to_extract<'a>(
|
fn attributes_to_extract<'a>(
|
||||||
@ -36,7 +49,7 @@ where
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
match document_change {
|
match document_change {
|
||||||
DocumentChange::Deletion(inner) => {
|
DocumentChange::Deletion(inner) => {
|
||||||
let mut token_fn = |fid, pos: u16, word: &str| {
|
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from)
|
cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -47,7 +60,7 @@ where
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
DocumentChange::Update(inner) => {
|
DocumentChange::Update(inner) => {
|
||||||
let mut token_fn = |fid, pos, word: &str| {
|
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from)
|
cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -57,14 +70,14 @@ where
|
|||||||
&mut token_fn,
|
&mut token_fn,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let mut token_fn = |fid, pos, word: &str| {
|
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from)
|
cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
}
|
}
|
||||||
DocumentChange::Insertion(inner) => {
|
DocumentChange::Insertion(inner) => {
|
||||||
let mut token_fn = |fid, pos, word: &str| {
|
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from)
|
cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -181,3 +194,435 @@ impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor {
|
|||||||
Cow::Owned(key)
|
Cow::Owned(key)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// V2
|
||||||
|
|
||||||
|
struct WordDocidsCachedSorters {
|
||||||
|
word_fid_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
exact_word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
word_position_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
fid_word_count_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
fid_word_count: HashMap<FieldId, (usize, usize)>,
|
||||||
|
current_docid: Option<DocumentId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WordDocidsCachedSorters {
|
||||||
|
pub fn new(
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
max_memory: Option<usize>,
|
||||||
|
capacity: NonZero<usize>,
|
||||||
|
) -> Self {
|
||||||
|
let max_memory = max_memory.map(|max_memory| max_memory / 4);
|
||||||
|
|
||||||
|
let word_fid_docids = CboCachedSorter::new(
|
||||||
|
capacity,
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
let word_docids = CboCachedSorter::new(
|
||||||
|
capacity,
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
let exact_word_docids = CboCachedSorter::new(
|
||||||
|
capacity,
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
let word_position_docids = CboCachedSorter::new(
|
||||||
|
capacity,
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
let fid_word_count_docids = CboCachedSorter::new(
|
||||||
|
capacity,
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
word_fid_docids,
|
||||||
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
|
word_position_docids,
|
||||||
|
fid_word_count_docids,
|
||||||
|
fid_word_count: HashMap::new(),
|
||||||
|
current_docid: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_add_u32(
|
||||||
|
&mut self,
|
||||||
|
field_id: FieldId,
|
||||||
|
position: u16,
|
||||||
|
word: &str,
|
||||||
|
exact: bool,
|
||||||
|
docid: u32,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let key = word.as_bytes();
|
||||||
|
if exact {
|
||||||
|
self.exact_word_docids.insert_add_u32(key, docid)?;
|
||||||
|
} else {
|
||||||
|
self.word_docids.insert_add_u32(key, docid)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
buffer.extend_from_slice(word.as_bytes());
|
||||||
|
buffer.push(0);
|
||||||
|
buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
|
self.word_fid_docids.insert_add_u32(key, docid)?;
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
buffer.extend_from_slice(word.as_bytes());
|
||||||
|
buffer.push(0);
|
||||||
|
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
self.word_position_docids.insert_add_u32(buffer, docid)?;
|
||||||
|
|
||||||
|
if self.current_docid.map_or(false, |id| docid != id) {
|
||||||
|
self.flush_fid_word_count(buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.fid_word_count
|
||||||
|
.entry(field_id)
|
||||||
|
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||||
|
.or_insert((0, 1));
|
||||||
|
self.current_docid = Some(docid);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_del_u32(
|
||||||
|
&mut self,
|
||||||
|
field_id: FieldId,
|
||||||
|
position: u16,
|
||||||
|
word: &str,
|
||||||
|
exact: bool,
|
||||||
|
docid: u32,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let key = word.as_bytes();
|
||||||
|
if exact {
|
||||||
|
self.exact_word_docids.insert_del_u32(key, docid)?;
|
||||||
|
} else {
|
||||||
|
self.word_docids.insert_del_u32(key, docid)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
buffer.extend_from_slice(word.as_bytes());
|
||||||
|
buffer.push(0);
|
||||||
|
buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
|
self.word_fid_docids.insert_del_u32(key, docid)?;
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
buffer.extend_from_slice(word.as_bytes());
|
||||||
|
buffer.push(0);
|
||||||
|
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
self.word_position_docids.insert_del_u32(buffer, docid)?;
|
||||||
|
|
||||||
|
if self.current_docid.map_or(false, |id| docid != id) {
|
||||||
|
self.flush_fid_word_count(buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.fid_word_count
|
||||||
|
.entry(field_id)
|
||||||
|
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||||
|
.or_insert((1, 0));
|
||||||
|
self.current_docid = Some(docid);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush_fid_word_count(&mut self, buffer: &mut Vec<u8>) -> Result<()> {
|
||||||
|
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
|
||||||
|
if current_count != new_count {
|
||||||
|
if current_count <= MAX_COUNTED_WORDS {
|
||||||
|
buffer.clear();
|
||||||
|
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
buffer.push(current_count as u8);
|
||||||
|
self.fid_word_count_docids
|
||||||
|
.insert_del_u32(buffer, self.current_docid.unwrap())?;
|
||||||
|
}
|
||||||
|
if new_count <= MAX_COUNTED_WORDS {
|
||||||
|
buffer.clear();
|
||||||
|
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
buffer.push(new_count as u8);
|
||||||
|
self.fid_word_count_docids
|
||||||
|
.insert_add_u32(buffer, self.current_docid.unwrap())?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct WordDocidsMergerBuilders {
|
||||||
|
word_fid_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
word_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
exact_word_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
word_position_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
fid_word_count_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct WordDocidsMergers {
|
||||||
|
pub word_fid_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
pub word_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
pub exact_word_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
pub word_position_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
pub fid_word_count_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WordDocidsMergerBuilders {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
word_fid_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
|
||||||
|
word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
|
||||||
|
exact_word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
|
||||||
|
word_position_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
|
||||||
|
fid_word_count_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_sorters(&mut self, other: WordDocidsCachedSorters) -> Result<()> {
|
||||||
|
let WordDocidsCachedSorters {
|
||||||
|
word_fid_docids,
|
||||||
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
|
word_position_docids,
|
||||||
|
fid_word_count_docids,
|
||||||
|
fid_word_count: _,
|
||||||
|
current_docid: _,
|
||||||
|
} = other;
|
||||||
|
|
||||||
|
let sorter = word_fid_docids.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
self.word_fid_docids.extend(readers);
|
||||||
|
let sorter = word_docids.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
self.word_docids.extend(readers);
|
||||||
|
let sorter = exact_word_docids.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
self.exact_word_docids.extend(readers);
|
||||||
|
let sorter = word_position_docids.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
self.word_position_docids.extend(readers);
|
||||||
|
let sorter = fid_word_count_docids.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
self.fid_word_count_docids.extend(readers);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build(self) -> WordDocidsMergers {
|
||||||
|
WordDocidsMergers {
|
||||||
|
word_fid_docids: self.word_fid_docids.build(),
|
||||||
|
word_docids: self.word_docids.build(),
|
||||||
|
exact_word_docids: self.exact_word_docids.build(),
|
||||||
|
word_position_docids: self.word_position_docids.build(),
|
||||||
|
fid_word_count_docids: self.fid_word_count_docids.build(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct WordDocidsExtractors;
|
||||||
|
|
||||||
|
impl WordDocidsExtractors {
|
||||||
|
pub fn run_extraction(
|
||||||
|
index: &Index,
|
||||||
|
fields_ids_map: &GlobalFieldsIdsMap,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||||
|
) -> Result<WordDocidsMergers> {
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn()?;
|
||||||
|
let stop_words = index.stop_words(&rtxn)?;
|
||||||
|
let allowed_separators = index.allowed_separators(&rtxn)?;
|
||||||
|
let allowed_separators: Option<Vec<_>> =
|
||||||
|
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
|
let dictionary = index.dictionary(&rtxn)?;
|
||||||
|
let dictionary: Option<Vec<_>> =
|
||||||
|
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
|
let builder = tokenizer_builder(
|
||||||
|
stop_words.as_ref(),
|
||||||
|
allowed_separators.as_deref(),
|
||||||
|
dictionary.as_deref(),
|
||||||
|
);
|
||||||
|
let tokenizer = builder.into_tokenizer();
|
||||||
|
|
||||||
|
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||||
|
let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
|
||||||
|
let localized_attributes_rules =
|
||||||
|
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let document_tokenizer = DocumentTokenizer {
|
||||||
|
tokenizer: &tokenizer,
|
||||||
|
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||||
|
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||||
|
localized_attributes_rules: &localized_attributes_rules,
|
||||||
|
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||||
|
};
|
||||||
|
|
||||||
|
let context_pool = ItemsPool::new(|| {
|
||||||
|
Ok((
|
||||||
|
index.read_txn()?,
|
||||||
|
&document_tokenizer,
|
||||||
|
fields_ids_map.clone(),
|
||||||
|
WordDocidsCachedSorters::new(
|
||||||
|
indexer,
|
||||||
|
max_memory,
|
||||||
|
// TODO use a better value
|
||||||
|
200_000.try_into().unwrap(),
|
||||||
|
),
|
||||||
|
))
|
||||||
|
});
|
||||||
|
|
||||||
|
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||||
|
context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
|
||||||
|
Self::extract_document_change(
|
||||||
|
&*rtxn,
|
||||||
|
index,
|
||||||
|
document_tokenizer,
|
||||||
|
fields_ids_map,
|
||||||
|
cached_sorter,
|
||||||
|
document_change?,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut builder = WordDocidsMergerBuilders::new();
|
||||||
|
for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() {
|
||||||
|
builder.add_sorters(cache)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(builder.build())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_document_change(
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
document_tokenizer: &DocumentTokenizer,
|
||||||
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
|
cached_sorter: &mut WordDocidsCachedSorters,
|
||||||
|
document_change: DocumentChange,
|
||||||
|
) -> Result<()> {
|
||||||
|
let exact_attributes = index.exact_attributes(&rtxn)?;
|
||||||
|
let is_exact_attribute =
|
||||||
|
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
match document_change {
|
||||||
|
DocumentChange::Deletion(inner) => {
|
||||||
|
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||||
|
cached_sorter
|
||||||
|
.insert_del_u32(
|
||||||
|
fid,
|
||||||
|
pos,
|
||||||
|
word,
|
||||||
|
is_exact_attribute(fname),
|
||||||
|
inner.docid(),
|
||||||
|
&mut buffer,
|
||||||
|
)
|
||||||
|
.map_err(crate::Error::from)
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
DocumentChange::Update(inner) => {
|
||||||
|
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||||
|
cached_sorter
|
||||||
|
.insert_del_u32(
|
||||||
|
fid,
|
||||||
|
pos,
|
||||||
|
word,
|
||||||
|
is_exact_attribute(fname),
|
||||||
|
inner.docid(),
|
||||||
|
&mut buffer,
|
||||||
|
)
|
||||||
|
.map_err(crate::Error::from)
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||||
|
cached_sorter
|
||||||
|
.insert_add_u32(
|
||||||
|
fid,
|
||||||
|
pos,
|
||||||
|
word,
|
||||||
|
is_exact_attribute(fname),
|
||||||
|
inner.docid(),
|
||||||
|
&mut buffer,
|
||||||
|
)
|
||||||
|
.map_err(crate::Error::from)
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
|
}
|
||||||
|
DocumentChange::Insertion(inner) => {
|
||||||
|
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||||
|
cached_sorter
|
||||||
|
.insert_add_u32(
|
||||||
|
fid,
|
||||||
|
pos,
|
||||||
|
word,
|
||||||
|
is_exact_attribute(fname),
|
||||||
|
inner.docid(),
|
||||||
|
&mut buffer,
|
||||||
|
)
|
||||||
|
.map_err(crate::Error::from)
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cached_sorter.flush_fid_word_count(&mut buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn attributes_to_extract<'a>(
|
||||||
|
rtxn: &'a RoTxn,
|
||||||
|
index: &'a Index,
|
||||||
|
) -> Result<Option<Vec<&'a str>>> {
|
||||||
|
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -149,7 +149,7 @@ fn process_document_tokens(
|
|||||||
word_positions: &mut VecDeque<(String, u16)>,
|
word_positions: &mut VecDeque<(String, u16)>,
|
||||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut token_fn = |_fid: FieldId, pos: u16, word: &str| {
|
let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
|
||||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||||
while word_positions
|
while word_positions
|
||||||
.front()
|
.front()
|
||||||
|
@ -7,8 +7,8 @@ use std::fs::File;
|
|||||||
|
|
||||||
pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor;
|
pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor;
|
||||||
pub use extract_word_docids::{
|
pub use extract_word_docids::{
|
||||||
ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
ExactWordDocidsExtractor, WordDocidsExtractor, WordDocidsExtractors, WordDocidsMergers,
|
||||||
WordPositionDocidsExtractor,
|
WordFidDocidsExtractor, WordPositionDocidsExtractor,
|
||||||
};
|
};
|
||||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
|
@ -26,7 +26,7 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
&self,
|
&self,
|
||||||
obkv: &KvReaderFieldId,
|
obkv: &KvReaderFieldId,
|
||||||
field_id_map: &mut GlobalFieldsIdsMap,
|
field_id_map: &mut GlobalFieldsIdsMap,
|
||||||
token_fn: &mut impl FnMut(FieldId, u16, &str) -> Result<()>,
|
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut field_position = HashMap::new();
|
let mut field_position = HashMap::new();
|
||||||
let mut field_name = String::new();
|
let mut field_name = String::new();
|
||||||
@ -56,7 +56,7 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
Value::Number(n) => {
|
Value::Number(n) => {
|
||||||
let token = n.to_string();
|
let token = n.to_string();
|
||||||
if let Ok(position) = (*position).try_into() {
|
if let Ok(position) = (*position).try_into() {
|
||||||
token_fn(field_id, position, token.as_str())?;
|
token_fn(name, field_id, position, token.as_str())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -80,7 +80,7 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
*position = index;
|
*position = index;
|
||||||
if let Ok(position) = (*position).try_into() {
|
if let Ok(position) = (*position).try_into() {
|
||||||
token_fn(field_id, position, token)?;
|
token_fn(name, field_id, position, token)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -235,7 +235,7 @@ mod test {
|
|||||||
|
|
||||||
let mut words = std::collections::BTreeMap::new();
|
let mut words = std::collections::BTreeMap::new();
|
||||||
document_tokenizer
|
document_tokenizer
|
||||||
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| {
|
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |_fname, fid, pos, word| {
|
||||||
words.insert([fid, pos], word.to_string());
|
words.insert([fid, pos], word.to_string());
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
|
@ -58,7 +58,7 @@ where
|
|||||||
{
|
{
|
||||||
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
||||||
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
||||||
let (extractor_sender, merger_receiver) = extractors_merger_channels(0);
|
let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
|
||||||
|
|
||||||
let fields_ids_map_lock = RwLock::new(fields_ids_map);
|
let fields_ids_map_lock = RwLock::new(fields_ids_map);
|
||||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
||||||
@ -103,62 +103,56 @@ where
|
|||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
extract_and_send_docids::<WordDocidsExtractor, WordDocids>(
|
|
||||||
index,
|
let WordDocidsMergers {
|
||||||
&global_fields_ids_map,
|
word_fid_docids,
|
||||||
grenad_parameters,
|
word_docids,
|
||||||
document_changes.clone(),
|
exact_word_docids,
|
||||||
&extractor_sender,
|
word_position_docids,
|
||||||
)?;
|
fid_word_count_docids,
|
||||||
|
} = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?;
|
||||||
|
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
||||||
|
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
||||||
|
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
||||||
|
extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
|
||||||
|
extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
// {
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_fid_docids");
|
// let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids");
|
||||||
let _entered = span.enter();
|
// let _entered = span.enter();
|
||||||
extract_and_send_docids::<WordFidDocidsExtractor, WordFidDocids>(
|
// extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
|
||||||
index,
|
// index,
|
||||||
&global_fields_ids_map,
|
// &global_fields_ids_map,
|
||||||
grenad_parameters,
|
// grenad_parameters,
|
||||||
document_changes.clone(),
|
// document_changes.clone(),
|
||||||
&extractor_sender,
|
// &extractor_sender,
|
||||||
)?;
|
// )?;
|
||||||
}
|
// }
|
||||||
|
|
||||||
{
|
// {
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids");
|
// let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids");
|
||||||
let _entered = span.enter();
|
// let _entered = span.enter();
|
||||||
extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
|
// extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
|
||||||
index,
|
// index,
|
||||||
&global_fields_ids_map,
|
// &global_fields_ids_map,
|
||||||
grenad_parameters,
|
// grenad_parameters,
|
||||||
document_changes.clone(),
|
// document_changes.clone(),
|
||||||
&extractor_sender,
|
// &extractor_sender,
|
||||||
)?;
|
// )?;
|
||||||
}
|
// }
|
||||||
|
|
||||||
{
|
// {
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids");
|
// let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids");
|
||||||
let _entered = span.enter();
|
// let _entered = span.enter();
|
||||||
extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
|
// extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
|
||||||
index,
|
// index,
|
||||||
&global_fields_ids_map,
|
// &global_fields_ids_map,
|
||||||
grenad_parameters,
|
// GrenadParameters::default(),
|
||||||
document_changes.clone(),
|
// document_changes.clone(),
|
||||||
&extractor_sender,
|
// &extractor_sender,
|
||||||
)?;
|
// )?;
|
||||||
}
|
// }
|
||||||
|
|
||||||
{
|
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids");
|
|
||||||
let _entered = span.enter();
|
|
||||||
extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
|
|
||||||
index,
|
|
||||||
&global_fields_ids_map,
|
|
||||||
GrenadParameters::default(),
|
|
||||||
document_changes.clone(),
|
|
||||||
&extractor_sender,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||||
|
Loading…
Reference in New Issue
Block a user