mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-09 22:48:54 +01:00
Integrate first searchable exctrator
This commit is contained in:
parent
5369bf4a62
commit
c50d3edc4a
@ -1,10 +1,12 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
||||||
|
use grenad::Merger;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
|
|
||||||
use super::StdResult;
|
use super::StdResult;
|
||||||
use crate::update::new::KvReaderFieldId;
|
use crate::update::new::KvReaderFieldId;
|
||||||
|
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||||
use crate::{DocumentId, Index};
|
use crate::{DocumentId, Index};
|
||||||
|
|
||||||
/// The capacity of the channel is currently in number of messages.
|
/// The capacity of the channel is currently in number of messages.
|
||||||
@ -159,7 +161,7 @@ impl DocumentSender {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub enum MergerOperation {
|
pub enum MergerOperation {
|
||||||
WordDocidsCursors(Vec<grenad::ReaderCursor<File>>),
|
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct MergerReceiver(Receiver<MergerOperation>);
|
pub struct MergerReceiver(Receiver<MergerOperation>);
|
||||||
@ -175,3 +177,16 @@ impl IntoIterator for MergerReceiver {
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct DeladdCboRoaringBitmapSender(Sender<MergerOperation>);
|
pub struct DeladdCboRoaringBitmapSender(Sender<MergerOperation>);
|
||||||
|
|
||||||
|
impl DeladdCboRoaringBitmapSender {
|
||||||
|
pub fn word_docids(
|
||||||
|
&self,
|
||||||
|
merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
||||||
|
) -> StdResult<(), SendError<()>> {
|
||||||
|
let operation = MergerOperation::WordDocidsMerger(merger);
|
||||||
|
match self.0.send(operation) {
|
||||||
|
Ok(()) => Ok(()),
|
||||||
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -2,7 +2,7 @@ use heed::RoTxn;
|
|||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
|
|
||||||
use crate::update::new::KvReaderFieldId;
|
use crate::update::new::KvReaderFieldId;
|
||||||
use crate::{DocumentId, FieldId};
|
use crate::{DocumentId, FieldId, Index};
|
||||||
|
|
||||||
pub enum DocumentChange {
|
pub enum DocumentChange {
|
||||||
Deletion(Deletion),
|
Deletion(Deletion),
|
||||||
@ -12,14 +12,14 @@ pub enum DocumentChange {
|
|||||||
|
|
||||||
pub struct Deletion {
|
pub struct Deletion {
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: String, // ?
|
external_docid: String, // ?
|
||||||
current: Box<KvReaderFieldId>,
|
current: Box<KvReaderFieldId>, // ?
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Update {
|
pub struct Update {
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: String, // ?
|
external_docid: String, // ?
|
||||||
current: Box<KvReaderFieldId>,
|
current: Box<KvReaderFieldId>, // ?
|
||||||
new: Box<KvReaderFieldId>,
|
new: Box<KvReaderFieldId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ pub struct Insertion {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentChange {
|
impl DocumentChange {
|
||||||
fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
match &self {
|
match &self {
|
||||||
Self::Deletion(inner) => inner.docid(),
|
Self::Deletion(inner) => inner.docid(),
|
||||||
Self::Update(inner) => inner.docid(),
|
Self::Update(inner) => inner.docid(),
|
||||||
@ -48,11 +48,11 @@ impl Deletion {
|
|||||||
Self { docid, external_docid, current }
|
Self { docid, external_docid, current }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
self.docid
|
self.docid
|
||||||
}
|
}
|
||||||
|
|
||||||
fn current(&self, rtxn: &RoTxn) -> &KvReader<FieldId> {
|
pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader<FieldId> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -62,11 +62,11 @@ impl Insertion {
|
|||||||
Insertion { docid, external_docid, new }
|
Insertion { docid, external_docid, new }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
self.docid
|
self.docid
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new(&self) -> &KvReader<FieldId> {
|
pub fn new(&self) -> &KvReader<FieldId> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -81,15 +81,15 @@ impl Update {
|
|||||||
Update { docid, external_docid, current, new }
|
Update { docid, external_docid, current, new }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn docid(&self) -> DocumentId {
|
pub fn docid(&self) -> DocumentId {
|
||||||
self.docid
|
self.docid
|
||||||
}
|
}
|
||||||
|
|
||||||
fn current(&self, rtxn: &RoTxn) -> &KvReader<FieldId> {
|
pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader<FieldId> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new(&self) -> &KvReader<FieldId> {
|
pub fn new(&self) -> &KvReader<FieldId> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,12 +2,12 @@ use std::borrow::Cow;
|
|||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroUsize;
|
||||||
use std::{io, mem};
|
use std::{io, mem};
|
||||||
|
|
||||||
use grenad2::{MergeFunction, Sorter};
|
use grenad::{MergeFunction, Sorter};
|
||||||
use lru::LruCache;
|
use lru::LruCache;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use smallvec::SmallVec;
|
use smallvec::SmallVec;
|
||||||
|
|
||||||
use crate::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct CachedSorter<MF> {
|
pub struct CachedSorter<MF> {
|
||||||
|
@ -1,84 +1,180 @@
|
|||||||
pub fn extract_word_docids(
|
use std::fs::File;
|
||||||
document_change: DocumentChange,
|
|
||||||
_tokenizer: &Tokenizer,
|
use charabia::TokenizerBuilder;
|
||||||
output: &mut CachedSorter<DelAddRoaringBitmapMerger>,
|
use grenad::Merger;
|
||||||
) -> grenad::Result<(), io::Error> {
|
use grenad::ReaderCursor;
|
||||||
match document_change {
|
use heed::RoTxn;
|
||||||
DocumentChange::Deletion(inner) => {
|
use rayon::iter::IntoParallelIterator;
|
||||||
unimplemented!()
|
use rayon::iter::ParallelBridge;
|
||||||
}
|
use rayon::iter::ParallelIterator;
|
||||||
DocumentChange::Update(inner) => {
|
|
||||||
unimplemented!()
|
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||||
}
|
use crate::{
|
||||||
DocumentChange::Insertion(inner) => {
|
update::{
|
||||||
unimplemented!()
|
create_sorter,
|
||||||
|
new::{DocumentChange, ItemsPool},
|
||||||
|
GrenadParameters,
|
||||||
|
},
|
||||||
|
FieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
cache::{CachedSorter, DelAddRoaringBitmapMerger},
|
||||||
|
tokenize_document::DocumentTokenizer,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub trait SearchableExtractor {
|
||||||
|
fn run_extraction(
|
||||||
|
index: &Index,
|
||||||
|
fields_ids_map: &FieldsIdsMap,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||||
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn()?;
|
||||||
|
let stop_words = index.stop_words(&rtxn)?;
|
||||||
|
let allowed_separators = index.allowed_separators(&rtxn)?;
|
||||||
|
let allowed_separators: Option<Vec<_>> =
|
||||||
|
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
|
let dictionary = index.dictionary(&rtxn)?;
|
||||||
|
let dictionary: Option<Vec<_>> =
|
||||||
|
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
|
let builder = tokenizer_builder(
|
||||||
|
stop_words.as_ref(),
|
||||||
|
allowed_separators.as_deref(),
|
||||||
|
dictionary.as_deref(),
|
||||||
|
);
|
||||||
|
let tokenizer = builder.into_tokenizer();
|
||||||
|
|
||||||
|
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn)?;
|
||||||
|
let localized_attributes_rules =
|
||||||
|
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let document_tokenizer = DocumentTokenizer {
|
||||||
|
tokenizer: &tokenizer,
|
||||||
|
searchable_attributes: user_defined_searchable_fields.as_deref(),
|
||||||
|
localized_attributes_rules: &localized_attributes_rules,
|
||||||
|
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||||
|
};
|
||||||
|
|
||||||
|
let context_pool = ItemsPool::new(|| {
|
||||||
|
Ok((
|
||||||
|
index.read_txn()?,
|
||||||
|
&document_tokenizer,
|
||||||
|
CachedSorter::new(
|
||||||
|
// TODO use a better value
|
||||||
|
100.try_into().unwrap(),
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
DelAddRoaringBitmapMerger,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
))
|
||||||
|
});
|
||||||
|
|
||||||
|
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||||
|
context_pool.with(|(rtxn, document_tokenizer, cached_sorter)| {
|
||||||
|
Self::extract_document_change(
|
||||||
|
&*rtxn,
|
||||||
|
index,
|
||||||
|
document_tokenizer,
|
||||||
|
&fields_ids_map,
|
||||||
|
cached_sorter,
|
||||||
|
document_change?,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||||
|
for (_rtxn, _tokenizer, cache) in context_pool.into_items() {
|
||||||
|
let sorter = cache.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
builder.extend(readers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(builder.build())
|
||||||
}
|
}
|
||||||
|
|
||||||
let normalizer_options = NormalizerOption::default();
|
fn extract_document_change(
|
||||||
|
rtxn: &RoTxn,
|
||||||
if let Some(previous_doc) = previous_doc {
|
index: &Index,
|
||||||
for (_, v) in previous_doc.iter() {
|
document_tokenizer: &DocumentTokenizer,
|
||||||
// Only manage the direct JSON strings
|
fields_ids_map: &FieldsIdsMap,
|
||||||
// TODO manage the JSON strings correctly (escaped chars)
|
cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>,
|
||||||
if v.first().zip(v.last()) == Some((&b'"', &b'"')) {
|
document_change: DocumentChange,
|
||||||
let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap();
|
) -> Result<()>;
|
||||||
// for token in tokenizer.tokenize(s).filter(|t| t.is_word()) {
|
|
||||||
// let key = token.lemma().normalize(&normalizer_options);
|
|
||||||
for token in s.split_whitespace() {
|
|
||||||
let key = token.normalize(&normalizer_options);
|
|
||||||
output.insert_del_u32(key.as_bytes(), docid)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (_, v) in new_doc.iter() {
|
|
||||||
// Only manage the direct JSON strings
|
|
||||||
// TODO manage the JSON strings correctly (escaped chars)
|
|
||||||
if v.first().zip(v.last()) == Some((&b'"', &b'"')) {
|
|
||||||
let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap();
|
|
||||||
// for token in tokenizer.tokenize(s).filter(|t| t.is_word()) {
|
|
||||||
// let key = token.lemma().normalize(&normalizer_options);
|
|
||||||
for token in s.split_whitespace() {
|
|
||||||
let key = token.normalize(&normalizer_options);
|
|
||||||
output.insert_add_u32(key.as_bytes(), docid)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
pub struct WordDocidsExtractor;
|
||||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
impl SearchableExtractor for WordDocidsExtractor {
|
||||||
/// else we keep the standard proximity of 1 between words.
|
fn extract_document_change(
|
||||||
fn process_tokens<'a>(
|
rtxn: &RoTxn,
|
||||||
tokens: impl Iterator<Item = Token<'a>>,
|
index: &Index,
|
||||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
document_tokenizer: &DocumentTokenizer,
|
||||||
tokens
|
fields_ids_map: &FieldsIdsMap,
|
||||||
.skip_while(|token| token.is_separator())
|
// TODO: DelAddRoaringBitmapMerger should be CBO
|
||||||
.scan((0, None), |(offset, prev_kind), mut token| {
|
cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>,
|
||||||
match token.kind {
|
document_change: DocumentChange,
|
||||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
) -> crate::Result<()> {
|
||||||
*offset += match *prev_kind {
|
match document_change {
|
||||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
DocumentChange::Deletion(inner) => {
|
||||||
Some(_) => 1,
|
let mut token_fn = |_fid, _pos: u16, word: &str| {
|
||||||
None => 0,
|
cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap();
|
||||||
};
|
};
|
||||||
*prev_kind = Some(token.kind)
|
document_tokenizer.tokenize_document(
|
||||||
}
|
inner.current(rtxn, index),
|
||||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
fields_ids_map,
|
||||||
*prev_kind = Some(token.kind);
|
&mut token_fn,
|
||||||
}
|
)?;
|
||||||
TokenKind::Separator(SeparatorKind::Soft)
|
|
||||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
|
||||||
{
|
|
||||||
*prev_kind = Some(token.kind);
|
|
||||||
}
|
|
||||||
_ => token.kind = TokenKind::Unknown,
|
|
||||||
}
|
}
|
||||||
Some((*offset, token))
|
DocumentChange::Update(inner) => {
|
||||||
})
|
let mut token_fn = |_fid, _pos, word: &str| {
|
||||||
.filter(|(_, t)| t.is_word())
|
cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap();
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.current(rtxn, index),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let mut token_fn = |_fid, _pos, word: &str| {
|
||||||
|
cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap();
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
|
}
|
||||||
|
DocumentChange::Insertion(inner) => {
|
||||||
|
let mut token_fn = |_fid, _pos, word: &str| {
|
||||||
|
cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap();
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Factorize tokenizer building.
|
||||||
|
fn tokenizer_builder<'a>(
|
||||||
|
stop_words: Option<&'a fst::Set<&'a [u8]>>,
|
||||||
|
allowed_separators: Option<&'a [&str]>,
|
||||||
|
dictionary: Option<&'a [&str]>,
|
||||||
|
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||||
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
|
if let Some(stop_words) = stop_words {
|
||||||
|
tokenizer_builder.stop_words(stop_words);
|
||||||
|
}
|
||||||
|
if let Some(dictionary) = dictionary {
|
||||||
|
tokenizer_builder.words_dict(dictionary);
|
||||||
|
}
|
||||||
|
if let Some(separators) = allowed_separators {
|
||||||
|
tokenizer_builder.separators(separators);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer_builder
|
||||||
}
|
}
|
||||||
|
@ -1,2 +1,6 @@
|
|||||||
mod cache;
|
mod cache;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
|
mod tokenize_document;
|
||||||
|
|
||||||
|
pub use extract_word_docids::SearchableExtractor;
|
||||||
|
pub use extract_word_docids::WordDocidsExtractor;
|
||||||
|
@ -1,56 +1,71 @@
|
|||||||
pub struct DocumentTokenizer {
|
use crate::{
|
||||||
tokenizer: &Tokenizer,
|
update::new::KvReaderFieldId, FieldId, FieldsIdsMap, Index, InternalError,
|
||||||
searchable_attributes: Option<&[String]>,
|
LocalizedAttributesRule, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||||
localized_attributes_rules: &[LocalizedAttributesRule],
|
};
|
||||||
max_positions_per_attributes: u32,
|
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
|
use heed::RoTxn;
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
pub struct DocumentTokenizer<'a> {
|
||||||
|
pub tokenizer: &'a Tokenizer<'a>,
|
||||||
|
pub searchable_attributes: Option<&'a [&'a str]>,
|
||||||
|
pub localized_attributes_rules: &'a [LocalizedAttributesRule],
|
||||||
|
pub max_positions_per_attributes: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentTokenizer {
|
impl<'a> DocumentTokenizer<'a> {
|
||||||
// pub fn new(tokenizer: &Tokenizer, settings: &InnerIndexSettings) -> Self {
|
pub fn tokenize_document(
|
||||||
// Self { tokenizer, settings }
|
&self,
|
||||||
// }
|
obkv: &KvReaderFieldId,
|
||||||
|
|
||||||
pub fn tokenize_document<'a>(
|
|
||||||
obkv: &KvReader<'a, FieldId>,
|
|
||||||
field_id_map: &FieldsIdsMap,
|
field_id_map: &FieldsIdsMap,
|
||||||
token_fn: impl Fn(FieldId, u16, &str),
|
token_fn: &mut impl FnMut(FieldId, u16, &str),
|
||||||
) {
|
) -> Result<()> {
|
||||||
let mut field_position = Hashmap::new();
|
let mut field_position = HashMap::new();
|
||||||
for (field_id, field_bytes) in obkv {
|
for (field_id, field_bytes) in obkv {
|
||||||
let field_name = field_id_map.name(field_id);
|
let Some(field_name) = field_id_map.name(field_id) else {
|
||||||
|
unreachable!("field id not found in field id map");
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tokenize_field = |name: &str, value: &Value| {
|
||||||
|
let Some(field_id) = field_id_map.id(name) else {
|
||||||
|
unreachable!("field name not found in field id map");
|
||||||
|
};
|
||||||
|
|
||||||
|
let position =
|
||||||
|
field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
|
||||||
|
if *position as u32 >= self.max_positions_per_attributes {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let tokenize_field = |name, value| {
|
|
||||||
let field_id = field_id_map.id(name);
|
|
||||||
match value {
|
match value {
|
||||||
Number(n) => {
|
Value::Number(n) => {
|
||||||
let token = n.to_string();
|
let token = n.to_string();
|
||||||
let position = field_position
|
if let Ok(position) = (*position).try_into() {
|
||||||
.entry(field_id)
|
token_fn(field_id, position, token.as_str());
|
||||||
.and_modify(|counter| *counter += 8)
|
}
|
||||||
.or_insert(0);
|
|
||||||
token_fn(field_id, position, token.as_str());
|
|
||||||
}
|
}
|
||||||
String(text) => {
|
Value::String(text) => {
|
||||||
// create an iterator of token with their positions.
|
// create an iterator of token with their positions.
|
||||||
let locales = self
|
let locales = self
|
||||||
.localized_attributes_rules
|
.localized_attributes_rules
|
||||||
.iter()
|
.iter()
|
||||||
.first(|rule| rule.match_str(field_name))
|
.find(|rule| rule.match_str(field_name))
|
||||||
.map(|rule| rule.locales(field_id));
|
.map(|rule| rule.locales());
|
||||||
let tokens =
|
let tokens = process_tokens(
|
||||||
process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
*position,
|
||||||
.take_while(|(p, _)| {
|
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
|
||||||
(*p as u32) < self.max_positions_per_attributes
|
)
|
||||||
});
|
.take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes);
|
||||||
|
|
||||||
for (index, token) in tokens {
|
for (index, token) in tokens {
|
||||||
// keep a word only if it is not empty and fit in a LMDB key.
|
// keep a word only if it is not empty and fit in a LMDB key.
|
||||||
let token = token.lemma().trim();
|
let token = token.lemma().trim();
|
||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
let position: u16 = index
|
*position = index;
|
||||||
.try_into()
|
if let Ok(position) = (*position).try_into() {
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
token_fn(field_id, position, token);
|
||||||
writer.insert(position, token.as_bytes())?;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -59,21 +74,28 @@ impl DocumentTokenizer {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// if the current field is searchable or contains a searchable attribute
|
// if the current field is searchable or contains a searchable attribute
|
||||||
if searchable_attributes.map_or(true, |attributes| {
|
if self.searchable_attributes.map_or(true, |attributes| {
|
||||||
attributes.iter().any(|name| contained_in(name, field_name))
|
attributes.iter().any(|name| perm_json_p::contained_in(name, field_name))
|
||||||
}) {
|
}) {
|
||||||
// parse json.
|
// parse json.
|
||||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
||||||
Value::Object(object) => {
|
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
||||||
seek_leaf_values_in_object(object, selectors, &field_name, tokenize_field)
|
&object,
|
||||||
}
|
self.searchable_attributes.as_deref(),
|
||||||
Value::Array(array) => {
|
&field_name,
|
||||||
seek_leaf_values_in_array(array, selectors, &field_name, tokenize_field)
|
&mut tokenize_field,
|
||||||
}
|
),
|
||||||
value => tokenize_field(&base_key, value),
|
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
|
||||||
|
&array,
|
||||||
|
self.searchable_attributes.as_deref(),
|
||||||
|
&field_name,
|
||||||
|
&mut tokenize_field,
|
||||||
|
),
|
||||||
|
value => tokenize_field(&field_name, &value),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,11 +103,12 @@ impl DocumentTokenizer {
|
|||||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||||
/// else we keep the standard proximity of 1 between words.
|
/// else we keep the standard proximity of 1 between words.
|
||||||
fn process_tokens<'a>(
|
fn process_tokens<'a>(
|
||||||
|
start_offset: usize,
|
||||||
tokens: impl Iterator<Item = Token<'a>>,
|
tokens: impl Iterator<Item = Token<'a>>,
|
||||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||||
tokens
|
tokens
|
||||||
.skip_while(|token| token.is_separator())
|
.skip_while(|token| token.is_separator())
|
||||||
.scan((0, None), |(offset, prev_kind), mut token| {
|
.scan((start_offset, None), |(offset, prev_kind), mut token| {
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||||
*offset += match *prev_kind {
|
*offset += match *prev_kind {
|
||||||
@ -110,42 +133,45 @@ fn process_tokens<'a>(
|
|||||||
.filter(|(_, t)| t.is_word())
|
.filter(|(_, t)| t.is_word())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if the `selector` match the `key`.
|
|
||||||
///
|
|
||||||
/// ```text
|
|
||||||
/// Example:
|
|
||||||
/// `animaux` match `animaux`
|
|
||||||
/// `animaux.chien` match `animaux`
|
|
||||||
/// `animaux.chien` match `animaux`
|
|
||||||
/// `animaux.chien.nom` match `animaux`
|
|
||||||
/// `animaux.chien.nom` match `animaux.chien`
|
|
||||||
/// -----------------------------------------
|
|
||||||
/// `animaux` doesn't match `animaux.chien`
|
|
||||||
/// `animaux.` doesn't match `animaux`
|
|
||||||
/// `animaux.ch` doesn't match `animaux.chien`
|
|
||||||
/// `animau` doesn't match `animaux`
|
|
||||||
/// ```
|
|
||||||
fn contained_in(selector: &str, key: &str) -> bool {
|
|
||||||
selector.starts_with(key)
|
|
||||||
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// TODO move in permissive json pointer
|
/// TODO move in permissive json pointer
|
||||||
mod perm_json_p {
|
mod perm_json_p {
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
const SPLIT_SYMBOL: char = '.';
|
||||||
|
|
||||||
|
/// Returns `true` if the `selector` match the `key`.
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// Example:
|
||||||
|
/// `animaux` match `animaux`
|
||||||
|
/// `animaux.chien` match `animaux`
|
||||||
|
/// `animaux.chien` match `animaux`
|
||||||
|
/// `animaux.chien.nom` match `animaux`
|
||||||
|
/// `animaux.chien.nom` match `animaux.chien`
|
||||||
|
/// -----------------------------------------
|
||||||
|
/// `animaux` doesn't match `animaux.chien`
|
||||||
|
/// `animaux.` doesn't match `animaux`
|
||||||
|
/// `animaux.ch` doesn't match `animaux.chien`
|
||||||
|
/// `animau` doesn't match `animaux`
|
||||||
|
/// ```
|
||||||
|
pub fn contained_in(selector: &str, key: &str) -> bool {
|
||||||
|
selector.starts_with(key)
|
||||||
|
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn seek_leaf_values<'a>(
|
pub fn seek_leaf_values<'a>(
|
||||||
value: &Map<String, Value>,
|
value: &Map<String, Value>,
|
||||||
selectors: impl IntoIterator<Item = &'a str>,
|
selectors: impl IntoIterator<Item = &'a str>,
|
||||||
seeker: impl Fn(&str, &Value),
|
seeker: &mut impl FnMut(&str, &Value),
|
||||||
) {
|
) {
|
||||||
let selectors: Vec<_> = selectors.into_iter().collect();
|
let selectors: Vec<_> = selectors.into_iter().collect();
|
||||||
seek_leaf_values_in_object(value, &selectors, "", &seeker);
|
seek_leaf_values_in_object(value, Some(&selectors), "", seeker);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn seek_leaf_values_in_object(
|
pub fn seek_leaf_values_in_object(
|
||||||
value: &Map<String, Value>,
|
value: &Map<String, Value>,
|
||||||
selectors: &[&str],
|
selectors: Option<&[&str]>,
|
||||||
base_key: &str,
|
base_key: &str,
|
||||||
seeker: &impl Fn(&str, &Value),
|
seeker: &mut impl FnMut(&str, &Value),
|
||||||
) {
|
) {
|
||||||
for (key, value) in value.iter() {
|
for (key, value) in value.iter() {
|
||||||
let base_key = if base_key.is_empty() {
|
let base_key = if base_key.is_empty() {
|
||||||
@ -156,8 +182,10 @@ mod perm_json_p {
|
|||||||
|
|
||||||
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
||||||
// so we check the contained_in on both side
|
// so we check the contained_in on both side
|
||||||
let should_continue = selectors.iter().any(|selector| {
|
let should_continue = selectors.map_or(true, |selectors| {
|
||||||
contained_in(selector, &base_key) || contained_in(&base_key, selector)
|
selectors.iter().any(|selector| {
|
||||||
|
contained_in(selector, &base_key) || contained_in(&base_key, selector)
|
||||||
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
if should_continue {
|
if should_continue {
|
||||||
@ -175,12 +203,12 @@ mod perm_json_p {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn seek_leaf_values_in_array(
|
pub fn seek_leaf_values_in_array(
|
||||||
values: &mut [Value],
|
values: &[Value],
|
||||||
selectors: &[&str],
|
selectors: Option<&[&str]>,
|
||||||
base_key: &str,
|
base_key: &str,
|
||||||
seeker: &impl Fn(&str, &Value),
|
seeker: &mut impl FnMut(&str, &Value),
|
||||||
) {
|
) {
|
||||||
for value in values.iter_mut() {
|
for value in values {
|
||||||
match value {
|
match value {
|
||||||
Value::Object(object) => {
|
Value::Object(object) => {
|
||||||
seek_leaf_values_in_object(object, selectors, base_key, seeker)
|
seek_leaf_values_in_object(object, selectors, base_key, seeker)
|
||||||
@ -193,3 +221,91 @@ mod perm_json_p {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
use charabia::TokenizerBuilder;
|
||||||
|
use meili_snap::snapshot;
|
||||||
|
use obkv::KvReader;
|
||||||
|
use serde_json::json;
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_document() {
|
||||||
|
let mut fields_ids_map = FieldsIdsMap::new();
|
||||||
|
|
||||||
|
let field_1 = json!({
|
||||||
|
"name": "doggo",
|
||||||
|
"age": 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
let field_2 = json!({
|
||||||
|
"catto": {
|
||||||
|
"name": "pesti",
|
||||||
|
"age": 23,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let field_3 = json!(["doggo", "catto"]);
|
||||||
|
|
||||||
|
let mut obkv = obkv::KvWriter::memory();
|
||||||
|
let field_1_id = fields_ids_map.insert("doggo").unwrap();
|
||||||
|
let field_1 = serde_json::to_string(&field_1).unwrap();
|
||||||
|
obkv.insert(field_1_id, field_1.as_bytes()).unwrap();
|
||||||
|
let field_2_id = fields_ids_map.insert("catto").unwrap();
|
||||||
|
let field_2 = serde_json::to_string(&field_2).unwrap();
|
||||||
|
obkv.insert(field_2_id, field_2.as_bytes()).unwrap();
|
||||||
|
let field_3_id = fields_ids_map.insert("doggo.name").unwrap();
|
||||||
|
let field_3 = serde_json::to_string(&field_3).unwrap();
|
||||||
|
obkv.insert(field_3_id, field_3.as_bytes()).unwrap();
|
||||||
|
let value = obkv.into_inner().unwrap();
|
||||||
|
let obkv = KvReader::from_slice(value.as_slice());
|
||||||
|
|
||||||
|
fields_ids_map.insert("doggo.age");
|
||||||
|
fields_ids_map.insert("catto.catto.name");
|
||||||
|
fields_ids_map.insert("catto.catto.age");
|
||||||
|
|
||||||
|
let mut tb = TokenizerBuilder::default();
|
||||||
|
let document_tokenizer = DocumentTokenizer {
|
||||||
|
tokenizer: &tb.build(),
|
||||||
|
searchable_attributes: None,
|
||||||
|
localized_attributes_rules: &[],
|
||||||
|
max_positions_per_attributes: 1000,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut words = std::collections::BTreeMap::new();
|
||||||
|
document_tokenizer
|
||||||
|
.tokenize_document(obkv, &fields_ids_map, &mut |fid, pos, word| {
|
||||||
|
words.insert([fid, pos], word.to_string());
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
snapshot!(format!("{:#?}", words), @r###"
|
||||||
|
{
|
||||||
|
[
|
||||||
|
2,
|
||||||
|
0,
|
||||||
|
]: "doggo",
|
||||||
|
[
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
]: "doggo",
|
||||||
|
[
|
||||||
|
2,
|
||||||
|
16,
|
||||||
|
]: "catto",
|
||||||
|
[
|
||||||
|
3,
|
||||||
|
0,
|
||||||
|
]: "10",
|
||||||
|
[
|
||||||
|
4,
|
||||||
|
0,
|
||||||
|
]: "pesti",
|
||||||
|
[
|
||||||
|
5,
|
||||||
|
0,
|
||||||
|
]: "23",
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -15,11 +15,13 @@ use super::channel::{
|
|||||||
WriterOperation,
|
WriterOperation,
|
||||||
};
|
};
|
||||||
use super::document_change::DocumentChange;
|
use super::document_change::DocumentChange;
|
||||||
|
use super::extract::{SearchableExtractor, WordDocidsExtractor};
|
||||||
use super::merger::merge_grenad_entries;
|
use super::merger::merge_grenad_entries;
|
||||||
use super::StdResult;
|
use super::StdResult;
|
||||||
use crate::documents::{
|
use crate::documents::{
|
||||||
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||||
};
|
};
|
||||||
|
use crate::update::GrenadParameters;
|
||||||
use crate::{Index, Result, UserError};
|
use crate::{Index, Result, UserError};
|
||||||
|
|
||||||
mod document_deletion;
|
mod document_deletion;
|
||||||
@ -45,7 +47,7 @@ pub fn index<PI>(
|
|||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
pool: &ThreadPool,
|
pool: &ThreadPool,
|
||||||
_document_changes: PI,
|
document_changes: PI,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
|
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
|
||||||
@ -59,10 +61,18 @@ where
|
|||||||
// TODO manage the errors correctly
|
// TODO manage the errors correctly
|
||||||
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || {
|
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || {
|
||||||
pool.in_place_scope(|_s| {
|
pool.in_place_scope(|_s| {
|
||||||
|
let document_changes = document_changes.into_par_iter();
|
||||||
// word docids
|
// word docids
|
||||||
// document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>)
|
let merger = WordDocidsExtractor::run_extraction(
|
||||||
// let grenads = extractor_function(document_changes)?;
|
index,
|
||||||
// deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?;
|
todo!(),
|
||||||
|
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
||||||
|
GrenadParameters::default(),
|
||||||
|
document_changes.clone(),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
/// TODO: manage the errors correctly
|
||||||
|
deladd_cbo_roaring_bitmap_sender.word_docids(merger).unwrap();
|
||||||
|
|
||||||
Ok(()) as Result<_>
|
Ok(()) as Result<_>
|
||||||
})
|
})
|
||||||
|
@ -20,14 +20,12 @@ pub fn merge_grenad_entries(
|
|||||||
|
|
||||||
for merger_operation in receiver {
|
for merger_operation in receiver {
|
||||||
match merger_operation {
|
match merger_operation {
|
||||||
MergerOperation::WordDocidsCursors(cursors) => {
|
MergerOperation::WordDocidsMerger(merger) => {
|
||||||
let sender = sender.word_docids();
|
let sender = sender.word_docids();
|
||||||
let database = index.word_docids.remap_types::<Bytes, Bytes>();
|
let database = index.word_docids.remap_types::<Bytes, Bytes>();
|
||||||
|
|
||||||
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
|
||||||
builder.extend(cursors);
|
|
||||||
/// TODO manage the error correctly
|
/// TODO manage the error correctly
|
||||||
let mut merger_iter = builder.build().into_stream_merger_iter().unwrap();
|
let mut merger_iter = merger.into_stream_merger_iter().unwrap();
|
||||||
|
|
||||||
// TODO manage the error correctly
|
// TODO manage the error correctly
|
||||||
while let Some((key, deladd)) = merger_iter.next().unwrap() {
|
while let Some((key, deladd)) = merger_iter.next().unwrap() {
|
||||||
|
@ -4,13 +4,12 @@ pub use items_pool::ItemsPool;
|
|||||||
use super::del_add::DelAdd;
|
use super::del_add::DelAdd;
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
mod document_change;
|
|
||||||
mod merger;
|
|
||||||
// mod extract;
|
|
||||||
mod channel;
|
mod channel;
|
||||||
//mod global_fields_ids_map;
|
mod document_change;
|
||||||
|
mod extract;
|
||||||
pub mod indexer;
|
pub mod indexer;
|
||||||
mod items_pool;
|
mod items_pool;
|
||||||
|
mod merger;
|
||||||
|
|
||||||
/// TODO move them elsewhere
|
/// TODO move them elsewhere
|
||||||
pub type StdResult<T, E> = std::result::Result<T, E>;
|
pub type StdResult<T, E> = std::result::Result<T, E>;
|
||||||
|
Loading…
Reference in New Issue
Block a user