diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index b4eb4d1d2..aa37593c9 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -52,6 +52,7 @@ impl Deletion { self.docid } + // TODO shouldn't we use the one in self? pub fn current<'a>( &self, rtxn: &'a RoTxn, diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 684b67daa..5c3c4a735 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -1,5 +1,5 @@ -use std::num::NonZeroUsize; use std::mem; +use std::num::NonZeroUsize; use grenad::{MergeFunction, Sorter}; use lru::LruCache; @@ -10,16 +10,16 @@ use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::CboRoaringBitmapCodec; #[derive(Debug)] -pub struct CachedSorter { +pub struct CboCachedSorter { cache: lru::LruCache, DelAddRoaringBitmap>, sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, } -impl CachedSorter { +impl CboCachedSorter { pub fn new(cap: NonZeroUsize, sorter: Sorter) -> Self { - CachedSorter { + CboCachedSorter { cache: lru::LruCache::new(cap), sorter, deladd_buffer: Vec::new(), @@ -28,7 +28,7 @@ impl CachedSorter { } } -impl CachedSorter { +impl CboCachedSorter { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { @@ -194,4 +194,4 @@ impl DelAddRoaringBitmap { fn new_add_u32(n: u32) -> Self { DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } -} \ No newline at end of file +} diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs new file mode 100644 index 000000000..b230549c1 --- /dev/null +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -0,0 +1,271 @@ +use std::collections::HashSet; +use std::fs::File; + +use grenad::Merger; +use heed::RoTxn; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use serde_json::Value; + +use super::cache::CboCachedSorter; +use super::perm_json_p; +use crate::facet::value_encoding::f64_into_bytes; +use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::{ + normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError, + MAX_FACET_VALUE_LENGTH, +}; + +pub trait FacetedExtractor { + fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let rtxn = index.read_txn()?; + let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; + let attributes_to_extract: Vec<_> = + attributes_to_extract.iter().map(|s| s.as_ref()).collect(); + + let context_pool = ItemsPool::new(|| { + Ok(( + index.read_txn()?, + fields_ids_map.clone(), + Vec::new(), + CboCachedSorter::new( + // TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ), + )) + }); + + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + buffer, + fields_ids_map, + &attributes_to_extract, + cached_sorter, + document_change?, + ) + }) + })?; + + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + for (_rtxn, _fields_ids_map, _buffer, cache) in context_pool.into_items() { + let sorter = cache.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + builder.extend(readers); + } + + Ok(builder.build()) + } + + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + buffer: &mut Vec, + fields_ids_map: &mut GlobalFieldsIdsMap, + attributes_to_extract: &[&str], + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + match document_change { + DocumentChange::Deletion(inner) => { + let mut facet_del_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut facet_del_fn, + ) + } + DocumentChange::Update(inner) => { + let mut facet_del_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut facet_del_fn, + )?; + + let mut facet_add_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut facet_add_fn, + ) + } + DocumentChange::Insertion(inner) => { + let mut facet_add_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut facet_add_fn, + ) + } + } + } + + // TODO avoid owning the strings here. + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; + + fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec) + -> Option<&'b [u8]>; +} + +pub struct FieldIdFacetNumberDocidsExtractor; +impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let number = value.as_number()?; + let n = number.as_f64()?; + let ordered = f64_into_bytes(n)?; + + // fid - level - orderedf64 - orignalf64 + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(&ordered); + output.extend_from_slice(&n.to_be_bytes()); + + Some(&*output) + } +} + +/// TODO It doesn't keep the original string in the value +pub struct FieldIdFacetStringDocidsExtractor; +impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let string = value.as_str()?; + let normalize = normalize_facet(string); + let truncated = truncate_str(&normalize); + + // fid - level - normalized string + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(truncated.as_bytes()); + + Some(&*output) + } +} + +pub fn extract_document_facets( + attributes_to_extract: &[&str], + obkv: &KvReaderFieldId, + field_id_map: &mut GlobalFieldsIdsMap, + facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, +) -> Result<()> { + let mut field_name = String::new(); + for (field_id, field_bytes) in obkv { + let Some(field_name) = field_id_map.name(field_id).map(|s| { + field_name.clear(); + field_name.push_str(s); + &field_name + }) else { + unreachable!("field id not found in field id map"); + }; + + let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { + Some(field_id) => facet_fn(field_id, value), + None => Err(UserError::AttributeLimitReached.into()), + }; + + // if the current field is searchable or contains a searchable attribute + if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { + // parse json. + match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + &object, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + &array, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + value => tokenize_field(field_name, &value)?, + } + } + } + + Ok(()) +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_str(s: &str) -> &str { + let index = s + .char_indices() + .map(|(idx, _)| idx) + .chain(std::iter::once(s.len())) + .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) + .last(); + + &s[..index.unwrap_or(0)] +} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 1964b88fc..fee4f42f6 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,7 +1,114 @@ mod cache; +mod faceted; mod searchable; +pub use faceted::FacetedExtractor; pub use searchable::{ ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, }; + +/// TODO move in permissive json pointer +pub mod perm_json_p { + use serde_json::{Map, Value}; + + use crate::Result; + const SPLIT_SYMBOL: char = '.'; + + /// Returns `true` if the `selector` match the `key`. + /// + /// ```text + /// Example: + /// `animaux` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien.nom` match `animaux` + /// `animaux.chien.nom` match `animaux.chien` + /// ----------------------------------------- + /// `animaux` doesn't match `animaux.chien` + /// `animaux.` doesn't match `animaux` + /// `animaux.ch` doesn't match `animaux.chien` + /// `animau` doesn't match `animaux` + /// ``` + pub fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) + } + + pub fn seek_leaf_values_in_object( + value: &Map, + selectors: Option<&[&str]>, + skip_selectors: &[&str], + base_key: &str, + seeker: &mut impl FnMut(&str, &Value) -> Result<()>, + ) -> Result<()> { + for (key, value) in value.iter() { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side + let should_continue = select_field(&base_key, selectors, skip_selectors); + if should_continue { + match value { + Value::Object(object) => seek_leaf_values_in_object( + object, + selectors, + skip_selectors, + &base_key, + seeker, + ), + Value::Array(array) => seek_leaf_values_in_array( + array, + selectors, + skip_selectors, + &base_key, + seeker, + ), + value => seeker(&base_key, value), + }?; + } + } + + Ok(()) + } + + pub fn seek_leaf_values_in_array( + values: &[Value], + selectors: Option<&[&str]>, + skip_selectors: &[&str], + base_key: &str, + seeker: &mut impl FnMut(&str, &Value) -> Result<()>, + ) -> Result<()> { + for value in values { + match value { + Value::Object(object) => { + seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker) + } + value => seeker(base_key, value), + }?; + } + + Ok(()) + } + + pub fn select_field( + field_name: &str, + selectors: Option<&[&str]>, + skip_selectors: &[&str], + ) -> bool { + selectors.map_or(true, |selectors| { + selectors.iter().any(|selector| { + contained_in(selector, &field_name) || contained_in(&field_name, selector) + }) + }) && !skip_selectors.iter().any(|skip_selector| { + contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector) + }) + } +} diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index f8b495538..70f9c4e47 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -19,6 +19,7 @@ impl SearchableExtractor for WordDocidsExtractor { index.exact_attributes(rtxn).map_err(Into::into) } + /// TODO write in an external Vec buffer fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { Cow::Borrowed(word.as_bytes()) } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 106455a7b..078d06150 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -1,22 +1,22 @@ mod extract_word_docids; mod tokenize_document; +use std::borrow::Cow; +use std::fs::File; + pub use extract_word_docids::{ ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, }; -use std::borrow::Cow; -use std::fs::File; - use grenad::Merger; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::CachedSorter; +use super::cache::CboCachedSorter; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; -use tokenize_document::{tokenizer_builder, DocumentTokenizer}; pub trait SearchableExtractor { fn run_extraction( @@ -60,7 +60,7 @@ pub trait SearchableExtractor { index.read_txn()?, &document_tokenizer, fields_ids_map.clone(), - CachedSorter::new( + CboCachedSorter::new( // TODO use a better value 100.try_into().unwrap(), create_sorter( @@ -103,14 +103,16 @@ pub trait SearchableExtractor { index: &Index, document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CachedSorter, + cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()> { match document_change { DocumentChange::Deletion(inner) => { let mut token_fn = |fid, pos: u16, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document( inner.current(rtxn, index)?.unwrap(), @@ -121,7 +123,9 @@ pub trait SearchableExtractor { DocumentChange::Update(inner) => { let mut token_fn = |fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document( inner.current(rtxn, index)?.unwrap(), @@ -131,14 +135,18 @@ pub trait SearchableExtractor { let mut token_fn = |fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; } DocumentChange::Insertion(inner) => { let mut token_fn = |fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; } @@ -152,5 +160,5 @@ pub trait SearchableExtractor { fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; - fn build_key<'a>(field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]>; + fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; } diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index e20e52406..1d19354db 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -1,13 +1,15 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; -use heed::RoTxn; use serde_json::Value; +use crate::update::new::extract::perm_json_p::{ + seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, +}; use crate::update::new::KvReaderFieldId; use crate::{ - FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, - Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, + FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, + MAX_WORD_LENGTH, }; pub struct DocumentTokenizer<'a> { @@ -23,7 +25,7 @@ impl<'a> DocumentTokenizer<'a> { &self, obkv: &KvReaderFieldId, field_id_map: &mut GlobalFieldsIdsMap, - token_fn: &mut impl FnMut(FieldId, u16, &str), + token_fn: &mut impl FnMut(FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); let mut field_name = String::new(); @@ -38,22 +40,23 @@ impl<'a> DocumentTokenizer<'a> { let mut tokenize_field = |name: &str, value: &Value| { let Some(field_id) = field_id_map.id_or_insert(name) else { - /// TODO: better error - panic!("it's over 9000"); + return Err(UserError::AttributeLimitReached.into()); }; let position = field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); if *position as u32 >= self.max_positions_per_attributes { - return; + return Ok(()); } match value { Value::Number(n) => { let token = n.to_string(); if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token.as_str()); + token_fn(field_id, position, token.as_str())?; } + + Ok(()) } Value::String(text) => { // create an iterator of token with their positions. @@ -74,41 +77,40 @@ impl<'a> DocumentTokenizer<'a> { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { *position = index; if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token); + token_fn(field_id, position, token)?; } } } + + Ok(()) } - _ => (), + _ => Ok(()), } }; // if the current field is searchable or contains a searchable attribute - if perm_json_p::select_field( - &field_name, - self.attribute_to_extract.as_deref(), - self.attribute_to_skip, - ) { + if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) { // parse json. match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { - Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + Value::Object(object) => seek_leaf_values_in_object( &object, - self.attribute_to_extract.as_deref(), + self.attribute_to_extract, self.attribute_to_skip, &field_name, &mut tokenize_field, - ), - Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + )?, + Value::Array(array) => seek_leaf_values_in_array( &array, - self.attribute_to_extract.as_deref(), + self.attribute_to_extract, self.attribute_to_skip, &field_name, &mut tokenize_field, - ), - value => tokenize_field(&field_name, &value), + )?, + value => tokenize_field(&field_name, &value)?, } } } + Ok(()) } } @@ -167,105 +169,6 @@ pub fn tokenizer_builder<'a>( tokenizer_builder } -/// TODO move in permissive json pointer -mod perm_json_p { - use serde_json::{Map, Value}; - const SPLIT_SYMBOL: char = '.'; - - /// Returns `true` if the `selector` match the `key`. - /// - /// ```text - /// Example: - /// `animaux` match `animaux` - /// `animaux.chien` match `animaux` - /// `animaux.chien` match `animaux` - /// `animaux.chien.nom` match `animaux` - /// `animaux.chien.nom` match `animaux.chien` - /// ----------------------------------------- - /// `animaux` doesn't match `animaux.chien` - /// `animaux.` doesn't match `animaux` - /// `animaux.ch` doesn't match `animaux.chien` - /// `animau` doesn't match `animaux` - /// ``` - pub fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) - } - - pub fn seek_leaf_values_in_object( - value: &Map, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - base_key: &str, - seeker: &mut impl FnMut(&str, &Value), - ) { - for (key, value) in value.iter() { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, SPLIT_SYMBOL, key) - }; - - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let should_continue = select_field(&base_key, selectors, skip_selectors); - if should_continue { - match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - seeker, - ), - value => seeker(&base_key, value), - } - } - } - } - - pub fn seek_leaf_values_in_array( - values: &[Value], - selectors: Option<&[&str]>, - skip_selectors: &[&str], - base_key: &str, - seeker: &mut impl FnMut(&str, &Value), - ) { - for value in values { - match value { - Value::Object(object) => { - seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker) - } - Value::Array(array) => { - seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker) - } - value => seeker(base_key, value), - } - } - } - - pub fn select_field( - field_name: &str, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - ) -> bool { - selectors.map_or(true, |selectors| { - selectors.iter().any(|selector| { - contained_in(selector, &field_name) || contained_in(&field_name, selector) - }) - }) && !skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector) - }) - } -} - #[cfg(test)] mod test { use charabia::TokenizerBuilder; @@ -274,6 +177,8 @@ mod test { use serde_json::json; use super::*; + use crate::FieldsIdsMap; + #[test] fn test_tokenize_document() { let mut fields_ids_map = FieldsIdsMap::new(); @@ -329,6 +234,7 @@ mod test { document_tokenizer .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { words.insert([fid, pos], word.to_string()); + Ok(()) }) .unwrap(); diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 3b1fc97c5..21e28fc84 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -127,6 +127,19 @@ where &extractor_sender, )?; + // TODO THIS IS TOO MUCH + // Extract fieldid docid facet number + // Extract fieldid docid facet string + // Extract facetid string fst + + // Extract fieldid facet isempty docids + // Extract fieldid facet isnull docids + // Extract fieldid facet exists docids + + // TODO This is the normal system + // Extract fieldid facet number docids + // Extract fieldid facet string docids + Ok(()) as Result<_> }) })?; diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 35449b475..25f09441c 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,7 +1,5 @@ use std::fs::File; -use std::io; -use fst::set::OpBuilder; use fst::{Set, SetBuilder}; use grenad::Merger; use heed::types::Bytes; @@ -15,7 +13,6 @@ use super::channel::{ WordFidDocids, WordPositionDocids, }; use super::KvReaderDelAdd; -use crate::index::main_key::WORDS_FST_KEY; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::MergeDeladdCboRoaringBitmaps; @@ -210,7 +207,7 @@ fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec } /// TODO Return the slice directly from the serialize_into method -fn serialize_bitmap_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) { +fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec) { buffer.clear(); bitmap.serialize_into(buffer).unwrap(); // buffer.as_slice()