From bc7b0a38fd4fa27829508f159f4c1696a8471ce2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 22 May 2020 15:00:50 +0200 Subject: [PATCH] Use fst 0.4.4 in the project --- meilisearch-core/src/bucket_sort.rs | 16 +--- meilisearch-core/src/lib.rs | 8 +- meilisearch-core/src/query_builder.rs | 29 +++---- meilisearch-core/src/query_tree.rs | 10 +-- meilisearch-core/src/raw_indexer.rs | 38 ++++---- meilisearch-core/src/store/docs_words.rs | 29 +++---- meilisearch-core/src/store/main.rs | 87 +++++++------------ meilisearch-core/src/store/synonyms.rs | 29 +++---- .../src/update/documents_addition.rs | 59 ++++++------- .../src/update/documents_deletion.rs | 28 +++--- meilisearch-core/src/update/helpers.rs | 11 +-- meilisearch-core/src/update/mod.rs | 14 +-- .../src/update/settings_update.rs | 40 ++++----- meilisearch-http/src/routes/setting.rs | 10 +-- meilisearch-http/src/routes/stop_words.rs | 2 +- meilisearch-http/src/routes/synonym.rs | 9 +- 16 files changed, 178 insertions(+), 241 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 4fd6b3080..b991745ba 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -79,12 +79,8 @@ where let mut result = SortResult::default(); - let words_set = match unsafe { main_store.static_words_fst(reader)? } { - Some(words) => words, - None => return Ok(SortResult::default()), - }; - - let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default(); + let words_set = main_store.words_fst(reader)?; + let stop_words = main_store.stop_words_fst(reader)?; let context = QTContext { words_set, @@ -230,12 +226,8 @@ where { let mut result = SortResult::default(); - let words_set = match unsafe { main_store.static_words_fst(reader)? } { - Some(words) => words, - None => return Ok(SortResult::default()), - }; - - let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default(); + let words_set = main_store.words_fst(reader)?; + let stop_words = main_store.stop_words_fst(reader)?; let context = QTContext { words_set, diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 1a4b0d2e6..a9938bb73 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -38,16 +38,20 @@ pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; pub use meilisearch_schema::Schema; pub use query_words_mapper::QueryWordsMapper; -use std::convert::TryFrom; -use std::collections::HashMap; use compact_arena::SmallArena; use log::{error, trace}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::convert::TryFrom; use crate::bucket_sort::PostingsListView; use crate::levenshtein::prefix_damerau_levenshtein; use crate::query_tree::{QueryId, QueryKind}; use crate::reordered_attrs::ReorderedAttrs; +type FstSetCow<'a> = fst::Set>; +type FstMapCow<'a> = fst::Map>; + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Document { pub id: DocumentId, diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index d1b35273d..c9ed1933e 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -186,7 +186,7 @@ mod tests { use std::collections::{BTreeSet, HashMap}; use std::iter::FromIterator; - use fst::{IntoStreamer, Set}; + use fst::IntoStreamer; use meilisearch_schema::IndexedPos; use sdset::SetBuf; use tempfile::TempDir; @@ -199,21 +199,21 @@ mod tests { use crate::store::Index; use meilisearch_schema::Schema; - fn set_from_stream<'f, I, S>(stream: I) -> Set + fn set_from_stream<'f, I, S>(stream: I) -> fst::Set> where I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, { let mut builder = fst::SetBuilder::memory(); builder.extend_stream(stream).unwrap(); - builder.into_inner().and_then(Set::from_bytes).unwrap() + builder.into_set() } - fn insert_key(set: &Set, key: &[u8]) -> Set { + fn insert_key>(set: &fst::Set, key: &[u8]) -> fst::Set> { let unique_key = { let mut builder = fst::SetBuilder::memory(); builder.insert(key).unwrap(); - builder.into_inner().and_then(Set::from_bytes).unwrap() + builder.into_set() }; let union_ = set.op().add(unique_key.into_stream()).r#union(); @@ -221,11 +221,11 @@ mod tests { set_from_stream(union_) } - fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { + fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set> { let mut builder = fst::SetBuilder::memory(); let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); builder.extend_iter(set.into_iter()).unwrap(); - builder.into_inner().and_then(Set::from_bytes).unwrap() + builder.into_set() } const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { @@ -265,15 +265,11 @@ mod tests { let word = normalize_str(word); - let alternatives = match self + let alternatives = self .index .synonyms .synonyms(&writer, word.as_bytes()) - .unwrap() - { - Some(alternatives) => alternatives, - None => fst::Set::default(), - }; + .unwrap(); let new = sdset_into_fstset(&new); let new_alternatives = @@ -283,10 +279,7 @@ mod tests { .put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) .unwrap(); - let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() { - Some(synonyms) => synonyms, - None => fst::Set::default(), - }; + let synonyms = self.index.main.synonyms_fst(&writer).unwrap(); let synonyms_fst = insert_key(&synonyms, word.as_bytes()); self.index @@ -339,7 +332,7 @@ mod tests { index.main.put_schema(&mut writer, &schema).unwrap(); - let words_fst = Set::from_iter(words_fst).unwrap(); + let words_fst = fst::Set::from_iter(words_fst).unwrap(); index.main.put_words_fst(&mut writer, &words_fst).unwrap(); diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 319b997a0..2687028a0 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -12,7 +12,7 @@ use sdset::{Set, SetBuf, SetOperation}; use log::debug; use crate::database::MainT; -use crate::{store, DocumentId, DocIndex, MResult}; +use crate::{store, DocumentId, DocIndex, MResult, FstSetCow}; use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::QueryWordsMapper; @@ -112,9 +112,9 @@ pub struct PostingsList { matches: SetBuf, } -pub struct Context { - pub words_set: fst::Set, - pub stop_words: fst::Set, +pub struct Context<'a> { + pub words_set: FstSetCow<'a>, + pub stop_words: FstSetCow<'a>, pub synonyms: store::Synonyms, pub postings_lists: store::PostingsLists, pub prefix_postings_lists: store::PrefixPostingsListsCache, @@ -147,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn, ctx: &Context, word: &' fn fetch_synonyms(reader: &heed::RoTxn, ctx: &Context, words: &[&str]) -> MResult>> { let words = normalize_str(&words.join(" ")); - let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default(); + let set = ctx.synonyms.synonyms(reader, words.as_bytes())?; let mut strings = Vec::new(); let mut stream = set.stream(); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 8ed5966bb..1cc06f441 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -1,34 +1,37 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; -use crate::{DocIndex, DocumentId}; use deunicode::deunicode_with_tofu; use meilisearch_schema::IndexedPos; use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer}; use sdset::SetBuf; +use crate::{DocIndex, DocumentId}; +use crate::FstSetCow; + const WORD_LENGTH_LIMIT: usize = 80; type Word = Vec; // TODO make it be a SmallVec -pub struct RawIndexer { +pub struct RawIndexer { word_limit: usize, // the maximum number of indexed words - stop_words: fst::Set, + stop_words: fst::Set, words_doc_indexes: BTreeMap>, docs_words: HashMap>, } -pub struct Indexed { +pub struct Indexed<'a> { pub words_doc_indexes: BTreeMap>, - pub docs_words: HashMap, + pub docs_words: HashMap>, } -impl RawIndexer { - pub fn new(stop_words: fst::Set) -> RawIndexer { +impl RawIndexer { + pub fn new(stop_words: fst::Set) -> RawIndexer { RawIndexer::with_word_limit(stop_words, 1000) } - pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer { + pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer { RawIndexer { word_limit: limit, stop_words, @@ -36,7 +39,9 @@ impl RawIndexer { docs_words: HashMap::new(), } } +} +impl> RawIndexer { pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; @@ -61,9 +66,9 @@ impl RawIndexer { number_of_words } - pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) + pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) where - I: IntoIterator, + I: IntoIterator, { let iter = iter.into_iter(); for token in SeqTokenizer::new(iter) { @@ -83,7 +88,7 @@ impl RawIndexer { } } - pub fn build(self) -> Indexed { + pub fn build(self) -> Indexed<'static> { let words_doc_indexes = self .words_doc_indexes .into_iter() @@ -96,7 +101,8 @@ impl RawIndexer { .map(|(id, mut words)| { words.sort_unstable(); words.dedup(); - (id, fst::Set::from_iter(words).unwrap()) + let fst = fst::Set::from_iter(words).unwrap().map_data(Cow::Owned).unwrap(); + (id, fst) }) .collect(); @@ -107,15 +113,17 @@ impl RawIndexer { } } -fn index_token( +fn index_token( token: Token, id: DocumentId, indexed_pos: IndexedPos, word_limit: usize, - stop_words: &fst::Set, + stop_words: &fst::Set, words_doc_indexes: &mut BTreeMap>, docs_words: &mut HashMap>, -) -> bool { +) -> bool +where A: AsRef<[u8]>, +{ if token.word_index >= word_limit { return false; } diff --git a/meilisearch-core/src/store/docs_words.rs b/meilisearch-core/src/store/docs_words.rs index 1e33a9bb0..11e81f673 100644 --- a/meilisearch-core/src/store/docs_words.rs +++ b/meilisearch-core/src/store/docs_words.rs @@ -1,9 +1,11 @@ -use super::BEU32; -use crate::database::MainT; -use crate::DocumentId; -use heed::types::{ByteSlice, OwnedType}; +use std::borrow::Cow; + use heed::Result as ZResult; -use std::sync::Arc; +use heed::types::{ByteSlice, OwnedType}; + +use crate::database::MainT; +use crate::{DocumentId, FstSetCow}; +use super::BEU32; #[derive(Copy, Clone)] pub struct DocsWords { @@ -15,7 +17,7 @@ impl DocsWords { self, writer: &mut heed::RwTxn, document_id: DocumentId, - words: &fst::Set, + words: &FstSetCow, ) -> ZResult<()> { let document_id = BEU32::new(document_id.0); let bytes = words.as_fst().as_bytes(); @@ -31,20 +33,11 @@ impl DocsWords { self.docs_words.clear(writer) } - pub fn doc_words( - self, - reader: &heed::RoTxn, - document_id: DocumentId, - ) -> ZResult> { + pub fn doc_words(self, reader: &heed::RoTxn, document_id: DocumentId) -> ZResult { let document_id = BEU32::new(document_id.0); match self.docs_words.get(reader, &document_id)? { - Some(bytes) => { - let len = bytes.len(); - let bytes = Arc::new(bytes.to_owned()); - let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(Some(fst::Set::from(fst))) - } - None => Ok(None), + Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), + None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), } } } diff --git a/meilisearch-core/src/store/main.rs b/meilisearch-core/src/store/main.rs index d1f7a522c..ba3c259a3 100644 --- a/meilisearch-core/src/store/main.rs +++ b/meilisearch-core/src/store/main.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::sync::Arc; use std::collections::HashMap; use chrono::{DateTime, Utc}; @@ -12,6 +11,7 @@ use sdset::Set; use crate::database::MainT; use crate::RankedMap; use crate::settings::RankingRule; +use crate::{FstSetCow, FstMapCow}; use super::{CowSet, DocumentsIds}; const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting"; @@ -103,11 +103,15 @@ impl Main { self.put_internal_docids(writer, &internal_docids) } - pub fn put_external_docids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> { + pub fn put_external_docids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> + where A: AsRef<[u8]>, + { self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes()) } - pub fn merge_external_docids(self, writer: &mut heed::RwTxn, new_docids: &fst::Map) -> ZResult<()> { + pub fn merge_external_docids(self, writer: &mut heed::RwTxn, new_docids: &fst::Map) -> ZResult<()> + where A: AsRef<[u8]>, + { use fst::{Streamer, IntoStreamer}; // Do an union of the old and the new set of external docids. @@ -117,13 +121,15 @@ impl Main { while let Some((docid, values)) = op.next() { build.insert(docid, values[0].value).unwrap(); } - let external_docids = build.into_inner().unwrap(); + drop(op); - // TODO prefer using self.put_user_ids - self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice()) + let external_docids = build.into_map(); + self.put_external_docids(writer, &external_docids) } - pub fn remove_external_docids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> { + pub fn remove_external_docids(self, writer: &mut heed::RwTxn, ids: &fst::Map) -> ZResult<()> + where A: AsRef<[u8]>, + { use fst::{Streamer, IntoStreamer}; // Do an union of the old and the new set of external docids. @@ -133,21 +139,16 @@ impl Main { while let Some((docid, values)) = op.next() { build.insert(docid, values[0].value).unwrap(); } - let external_docids = build.into_inner().unwrap(); + drop(op); - // TODO prefer using self.put_external_docids - self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice()) + let external_docids = build.into_map(); + self.put_external_docids(writer, &external_docids) } - pub fn external_docids(self, reader: &heed::RoTxn) -> ZResult { + pub fn external_docids(self, reader: &heed::RoTxn) -> ZResult { match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? { - Some(bytes) => { - let len = bytes.len(); - let bytes = Arc::new(bytes.to_owned()); - let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(fst::Map::from(fst)) - }, - None => Ok(fst::Map::default()), + Some(bytes) => Ok(fst::Map::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), + None => Ok(fst::Map::default().map_data(Cow::Owned).unwrap()), } } @@ -156,30 +157,14 @@ impl Main { Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32))) } - pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { + pub fn put_words_fst>(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes()) } - pub unsafe fn static_words_fst(self, reader: &heed::RoTxn) -> ZResult> { + pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult { match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? { - Some(bytes) => { - let bytes: &'static [u8] = std::mem::transmute(bytes); - let set = fst::Set::from_static_slice(bytes).unwrap(); - Ok(Some(set)) - }, - None => Ok(None), - } - } - - pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult> { - match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? { - Some(bytes) => { - let len = bytes.len(); - let bytes = Arc::new(bytes.to_owned()); - let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(Some(fst::Set::from(fst))) - }, - None => Ok(None), + Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), + None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), } } @@ -203,37 +188,27 @@ impl Main { self.main.get::<_, Str, SerdeBincode>(reader, RANKED_MAP_KEY) } - pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { + pub fn put_synonyms_fst>(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { let bytes = fst.as_fst().as_bytes(); self.main.put::<_, Str, ByteSlice>(writer, SYNONYMS_KEY, bytes) } - pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult> { + pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult { match self.main.get::<_, Str, ByteSlice>(reader, SYNONYMS_KEY)? { - Some(bytes) => { - let len = bytes.len(); - let bytes = Arc::new(bytes.to_owned()); - let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(Some(fst::Set::from(fst))) - } - None => Ok(None), + Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), + None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), } } - pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { + pub fn put_stop_words_fst>(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { let bytes = fst.as_fst().as_bytes(); self.main.put::<_, Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes) } - pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult> { + pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult { match self.main.get::<_, Str, ByteSlice>(reader, STOP_WORDS_KEY)? { - Some(bytes) => { - let len = bytes.len(); - let bytes = Arc::new(bytes.to_owned()); - let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(Some(fst::Set::from(fst))) - } - None => Ok(None), + Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), + None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), } } diff --git a/meilisearch-core/src/store/synonyms.rs b/meilisearch-core/src/store/synonyms.rs index 75f7610eb..1ec8d313c 100644 --- a/meilisearch-core/src/store/synonyms.rs +++ b/meilisearch-core/src/store/synonyms.rs @@ -1,7 +1,10 @@ -use heed::types::ByteSlice; -use crate::database::MainT; +use std::borrow::Cow; + use heed::Result as ZResult; -use std::sync::Arc; +use heed::types::ByteSlice; + +use crate::database::MainT; +use crate::FstSetCow; #[derive(Copy, Clone)] pub struct Synonyms { @@ -9,12 +12,9 @@ pub struct Synonyms { } impl Synonyms { - pub fn put_synonyms( - self, - writer: &mut heed::RwTxn, - word: &[u8], - synonyms: &fst::Set, - ) -> ZResult<()> { + pub fn put_synonyms(self, writer: &mut heed::RwTxn, word: &[u8], synonyms: &fst::Set) -> ZResult<()> + where A: AsRef<[u8]>, + { let bytes = synonyms.as_fst().as_bytes(); self.synonyms.put(writer, word, bytes) } @@ -27,15 +27,10 @@ impl Synonyms { self.synonyms.clear(writer) } - pub fn synonyms(self, reader: &heed::RoTxn, word: &[u8]) -> ZResult> { + pub fn synonyms<'txn>(self, reader: &'txn heed::RoTxn, word: &[u8]) -> ZResult> { match self.synonyms.get(reader, word)? { - Some(bytes) => { - let len = bytes.len(); - let bytes = Arc::new(bytes.to_owned()); - let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); - Ok(Some(fst::Set::from(fst))) - } - None => Ok(None), + Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), + None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), } } } diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index 6858f6d2c..668ce5613 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::{HashMap, BTreeMap}; use fst::{set::OpBuilder, SetBuilder}; @@ -108,17 +109,18 @@ pub fn push_documents_addition( Ok(last_update_id) } -fn index_document( +fn index_document( writer: &mut heed::RwTxn, documents_fields: DocumentsFields, documents_fields_counts: DocumentsFieldsCounts, ranked_map: &mut RankedMap, - indexer: &mut RawIndexer, + indexer: &mut RawIndexer, schema: &Schema, field_id: FieldId, document_id: DocumentId, value: &Value, ) -> MResult<()> +where A: AsRef<[u8]>, { let serialized = serde_json::to_vec(value)?; documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; @@ -208,10 +210,7 @@ pub fn apply_addition<'a, 'b>( None => RankedMap::default(), }; - let stop_words = match index.main.stop_words_fst(writer)? { - Some(stop_words) => stop_words, - None => fst::Set::default(), - }; + let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?; // 3. index the documents fields in the stores if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? { @@ -297,10 +296,10 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind index.postings_lists.clear(writer)?; index.docs_words.clear(writer)?; - let stop_words = match index.main.stop_words_fst(writer)? { - Some(stop_words) => stop_words, - None => fst::Set::default(), - }; + let stop_words = index.main + .stop_words_fst(writer)? + .map_data(Cow::into_owned) + .unwrap(); let number_of_inserted_documents = documents_ids_to_reindex.len(); let mut indexer = RawIndexer::new(stop_words); @@ -348,13 +347,15 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind Ok(()) } -pub fn write_documents_addition_index( +pub fn write_documents_addition_index( writer: &mut heed::RwTxn, index: &store::Index, ranked_map: &RankedMap, number_of_inserted_documents: usize, - indexer: RawIndexer, -) -> MResult<()> { + indexer: RawIndexer, +) -> MResult<()> +where A: AsRef<[u8]>, +{ let indexed = indexer.build(); let mut delta_words_builder = SetBuilder::memory(); @@ -373,33 +374,27 @@ pub fn write_documents_addition_index( index.docs_words.put_doc_words(writer, id, &words)?; } - let delta_words = delta_words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap(); + let delta_words = delta_words_builder.into_set(); - let words = match index.main.words_fst(writer)? { - Some(words) => { - let op = OpBuilder::new() - .add(words.stream()) - .add(delta_words.stream()) - .r#union(); + let words_fst = index.main.words_fst(writer)?; + let words = if !words_fst.is_empty() { + let op = OpBuilder::new() + .add(words_fst.stream()) + .add(delta_words.stream()) + .r#union(); - let mut words_builder = SetBuilder::memory(); - words_builder.extend_stream(op).unwrap(); - words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() - } - None => delta_words, + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder.into_set() + } else { + delta_words }; index.main.put_words_fst(writer, &words)?; index.main.put_ranked_map(writer, ranked_map)?; index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?; - compute_short_prefixes(writer, index)?; + compute_short_prefixes(writer, &words, index)?; Ok(()) } diff --git a/meilisearch-core/src/update/documents_deletion.rs b/meilisearch-core/src/update/documents_deletion.rs index a45021cae..8b31170ae 100644 --- a/meilisearch-core/src/update/documents_deletion.rs +++ b/meilisearch-core/src/update/documents_deletion.rs @@ -114,7 +114,8 @@ pub fn apply_documents_deletion( ranked_map.remove(id, *ranked_attr); } - if let Some(words) = index.docs_words.doc_words(writer, id)? { + let words = index.docs_words.doc_words(writer, id)?; + if !words.is_empty() { let mut stream = words.stream(); while let Some(word) = stream.next() { let word = word.to_vec(); @@ -157,21 +158,16 @@ pub fn apply_documents_deletion( } let removed_words = fst::Set::from_iter(removed_words).unwrap(); - let words = match index.main.words_fst(writer)? { - Some(words_set) => { - let op = fst::set::OpBuilder::new() - .add(words_set.stream()) - .add(removed_words.stream()) - .difference(); + let words = { + let words_set = index.main.words_fst(writer)?; + let op = fst::set::OpBuilder::new() + .add(words_set.stream()) + .add(removed_words.stream()) + .difference(); - let mut words_builder = SetBuilder::memory(); - words_builder.extend_stream(op).unwrap(); - words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() - } - None => fst::Set::default(), + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder.into_set() }; index.main.put_words_fst(writer, &words)?; @@ -182,7 +178,7 @@ pub fn apply_documents_deletion( index.main.remove_external_docids(writer, &external_docids)?; index.main.remove_internal_docids(writer, &internal_docids)?; - compute_short_prefixes(writer, index)?; + compute_short_prefixes(writer, &words, index)?; Ok(()) } diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index c7bd05cec..6e8902182 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -6,18 +6,19 @@ use meilisearch_types::DocumentId; use ordered_float::OrderedFloat; use serde_json::Value; -use crate::Number; +use crate::{Number, FstMapCow}; use crate::raw_indexer::RawIndexer; use crate::serde::SerializerError; use crate::store::DiscoverIds; /// Returns the number of words indexed or `None` if the type is unindexable. -pub fn index_value( - indexer: &mut RawIndexer, +pub fn index_value( + indexer: &mut RawIndexer, document_id: DocumentId, indexed_pos: IndexedPos, value: &Value, ) -> Option +where A: AsRef<[u8]>, { match value { Value::Null => None, @@ -99,7 +100,7 @@ pub fn value_to_number(value: &Value) -> Option { /// the corresponding id or generate a new one, this is the way we produce documents ids. pub fn discover_document_id( docid: &str, - external_docids: &fst::Map, + external_docids: &FstMapCow, available_docids: &mut DiscoverIds<'_>, ) -> Result { @@ -120,7 +121,7 @@ pub fn discover_document_id( pub fn extract_document_id( primary_key: &str, document: &IndexMap, - external_docids: &fst::Map, + external_docids: &FstMapCow, available_docids: &mut DiscoverIds<'_>, ) -> Result<(DocumentId, String), SerializerError> { diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index d2d771030..0a71f1b45 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -297,13 +297,13 @@ pub fn update_task<'a, 'b>( Ok(status) } -fn compute_short_prefixes(writer: &mut heed::RwTxn, index: &store::Index) -> MResult<()> { - // retrieve the words fst to compute all those prefixes - let words_fst = match index.main.words_fst(writer)? { - Some(fst) => fst, - None => return Ok(()), - }; - +fn compute_short_prefixes( + writer: &mut heed::RwTxn, + words_fst: &fst::Set, + index: &store::Index, +) -> MResult<()> +where A: AsRef<[u8]>, +{ // clear the prefixes let pplc_store = index.prefix_postings_lists_cache; pplc_store.clear(writer)?; diff --git a/meilisearch-core/src/update/settings_update.rs b/meilisearch-core/src/update/settings_update.rs index b4bf6c125..2717d7aa5 100644 --- a/meilisearch-core/src/update/settings_update.rs +++ b/meilisearch-core/src/update/settings_update.rs @@ -168,7 +168,6 @@ pub fn apply_stop_words_update( let old_stop_words: BTreeSet = index.main .stop_words_fst(writer)? - .unwrap_or_default() .stream() .into_strs()? .into_iter() @@ -186,7 +185,8 @@ pub fn apply_stop_words_update( apply_stop_words_deletion(writer, index, deletion)?; } - if let Some(words_fst) = index.main.words_fst(writer)? { + let words_fst = index.main.words_fst(writer)?; + if !words_fst.is_empty() { let stop_words = fst::Set::from_iter(stop_words)?; let op = OpBuilder::new() .add(&words_fst) @@ -195,7 +195,7 @@ pub fn apply_stop_words_update( let mut builder = fst::SetBuilder::memory(); builder.extend_stream(op)?; - let words_fst = builder.into_inner().and_then(fst::Set::from_bytes)?; + let words_fst = builder.into_set(); index.main.put_words_fst(writer, &words_fst)?; index.main.put_stop_words_fst(writer, &stop_words)?; @@ -222,28 +222,25 @@ fn apply_stop_words_addition( } // create the new delta stop words fst - let delta_stop_words = stop_words_builder - .into_inner() - .and_then(fst::Set::from_bytes)?; + let delta_stop_words = stop_words_builder.into_set(); // we also need to remove all the stop words from the main fst - if let Some(word_fst) = main_store.words_fst(writer)? { + let words_fst = main_store.words_fst(writer)?; + if !words_fst.is_empty() { let op = OpBuilder::new() - .add(&word_fst) + .add(&words_fst) .add(&delta_stop_words) .difference(); let mut word_fst_builder = SetBuilder::memory(); word_fst_builder.extend_stream(op)?; - let word_fst = word_fst_builder - .into_inner() - .and_then(fst::Set::from_bytes)?; + let word_fst = word_fst_builder.into_set(); main_store.put_words_fst(writer, &word_fst)?; } // now we add all of these stop words from the main store - let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default(); + let stop_words_fst = main_store.stop_words_fst(writer)?; let op = OpBuilder::new() .add(&stop_words_fst) @@ -252,9 +249,7 @@ fn apply_stop_words_addition( let mut stop_words_builder = SetBuilder::memory(); stop_words_builder.extend_stream(op)?; - let stop_words_fst = stop_words_builder - .into_inner() - .and_then(fst::Set::from_bytes)?; + let stop_words_fst = stop_words_builder.into_set(); main_store.put_stop_words_fst(writer, &stop_words_fst)?; @@ -274,12 +269,10 @@ fn apply_stop_words_deletion( } // create the new delta stop words fst - let delta_stop_words = stop_words_builder - .into_inner() - .and_then(fst::Set::from_bytes)?; + let delta_stop_words = stop_words_builder.into_set(); // now we delete all of these stop words from the main store - let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default(); + let stop_words_fst = index.main.stop_words_fst(writer)?; let op = OpBuilder::new() .add(&stop_words_fst) @@ -288,7 +281,7 @@ fn apply_stop_words_deletion( let mut stop_words_builder = SetBuilder::memory(); stop_words_builder.extend_stream(op)?; - let stop_words_fst = stop_words_builder.into_inner().and_then(fst::Set::from_bytes)?; + let stop_words_fst = stop_words_builder.into_set(); Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?) } @@ -311,16 +304,13 @@ pub fn apply_synonyms_update( let alternatives = SetBuf::from_dirty(alternatives); let mut alternatives_builder = SetBuilder::memory(); alternatives_builder.extend_iter(alternatives)?; - let bytes = alternatives_builder.into_inner()?; - fst::Set::from_bytes(bytes)? + alternatives_builder.into_set() }; synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?; } - let synonyms_set = synonyms_builder - .into_inner() - .and_then(fst::Set::from_bytes)?; + let synonyms_set = synonyms_builder.into_set(); main_store.put_synonyms_fst(writer, &synonyms_set)?; diff --git a/meilisearch-http/src/routes/setting.rs b/meilisearch-http/src/routes/setting.rs index 96e6935e7..04ed8ceed 100644 --- a/meilisearch-http/src/routes/setting.rs +++ b/meilisearch-http/src/routes/setting.rs @@ -63,20 +63,18 @@ async fn get_all( let reader = data.db.main_read_txn()?; let stop_words_fst = index.main.stop_words_fst(&reader)?; - let stop_words = stop_words_fst.unwrap_or_default().stream().into_strs()?; + let stop_words = stop_words_fst.stream().into_strs()?; let stop_words: BTreeSet = stop_words.into_iter().collect(); - let synonyms_fst = index.main.synonyms_fst(&reader)?.unwrap_or_default(); + let synonyms_fst = index.main.synonyms_fst(&reader)?; let synonyms_list = synonyms_fst.stream().into_strs()?; let mut synonyms = BTreeMap::new(); let index_synonyms = &index.synonyms; for synonym in synonyms_list { let alternative_list = index_synonyms.synonyms(&reader, synonym.as_bytes())?; - if let Some(list) = alternative_list { - let list = list.stream().into_strs()?; - synonyms.insert(synonym, list); - } + let list = alternative_list.stream().into_strs()?; + synonyms.insert(synonym, list); } let ranking_rules = index diff --git a/meilisearch-http/src/routes/stop_words.rs b/meilisearch-http/src/routes/stop_words.rs index 21fc9281f..90814e423 100644 --- a/meilisearch-http/src/routes/stop_words.rs +++ b/meilisearch-http/src/routes/stop_words.rs @@ -26,7 +26,7 @@ async fn get( .ok_or(ResponseError::index_not_found(&path.index_uid))?; let reader = data.db.main_read_txn()?; let stop_words_fst = index.main.stop_words_fst(&reader)?; - let stop_words = stop_words_fst.unwrap_or_default().stream().into_strs()?; + let stop_words = stop_words_fst.stream().into_strs()?; Ok(HttpResponse::Ok().json(stop_words)) } diff --git a/meilisearch-http/src/routes/synonym.rs b/meilisearch-http/src/routes/synonym.rs index 973f591ab..6c9e77f6d 100644 --- a/meilisearch-http/src/routes/synonym.rs +++ b/meilisearch-http/src/routes/synonym.rs @@ -29,18 +29,15 @@ async fn get( let reader = data.db.main_read_txn()?; - let synonyms_fst = index.main.synonyms_fst(&reader)?.unwrap_or_default(); + let synonyms_fst = index.main.synonyms_fst(&reader)?; let synonyms_list = synonyms_fst.stream().into_strs()?; let mut synonyms = IndexMap::new(); let index_synonyms = &index.synonyms; for synonym in synonyms_list { let alternative_list = index_synonyms.synonyms(&reader, synonym.as_bytes())?; - - if let Some(list) = alternative_list { - let list = list.stream().into_strs()?; - synonyms.insert(synonym, list); - } + let list = alternative_list.stream().into_strs()?; + synonyms.insert(synonym, list); } Ok(HttpResponse::Ok().json(synonyms))