From 106b88687344f2b9d9db7b6057bc21f376d598b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Dec 2019 12:27:24 +0100 Subject: [PATCH] Cache the prefix postings lists --- .../src/update/documents_addition.rs | 52 ++++++++++++++++-- meilisearch-core/src/update/mod.rs | 54 +------------------ 2 files changed, 50 insertions(+), 56 deletions(-) diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index d6f3ac00a..6a4733d01 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,8 +1,10 @@ use std::collections::HashMap; +use std::borrow::Cow; -use fst::{set::OpBuilder, SetBuilder}; -use sdset::{duo::Union, SetOperation}; +use fst::{set::OpBuilder, SetBuilder, IntoStreamer, Streamer}; +use sdset::{duo::Union, SetOperation, SetBuf}; use serde::{Deserialize, Serialize}; +use log::debug; use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; @@ -110,6 +112,7 @@ pub fn apply_documents_addition<'a, 'b>( postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, prefix_documents_cache_store: store::PrefixDocumentsCache, + prefix_postings_lists_cache_store: store::PrefixPostingsListsCache, addition: Vec>, ) -> MResult<()> { let mut documents_additions = HashMap::new(); @@ -180,7 +183,50 @@ pub fn apply_documents_addition<'a, 'b>( &ranked_map, number_of_inserted_documents, indexer, - ) + )?; + + + // retrieve the words fst to compute all those prefixes + let words_fst = match main_store.words_fst(writer)? { + Some(fst) => fst, + None => return Ok(()), + }; + + // clear the prefixes + let pplc_store = prefix_postings_lists_cache_store; + pplc_store.clear(writer)?; + + const MAX_PREFIX_LENGTH: usize = 1; + + // compute prefixes and store those in the PrefixPostingsListsCache. + let mut stream = words_fst.into_stream(); + while let Some(input) = stream.next() { + for i in 1..=MAX_PREFIX_LENGTH { + let prefix = &input[..i]; + if let Some(postings_list) = postings_lists_store.postings_list(writer, prefix)? { + if let (Ok(input), Ok(prefix)) = (std::str::from_utf8(input), std::str::from_utf8(prefix)) { + debug!("{:?} postings list (prefix {:?}) length {}", input, prefix, postings_list.len()); + } + + // compute the new prefix postings lists + let mut p = [0; 4]; + let len = std::cmp::min(4, prefix.len()); + p[..len].copy_from_slice(&prefix[..len]); + + let previous = match pplc_store.prefix_postings_list(writer, p)? { + Some(previous) => previous, + None => Cow::Owned(SetBuf::default()), + }; + + let new_postings_list = Union::new(&postings_list, &previous).into_set_buf(); + pplc_store.put_prefix_postings_list(writer, p, &new_postings_list)?; + + debug!("new length {}", new_postings_list.len()); + } + } + } + + Ok(()) } pub fn apply_documents_partial_addition<'a, 'b>( diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 1c18ef5d8..265a6e193 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -309,62 +309,10 @@ pub fn update_task<'a, 'b>( index.postings_lists, index.docs_words, index.prefix_documents_cache, + index.prefix_postings_lists_cache, documents, ); - let words_fst = index.main.words_fst(writer)?.unwrap(); - let mut stream = words_fst.into_stream(); - let mut previous_char = None; - while let Some(input) = stream.next() { - let (s, c) = match std::str::from_utf8(input) { - Ok(s) => { - let c = s.chars().next().unwrap(); - (&s[..c.len_utf8()], c) - }, - Err(_) => continue, - }; - - match previous_char { - Some(pc) if pc != c => { - debug!("searching and caching {:?}", s); - - let documents = bucket_sort( - writer, - s, - 0..20, - None as Option bool>, - Criteria::default(), - None, - index.main, - index.postings_lists, - index.documents_fields_counts, - index.synonyms, - index.prefix_documents_cache, - ).unwrap(); - - let mut prefix = [0; 4]; - let len = cmp::min(4, s.len()); - prefix[..len].copy_from_slice(&s.as_bytes()[..len]); - - for (i, document) in documents.into_iter().enumerate() { - index.prefix_documents_cache.put_prefix_document( - writer, - prefix, - i, - document.id, - &document.highlights, - ).unwrap(); - } - - previous_char = Some(c) - }, - Some(_) => (), - None => previous_char = Some(c), - } - } - - // TODO we forget to do it for the last prefix char - (update_type, result, start.elapsed()) } UpdateData::DocumentsPartial(documents) => {