mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Fix many indexing and searching related bugs
This commit is contained in:
parent
2236ebbd42
commit
d8d0442d63
@ -36,6 +36,7 @@ impl AutomatonProducer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct Automaton {
|
pub struct Automaton {
|
||||||
pub index: usize,
|
pub index: usize,
|
||||||
pub ngram: usize,
|
pub ngram: usize,
|
||||||
@ -108,6 +109,7 @@ fn generate_automatons(
|
|||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
let synonyms = synonym_store.synonyms_fst(reader)?;
|
let synonyms = synonym_store.synonyms_fst(reader)?;
|
||||||
|
|
||||||
|
let mut automaton_index = 0;
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||||
|
|
||||||
@ -121,10 +123,11 @@ fn generate_automatons(
|
|||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||||
|
|
||||||
let automaton = if not_prefix_dfa {
|
let automaton = if not_prefix_dfa {
|
||||||
Automaton::exact(automatons.len(), 1, word)
|
Automaton::exact(automaton_index, 1, word)
|
||||||
} else {
|
} else {
|
||||||
Automaton::prefix_exact(automatons.len(), 1, word)
|
Automaton::prefix_exact(automaton_index, 1, word)
|
||||||
};
|
};
|
||||||
|
automaton_index += 1;
|
||||||
original_automatons.push(automaton);
|
original_automatons.push(automaton);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,15 +165,16 @@ fn generate_automatons(
|
|||||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||||
let nb_synonym_words = synonyms_words.len();
|
let nb_synonym_words = synonyms_words.len();
|
||||||
|
|
||||||
let real_query_index = automatons.len();
|
let real_query_index = automaton_index;
|
||||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||||
|
|
||||||
for synonym in synonyms_words {
|
for synonym in synonyms_words {
|
||||||
let automaton = if nb_synonym_words == 1 {
|
let automaton = if nb_synonym_words == 1 {
|
||||||
Automaton::exact(automatons.len(), n, synonym)
|
Automaton::exact(automaton_index, n, synonym)
|
||||||
} else {
|
} else {
|
||||||
Automaton::non_exact(automatons.len(), n, synonym)
|
Automaton::non_exact(automaton_index, n, synonym)
|
||||||
};
|
};
|
||||||
|
automaton_index += 1;
|
||||||
automatons.push(vec![automaton]);
|
automatons.push(vec![automaton]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -182,10 +186,11 @@ fn generate_automatons(
|
|||||||
let concat = ngram_slice.concat();
|
let concat = ngram_slice.concat();
|
||||||
let normalized = normalize_str(&concat);
|
let normalized = normalize_str(&concat);
|
||||||
|
|
||||||
let real_query_index = automatons.len();
|
let real_query_index = automaton_index;
|
||||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||||
|
|
||||||
let automaton = Automaton::exact(automatons.len(), n, &normalized);
|
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||||
|
automaton_index += 1;
|
||||||
automatons.push(vec![automaton]);
|
automatons.push(vec![automaton]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,7 @@ use std::sync::{Arc, RwLock};
|
|||||||
use std::{fs, thread};
|
use std::{fs, thread};
|
||||||
|
|
||||||
use crossbeam_channel::Receiver;
|
use crossbeam_channel::Receiver;
|
||||||
use log::error;
|
use log::{debug, error};
|
||||||
|
|
||||||
use crate::{store, update, Index, MResult};
|
use crate::{store, update, Index, MResult};
|
||||||
|
|
||||||
@ -23,6 +23,7 @@ fn update_awaiter(receiver: Receiver<()>, rkv: Arc<RwLock<rkv::Rkv>>, index: Ind
|
|||||||
Ok(rkv) => rkv,
|
Ok(rkv) => rkv,
|
||||||
Err(e) => { error!("rkv RwLock read failed: {}", e); break }
|
Err(e) => { error!("rkv RwLock read failed: {}", e); break }
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut writer = match rkv.write() {
|
let mut writer = match rkv.write() {
|
||||||
Ok(writer) => writer,
|
Ok(writer) => writer,
|
||||||
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break }
|
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break }
|
||||||
@ -31,7 +32,7 @@ fn update_awaiter(receiver: Receiver<()>, rkv: Arc<RwLock<rkv::Rkv>>, index: Ind
|
|||||||
match update::update_task(&mut writer, index.clone(), None as Option::<fn(_)>) {
|
match update::update_task(&mut writer, index.clone(), None as Option::<fn(_)>) {
|
||||||
Ok(true) => if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) },
|
Ok(true) => if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) },
|
||||||
// no more updates to handle for now
|
// no more updates to handle for now
|
||||||
Ok(false) => { writer.abort(); break },
|
Ok(false) => { debug!("no more updates"); writer.abort(); break },
|
||||||
Err(e) => { error!("update task failed: {}", e); writer.abort() },
|
Err(e) => { error!("update task failed: {}", e); writer.abort() },
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::{error, fmt, io};
|
use std::{error, fmt, io};
|
||||||
use crate::serde::SerializerError;
|
use crate::serde::{SerializerError, DeserializerError};
|
||||||
|
|
||||||
pub type MResult<T> = Result<T, Error>;
|
pub type MResult<T> = Result<T, Error>;
|
||||||
|
|
||||||
@ -16,6 +16,7 @@ pub enum Error {
|
|||||||
RmpEncode(rmp_serde::encode::Error),
|
RmpEncode(rmp_serde::encode::Error),
|
||||||
Bincode(bincode::Error),
|
Bincode(bincode::Error),
|
||||||
Serializer(SerializerError),
|
Serializer(SerializerError),
|
||||||
|
Deserializer(DeserializerError),
|
||||||
UnsupportedOperation(UnsupportedOperation),
|
UnsupportedOperation(UnsupportedOperation),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,6 +62,12 @@ impl From<SerializerError> for Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<DeserializerError> for Error {
|
||||||
|
fn from(error: DeserializerError) -> Error {
|
||||||
|
Error::Deserializer(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<UnsupportedOperation> for Error {
|
impl From<UnsupportedOperation> for Error {
|
||||||
fn from(op: UnsupportedOperation) -> Error {
|
fn from(op: UnsupportedOperation) -> Error {
|
||||||
Error::UnsupportedOperation(op)
|
Error::UnsupportedOperation(op)
|
||||||
@ -82,6 +89,7 @@ impl fmt::Display for Error {
|
|||||||
RmpEncode(e) => write!(f, "rmp encode error; {}", e),
|
RmpEncode(e) => write!(f, "rmp encode error; {}", e),
|
||||||
Bincode(e) => write!(f, "bincode error; {}", e),
|
Bincode(e) => write!(f, "bincode error; {}", e),
|
||||||
Serializer(e) => write!(f, "serializer error; {}", e),
|
Serializer(e) => write!(f, "serializer error; {}", e),
|
||||||
|
Deserializer(e) => write!(f, "deserializer error; {}", e),
|
||||||
UnsupportedOperation(op) => write!(f, "unsupported operation; {}", op),
|
UnsupportedOperation(op) => write!(f, "unsupported operation; {}", op),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use rkv::Value;
|
use rkv::{Value, StoreError};
|
||||||
use crate::{DocumentId, MResult};
|
use crate::{DocumentId, MResult};
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
@ -24,10 +24,14 @@ impl DocsWords {
|
|||||||
&self,
|
&self,
|
||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> Result<(), rkv::StoreError>
|
) -> Result<bool, rkv::StoreError>
|
||||||
{
|
{
|
||||||
let document_id_bytes = document_id.0.to_be_bytes();
|
let document_id_bytes = document_id.0.to_be_bytes();
|
||||||
self.docs_words.delete(writer, document_id_bytes)
|
match self.docs_words.delete(writer, document_id_bytes) {
|
||||||
|
Ok(()) => Ok(true),
|
||||||
|
Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false),
|
||||||
|
Err(e) => Err(e),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn doc_words<T: rkv::Readable>(
|
pub fn doc_words<T: rkv::Readable>(
|
||||||
|
@ -18,6 +18,20 @@ fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -
|
|||||||
key
|
key
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) {
|
||||||
|
let document_id = {
|
||||||
|
let array = TryFrom::try_from(&key[0..8]).unwrap();
|
||||||
|
DocumentId(u64::from_be_bytes(array))
|
||||||
|
};
|
||||||
|
|
||||||
|
let schema_attr = {
|
||||||
|
let array = TryFrom::try_from(&key[8..8+2]).unwrap();
|
||||||
|
SchemaAttr(u16::from_be_bytes(array))
|
||||||
|
};
|
||||||
|
|
||||||
|
(document_id, schema_attr)
|
||||||
|
}
|
||||||
|
|
||||||
impl DocumentsFields {
|
impl DocumentsFields {
|
||||||
pub fn put_document_field(
|
pub fn put_document_field(
|
||||||
&self,
|
&self,
|
||||||
@ -45,13 +59,10 @@ impl DocumentsFields {
|
|||||||
let iter = self.documents_fields.iter_from(writer, document_id_bytes)?;
|
let iter = self.documents_fields.iter_from(writer, document_id_bytes)?;
|
||||||
for result in iter {
|
for result in iter {
|
||||||
let (key, _) = result?;
|
let (key, _) = result?;
|
||||||
let current_document_id = {
|
let array = TryFrom::try_from(key).unwrap();
|
||||||
let bytes = key.get(0..8).unwrap();
|
let (current_document_id, _) = document_attribute_from_key(array);
|
||||||
let array = TryFrom::try_from(bytes).unwrap();
|
|
||||||
DocumentId(u64::from_be_bytes(array))
|
|
||||||
};
|
|
||||||
|
|
||||||
if current_document_id != document_id { break }
|
if current_document_id != document_id { break }
|
||||||
|
|
||||||
keys_to_delete.push(key.to_owned());
|
keys_to_delete.push(key.to_owned());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,10 +114,10 @@ impl<'r, T: rkv::Readable + 'r> Iterator for DocumentFieldsIter<'r, T> {
|
|||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
match self.iter.next() {
|
match self.iter.next() {
|
||||||
Some(Ok((key, Some(rkv::Value::Blob(bytes))))) => {
|
Some(Ok((key, Some(rkv::Value::Blob(bytes))))) => {
|
||||||
let key_bytes = key.get(8..8+2).unwrap();
|
let array = TryFrom::try_from(key).unwrap();
|
||||||
let array = TryFrom::try_from(key_bytes).unwrap();
|
let (current_document_id, attr) = document_attribute_from_key(array);
|
||||||
let attr = u16::from_be_bytes(array);
|
if current_document_id != self.document_id { return None; }
|
||||||
let attr = SchemaAttr::new(attr);
|
|
||||||
Some(Ok((attr, bytes)))
|
Some(Ok((attr, bytes)))
|
||||||
},
|
},
|
||||||
Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data),
|
Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data),
|
||||||
|
@ -14,8 +14,11 @@ pub use self::synonyms::Synonyms;
|
|||||||
pub use self::updates::Updates;
|
pub use self::updates::Updates;
|
||||||
pub use self::updates_results::UpdatesResults;
|
pub use self::updates_results::UpdatesResults;
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
use meilidb_schema::Schema;
|
use meilidb_schema::Schema;
|
||||||
use crate::{update, query_builder::QueryBuilder, MResult};
|
use serde::de;
|
||||||
|
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
|
||||||
|
use crate::serde::Deserializer;
|
||||||
|
|
||||||
fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
||||||
(bytes as *const _ as *const () as usize) % align == 0
|
(bytes as *const _ as *const () as usize) % align == 0
|
||||||
@ -63,6 +66,34 @@ pub struct Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
|
pub fn document<T: de::DeserializeOwned, R: rkv::Readable>(
|
||||||
|
&self,
|
||||||
|
reader: &R,
|
||||||
|
fields: Option<&HashSet<&str>>,
|
||||||
|
document_id: DocumentId,
|
||||||
|
) -> MResult<Option<T>>
|
||||||
|
{
|
||||||
|
let schema = self.main.schema(reader)?;
|
||||||
|
let schema = schema.ok_or(Error::SchemaMissing)?;
|
||||||
|
|
||||||
|
let fields = match fields {
|
||||||
|
Some(fields) => fields.into_iter().map(|name| schema.attribute(name)).collect(),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut deserializer = Deserializer {
|
||||||
|
document_id,
|
||||||
|
reader,
|
||||||
|
documents_fields: self.documents_fields,
|
||||||
|
schema: &schema,
|
||||||
|
fields: fields.as_ref(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: currently we return an error if all document fields are missing,
|
||||||
|
// returning None would have been better
|
||||||
|
Ok(T::deserialize(&mut deserializer).map(Some)?)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn schema_update(&self, mut writer: rkv::Writer, schema: Schema) -> MResult<()> {
|
pub fn schema_update(&self, mut writer: rkv::Writer, schema: Schema) -> MResult<()> {
|
||||||
update::push_schema_update(&mut writer, self.updates, self.updates_results, schema)?;
|
update::push_schema_update(&mut writer, self.updates, self.updates_results, schema)?;
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::{mem, ptr};
|
use std::{mem, ptr};
|
||||||
|
|
||||||
use zerocopy::{AsBytes, LayoutVerified};
|
use zerocopy::{AsBytes, LayoutVerified};
|
||||||
|
use rkv::StoreError;
|
||||||
|
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
use crate::store::aligned_to;
|
use crate::store::aligned_to;
|
||||||
@ -26,9 +28,13 @@ impl PostingsLists {
|
|||||||
&self,
|
&self,
|
||||||
writer: &mut rkv::Writer,
|
writer: &mut rkv::Writer,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
) -> Result<(), rkv::StoreError>
|
) -> Result<bool, rkv::StoreError>
|
||||||
{
|
{
|
||||||
self.postings_lists.delete(writer, word)
|
match self.postings_lists.delete(writer, word) {
|
||||||
|
Ok(()) => Ok(true),
|
||||||
|
Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false),
|
||||||
|
Err(e) => Err(e),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn postings_list<'a>(
|
pub fn postings_list<'a>(
|
||||||
|
@ -130,10 +130,14 @@ pub fn apply_documents_deletion(
|
|||||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
||||||
deleted_documents.insert(id);
|
deleted_documents.insert(id);
|
||||||
}
|
}
|
||||||
docs_words_store.del_doc_words(writer, id)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let deleted_documents_len = deleted_documents.len() as u64;
|
||||||
|
for id in deleted_documents {
|
||||||
|
docs_words_store.del_doc_words(writer, id)?;
|
||||||
|
}
|
||||||
|
|
||||||
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||||
let words = match main_store.words_fst(writer)? {
|
let words = match main_store.words_fst(writer)? {
|
||||||
Some(words_set) => {
|
Some(words_set) => {
|
||||||
@ -155,7 +159,6 @@ pub fn apply_documents_deletion(
|
|||||||
main_store.put_words_fst(writer, &words)?;
|
main_store.put_words_fst(writer, &words)?;
|
||||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
main_store.put_ranked_map(writer, &ranked_map)?;
|
||||||
|
|
||||||
let deleted_documents_len = deleted_documents.len() as u64;
|
|
||||||
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -151,6 +151,8 @@ pub fn update_task(
|
|||||||
None => return Ok(false),
|
None => return Ok(false),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
debug!("Processing update number {}", update_id);
|
||||||
|
|
||||||
let (update_type, result, duration) = match update {
|
let (update_type, result, duration) = match update {
|
||||||
Update::SchemaUpdate(schema) => {
|
Update::SchemaUpdate(schema) => {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
Loading…
Reference in New Issue
Block a user