2020-06-27 15:10:39 +02:00
|
|
|
#![allow(clippy::type_complexity)]
|
|
|
|
|
2019-10-08 15:22:36 +02:00
|
|
|
#[cfg(test)]
|
2019-10-18 13:05:28 +02:00
|
|
|
#[macro_use]
|
|
|
|
extern crate assert_matches;
|
2020-04-06 20:05:02 +02:00
|
|
|
#[macro_use]
|
|
|
|
extern crate pest_derive;
|
2019-10-08 15:22:36 +02:00
|
|
|
|
2019-10-02 17:34:32 +02:00
|
|
|
mod automaton;
|
2019-12-13 11:49:56 +01:00
|
|
|
mod bucket_sort;
|
2019-10-04 16:49:17 +02:00
|
|
|
mod database;
|
2019-10-09 13:44:18 +02:00
|
|
|
mod distinct_map;
|
2019-10-03 15:04:11 +02:00
|
|
|
mod error;
|
2020-04-06 20:05:02 +02:00
|
|
|
mod filters;
|
2019-10-30 17:25:42 +01:00
|
|
|
mod levenshtein;
|
2019-10-03 11:49:13 +02:00
|
|
|
mod number;
|
2019-10-02 17:34:32 +02:00
|
|
|
mod query_builder;
|
2020-01-07 17:40:58 +01:00
|
|
|
mod query_tree;
|
2020-01-13 13:29:47 +01:00
|
|
|
mod query_words_mapper;
|
2019-10-03 15:04:11 +02:00
|
|
|
mod ranked_map;
|
2019-10-02 17:34:32 +02:00
|
|
|
mod raw_document;
|
2019-10-18 13:05:28 +02:00
|
|
|
mod reordered_attrs;
|
2019-12-13 11:49:56 +01:00
|
|
|
pub mod criterion;
|
2020-05-05 22:19:34 +02:00
|
|
|
pub mod facets;
|
2019-12-13 11:49:56 +01:00
|
|
|
pub mod raw_indexer;
|
2019-10-03 11:49:13 +02:00
|
|
|
pub mod serde;
|
2020-05-19 14:11:48 +02:00
|
|
|
pub mod settings;
|
2019-10-02 17:34:32 +02:00
|
|
|
pub mod store;
|
2020-05-19 14:11:48 +02:00
|
|
|
pub mod update;
|
2019-11-30 16:53:34 +01:00
|
|
|
|
2020-05-27 12:04:35 +02:00
|
|
|
pub use self::database::{BoxUpdateFn, Database, DatabaseOptions, MainT, UpdateT, MainWriter, MainReader, UpdateWriter, UpdateReader};
|
2020-05-05 22:19:34 +02:00
|
|
|
pub use self::error::{Error, HeedError, FstError, MResult, pest_error, FacetError};
|
2020-04-06 20:05:02 +02:00
|
|
|
pub use self::filters::Filter;
|
2019-10-04 13:26:33 +02:00
|
|
|
pub use self::number::{Number, ParseNumberError};
|
|
|
|
pub use self::ranked_map::RankedMap;
|
2019-10-04 16:49:17 +02:00
|
|
|
pub use self::raw_document::RawDocument;
|
2019-10-04 13:26:33 +02:00
|
|
|
pub use self::store::Index;
|
2019-10-31 11:13:37 +01:00
|
|
|
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
2019-12-13 11:46:53 +01:00
|
|
|
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
2020-02-02 22:59:19 +01:00
|
|
|
pub use meilisearch_schema::Schema;
|
2020-01-13 13:29:47 +01:00
|
|
|
pub use query_words_mapper::QueryWordsMapper;
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2019-12-11 17:36:53 +01:00
|
|
|
use compact_arena::SmallArena;
|
2020-02-02 22:59:19 +01:00
|
|
|
use log::{error, trace};
|
2020-05-22 15:00:50 +02:00
|
|
|
use std::borrow::Cow;
|
|
|
|
use std::collections::HashMap;
|
|
|
|
use std::convert::TryFrom;
|
2020-01-16 14:56:16 +01:00
|
|
|
|
2020-01-16 14:24:45 +01:00
|
|
|
use crate::bucket_sort::PostingsListView;
|
2019-12-11 17:36:53 +01:00
|
|
|
use crate::levenshtein::prefix_damerau_levenshtein;
|
2020-01-16 14:56:16 +01:00
|
|
|
use crate::query_tree::{QueryId, QueryKind};
|
2019-12-13 13:22:54 +01:00
|
|
|
use crate::reordered_attrs::ReorderedAttrs;
|
2019-12-11 17:36:53 +01:00
|
|
|
|
2020-05-22 15:00:50 +02:00
|
|
|
type FstSetCow<'a> = fst::Set<Cow<'a, [u8]>>;
|
|
|
|
type FstMapCow<'a> = fst::Map<Cow<'a, [u8]>>;
|
|
|
|
|
2019-12-13 11:14:12 +01:00
|
|
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
2019-10-02 17:34:32 +02:00
|
|
|
pub struct Document {
|
|
|
|
pub id: DocumentId,
|
|
|
|
pub highlights: Vec<Highlight>,
|
|
|
|
|
2019-12-13 11:14:12 +01:00
|
|
|
#[cfg(test)]
|
|
|
|
pub matches: Vec<crate::bucket_sort::SimpleMatch>,
|
2019-10-02 17:34:32 +02:00
|
|
|
}
|
|
|
|
|
2019-12-13 13:22:54 +01:00
|
|
|
fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
|
|
|
raw_document: &RawDocument<'a, 'tag>,
|
2020-01-16 14:56:16 +01:00
|
|
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
2019-12-13 13:22:54 +01:00
|
|
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2020-01-13 19:34:49 +01:00
|
|
|
schema: &Schema,
|
2019-12-13 13:22:54 +01:00
|
|
|
) -> Vec<Highlight>
|
|
|
|
{
|
|
|
|
let mut highlights = Vec::new();
|
|
|
|
|
|
|
|
for bm in raw_document.bare_matches.iter() {
|
|
|
|
let postings_list = &arena[bm.postings_list];
|
|
|
|
let input = postings_list.input();
|
2020-01-16 14:56:16 +01:00
|
|
|
let kind = &queries_kinds.get(&bm.query_index);
|
2019-12-13 13:22:54 +01:00
|
|
|
|
|
|
|
for di in postings_list.iter() {
|
2020-01-16 14:56:16 +01:00
|
|
|
let covered_area = match kind {
|
2020-01-22 18:11:58 +01:00
|
|
|
Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
|
2020-01-16 14:56:16 +01:00
|
|
|
let len = if query.len() > input.len() {
|
|
|
|
input.len()
|
|
|
|
} else {
|
|
|
|
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
|
|
|
};
|
|
|
|
u16::try_from(len).unwrap_or(u16::max_value())
|
|
|
|
},
|
|
|
|
_ => di.char_length,
|
|
|
|
};
|
2019-12-13 13:22:54 +01:00
|
|
|
|
|
|
|
let attribute = searchable_attrs
|
|
|
|
.and_then(|sa| sa.reverse(di.attribute))
|
|
|
|
.unwrap_or(di.attribute);
|
|
|
|
|
2020-01-13 19:34:49 +01:00
|
|
|
let attribute = match schema.indexed_pos_to_field_id(attribute) {
|
|
|
|
Some(field_id) => field_id.0,
|
|
|
|
None => {
|
|
|
|
error!("Cannot convert indexed_pos {} to field_id", attribute);
|
2020-01-29 18:30:21 +01:00
|
|
|
trace!("Schema is compromized; {:?}", schema);
|
2020-01-13 19:34:49 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-12-13 13:22:54 +01:00
|
|
|
let highlight = Highlight {
|
2020-02-02 22:59:19 +01:00
|
|
|
attribute,
|
2019-12-13 13:22:54 +01:00
|
|
|
char_index: di.char_index,
|
2020-01-16 14:56:16 +01:00
|
|
|
char_length: covered_area,
|
2019-12-13 13:22:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
highlights.push(highlight);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
highlights
|
|
|
|
}
|
|
|
|
|
2019-12-11 17:36:53 +01:00
|
|
|
impl Document {
|
2019-12-21 13:44:19 +01:00
|
|
|
#[cfg(not(test))]
|
|
|
|
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
|
|
|
Document { id, highlights: highlights.to_owned() }
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
|
|
|
Document { id, highlights: highlights.to_owned(), matches: Vec::new() }
|
|
|
|
}
|
|
|
|
|
2019-12-13 13:22:54 +01:00
|
|
|
#[cfg(not(test))]
|
2019-12-11 17:36:53 +01:00
|
|
|
pub fn from_raw<'a, 'tag, 'txn>(
|
|
|
|
raw_document: RawDocument<'a, 'tag>,
|
2020-01-16 14:56:16 +01:00
|
|
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
2019-12-11 17:36:53 +01:00
|
|
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
2019-12-13 13:22:54 +01:00
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2020-01-13 19:34:49 +01:00
|
|
|
schema: &Schema,
|
2019-12-11 17:36:53 +01:00
|
|
|
) -> Document
|
|
|
|
{
|
2020-01-16 14:24:45 +01:00
|
|
|
let highlights = highlights_from_raw_document(
|
|
|
|
&raw_document,
|
2020-01-16 14:56:16 +01:00
|
|
|
queries_kinds,
|
2020-01-16 14:24:45 +01:00
|
|
|
arena,
|
|
|
|
searchable_attrs,
|
2020-01-13 19:34:49 +01:00
|
|
|
schema,
|
2020-01-16 14:24:45 +01:00
|
|
|
);
|
2019-12-13 13:22:54 +01:00
|
|
|
|
|
|
|
Document { id: raw_document.id, highlights }
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
pub fn from_raw<'a, 'tag, 'txn>(
|
|
|
|
raw_document: RawDocument<'a, 'tag>,
|
2020-01-16 14:56:16 +01:00
|
|
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
2019-12-13 13:22:54 +01:00
|
|
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2020-01-13 19:34:49 +01:00
|
|
|
schema: &Schema,
|
2019-12-13 13:22:54 +01:00
|
|
|
) -> Document
|
|
|
|
{
|
|
|
|
use crate::bucket_sort::SimpleMatch;
|
2019-12-13 11:14:12 +01:00
|
|
|
|
2020-01-16 14:24:45 +01:00
|
|
|
let highlights = highlights_from_raw_document(
|
|
|
|
&raw_document,
|
2020-01-16 14:56:16 +01:00
|
|
|
queries_kinds,
|
2020-01-16 14:24:45 +01:00
|
|
|
arena,
|
|
|
|
searchable_attrs,
|
2020-01-13 19:34:49 +01:00
|
|
|
schema,
|
2020-01-16 14:24:45 +01:00
|
|
|
);
|
2019-12-13 13:22:54 +01:00
|
|
|
|
|
|
|
let mut matches = Vec::new();
|
|
|
|
for sm in raw_document.processed_matches {
|
|
|
|
let attribute = searchable_attrs
|
|
|
|
.and_then(|sa| sa.reverse(sm.attribute))
|
|
|
|
.unwrap_or(sm.attribute);
|
|
|
|
|
2020-01-13 19:34:49 +01:00
|
|
|
let attribute = match schema.indexed_pos_to_field_id(attribute) {
|
|
|
|
Some(field_id) => field_id.0,
|
|
|
|
None => {
|
|
|
|
error!("Cannot convert indexed_pos {} to field_id", attribute);
|
2020-01-29 18:30:21 +01:00
|
|
|
trace!("Schema is compromized; {:?}", schema);
|
2020-01-13 19:34:49 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-12-13 13:22:54 +01:00
|
|
|
matches.push(SimpleMatch { attribute, ..sm });
|
2019-12-13 11:14:12 +01:00
|
|
|
}
|
2019-12-13 13:22:54 +01:00
|
|
|
matches.sort_unstable();
|
|
|
|
|
|
|
|
Document { id: raw_document.id, highlights, matches }
|
2019-12-11 17:36:53 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-02 17:34:32 +02:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use std::mem;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn docindex_mem_size() {
|
2020-05-19 13:53:31 +02:00
|
|
|
assert_eq!(mem::size_of::<DocIndex>(), 12);
|
2019-10-02 17:34:32 +02:00
|
|
|
}
|
|
|
|
}
|