mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Merge branch 'main' into indexer-edition-2024
This commit is contained in:
commit
10feeb88f2
1122 changed files with 6265 additions and 5265 deletions
446
crates/milli/src/lib.rs
Normal file
446
crates/milli/src/lib.rs
Normal file
|
@ -0,0 +1,446 @@
|
|||
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
|
||||
#![allow(clippy::type_complexity)]
|
||||
|
||||
#[cfg(test)]
|
||||
#[global_allocator]
|
||||
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
#[macro_use]
|
||||
pub mod documents;
|
||||
|
||||
mod asc_desc;
|
||||
mod criterion;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
pub mod facet;
|
||||
mod fields_ids_map;
|
||||
pub mod heed_codec;
|
||||
pub mod index;
|
||||
mod localized_attributes_rules;
|
||||
pub mod order_by_map;
|
||||
pub mod prompt;
|
||||
pub mod proximity;
|
||||
pub mod score_details;
|
||||
mod search;
|
||||
mod thread_pool_no_abort;
|
||||
pub mod update;
|
||||
pub mod vector;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
pub mod snapshot_tests;
|
||||
mod fieldids_weights_map;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fmt;
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
|
||||
pub use filter_parser::{Condition, FilterCondition, Span, Token};
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
pub use search::new::{
|
||||
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext,
|
||||
SearchLogger, VisualSearchLogger,
|
||||
};
|
||||
use serde_json::Value;
|
||||
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
pub use {charabia as tokenizer, heed, rhai};
|
||||
|
||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||
pub use self::criterion::{default_criteria, Criterion, CriterionError};
|
||||
pub use self::error::{
|
||||
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
|
||||
};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fieldids_weights_map::FieldidsWeightsMap;
|
||||
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
|
||||
pub use self::heed_codec::{
|
||||
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
|
||||
RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
|
||||
UncheckedU8StrStrCodec,
|
||||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::localized_attributes_rules::LocalizedAttributesRule;
|
||||
use self::localized_attributes_rules::LocalizedFieldIds;
|
||||
pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
|
||||
pub use self::search::similar::Similar;
|
||||
pub use self::search::{
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
|
||||
Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
|
||||
pub type Attribute = u32;
|
||||
pub type BEU16 = heed::types::U16<heed::byteorder::BE>;
|
||||
pub type BEU32 = heed::types::U32<heed::byteorder::BE>;
|
||||
pub type BEU64 = heed::types::U64<heed::byteorder::BE>;
|
||||
pub type DocumentId = u32;
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||
pub type FieldId = u16;
|
||||
pub type Weight = u16;
|
||||
pub type Object = serde_json::Map<String, serde_json::Value>;
|
||||
pub type Position = u32;
|
||||
pub type RelativePosition = u16;
|
||||
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
|
||||
pub type Prefix = smallstr::SmallString<[u8; 16]>;
|
||||
pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
|
||||
pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
|
||||
pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
|
||||
|
||||
/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
|
||||
/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
|
||||
/// expressed in term of latitude and longitude.
|
||||
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
|
||||
|
||||
/// The maximum length a LMDB key can be.
|
||||
///
|
||||
/// Note that the actual allowed length is a little bit higher, but
|
||||
/// we keep a margin of safety.
|
||||
const MAX_LMDB_KEY_LENGTH: usize = 500;
|
||||
|
||||
/// The maximum length a field value can be when inserted in an LMDB key.
|
||||
///
|
||||
/// This number is determined by the keys of the different facet databases
|
||||
/// and adding a margin of safety.
|
||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;
|
||||
|
||||
/// The maximum length a word can be
|
||||
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
||||
|
||||
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TimeBudget {
|
||||
started_at: std::time::Instant,
|
||||
budget: std::time::Duration,
|
||||
|
||||
/// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
|
||||
/// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
|
||||
#[cfg(test)]
|
||||
stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for TimeBudget {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("TimeBudget")
|
||||
.field("started_at", &self.started_at)
|
||||
.field("budget", &self.budget)
|
||||
.field("left", &(self.budget - self.started_at.elapsed()))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TimeBudget {
|
||||
fn default() -> Self {
|
||||
Self::new(std::time::Duration::from_millis(1500))
|
||||
}
|
||||
}
|
||||
|
||||
impl TimeBudget {
|
||||
pub fn new(budget: std::time::Duration) -> Self {
|
||||
Self {
|
||||
started_at: std::time::Instant::now(),
|
||||
budget,
|
||||
|
||||
#[cfg(test)]
|
||||
stop_after: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max() -> Self {
|
||||
Self::new(std::time::Duration::from_secs(u64::MAX))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn with_stop_after(mut self, stop_after: usize) -> Self {
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::Arc;
|
||||
|
||||
self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn exceeded(&self) -> bool {
|
||||
#[cfg(test)]
|
||||
if let Some((current, stop_after)) = &self.stop_after {
|
||||
let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if current >= *stop_after {
|
||||
return true;
|
||||
} else {
|
||||
// if a number has been specified then we ignore entirely the time budget
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.started_at.elapsed() > self.budget
|
||||
}
|
||||
}
|
||||
|
||||
// Convert an absolute word position into a relative position.
|
||||
// Return the field id of the attribute related to the absolute position
|
||||
// and the relative position in the attribute.
|
||||
pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
|
||||
((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
|
||||
}
|
||||
|
||||
// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
|
||||
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
|
||||
(field_id as u32) << 16 | (relative as u32)
|
||||
}
|
||||
// TODO: this is wrong, but will do for now
|
||||
/// Compute the "bucketed" absolute position from the field id and relative position in the field.
|
||||
///
|
||||
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
|
||||
pub fn bucketed_position(relative: u16) -> u16 {
|
||||
// The first few relative positions are kept intact.
|
||||
if relative < 16 {
|
||||
relative
|
||||
} else if relative < 24 {
|
||||
// Relative positions between 16 and 24 all become equal to 24
|
||||
24
|
||||
} else {
|
||||
// Then, groups of positions that have the same base-2 logarithm are reduced to
|
||||
// the same relative position: the smallest power of 2 that is greater than them
|
||||
(relative as f64).log2().ceil().exp2() as u16
|
||||
}
|
||||
}
|
||||
|
||||
/// Transform a raw obkv store into a JSON Object.
|
||||
pub fn obkv_to_json(
|
||||
displayed_fields: &[FieldId],
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
obkv: &obkv::KvReaderU16,
|
||||
) -> Result<Object> {
|
||||
displayed_fields
|
||||
.iter()
|
||||
.copied()
|
||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||
.map(|(id, value)| {
|
||||
let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: id,
|
||||
process: "obkv_to_json",
|
||||
})?;
|
||||
let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?;
|
||||
Ok((name.to_owned(), value))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Transform every field of a raw obkv store into a JSON Object.
|
||||
pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result<Object> {
|
||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
||||
obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
for (key, value) in object {
|
||||
buffer.clear();
|
||||
let _ = write!(&mut buffer, "{}: ", key);
|
||||
if inner(value, &mut buffer) {
|
||||
buffer.push_str(". ");
|
||||
// We write the "key: value. " pair only when
|
||||
// we are sure that the value can be written.
|
||||
output.push_str(&buffer);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut string = String::new();
|
||||
if inner(value, &mut string) {
|
||||
Some(string)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
/// Return the distance between two points in meters. Each points are composed of two f64,
|
||||
/// one latitude and one longitude.
|
||||
pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 {
|
||||
let a = geoutils::Location::new(a[0], a[1]);
|
||||
let b = geoutils::Location::new(b[0], b[1]);
|
||||
|
||||
a.haversine_distance_to(&b).meters()
|
||||
}
|
||||
|
||||
/// Convert a point expressed in terms of latitude and longitude to a point in the
|
||||
/// cartesian coordinate expressed in terms of x, y and z.
|
||||
pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] {
|
||||
let [lat, lng] = coord.map(|f| f.to_radians());
|
||||
let x = lat.cos() * lng.cos();
|
||||
let y = lat.cos() * lng.sin();
|
||||
let z = lat.sin();
|
||||
|
||||
[x, y, z]
|
||||
}
|
||||
|
||||
/// Returns `true` if the field match one of the faceted fields.
|
||||
/// See the function [`is_faceted_by`] below to see what “matching” means.
|
||||
pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool {
|
||||
faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref()))
|
||||
}
|
||||
|
||||
/// Returns `true` if the field match the facet.
|
||||
/// ```
|
||||
/// use milli::is_faceted_by;
|
||||
/// // -- the valid basics
|
||||
/// assert!(is_faceted_by("animaux", "animaux"));
|
||||
/// assert!(is_faceted_by("animaux.chien", "animaux"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur"));
|
||||
///
|
||||
/// // -- the wrongs
|
||||
/// assert!(!is_faceted_by("chien", "chat"));
|
||||
/// assert!(!is_faceted_by("animaux", "animaux.chien"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chat"));
|
||||
///
|
||||
/// // -- the strange edge cases
|
||||
/// assert!(!is_faceted_by("animaux.chien", "anima"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animau"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux."));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.c"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.ch"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chi"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
|
||||
/// ```
|
||||
pub fn is_faceted_by(field: &str, facet: &str) -> bool {
|
||||
field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.')
|
||||
}
|
||||
|
||||
pub fn normalize_facet(original: &str) -> String {
|
||||
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn json_to_string_object() {
|
||||
let value = json!({
|
||||
"name": "John Doe",
|
||||
"age": 43,
|
||||
"not_there": null,
|
||||
});
|
||||
|
||||
let string = json_to_string(&value).unwrap();
|
||||
assert_eq!(string, "name: John Doe. age: 43. ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_to_string_array() {
|
||||
let value = json!([
|
||||
{ "name": "John Doe" },
|
||||
43,
|
||||
"hello",
|
||||
[ "I", "am", "fine" ],
|
||||
null,
|
||||
]);
|
||||
|
||||
let string = json_to_string(&value).unwrap();
|
||||
// We don't care about having two point (.) after the other as
|
||||
// the distance of hard separators is clamped to 8 anyway.
|
||||
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relative_position_conversion() {
|
||||
assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
|
||||
assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
|
||||
assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
|
||||
assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
|
||||
assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
|
||||
assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
|
||||
assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_absolute_position_conversion() {
|
||||
assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
|
||||
assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
|
||||
assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
|
||||
assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
|
||||
assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
|
||||
assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
|
||||
assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_obkv_to_json() {
|
||||
let mut fields_ids_map = FieldsIdsMap::new();
|
||||
let id1 = fields_ids_map.insert("field1").unwrap();
|
||||
let id2 = fields_ids_map.insert("field2").unwrap();
|
||||
|
||||
let mut writer = obkv::KvWriterU16::memory();
|
||||
writer.insert(id1, b"1234").unwrap();
|
||||
writer.insert(id2, b"4321").unwrap();
|
||||
let contents = writer.into_inner().unwrap();
|
||||
let obkv = obkv::KvReaderU16::from_slice(&contents);
|
||||
|
||||
let expected = json!({
|
||||
"field1": 1234,
|
||||
"field2": 4321,
|
||||
});
|
||||
let expected = expected.as_object().unwrap();
|
||||
let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
|
||||
|
||||
assert_eq!(&actual, expected);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue