mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 03:47:02 +02:00
Merge branch 'main' into tmp-release-v1.11.0
This commit is contained in:
commit
cf6ad1ae5e
1071 changed files with 263 additions and 106 deletions
144
crates/milli/Cargo.toml
Normal file
144
crates/milli/Cargo.toml
Normal file
|
@ -0,0 +1,144 @@
|
|||
[package]
|
||||
name = "milli"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
|
||||
version.workspace = true
|
||||
authors.workspace = true
|
||||
description.workspace = true
|
||||
homepage.workspace = true
|
||||
readme.workspace = true
|
||||
# edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bimap = { version = "0.6.3", features = ["serde"] }
|
||||
bincode = "1.3.3"
|
||||
bstr = "1.9.1"
|
||||
bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.5.0"
|
||||
charabia = { version = "0.9.1", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.13"
|
||||
deserr = "0.6.2"
|
||||
either = { version = "1.13.0", features = ["serde"] }
|
||||
flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.7", default-features = false, features = [
|
||||
"rayon",
|
||||
"tempfile",
|
||||
] }
|
||||
heed = { version = "0.20.3", default-features = false, features = [
|
||||
"serde-json",
|
||||
"serde-bincode",
|
||||
"read-txn-no-tls",
|
||||
] }
|
||||
indexmap = { version = "2.2.6", features = ["serde"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memchr = "2.5.0"
|
||||
memmap2 = "0.9.4"
|
||||
obkv = "0.2.2"
|
||||
once_cell = "1.19.0"
|
||||
ordered-float = "4.2.1"
|
||||
rayon = "1.10.0"
|
||||
roaring = { version = "0.10.6", features = ["serde"] }
|
||||
rstar = { version = "0.12.0", features = ["serde"] }
|
||||
serde = { version = "1.0.204", features = ["derive"] }
|
||||
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
||||
slice-group-by = "0.3.1"
|
||||
smallstr = { version = "0.3.0", features = ["serde"] }
|
||||
smallvec = "1.13.2"
|
||||
smartstring = "1.0.1"
|
||||
tempfile = "3.10.1"
|
||||
thiserror = "1.0.61"
|
||||
time = { version = "0.3.36", features = [
|
||||
"serde-well-known",
|
||||
"formatting",
|
||||
"parsing",
|
||||
"macros",
|
||||
] }
|
||||
uuid = { version = "1.10.0", features = ["v4"] }
|
||||
|
||||
filter-parser = { path = "../filter-parser" }
|
||||
|
||||
# documents words self-join
|
||||
itertools = "0.13.0"
|
||||
|
||||
csv = "1.3.0"
|
||||
candle-core = { version = "0.6.0" }
|
||||
candle-transformers = { version = "0.6.0" }
|
||||
candle-nn = { version = "0.6.0" }
|
||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
|
||||
"onig",
|
||||
] }
|
||||
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
|
||||
"online",
|
||||
] }
|
||||
tiktoken-rs = "0.5.9"
|
||||
liquid = "0.26.6"
|
||||
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
||||
arroy = "0.5.0"
|
||||
rand = "0.8.5"
|
||||
tracing = "0.1.40"
|
||||
ureq = { version = "2.10.0", features = ["json"] }
|
||||
url = "2.5.2"
|
||||
rayon-par-bridge = "0.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.43", default-features = false }
|
||||
big_s = "1.0.2"
|
||||
insta = "1.39.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
all-tokenizations = [
|
||||
"charabia/default",
|
||||
]
|
||||
|
||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||
# For more information on this feature, see heed's Cargo.toml
|
||||
lmdb-posix-sem = ["heed/posix-sem"]
|
||||
|
||||
# allow chinese specialized tokenization
|
||||
chinese = ["charabia/chinese"]
|
||||
chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"]
|
||||
|
||||
# allow hebrew specialized tokenization
|
||||
hebrew = ["charabia/hebrew"]
|
||||
|
||||
# allow japanese specialized tokenization
|
||||
japanese = ["charabia/japanese"]
|
||||
japanese-transliteration = ["charabia/japanese-transliteration"]
|
||||
|
||||
# allow korean specialized tokenization
|
||||
korean = ["charabia/korean"]
|
||||
|
||||
# allow thai specialized tokenization
|
||||
thai = ["charabia/thai"]
|
||||
|
||||
# allow greek specialized tokenization
|
||||
greek = ["charabia/greek"]
|
||||
|
||||
# allow khmer specialized tokenization
|
||||
khmer = ["charabia/khmer"]
|
||||
|
||||
# allow vietnamese specialized tokenization
|
||||
vietnamese = ["charabia/vietnamese"]
|
||||
|
||||
# allow german specialized tokenization
|
||||
german = ["charabia/german-segmentation"]
|
||||
|
||||
# force swedish character recomposition
|
||||
swedish-recomposition = ["charabia/swedish-recomposition"]
|
||||
|
||||
# allow turkish specialized tokenization
|
||||
turkish = ["charabia/turkish"]
|
||||
|
||||
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
||||
cuda = ["candle-core/cuda"]
|
16
crates/milli/README.md
Normal file
16
crates/milli/README.md
Normal file
|
@ -0,0 +1,16 @@
|
|||
<p align="center">
|
||||
<img alt="the milli logo" src="../assets/milli-logo.svg">
|
||||
</p>
|
||||
|
||||
<p align="center">a concurrent indexer combined with fast and relevant search algorithms</p>
|
||||
|
||||
## Introduction
|
||||
|
||||
This crate contains the internal engine used by [Meilisearch].
|
||||
|
||||
It contains a library that can manage one and only one index. Meilisearch
|
||||
manages the multi-index itself. Milli is unable to store updates in a store:
|
||||
it is the job of something else above and this is why it is only able
|
||||
to process one update at a time.
|
||||
|
||||
[Meilisearch]: https://github.com/meilisearch/meilisearch
|
114
crates/milli/examples/index.rs
Normal file
114
crates/milli/examples/index.rs
Normal file
|
@ -0,0 +1,114 @@
|
|||
use std::error::Error;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::path::Path;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::{Index, Object};
|
||||
|
||||
fn usage(error: &str, program_name: &str) -> String {
|
||||
format!(
|
||||
"{}. Usage: {} <PATH-TO-INDEX> <PATH-TO-DATASET> [searchable_fields] [filterable_fields]",
|
||||
error, program_name
|
||||
)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut args = std::env::args();
|
||||
let program_name = args.next().expect("No program name");
|
||||
let index_path =
|
||||
args.next().unwrap_or_else(|| panic!("{}", usage("Missing path to index.", &program_name)));
|
||||
let dataset_path = args
|
||||
.next()
|
||||
.unwrap_or_else(|| panic!("{}", usage("Missing path to source dataset.", &program_name)));
|
||||
// let primary_key = args.next().unwrap_or_else(|| "id".into());
|
||||
// "title overview"
|
||||
let searchable_fields: Vec<String> = args
|
||||
.next()
|
||||
.map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
println!("{searchable_fields:?}");
|
||||
// "release_date genres"
|
||||
let filterable_fields: Vec<String> = args
|
||||
.next()
|
||||
.map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
std::fs::create_dir_all(&index_path).unwrap();
|
||||
let index = Index::new(options, index_path).unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
// builder.set_primary_key(primary_key);
|
||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config = IndexDocumentsConfig::default();
|
||||
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||
|
||||
let documents = documents_from(
|
||||
&dataset_path,
|
||||
Path::new(&dataset_path).extension().unwrap_or_default().to_str().unwrap_or_default(),
|
||||
);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
Ok(())
|
||||
}
|
||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||
let reader = File::open(filename)
|
||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
||||
let reader = BufReader::new(reader);
|
||||
let documents = match filetype {
|
||||
"csv" => documents_from_csv(reader).unwrap(),
|
||||
"json" => documents_from_json(reader).unwrap(),
|
||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||
};
|
||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||
}
|
||||
|
||||
fn documents_from_jsonl(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
let object = result.unwrap();
|
||||
documents.append_json_object(&object)?;
|
||||
}
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_json(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
documents.append_json_array(reader)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_csv(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||
let csv = csv::Reader::from_reader(reader);
|
||||
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
documents.append_csv(csv)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
124
crates/milli/examples/search.rs
Normal file
124
crates/milli/examples/search.rs
Normal file
|
@ -0,0 +1,124 @@
|
|||
use std::error::Error;
|
||||
use std::io::stdin;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::{
|
||||
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext,
|
||||
SearchLogger, TermsMatchingStrategy, TimeBudget,
|
||||
};
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut args = std::env::args();
|
||||
let program_name = args.next().expect("No program name");
|
||||
let dataset = args.next().unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Missing path to index. Usage: {} <PATH-TO-INDEX> [<logger-dir>] [print-documents]",
|
||||
program_name
|
||||
)
|
||||
});
|
||||
let detailed_logger_dir = args.next();
|
||||
let print_documents: bool =
|
||||
if let Some(arg) = args.next() { arg == "print-documents" } else { false };
|
||||
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, dataset)?;
|
||||
let txn = index.read_txn()?;
|
||||
let mut query = String::new();
|
||||
while stdin().read_line(&mut query)? > 0 {
|
||||
for _ in 0..2 {
|
||||
let mut default_logger = DefaultSearchLogger;
|
||||
// FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible.
|
||||
// Workaround'd here by recreating the logger on each iteration of the loop
|
||||
let mut detailed_logger = detailed_logger_dir
|
||||
.as_ref()
|
||||
.map(|logger_dir| (milli::VisualSearchLogger::default(), logger_dir));
|
||||
let logger: &mut dyn SearchLogger<_> =
|
||||
if let Some((detailed_logger, _)) = detailed_logger.as_mut() {
|
||||
detailed_logger
|
||||
} else {
|
||||
&mut default_logger
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut ctx = SearchContext::new(&index, &txn)?;
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &None)?;
|
||||
|
||||
let docs = execute_search(
|
||||
&mut ctx,
|
||||
(!query.trim().is_empty()).then(|| query.trim()),
|
||||
TermsMatchingStrategy::Last,
|
||||
milli::score_details::ScoringStrategy::Skip,
|
||||
false,
|
||||
universe,
|
||||
&None,
|
||||
&None,
|
||||
GeoSortStrategy::default(),
|
||||
0,
|
||||
20,
|
||||
None,
|
||||
&mut DefaultSearchLogger,
|
||||
logger,
|
||||
TimeBudget::max(),
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
if let Some((logger, dir)) = detailed_logger {
|
||||
logger.finish(&mut ctx, Path::new(dir))?;
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
if print_documents {
|
||||
let documents = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(id, obkv)| {
|
||||
let mut object = serde_json::Map::default();
|
||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||
let value = obkv.get(fid).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||
object.insert(fid_name.to_owned(), value);
|
||||
}
|
||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for (id, document) in documents {
|
||||
println!("{id}:");
|
||||
println!("{document}");
|
||||
}
|
||||
|
||||
let documents = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(id, obkv)| {
|
||||
let mut object = serde_json::Map::default();
|
||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||
let value = obkv.get(fid).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||
object.insert(fid_name.to_owned(), value);
|
||||
}
|
||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
for (id, document) in documents {
|
||||
println!("{id}:");
|
||||
println!("{document}");
|
||||
}
|
||||
}
|
||||
}
|
||||
query.clear();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
33
crates/milli/examples/settings.rs
Normal file
33
crates/milli/examples/settings.rs
Normal file
|
@ -0,0 +1,33 @@
|
|||
// use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
// use maplit::hashset;
|
||||
use milli::{
|
||||
update::{IndexerConfig, Settings},
|
||||
Criterion, Index,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies.ms").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
|
||||
// builder.set_min_word_len_one_typo(5);
|
||||
// builder.set_min_word_len_two_typos(7);
|
||||
// builder.set_sortable_fields(hashset! { S("release_date") });
|
||||
builder.set_criteria(vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
Criterion::Attribute,
|
||||
Criterion::Sort,
|
||||
Criterion::Exactness,
|
||||
]);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
3
crates/milli/fuzz/.gitignore
vendored
Normal file
3
crates/milli/fuzz/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
target
|
||||
corpus
|
||||
artifacts
|
298
crates/milli/src/asc_desc.rs
Normal file
298
crates/milli/src/asc_desc.rs
Normal file
|
@ -0,0 +1,298 @@
|
|||
//! This module provides the `AscDesc` type and defines all the errors related to this type.
|
||||
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::error::is_reserved_keyword;
|
||||
use crate::search::facet::BadGeoError;
|
||||
use crate::{CriterionError, Error, UserError};
|
||||
|
||||
/// This error type is never supposed to be shown to the end user.
|
||||
/// You must always cast it to a sort error or a criterion error.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum AscDescError {
|
||||
#[error(transparent)]
|
||||
GeoError(BadGeoError),
|
||||
#[error("Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{name}`.")]
|
||||
InvalidSyntax { name: String },
|
||||
#[error("`{name}` is a reserved keyword and thus can't be used as a asc/desc rule.")]
|
||||
ReservedKeyword { name: String },
|
||||
}
|
||||
|
||||
impl From<BadGeoError> for AscDescError {
|
||||
fn from(geo_error: BadGeoError) -> Self {
|
||||
AscDescError::GeoError(geo_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AscDescError> for CriterionError {
|
||||
fn from(error: AscDescError) -> Self {
|
||||
match error {
|
||||
AscDescError::GeoError(_) => {
|
||||
CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() }
|
||||
}
|
||||
AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name },
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => {
|
||||
CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => {
|
||||
CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoBoundingBox") => {
|
||||
CriterionError::ReservedNameForFilter { name: "_geoBoundingBox".to_string() }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } => CriterionError::ReservedName { name },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
pub enum Member {
|
||||
Field(String),
|
||||
Geo([f64; 2]),
|
||||
}
|
||||
|
||||
impl FromStr for Member {
|
||||
type Err = AscDescError;
|
||||
|
||||
fn from_str(text: &str) -> Result<Member, Self::Err> {
|
||||
match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(')')) {
|
||||
Some(point) => {
|
||||
let (lat, lng) = point
|
||||
.split_once(',')
|
||||
.ok_or_else(|| AscDescError::ReservedKeyword { name: text.to_string() })
|
||||
.and_then(|(lat, lng)| {
|
||||
lat.trim()
|
||||
.parse()
|
||||
.and_then(|lat| lng.trim().parse().map(|lng| (lat, lng)))
|
||||
.map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() })
|
||||
})?;
|
||||
if !(-90.0..=90.0).contains(&lat) {
|
||||
return Err(BadGeoError::Lat(lat))?;
|
||||
} else if !(-180.0..=180.0).contains(&lng) {
|
||||
return Err(BadGeoError::Lng(lng))?;
|
||||
}
|
||||
Ok(Member::Geo([lat, lng]))
|
||||
}
|
||||
None => {
|
||||
if is_reserved_keyword(text)
|
||||
|| text.starts_with("_geoRadius(")
|
||||
|| text.starts_with("_geoBoundingBox(")
|
||||
|| text.starts_with("_geo(")
|
||||
|| text.starts_with("_geoDistance(")
|
||||
{
|
||||
return Err(AscDescError::ReservedKeyword { name: text.to_string() })?;
|
||||
}
|
||||
Ok(Member::Field(text.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Member {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Member::Field(name) => f.write_str(name),
|
||||
Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Member {
|
||||
pub fn field(&self) -> Option<&str> {
|
||||
match self {
|
||||
Member::Field(field) => Some(field),
|
||||
Member::Geo(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn geo_point(&self) -> Option<&[f64; 2]> {
|
||||
match self {
|
||||
Member::Geo(point) => Some(point),
|
||||
Member::Field(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
pub enum AscDesc {
|
||||
Asc(Member),
|
||||
Desc(Member),
|
||||
}
|
||||
|
||||
impl AscDesc {
|
||||
pub fn member(&self) -> &Member {
|
||||
match self {
|
||||
AscDesc::Asc(member) => member,
|
||||
AscDesc::Desc(member) => member,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn field(&self) -> Option<&str> {
|
||||
self.member().field()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for AscDesc {
|
||||
type Err = AscDescError;
|
||||
|
||||
fn from_str(text: &str) -> Result<AscDesc, Self::Err> {
|
||||
match text.rsplit_once(':') {
|
||||
Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)),
|
||||
Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)),
|
||||
_ => Err(AscDescError::InvalidSyntax { name: text.to_string() }),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SortError {
|
||||
#[error(transparent)]
|
||||
ParseGeoError { error: BadGeoError },
|
||||
#[error("Invalid syntax for the geo parameter: expected expression formated like \
|
||||
`_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{name}`.")]
|
||||
BadGeoPointUsage { name: String },
|
||||
#[error("Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{name}`.")]
|
||||
InvalidName { name: String },
|
||||
#[error("`{name}` is a reserved keyword and thus can't be used as a sort expression.")]
|
||||
ReservedName { name: String },
|
||||
#[error("`{name}` is a reserved keyword and thus can't be used as a sort expression. \
|
||||
Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.")]
|
||||
ReservedNameForSettings { name: String },
|
||||
#[error("`{name}` is a reserved keyword and thus can't be used as a sort expression. \
|
||||
Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.")]
|
||||
ReservedNameForFilter { name: String },
|
||||
}
|
||||
|
||||
impl From<AscDescError> for SortError {
|
||||
fn from(error: AscDescError) -> Self {
|
||||
match error {
|
||||
AscDescError::GeoError(error) => SortError::ParseGeoError { error },
|
||||
AscDescError::InvalidSyntax { name } => SortError::InvalidName { name },
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => {
|
||||
SortError::BadGeoPointUsage { name }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if &name == "_geo" => {
|
||||
SortError::ReservedNameForSettings { name }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => {
|
||||
SortError::ReservedNameForFilter { name: String::from("_geoRadius") }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoBoundingBox") => {
|
||||
SortError::ReservedNameForFilter { name: String::from("_geoBoundingBox") }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } => SortError::ReservedName { name },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SortError> for Error {
|
||||
fn from(error: SortError) -> Self {
|
||||
Self::UserError(UserError::SortError(error))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use AscDesc::*;
|
||||
use AscDescError::*;
|
||||
use Member::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_asc_desc() {
|
||||
let valid_req = [
|
||||
("truc:asc", Asc(Field(S("truc")))),
|
||||
("bidule:desc", Desc(Field(S("bidule")))),
|
||||
("a-b:desc", Desc(Field(S("a-b")))),
|
||||
("a:b:desc", Desc(Field(S("a:b")))),
|
||||
("a12:asc", Asc(Field(S("a12")))),
|
||||
("42:asc", Asc(Field(S("42")))),
|
||||
("_geoPoint(42, 59):asc", Asc(Geo([42., 59.]))),
|
||||
("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))),
|
||||
("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))),
|
||||
("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))),
|
||||
("_geoPoint(90.000000000, 180):desc", Desc(Geo([90., 180.]))),
|
||||
("_geoPoint(-90, -180.0000000000):asc", Asc(Geo([-90., -180.]))),
|
||||
("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))),
|
||||
("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))),
|
||||
("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))),
|
||||
];
|
||||
|
||||
for (req, expected) in valid_req {
|
||||
let res = req.parse::<AscDesc>();
|
||||
assert!(
|
||||
res.is_ok(),
|
||||
"Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`",
|
||||
req,
|
||||
expected,
|
||||
res
|
||||
);
|
||||
assert_eq!(res.unwrap(), expected);
|
||||
}
|
||||
|
||||
let invalid_req = [
|
||||
("truc:machin", InvalidSyntax { name: S("truc:machin") }),
|
||||
("truc:deesc", InvalidSyntax { name: S("truc:deesc") }),
|
||||
("truc:asc:deesc", InvalidSyntax { name: S("truc:asc:deesc") }),
|
||||
("42desc", InvalidSyntax { name: S("42desc") }),
|
||||
("_geoPoint:asc", ReservedKeyword { name: S("_geoPoint") }),
|
||||
("_geoDistance:asc", ReservedKeyword { name: S("_geoDistance") }),
|
||||
("_geoPoint(42.12 , 59.598)", InvalidSyntax { name: S("_geoPoint(42.12 , 59.598)") }),
|
||||
(
|
||||
"_geoPoint(42.12 , 59.598):deesc",
|
||||
InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):deesc") },
|
||||
),
|
||||
(
|
||||
"_geoPoint(42.12 , 59.598):machin",
|
||||
InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):machin") },
|
||||
),
|
||||
(
|
||||
"_geoPoint(42.12 , 59.598):asc:aasc",
|
||||
InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):asc:aasc") },
|
||||
),
|
||||
(
|
||||
"_geoPoint(42,12 , 59,598):desc",
|
||||
ReservedKeyword { name: S("_geoPoint(42,12 , 59,598)") },
|
||||
),
|
||||
("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }),
|
||||
("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }),
|
||||
("_geoPoint(200, 200):asc", GeoError(BadGeoError::Lat(200.))),
|
||||
("_geoPoint(90.000001, 0):asc", GeoError(BadGeoError::Lat(90.000001))),
|
||||
("_geoPoint(0, -180.000001):desc", GeoError(BadGeoError::Lng(-180.000001))),
|
||||
("_geoPoint(159.256, 130):asc", GeoError(BadGeoError::Lat(159.256))),
|
||||
("_geoPoint(12, -2021):desc", GeoError(BadGeoError::Lng(-2021.))),
|
||||
("_geo(12, -2021):asc", ReservedKeyword { name: S("_geo(12, -2021)") }),
|
||||
("_geo(12, -2021):desc", ReservedKeyword { name: S("_geo(12, -2021)") }),
|
||||
("_geoDistance(12, -2021):asc", ReservedKeyword { name: S("_geoDistance(12, -2021)") }),
|
||||
(
|
||||
"_geoDistance(12, -2021):desc",
|
||||
ReservedKeyword { name: S("_geoDistance(12, -2021)") },
|
||||
),
|
||||
];
|
||||
|
||||
for (req, expected_error) in invalid_req {
|
||||
let res = req.parse::<AscDesc>();
|
||||
assert!(
|
||||
res.is_err(),
|
||||
"Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`",
|
||||
req,
|
||||
res,
|
||||
);
|
||||
let res = res.unwrap_err();
|
||||
assert_eq!(
|
||||
res.to_string(),
|
||||
expected_error.to_string(),
|
||||
"Bad error for input {}: got `{:?}` instead of `{:?}`",
|
||||
req,
|
||||
res,
|
||||
expected_error
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
188
crates/milli/src/criterion.rs
Normal file
188
crates/milli/src/criterion.rs
Normal file
|
@ -0,0 +1,188 @@
|
|||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::{AscDesc, Member};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum CriterionError {
|
||||
#[error("`{name}` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules.")]
|
||||
InvalidName { name: String },
|
||||
#[error("`{name}` is a reserved keyword and thus can't be used as a ranking rule")]
|
||||
ReservedName { name: String },
|
||||
#[error(
|
||||
"`{name}` is a reserved keyword and thus can't be used as a ranking rule. \
|
||||
`{name}` can only be used for sorting at search time"
|
||||
)]
|
||||
ReservedNameForSort { name: String },
|
||||
#[error(
|
||||
"`{name}` is a reserved keyword and thus can't be used as a ranking rule. \
|
||||
`{name}` can only be used for filtering at search time"
|
||||
)]
|
||||
ReservedNameForFilter { name: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub enum Criterion {
|
||||
/// Sorted by decreasing number of matched query terms.
|
||||
/// Query words at the front of an attribute is considered better than if it was at the back.
|
||||
Words,
|
||||
/// Sorted by increasing number of typos.
|
||||
Typo,
|
||||
/// Sorted by increasing distance between matched query terms.
|
||||
Proximity,
|
||||
/// Documents with quey words contained in more important
|
||||
/// attributes are considered better.
|
||||
Attribute,
|
||||
/// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable
|
||||
/// attributes can be used in place of this criterion at query time.
|
||||
Sort,
|
||||
/// Sorted by the similarity of the matched words with the query words.
|
||||
Exactness,
|
||||
/// Sorted by the increasing value of the field specified.
|
||||
Asc(String),
|
||||
/// Sorted by the decreasing value of the field specified.
|
||||
Desc(String),
|
||||
}
|
||||
|
||||
impl Criterion {
|
||||
/// Returns the field name parameter of this criterion.
|
||||
pub fn field_name(&self) -> Option<&str> {
|
||||
match self {
|
||||
Criterion::Asc(name) | Criterion::Desc(name) => Some(name),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for Criterion {
|
||||
type Err = CriterionError;
|
||||
|
||||
fn from_str(text: &str) -> Result<Criterion, Self::Err> {
|
||||
match text {
|
||||
"words" => Ok(Criterion::Words),
|
||||
"typo" => Ok(Criterion::Typo),
|
||||
"proximity" => Ok(Criterion::Proximity),
|
||||
"attribute" => Ok(Criterion::Attribute),
|
||||
"sort" => Ok(Criterion::Sort),
|
||||
"exactness" => Ok(Criterion::Exactness),
|
||||
text => match AscDesc::from_str(text)? {
|
||||
AscDesc::Asc(Member::Field(field)) => Ok(Criterion::Asc(field)),
|
||||
AscDesc::Desc(Member::Field(field)) => Ok(Criterion::Desc(field)),
|
||||
AscDesc::Asc(Member::Geo(_)) | AscDesc::Desc(Member::Geo(_)) => {
|
||||
Err(CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() })?
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_criteria() -> Vec<Criterion> {
|
||||
vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
Criterion::Attribute,
|
||||
Criterion::Sort,
|
||||
Criterion::Exactness,
|
||||
]
|
||||
}
|
||||
|
||||
impl fmt::Display for Criterion {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use Criterion::*;
|
||||
|
||||
match self {
|
||||
Words => f.write_str("words"),
|
||||
Typo => f.write_str("typo"),
|
||||
Proximity => f.write_str("proximity"),
|
||||
Attribute => f.write_str("attribute"),
|
||||
Sort => f.write_str("sort"),
|
||||
Exactness => f.write_str("exactness"),
|
||||
Asc(attr) => write!(f, "{}:asc", attr),
|
||||
Desc(attr) => write!(f, "{}:desc", attr),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use CriterionError::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_criterion() {
|
||||
let valid_criteria = [
|
||||
("words", Criterion::Words),
|
||||
("typo", Criterion::Typo),
|
||||
("proximity", Criterion::Proximity),
|
||||
("attribute", Criterion::Attribute),
|
||||
("sort", Criterion::Sort),
|
||||
("exactness", Criterion::Exactness),
|
||||
("price:asc", Criterion::Asc(S("price"))),
|
||||
("price:desc", Criterion::Desc(S("price"))),
|
||||
("price:asc:desc", Criterion::Desc(S("price:asc"))),
|
||||
("truc:machin:desc", Criterion::Desc(S("truc:machin"))),
|
||||
("hello-world!:desc", Criterion::Desc(S("hello-world!"))),
|
||||
("it's spacy over there:asc", Criterion::Asc(S("it's spacy over there"))),
|
||||
];
|
||||
|
||||
for (input, expected) in valid_criteria {
|
||||
let res = input.parse::<Criterion>();
|
||||
assert!(
|
||||
res.is_ok(),
|
||||
"Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`",
|
||||
input,
|
||||
expected,
|
||||
res
|
||||
);
|
||||
assert_eq!(res.unwrap(), expected);
|
||||
}
|
||||
|
||||
let invalid_criteria = [
|
||||
("words suffix", InvalidName { name: S("words suffix") }),
|
||||
("prefix typo", InvalidName { name: S("prefix typo") }),
|
||||
("proximity attribute", InvalidName { name: S("proximity attribute") }),
|
||||
("price", InvalidName { name: S("price") }),
|
||||
("asc:price", InvalidName { name: S("asc:price") }),
|
||||
("price:deesc", InvalidName { name: S("price:deesc") }),
|
||||
("price:aasc", InvalidName { name: S("price:aasc") }),
|
||||
("price:asc and desc", InvalidName { name: S("price:asc and desc") }),
|
||||
("price:asc:truc", InvalidName { name: S("price:asc:truc") }),
|
||||
("_geo:asc", ReservedName { name: S("_geo") }),
|
||||
("_geoDistance:asc", ReservedName { name: S("_geoDistance") }),
|
||||
("_geoPoint:asc", ReservedNameForSort { name: S("_geoPoint") }),
|
||||
("_geoPoint(42, 75):asc", ReservedNameForSort { name: S("_geoPoint") }),
|
||||
("_geoRadius:asc", ReservedNameForFilter { name: S("_geoRadius") }),
|
||||
("_geoRadius(42, 75, 59):asc", ReservedNameForFilter { name: S("_geoRadius") }),
|
||||
("_geoBoundingBox:asc", ReservedNameForFilter { name: S("_geoBoundingBox") }),
|
||||
(
|
||||
"_geoBoundingBox([42, 75], [75, 59]):asc",
|
||||
ReservedNameForFilter { name: S("_geoBoundingBox") },
|
||||
),
|
||||
];
|
||||
|
||||
for (input, expected) in invalid_criteria {
|
||||
let res = input.parse::<Criterion>();
|
||||
assert!(
|
||||
res.is_err(),
|
||||
"Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`",
|
||||
input,
|
||||
res
|
||||
);
|
||||
let res = res.unwrap_err();
|
||||
assert_eq!(
|
||||
res.to_string(),
|
||||
expected.to_string(),
|
||||
"Bad error for input {}: got `{:?}` instead of `{:?}`",
|
||||
input,
|
||||
res,
|
||||
expected
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
600
crates/milli/src/documents/builder.rs
Normal file
600
crates/milli/src/documents/builder.rs
Normal file
|
@ -0,0 +1,600 @@
|
|||
use std::io::{self, Write};
|
||||
|
||||
use grenad::{CompressionType, WriterBuilder};
|
||||
use serde::de::Deserializer;
|
||||
use serde_json::{to_writer, Value};
|
||||
|
||||
use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
|
||||
use crate::documents::serde_impl::DocumentVisitor;
|
||||
use crate::Object;
|
||||
|
||||
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
|
||||
/// format used by milli.
|
||||
///
|
||||
/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader`
|
||||
/// to iterate over the documents.
|
||||
///
|
||||
/// ## example:
|
||||
/// ```
|
||||
/// use serde_json::json;
|
||||
/// use milli::documents::DocumentsBatchBuilder;
|
||||
///
|
||||
/// let json = json!({ "id": 1, "name": "foo" });
|
||||
///
|
||||
/// let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
/// builder.append_json_object(json.as_object().unwrap()).unwrap();
|
||||
/// let _vector = builder.into_inner().unwrap();
|
||||
/// ```
|
||||
pub struct DocumentsBatchBuilder<W> {
|
||||
/// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`.
|
||||
writer: grenad::Writer<W>,
|
||||
/// A map that creates the relation between field ids and field names.
|
||||
fields_index: DocumentsBatchIndex,
|
||||
/// The number of documents that were added to this builder,
|
||||
/// it doesn't take the primary key of the documents into account at this point.
|
||||
documents_count: u32,
|
||||
|
||||
/// A buffer to store a temporary obkv buffer and avoid reallocating.
|
||||
obkv_buffer: Vec<u8>,
|
||||
/// A buffer to serialize the values and avoid reallocating,
|
||||
/// serialized values are stored in an obkv.
|
||||
value_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<W: Write> DocumentsBatchBuilder<W> {
|
||||
pub fn new(writer: W) -> DocumentsBatchBuilder<W> {
|
||||
DocumentsBatchBuilder {
|
||||
writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer),
|
||||
fields_index: DocumentsBatchIndex::default(),
|
||||
documents_count: 0,
|
||||
obkv_buffer: Vec::new(),
|
||||
value_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of documents inserted into this builder.
|
||||
pub fn documents_count(&self) -> u32 {
|
||||
self.documents_count
|
||||
}
|
||||
|
||||
/// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly.
|
||||
pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> {
|
||||
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
||||
let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(k)).collect();
|
||||
fields_ids.sort_unstable();
|
||||
|
||||
self.obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
|
||||
for field_id in fields_ids {
|
||||
let key = self.fields_index.name(field_id).unwrap();
|
||||
self.value_buffer.clear();
|
||||
to_writer(&mut self.value_buffer, &object[key])?;
|
||||
writer.insert(field_id, &self.value_buffer)?;
|
||||
}
|
||||
|
||||
let internal_id = self.documents_count.to_be_bytes();
|
||||
let document_bytes = writer.into_inner()?;
|
||||
self.writer.insert(internal_id, &document_bytes)?;
|
||||
self.documents_count += 1;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly.
|
||||
pub fn append_json_array<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
|
||||
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||
let mut visitor = DocumentVisitor::new(self);
|
||||
de.deserialize_any(&mut visitor)?
|
||||
}
|
||||
|
||||
/// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
|
||||
pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
|
||||
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
||||
let mut typed_fields_ids: Vec<_> = reader
|
||||
.headers()?
|
||||
.into_iter()
|
||||
.map(parse_csv_header)
|
||||
.map(|(k, t)| (self.fields_index.insert(k), t))
|
||||
.enumerate()
|
||||
.collect();
|
||||
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
||||
typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
|
||||
|
||||
let mut record = csv::StringRecord::new();
|
||||
let mut line = 0;
|
||||
while reader.read_record(&mut record)? {
|
||||
// We increment here and not at the end of the while loop to take
|
||||
// the header offset into account.
|
||||
line += 1;
|
||||
|
||||
self.obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
|
||||
|
||||
for (i, (field_id, type_)) in typed_fields_ids.iter() {
|
||||
self.value_buffer.clear();
|
||||
|
||||
let value = &record[*i];
|
||||
let trimmed_value = value.trim();
|
||||
match type_ {
|
||||
AllowedType::Number => {
|
||||
if trimmed_value.is_empty() {
|
||||
to_writer(&mut self.value_buffer, &Value::Null)?;
|
||||
} else if let Ok(integer) = trimmed_value.parse::<i64>() {
|
||||
to_writer(&mut self.value_buffer, &integer)?;
|
||||
} else {
|
||||
match trimmed_value.parse::<f64>() {
|
||||
Ok(float) => {
|
||||
to_writer(&mut self.value_buffer, &float)?;
|
||||
}
|
||||
Err(error) => {
|
||||
return Err(Error::ParseFloat {
|
||||
error,
|
||||
line,
|
||||
value: value.to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
AllowedType::Boolean => {
|
||||
if trimmed_value.is_empty() {
|
||||
to_writer(&mut self.value_buffer, &Value::Null)?;
|
||||
} else {
|
||||
match trimmed_value.parse::<bool>() {
|
||||
Ok(bool) => {
|
||||
to_writer(&mut self.value_buffer, &bool)?;
|
||||
}
|
||||
Err(error) => {
|
||||
return Err(Error::ParseBool {
|
||||
error,
|
||||
line,
|
||||
value: value.to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
AllowedType::String => {
|
||||
if value.is_empty() {
|
||||
to_writer(&mut self.value_buffer, &Value::Null)?;
|
||||
} else {
|
||||
to_writer(&mut self.value_buffer, value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We insert into the obkv writer the value buffer that has been filled just above.
|
||||
writer.insert(*field_id, &self.value_buffer)?;
|
||||
}
|
||||
|
||||
let internal_id = self.documents_count.to_be_bytes();
|
||||
let document_bytes = writer.into_inner()?;
|
||||
self.writer.insert(internal_id, &document_bytes)?;
|
||||
self.documents_count += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
|
||||
|
||||
// We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer.
|
||||
self.value_buffer.clear();
|
||||
to_writer(&mut self.value_buffer, &fields_index)?;
|
||||
writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?;
|
||||
|
||||
writer.into_inner()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum AllowedType {
|
||||
String,
|
||||
Boolean,
|
||||
Number,
|
||||
}
|
||||
|
||||
fn parse_csv_header(header: &str) -> (&str, AllowedType) {
|
||||
// if there are several separators we only split on the last one.
|
||||
match header.rsplit_once(':') {
|
||||
Some((field_name, field_type)) => match field_type {
|
||||
"string" => (field_name, AllowedType::String),
|
||||
"boolean" => (field_name, AllowedType::Boolean),
|
||||
"number" => (field_name, AllowedType::Number),
|
||||
// if the pattern isn't recognized, we keep the whole field.
|
||||
_otherwise => (header, AllowedType::String),
|
||||
},
|
||||
None => (header, AllowedType::String),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::io::Cursor;
|
||||
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
|
||||
#[test]
|
||||
fn add_single_documents_json() {
|
||||
let json = serde_json::json!({
|
||||
"id": 1,
|
||||
"field": "hello!",
|
||||
});
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_json_object(json.as_object().unwrap()).unwrap();
|
||||
|
||||
let json = serde_json::json!({
|
||||
"blabla": false,
|
||||
"field": "hello!",
|
||||
"id": 1,
|
||||
});
|
||||
|
||||
builder.append_json_object(json.as_object().unwrap()).unwrap();
|
||||
|
||||
assert_eq!(builder.documents_count(), 2);
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
assert_eq!(index.len(), 3);
|
||||
|
||||
let document = cursor.next_document().unwrap().unwrap();
|
||||
assert_eq!(document.iter().count(), 2);
|
||||
|
||||
let document = cursor.next_document().unwrap().unwrap();
|
||||
assert_eq!(document.iter().count(), 3);
|
||||
|
||||
assert!(cursor.next_document().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_documents_csv() {
|
||||
let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
assert_eq!(builder.documents_count(), 2);
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
assert_eq!(index.len(), 2);
|
||||
|
||||
let document = cursor.next_document().unwrap().unwrap();
|
||||
assert_eq!(document.iter().count(), 2);
|
||||
|
||||
let document = cursor.next_document().unwrap().unwrap();
|
||||
assert_eq!(document.iter().count(), 2);
|
||||
|
||||
assert!(cursor.next_document().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_csv_document() {
|
||||
let csv_content = r#"city,country,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city": "Boston",
|
||||
"country": "United States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
|
||||
assert!(cursor.next_document().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn coma_in_field() {
|
||||
let csv_content = r#"city,country,pop
|
||||
"Boston","United, States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city": "Boston",
|
||||
"country": "United, States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quote_in_field() {
|
||||
let csv_content = r#"city,country,pop
|
||||
"Boston","United"" States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city": "Boston",
|
||||
"country": "United\" States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integer_in_field() {
|
||||
let csv_content = r#"city,country,pop:number
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city": "Boston",
|
||||
"country": "United States",
|
||||
"pop": 4628910,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integer_as_id() {
|
||||
let csv_content = r#""id:number","title:string","comment:string"
|
||||
"1239","Pride and Prejudice","A great book""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"id": 1239,
|
||||
"title": "Pride and Prejudice",
|
||||
"comment": "A great book",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float_in_field() {
|
||||
let csv_content = r#"city,country,pop:number
|
||||
"Boston","United States","4628910.01""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city": "Boston",
|
||||
"country": "United States",
|
||||
"pop": 4628910.01,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn several_colon_in_header() {
|
||||
let csv_content = r#"city:love:string,country:state,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city:love": "Boston",
|
||||
"country:state": "United States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ending_by_colon_in_header() {
|
||||
let csv_content = r#"city:,country,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city:": "Boston",
|
||||
"country": "United States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn starting_by_colon_in_header() {
|
||||
let csv_content = r#":city,country,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
":city": "Boston",
|
||||
"country": "United States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[ignore]
|
||||
#[test]
|
||||
fn starting_by_colon_in_header2() {
|
||||
let csv_content = r#":string,country,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
assert!(cursor.next_document().is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_colon_in_header() {
|
||||
let csv_content = r#"city::string,country,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
json!({
|
||||
"city:": "Boston",
|
||||
"country": "United States",
|
||||
"pop": "4628910",
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bad_type_in_header() {
|
||||
let csv_content = r#"city,country:number,pop
|
||||
"Boston","United States","4628910""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
assert!(builder.append_csv(csv).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bad_column_count1() {
|
||||
let csv_content = r#"city,country,pop
|
||||
"Boston","United States","4628910", "too much
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
assert!(builder.append_csv(csv).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bad_column_count2() {
|
||||
let csv_content = r#"city,country,pop
|
||||
"Boston","United States""#;
|
||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
assert!(builder.append_csv(csv).is_err());
|
||||
}
|
||||
}
|
110
crates/milli/src/documents/enriched.rs
Normal file
110
crates/milli/src/documents/enriched.rs
Normal file
|
@ -0,0 +1,110 @@
|
|||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, str};
|
||||
|
||||
use obkv::KvReader;
|
||||
|
||||
use super::{
|
||||
DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||
Error,
|
||||
};
|
||||
use crate::update::DocumentId;
|
||||
use crate::FieldId;
|
||||
|
||||
/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have
|
||||
/// been created with a `DocumentsBatchWriter` and, for the enriched data,
|
||||
/// a simple `grenad::Reader<File>`.
|
||||
///
|
||||
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
||||
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
||||
pub struct EnrichedDocumentsBatchReader<R> {
|
||||
documents: DocumentsBatchReader<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||
pub fn new(
|
||||
documents: DocumentsBatchReader<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::Reader<BufReader<File>>,
|
||||
) -> Result<Self, Error> {
|
||||
if documents.documents_count() as u64 == external_ids.len() {
|
||||
Ok(EnrichedDocumentsBatchReader {
|
||||
documents,
|
||||
primary_key,
|
||||
external_ids: external_ids.into_cursor()?,
|
||||
})
|
||||
} else {
|
||||
Err(Error::InvalidEnrichedData)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn documents_count(&self) -> u32 {
|
||||
self.documents.documents_count()
|
||||
}
|
||||
|
||||
pub fn primary_key(&self) -> &str {
|
||||
&self.primary_key
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.documents.is_empty()
|
||||
}
|
||||
|
||||
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
||||
self.documents.documents_batch_index()
|
||||
}
|
||||
|
||||
/// This method returns a forward cursor over the enriched documents.
|
||||
pub fn into_cursor_and_fields_index(
|
||||
self,
|
||||
) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
||||
let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
|
||||
let (documents, fields_index) = documents.into_cursor_and_fields_index();
|
||||
external_ids.reset();
|
||||
(EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EnrichedDocument<'a> {
|
||||
pub document: KvReader<'a, FieldId>,
|
||||
pub document_id: DocumentId,
|
||||
}
|
||||
|
||||
pub struct EnrichedDocumentsBatchCursor<R> {
|
||||
documents: DocumentsBatchCursor<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||
}
|
||||
|
||||
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||
pub fn primary_key(&self) -> &str {
|
||||
&self.primary_key
|
||||
}
|
||||
/// Resets the cursor to be able to read from the start again.
|
||||
pub fn reset(&mut self) {
|
||||
self.documents.reset();
|
||||
self.external_ids.reset();
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> {
|
||||
/// Returns the next document, starting from the first one. Subsequent calls to
|
||||
/// `next_document` advance the document reader until all the documents have been read.
|
||||
pub fn next_enriched_document(
|
||||
&mut self,
|
||||
) -> Result<Option<EnrichedDocument<'_>>, DocumentsBatchCursorError> {
|
||||
let document = self.documents.next_document()?;
|
||||
let document_id = match self.external_ids.move_on_next()? {
|
||||
Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?,
|
||||
None => None,
|
||||
};
|
||||
|
||||
match document.zip(document_id) {
|
||||
Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
273
crates/milli/src/documents/mod.rs
Normal file
273
crates/milli/src/documents/mod.rs
Normal file
|
@ -0,0 +1,273 @@
|
|||
mod builder;
|
||||
mod enriched;
|
||||
mod primary_key;
|
||||
mod reader;
|
||||
mod serde_impl;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::io;
|
||||
use std::str::Utf8Error;
|
||||
|
||||
use bimap::BiHashMap;
|
||||
pub use builder::DocumentsBatchBuilder;
|
||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||
use obkv::KvReader;
|
||||
pub use primary_key::{
|
||||
validate_document_id_value, DocumentIdExtractionError, FieldIdMapper, PrimaryKey,
|
||||
DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
||||
use crate::{FieldId, Object, Result};
|
||||
|
||||
/// The key that is used to store the `DocumentsBatchIndex` datastructure,
|
||||
/// it is the absolute last key of the list.
|
||||
const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
|
||||
|
||||
/// Helper function to convert an obkv reader into a JSON object.
|
||||
pub fn obkv_to_object(obkv: &KvReader<'_, FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
|
||||
obkv.iter()
|
||||
.map(|(field_id, value)| {
|
||||
let field_name = index
|
||||
.name(field_id)
|
||||
.ok_or(FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" })?;
|
||||
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||
Ok((field_name.to_string(), value))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// A bidirectional map that links field ids to their name in a document batch.
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
|
||||
|
||||
impl DocumentsBatchIndex {
|
||||
/// Insert the field in the map, or return it's field id if it doesn't already exists.
|
||||
pub fn insert(&mut self, field: &str) -> FieldId {
|
||||
match self.0.get_by_right(field) {
|
||||
Some(field_id) => *field_id,
|
||||
None => {
|
||||
let field_id = self.0.len() as FieldId;
|
||||
self.0.insert(field_id, field.to_string());
|
||||
field_id
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> bimap::hash::Iter<'_, FieldId, String> {
|
||||
self.0.iter()
|
||||
}
|
||||
|
||||
pub fn name(&self, id: FieldId) -> Option<&str> {
|
||||
self.0.get_by_left(&id).map(AsRef::as_ref)
|
||||
}
|
||||
|
||||
pub fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.0.get_by_right(name).cloned()
|
||||
}
|
||||
|
||||
pub fn recreate_json(&self, document: &obkv::KvReaderU16<'_>) -> Result<Object> {
|
||||
let mut map = Object::new();
|
||||
|
||||
for (k, v) in document.iter() {
|
||||
// TODO: TAMO: update the error type
|
||||
let key =
|
||||
self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone();
|
||||
let value = serde_json::from_slice::<serde_json::Value>(v)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
map.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldIdMapper for DocumentsBatchIndex {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("Error parsing number {value:?} at line {line}: {error}")]
|
||||
ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
|
||||
#[error("Error parsing boolean {value:?} at line {line}: {error}")]
|
||||
ParseBool { error: std::str::ParseBoolError, line: usize, value: String },
|
||||
#[error("Invalid document addition format, missing the documents batch index.")]
|
||||
InvalidDocumentFormat,
|
||||
#[error("Invalid enriched data.")]
|
||||
InvalidEnrichedData,
|
||||
#[error(transparent)]
|
||||
InvalidUtf8(#[from] Utf8Error),
|
||||
#[error(transparent)]
|
||||
Csv(#[from] csv::Error),
|
||||
#[error(transparent)]
|
||||
Json(#[from] serde_json::Error),
|
||||
#[error(transparent)]
|
||||
Serialize(serde_json::Error),
|
||||
#[error(transparent)]
|
||||
Grenad(#[from] grenad::Error),
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
}
|
||||
|
||||
pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
|
||||
let documents = match json {
|
||||
object @ serde_json::Value::Object(_) => vec![object],
|
||||
serde_json::Value::Array(objects) => objects,
|
||||
invalid => {
|
||||
panic!("an array of objects must be specified, {:#?} is not an array", invalid)
|
||||
}
|
||||
};
|
||||
let mut objects = vec![];
|
||||
for document in documents {
|
||||
let object = match document {
|
||||
serde_json::Value::Object(object) => object,
|
||||
invalid => panic!("an object must be specified, {:#?} is not an object", invalid),
|
||||
};
|
||||
objects.push(object);
|
||||
}
|
||||
objects
|
||||
}
|
||||
|
||||
/// Macro used to generate documents, with the same syntax as `serde_json::json`
|
||||
#[cfg(test)]
|
||||
macro_rules! documents {
|
||||
($data:tt) => {{
|
||||
let documents = serde_json::json!($data);
|
||||
let documents = $crate::documents::objects_from_json_value(documents);
|
||||
$crate::documents::documents_batch_reader_from_objects(documents)
|
||||
}};
|
||||
}
|
||||
|
||||
pub fn documents_batch_reader_from_objects(
|
||||
objects: impl IntoIterator<Item = Object>,
|
||||
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in objects {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
let vector = builder.into_inner().unwrap();
|
||||
DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::io::Cursor;
|
||||
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn create_documents_no_errors() {
|
||||
let value = json!({
|
||||
"number": 1,
|
||||
"string": "this is a field",
|
||||
"array": ["an", "array"],
|
||||
"object": {
|
||||
"key": "value",
|
||||
},
|
||||
"bool": true
|
||||
});
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_json_object(value.as_object().unwrap()).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
assert_eq!(index.iter().count(), 5);
|
||||
let reader = documents.next_document().unwrap().unwrap();
|
||||
assert_eq!(reader.iter().count(), 5);
|
||||
assert!(documents.next_document().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add_multiple_documents() {
|
||||
let doc1 = json!({
|
||||
"bool": true,
|
||||
});
|
||||
let doc2 = json!({
|
||||
"toto": false,
|
||||
});
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
|
||||
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector))
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
assert_eq!(index.iter().count(), 2);
|
||||
let reader = documents.next_document().unwrap().unwrap();
|
||||
assert_eq!(reader.iter().count(), 1);
|
||||
assert!(documents.next_document().unwrap().is_some());
|
||||
assert!(documents.next_document().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nested() {
|
||||
let docs_reader = documents!([{
|
||||
"hello": {
|
||||
"toto": ["hello"]
|
||||
}
|
||||
}]);
|
||||
|
||||
let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
|
||||
assert_eq!(nested, json!({ "toto": ["hello"] }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn out_of_order_json_fields() {
|
||||
let _documents = documents!([
|
||||
{"id": 1,"b": 0},
|
||||
{"id": 2,"a": 0,"b": 0},
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn csv_types_dont_panic() {
|
||||
let csv1_content =
|
||||
"id:number,b:boolean,c,d:number\n1,,,\n2,true,doggo,2\n3,false,the best doggo,-2\n4,,\"Hello, World!\",2.5";
|
||||
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv1).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn out_of_order_csv_fields() {
|
||||
let csv1_content = "id:number,b\n1,0";
|
||||
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
|
||||
|
||||
let csv2_content = "id:number,a,b\n2,0,0";
|
||||
let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
builder.append_csv(csv1).unwrap();
|
||||
builder.append_csv(csv2).unwrap();
|
||||
let vector = builder.into_inner().unwrap();
|
||||
|
||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||
}
|
||||
}
|
174
crates/milli/src/documents/primary_key.rs
Normal file
174
crates/milli/src/documents/primary_key.rs
Normal file
|
@ -0,0 +1,174 @@
|
|||
use std::iter;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::{FieldId, InternalError, Object, Result, UserError};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
pub const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
|
||||
/// Trait for objects that can map the name of a field to its [`FieldId`].
|
||||
pub trait FieldIdMapper {
|
||||
/// Attempts to map the passed name to its [`FieldId`].
|
||||
///
|
||||
/// `None` if the field with this name was not found.
|
||||
fn id(&self, name: &str) -> Option<FieldId>;
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
pub enum DocumentIdExtractionError {
|
||||
InvalidDocumentId(UserError),
|
||||
MissingDocumentId,
|
||||
TooManyDocumentIds(usize),
|
||||
}
|
||||
|
||||
impl<'a> PrimaryKey<'a> {
|
||||
pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
|
||||
Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
|
||||
Self::Nested { name: path }
|
||||
} else {
|
||||
let field_id = fields.id(path)?;
|
||||
Self::Flat { name: path, field_id }
|
||||
})
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_id(
|
||||
&self,
|
||||
document: &obkv::KvReader<'_, FieldId>,
|
||||
fields: &impl FieldIdMapper,
|
||||
) -> Result<StdResult<String, DocumentIdExtractionError>> {
|
||||
match self {
|
||||
PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id) {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
}
|
||||
}
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
},
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = fields.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
|
||||
matching_documents_ids.len(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id) {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
},
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and depth of the objects.
|
||||
pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
// FIXME: move to a DocumentId struct
|
||||
|
||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if document_id.is_empty()
|
||||
|| document_id.len() > 512
|
||||
|| !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
|
||||
{
|
||||
None
|
||||
} else {
|
||||
Some(document_id)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id_value(document_id: Value) -> StdResult<String, UserError> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(string),
|
||||
Some(s) => Ok(s.to_string()),
|
||||
None => Err(UserError::InvalidDocumentId { document_id: Value::String(string) }),
|
||||
},
|
||||
// a `u64` or `i64` cannot be more than 512 bytes once converted to a string
|
||||
Value::Number(number) if !number.is_f64() => Ok(number.to_string()),
|
||||
content => Err(UserError::InvalidDocumentId { document_id: content }),
|
||||
}
|
||||
}
|
117
crates/milli/src/documents/reader.rs
Normal file
117
crates/milli/src/documents/reader.rs
Normal file
|
@ -0,0 +1,117 @@
|
|||
use std::convert::TryInto;
|
||||
use std::{error, fmt, io};
|
||||
|
||||
use obkv::KvReader;
|
||||
|
||||
use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
|
||||
use crate::FieldId;
|
||||
|
||||
/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
|
||||
/// a `DocumentsBatchWriter`.
|
||||
///
|
||||
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
||||
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
||||
pub struct DocumentsBatchReader<R> {
|
||||
cursor: grenad::ReaderCursor<R>,
|
||||
fields_index: DocumentsBatchIndex,
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
|
||||
pub fn new(cursor: DocumentsBatchCursor<R>, fields_index: DocumentsBatchIndex) -> Self {
|
||||
Self { cursor: cursor.cursor, fields_index }
|
||||
}
|
||||
|
||||
/// Construct a `DocumentsReader` from a reader.
|
||||
///
|
||||
/// It first retrieves the index, then moves to the first document. Use the `into_cursor`
|
||||
/// method to iterator over the documents, from the first to the last.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
|
||||
pub fn from_reader(reader: R) -> Result<Self, Error> {
|
||||
let reader = grenad::Reader::new(reader)?;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
|
||||
let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? {
|
||||
Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?,
|
||||
None => return Err(Error::InvalidDocumentFormat),
|
||||
};
|
||||
|
||||
Ok(DocumentsBatchReader { cursor, fields_index })
|
||||
}
|
||||
|
||||
pub fn documents_count(&self) -> u32 {
|
||||
self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents")
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.cursor.len().saturating_sub(1) == 0
|
||||
}
|
||||
|
||||
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
||||
&self.fields_index
|
||||
}
|
||||
|
||||
/// This method returns a forward cursor over the documents.
|
||||
pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
||||
let DocumentsBatchReader { cursor, fields_index } = self;
|
||||
let mut cursor = DocumentsBatchCursor { cursor };
|
||||
cursor.reset();
|
||||
(cursor, fields_index)
|
||||
}
|
||||
}
|
||||
|
||||
/// A forward cursor over the documents in a `DocumentsBatchReader`.
|
||||
pub struct DocumentsBatchCursor<R> {
|
||||
cursor: grenad::ReaderCursor<R>,
|
||||
}
|
||||
|
||||
impl<R> DocumentsBatchCursor<R> {
|
||||
/// Resets the cursor to be able to read from the start again.
|
||||
pub fn reset(&mut self) {
|
||||
self.cursor.reset();
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
|
||||
/// Returns the next document, starting from the first one. Subsequent calls to
|
||||
/// `next_document` advance the document reader until all the documents have been read.
|
||||
pub fn next_document(
|
||||
&mut self,
|
||||
) -> Result<Option<KvReader<'_, FieldId>>, DocumentsBatchCursorError> {
|
||||
match self.cursor.move_on_next()? {
|
||||
Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
|
||||
Ok(Some(KvReader::new(value)))
|
||||
}
|
||||
_otherwise => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents.
|
||||
#[derive(Debug)]
|
||||
pub enum DocumentsBatchCursorError {
|
||||
Grenad(grenad::Error),
|
||||
SerdeJson(serde_json::Error),
|
||||
}
|
||||
|
||||
impl From<grenad::Error> for DocumentsBatchCursorError {
|
||||
fn from(error: grenad::Error) -> DocumentsBatchCursorError {
|
||||
DocumentsBatchCursorError::Grenad(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for DocumentsBatchCursorError {
|
||||
fn from(error: serde_json::Error) -> DocumentsBatchCursorError {
|
||||
DocumentsBatchCursorError::SerdeJson(error)
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for DocumentsBatchCursorError {}
|
||||
|
||||
impl fmt::Display for DocumentsBatchCursorError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
DocumentsBatchCursorError::Grenad(e) => e.fmt(f),
|
||||
DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
76
crates/milli/src/documents/serde_impl.rs
Normal file
76
crates/milli/src/documents/serde_impl.rs
Normal file
|
@ -0,0 +1,76 @@
|
|||
use std::fmt;
|
||||
use std::io::Write;
|
||||
|
||||
use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
|
||||
|
||||
use super::Error;
|
||||
use crate::documents::DocumentsBatchBuilder;
|
||||
use crate::Object;
|
||||
|
||||
macro_rules! tri {
|
||||
($e:expr) => {
|
||||
match $e {
|
||||
Ok(r) => r,
|
||||
Err(e) => return Ok(Err(e.into())),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub struct DocumentVisitor<'a, W> {
|
||||
inner: &'a mut DocumentsBatchBuilder<W>,
|
||||
object: Object,
|
||||
}
|
||||
|
||||
impl<'a, W> DocumentVisitor<'a, W> {
|
||||
pub fn new(inner: &'a mut DocumentsBatchBuilder<W>) -> Self {
|
||||
DocumentVisitor { inner, object: Object::new() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
|
||||
/// This Visitor value is nothing, since it write the value to a file.
|
||||
type Value = Result<(), Error>;
|
||||
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: SeqAccess<'de>,
|
||||
{
|
||||
while let Some(v) = seq.next_element_seed(&mut *self)? {
|
||||
tri!(v)
|
||||
}
|
||||
|
||||
Ok(Ok(()))
|
||||
}
|
||||
|
||||
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: MapAccess<'de>,
|
||||
{
|
||||
self.object.clear();
|
||||
while let Some((key, value)) = map.next_entry()? {
|
||||
self.object.insert(key, value);
|
||||
}
|
||||
|
||||
tri!(self.inner.append_json_object(&self.object));
|
||||
|
||||
Ok(Ok(()))
|
||||
}
|
||||
|
||||
fn expecting(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "a documents, or a sequence of documents.")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
type Value = Result<(), Error>;
|
||||
|
||||
fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_map(self)
|
||||
}
|
||||
}
|
504
crates/milli/src/error.rs
Normal file
504
crates/milli/src/error.rs
Normal file
|
@ -0,0 +1,504 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::convert::Infallible;
|
||||
use std::fmt::Write;
|
||||
use std::{io, str};
|
||||
|
||||
use heed::{Error as HeedError, MdbError};
|
||||
use rayon::ThreadPoolBuildError;
|
||||
use rhai::EvalAltResult;
|
||||
use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::documents::{self, DocumentsBatchCursorError};
|
||||
use crate::thread_pool_no_abort::PanicCatched;
|
||||
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
|
||||
|
||||
pub fn is_reserved_keyword(keyword: &str) -> bool {
|
||||
["_geo", "_geoDistance", "_geoPoint", "_geoRadius", "_geoBoundingBox"].contains(&keyword)
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("internal: {0}.")]
|
||||
InternalError(#[from] InternalError),
|
||||
#[error(transparent)]
|
||||
IoError(#[from] io::Error),
|
||||
#[error(transparent)]
|
||||
UserError(#[from] UserError),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum InternalError {
|
||||
#[error("{}", HeedError::DatabaseClosing)]
|
||||
DatabaseClosing,
|
||||
#[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]
|
||||
DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> },
|
||||
#[error("Missing {key} in the fieldids weights mapping.")]
|
||||
FieldidsWeightsMapMissingEntry { key: FieldId },
|
||||
#[error(transparent)]
|
||||
FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry),
|
||||
#[error("Missing {key} in the field id mapping.")]
|
||||
FieldIdMappingMissingEntry { key: FieldId },
|
||||
#[error(transparent)]
|
||||
Fst(#[from] fst::Error),
|
||||
#[error(transparent)]
|
||||
DocumentsError(#[from] documents::Error),
|
||||
#[error("Invalid compression type have been specified to grenad")]
|
||||
GrenadInvalidCompressionType,
|
||||
#[error("Invalid grenad file with an invalid version format")]
|
||||
GrenadInvalidFormatVersion,
|
||||
#[error("Invalid merge while processing {process}")]
|
||||
IndexingMergingKeys { process: &'static str },
|
||||
#[error(transparent)]
|
||||
RayonThreadPool(#[from] ThreadPoolBuildError),
|
||||
#[error(transparent)]
|
||||
PanicInThreadPool(#[from] PanicCatched),
|
||||
#[error(transparent)]
|
||||
SerdeJson(#[from] serde_json::Error),
|
||||
#[error(transparent)]
|
||||
Serialization(#[from] SerializationError),
|
||||
#[error(transparent)]
|
||||
Store(#[from] MdbError),
|
||||
#[error(transparent)]
|
||||
Utf8(#[from] str::Utf8Error),
|
||||
#[error("An indexation process was explicitly aborted")]
|
||||
AbortedIndexation,
|
||||
#[error("The matching words list contains at least one invalid member")]
|
||||
InvalidMatchingWords,
|
||||
#[error(transparent)]
|
||||
ArroyError(#[from] arroy::Error),
|
||||
#[error(transparent)]
|
||||
VectorEmbeddingError(#[from] crate::vector::Error),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SerializationError {
|
||||
#[error("{}", match .db_name {
|
||||
Some(name) => format!("decoding from the {name} database failed"),
|
||||
None => "decoding failed".to_string(),
|
||||
})]
|
||||
Decoding { db_name: Option<&'static str> },
|
||||
#[error("{}", match .db_name {
|
||||
Some(name) => format!("encoding into the {name} database failed"),
|
||||
None => "encoding failed".to_string(),
|
||||
})]
|
||||
Encoding { db_name: Option<&'static str> },
|
||||
#[error("number is not a valid finite number")]
|
||||
InvalidNumberSerialization,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FieldIdMapMissingEntry {
|
||||
#[error("unknown field id {field_id} coming from the {process} process")]
|
||||
FieldId { field_id: FieldId, process: &'static str },
|
||||
#[error("unknown field name {field_name} coming from the {process} process")]
|
||||
FieldName { field_name: String, process: &'static str },
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum UserError {
|
||||
#[error("A document cannot contain more than 65,535 fields.")]
|
||||
AttributeLimitReached,
|
||||
#[error(transparent)]
|
||||
CriterionError(#[from] CriterionError),
|
||||
#[error("Maximum number of documents reached.")]
|
||||
DocumentLimitReached,
|
||||
#[error(
|
||||
"Document identifier `{}` is invalid. \
|
||||
A document identifier can be of type integer or string, \
|
||||
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \
|
||||
and can not be more than 512 bytes.", .document_id.to_string()
|
||||
)]
|
||||
InvalidDocumentId { document_id: Value },
|
||||
#[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]
|
||||
InvalidFacetsDistribution {
|
||||
invalid_facets_name: BTreeSet<String>,
|
||||
valid_facets_name: BTreeSet<String>,
|
||||
},
|
||||
#[error(transparent)]
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||
InvalidVectorsMapType { document_id: String, value: Value },
|
||||
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
||||
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError },
|
||||
#[error("{0}")]
|
||||
InvalidFilter(String),
|
||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||
InvalidFilterExpression(&'static [&'static str], Value),
|
||||
#[error("Attribute `{}` is not sortable. {}",
|
||||
.field,
|
||||
match .valid_fields.is_empty() {
|
||||
true => "This index does not have configured sortable attributes.".to_string(),
|
||||
false => format!("Available sortable attributes are: `{}{}`.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
),
|
||||
}
|
||||
)]
|
||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
|
||||
#[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}",
|
||||
.field,
|
||||
match .valid_fields.is_empty() {
|
||||
true => "This index does not have configured filterable attributes.".to_string(),
|
||||
false => format!("Available filterable attributes are: `{}{}`.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
),
|
||||
}
|
||||
)]
|
||||
InvalidDistinctAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
|
||||
#[error("Attribute `{}` is not facet-searchable. {}",
|
||||
.field,
|
||||
match .valid_fields.is_empty() {
|
||||
true => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(),
|
||||
false => format!("Available facet-searchable attributes are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
),
|
||||
}
|
||||
)]
|
||||
InvalidFacetSearchFacetName {
|
||||
field: String,
|
||||
valid_fields: BTreeSet<String>,
|
||||
hidden_fields: bool,
|
||||
},
|
||||
#[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.",
|
||||
.field,
|
||||
.valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
)]
|
||||
InvalidSearchableAttribute {
|
||||
field: String,
|
||||
valid_fields: BTreeSet<String>,
|
||||
hidden_fields: bool,
|
||||
},
|
||||
#[error("an environment is already opened with different options")]
|
||||
InvalidLmdbOpenOptions,
|
||||
#[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")]
|
||||
SortRankingRuleMissing,
|
||||
#[error("The database file is in an invalid state.")]
|
||||
InvalidStoreFile,
|
||||
#[error("Maximum database size has been reached.")]
|
||||
MaxDatabaseSizeReached,
|
||||
#[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
||||
MissingDocumentId { primary_key: String, document: Object },
|
||||
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
||||
TooManyDocumentIds { primary_key: String, document: Object },
|
||||
#[error("The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")]
|
||||
NoPrimaryKeyCandidateFound,
|
||||
#[error("The primary key inference failed as the engine found {} fields ending with `id` in their names: '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.first().unwrap(), .candidates.get(1).unwrap())]
|
||||
MultiplePrimaryKeyCandidatesFound { candidates: Vec<String> },
|
||||
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
|
||||
NoSpaceLeftOnDevice,
|
||||
#[error("Index already has a primary key: `{0}`.")]
|
||||
PrimaryKeyCannotBeChanged(String),
|
||||
#[error(transparent)]
|
||||
SerdeJson(serde_json::Error),
|
||||
#[error(transparent)]
|
||||
SortError(#[from] SortError),
|
||||
#[error("An unknown internal document id have been used: `{document_id}`.")]
|
||||
UnknownInternalDocumentId { document_id: DocumentId },
|
||||
#[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")]
|
||||
InvalidMinTypoWordLenSetting(u8, u8),
|
||||
#[error(transparent)]
|
||||
VectorEmbeddingError(#[from] crate::vector::Error),
|
||||
#[error(transparent)]
|
||||
MissingDocumentField(#[from] crate::prompt::error::RenderPromptError),
|
||||
#[error(transparent)]
|
||||
InvalidPrompt(#[from] crate::prompt::error::NewPromptError),
|
||||
#[error("`.embedders.{0}.documentTemplate`: Invalid template: {1}.")]
|
||||
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
||||
#[error("Too many embedders in the configuration. Found {0}, but limited to 256.")]
|
||||
TooManyEmbedders(usize),
|
||||
#[error("Cannot find embedder with name `{0}`.")]
|
||||
InvalidEmbedder(String),
|
||||
#[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")]
|
||||
TooManyVectors(String, usize),
|
||||
#[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}",
|
||||
allowed_sources_for_field
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", "),
|
||||
allowed_fields_for_source
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ")
|
||||
)]
|
||||
InvalidFieldForSource {
|
||||
embedder_name: String,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
field: &'static str,
|
||||
allowed_fields_for_source: &'static [&'static str],
|
||||
allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource],
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())]
|
||||
InvalidOpenAiModel { embedder_name: String, model: String },
|
||||
#[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")]
|
||||
MissingFieldForSource {
|
||||
field: &'static str,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
embedder_name: String,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: Model `{model}` does not support overriding its native dimensions of {expected_dimensions}. Found {dimensions}")]
|
||||
InvalidOpenAiModelDimensions {
|
||||
embedder_name: String,
|
||||
model: &'static str,
|
||||
dimensions: usize,
|
||||
expected_dimensions: usize,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: Model `{model}` does not support overriding its dimensions to a value higher than {max_dimensions}. Found {dimensions}")]
|
||||
InvalidOpenAiModelDimensionsMax {
|
||||
embedder_name: String,
|
||||
model: &'static str,
|
||||
dimensions: usize,
|
||||
max_dimensions: usize,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: `dimensions` cannot be zero")]
|
||||
InvalidSettingsDimensions { embedder_name: String },
|
||||
#[error(
|
||||
"`.embedders.{embedder_name}.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors."
|
||||
)]
|
||||
InvalidDisableBinaryQuantization { embedder_name: String },
|
||||
#[error("`.embedders.{embedder_name}.documentTemplateMaxBytes`: `documentTemplateMaxBytes` cannot be zero")]
|
||||
InvalidSettingsDocumentTemplateMaxBytes { embedder_name: String },
|
||||
#[error("`.embedders.{embedder_name}.url`: could not parse `{url}`: {inner_error}")]
|
||||
InvalidUrl { embedder_name: String, inner_error: url::ParseError, url: String },
|
||||
#[error("Document editions cannot modify a document's primary key")]
|
||||
DocumentEditionCannotModifyPrimaryKey,
|
||||
#[error("Document editions must keep documents as objects")]
|
||||
DocumentEditionDocumentMustBeObject,
|
||||
#[error("Document edition runtime error encountered while running the function: {0}")]
|
||||
DocumentEditionRuntimeError(Box<EvalAltResult>),
|
||||
#[error("Document edition runtime error encountered while compiling the function: {0}")]
|
||||
DocumentEditionCompilationError(rhai::ParseError),
|
||||
#[error("{0}")]
|
||||
DocumentEmbeddingError(String),
|
||||
}
|
||||
|
||||
impl From<crate::vector::Error> for Error {
|
||||
fn from(value: crate::vector::Error) -> Self {
|
||||
match value.fault() {
|
||||
FaultSource::User => Error::UserError(value.into()),
|
||||
FaultSource::Runtime => Error::UserError(value.into()),
|
||||
FaultSource::Bug => Error::InternalError(value.into()),
|
||||
FaultSource::Undecided => Error::UserError(value.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<arroy::Error> for Error {
|
||||
fn from(value: arroy::Error) -> Self {
|
||||
match value {
|
||||
arroy::Error::Heed(heed) => heed.into(),
|
||||
arroy::Error::Io(io) => io.into(),
|
||||
arroy::Error::InvalidVecDimension { expected, received } => {
|
||||
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||
}
|
||||
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
|
||||
arroy::Error::DatabaseFull
|
||||
| arroy::Error::InvalidItemAppend
|
||||
| arroy::Error::UnmatchingDistance { .. }
|
||||
| arroy::Error::NeedBuild(_)
|
||||
| arroy::Error::MissingKey { .. }
|
||||
| arroy::Error::MissingMetadata(_) => {
|
||||
Error::InternalError(InternalError::ArroyError(value))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum GeoError {
|
||||
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
|
||||
NotAnObject { document_id: Value, value: Value },
|
||||
#[error("The `_geo` field in the document with the id: `{document_id}` contains the following unexpected fields: `{value}`.")]
|
||||
UnexpectedExtraFields { document_id: Value, value: Value },
|
||||
#[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
|
||||
MissingLatitudeAndLongitude { document_id: Value },
|
||||
#[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
|
||||
MissingLatitude { document_id: Value },
|
||||
#[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")]
|
||||
MissingLongitude { document_id: Value },
|
||||
#[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")]
|
||||
BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value },
|
||||
#[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")]
|
||||
BadLatitude { document_id: Value, value: Value },
|
||||
#[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")]
|
||||
BadLongitude { document_id: Value, value: Value },
|
||||
}
|
||||
|
||||
fn format_invalid_filter_distribution(
|
||||
invalid_facets_name: &BTreeSet<String>,
|
||||
valid_facets_name: &BTreeSet<String>,
|
||||
) -> String {
|
||||
if valid_facets_name.is_empty() {
|
||||
return "this index does not have configured filterable attributes.".into();
|
||||
}
|
||||
|
||||
let mut result = String::new();
|
||||
|
||||
match invalid_facets_name.len() {
|
||||
0 => (),
|
||||
1 => write!(
|
||||
result,
|
||||
"attribute `{}` is not filterable.",
|
||||
invalid_facets_name.first().unwrap()
|
||||
)
|
||||
.unwrap(),
|
||||
_ => write!(
|
||||
result,
|
||||
"attributes `{}` are not filterable.",
|
||||
invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
)
|
||||
.unwrap(),
|
||||
};
|
||||
|
||||
match valid_facets_name.len() {
|
||||
1 => write!(
|
||||
result,
|
||||
" The available filterable attribute is `{}`.",
|
||||
valid_facets_name.first().unwrap()
|
||||
)
|
||||
.unwrap(),
|
||||
_ => write!(
|
||||
result,
|
||||
" The available filterable attributes are `{}`.",
|
||||
valid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
)
|
||||
.unwrap(),
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// A little macro helper to autogenerate From implementation that needs two `Into`.
|
||||
/// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)`
|
||||
/// the macro will create the following code:
|
||||
/// ```ignore
|
||||
/// impl From<FieldIdMapMissingEntry> for Error {
|
||||
/// fn from(error: FieldIdMapMissingEntry) -> Error {
|
||||
/// Error::from(InternalError::from(error))
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
macro_rules! error_from_sub_error {
|
||||
() => {};
|
||||
($sub:ty => $intermediate:ty) => {
|
||||
impl From<$sub> for Error {
|
||||
fn from(error: $sub) -> Error {
|
||||
Error::from(<$intermediate>::from(error))
|
||||
}
|
||||
}
|
||||
};
|
||||
($($sub:ty => $intermediate:ty $(,)?),+) => {
|
||||
$(error_from_sub_error!($sub => $intermediate);)+
|
||||
};
|
||||
}
|
||||
|
||||
error_from_sub_error! {
|
||||
FieldIdMapMissingEntry => InternalError,
|
||||
fst::Error => InternalError,
|
||||
documents::Error => InternalError,
|
||||
str::Utf8Error => InternalError,
|
||||
ThreadPoolBuildError => InternalError,
|
||||
SerializationError => InternalError,
|
||||
GeoError => UserError,
|
||||
CriterionError => UserError,
|
||||
}
|
||||
|
||||
impl<E> From<grenad::Error<E>> for Error
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
fn from(error: grenad::Error<E>) -> Error {
|
||||
match error {
|
||||
grenad::Error::Io(error) => Error::IoError(error),
|
||||
grenad::Error::Merge(error) => Error::from(error),
|
||||
grenad::Error::InvalidCompressionType => {
|
||||
Error::InternalError(InternalError::GrenadInvalidCompressionType)
|
||||
}
|
||||
grenad::Error::InvalidFormatVersion => {
|
||||
Error::InternalError(InternalError::GrenadInvalidFormatVersion)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DocumentsBatchCursorError> for Error {
|
||||
fn from(error: DocumentsBatchCursorError) -> Error {
|
||||
match error {
|
||||
DocumentsBatchCursorError::Grenad(e) => Error::from(e),
|
||||
DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Infallible> for Error {
|
||||
fn from(_error: Infallible) -> Error {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HeedError> for Error {
|
||||
fn from(error: HeedError) -> Error {
|
||||
use self::Error::*;
|
||||
use self::InternalError::*;
|
||||
use self::SerializationError::*;
|
||||
use self::UserError::*;
|
||||
|
||||
match error {
|
||||
HeedError::Io(error) => Error::from(error),
|
||||
HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached),
|
||||
HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile),
|
||||
HeedError::Mdb(error) => InternalError(Store(error)),
|
||||
// TODO use the encoding
|
||||
HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })),
|
||||
HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })),
|
||||
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
|
||||
HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum FaultSource {
|
||||
User,
|
||||
Runtime,
|
||||
Bug,
|
||||
Undecided,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FaultSource {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
FaultSource::User => "user error",
|
||||
FaultSource::Runtime => "runtime error",
|
||||
FaultSource::Bug => "coding error",
|
||||
FaultSource::Undecided => "error",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn conditionally_lookup_for_error_message() {
|
||||
let prefix = "Attribute `name` is not sortable.";
|
||||
let messages = vec![
|
||||
(BTreeSet::new(), "This index does not have configured sortable attributes."),
|
||||
(BTreeSet::from(["age".to_string()]), "Available sortable attributes are: `age`."),
|
||||
];
|
||||
|
||||
for (list, suffix) in messages {
|
||||
let err = UserError::InvalidSortableAttribute {
|
||||
field: "name".to_string(),
|
||||
valid_fields: list,
|
||||
hidden_fields: false,
|
||||
};
|
||||
|
||||
assert_eq!(err.to_string(), format!("{} {}", prefix, suffix));
|
||||
}
|
||||
}
|
83
crates/milli/src/external_documents_ids.rs
Normal file
83
crates/milli/src/external_documents_ids.rs
Normal file
|
@ -0,0 +1,83 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use heed::types::Str;
|
||||
use heed::{Database, RoIter, RoTxn, RwTxn};
|
||||
|
||||
use crate::{DocumentId, BEU32};
|
||||
|
||||
pub enum DocumentOperationKind {
|
||||
Create,
|
||||
Delete,
|
||||
}
|
||||
|
||||
pub struct DocumentOperation {
|
||||
pub external_id: String,
|
||||
pub internal_id: DocumentId,
|
||||
pub kind: DocumentOperationKind,
|
||||
}
|
||||
|
||||
pub struct ExternalDocumentsIds(Database<Str, BEU32>);
|
||||
|
||||
impl ExternalDocumentsIds {
|
||||
pub fn new(db: Database<Str, BEU32>) -> ExternalDocumentsIds {
|
||||
ExternalDocumentsIds(db)
|
||||
}
|
||||
|
||||
/// Returns `true` if hard and soft external documents lists are empty.
|
||||
pub fn is_empty(&self, rtxn: &RoTxn<'_>) -> heed::Result<bool> {
|
||||
self.0.is_empty(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn get<A: AsRef<str>>(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
external_id: A,
|
||||
) -> heed::Result<Option<u32>> {
|
||||
self.0.get(rtxn, external_id.as_ref())
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
/// soft and hard fst maps, combined.
|
||||
pub fn to_hash_map(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashMap<String, u32>> {
|
||||
let mut map = HashMap::default();
|
||||
for result in self.0.iter(rtxn)? {
|
||||
let (external, internal) = result?;
|
||||
map.insert(external.to_owned(), internal);
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
|
||||
///
|
||||
/// If the list contains multiple operations on the same external id, then the result is unspecified.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If attempting to delete a document that doesn't exist
|
||||
/// - If attempting to create a document that already exists
|
||||
pub fn apply(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
operations: Vec<DocumentOperation>,
|
||||
) -> heed::Result<()> {
|
||||
for DocumentOperation { external_id, internal_id, kind } in operations {
|
||||
match kind {
|
||||
DocumentOperationKind::Create => {
|
||||
self.0.put(wtxn, &external_id, &internal_id)?;
|
||||
}
|
||||
DocumentOperationKind::Delete => {
|
||||
if !self.0.delete(wtxn, &external_id)? {
|
||||
panic!("Attempting to delete a non-existing document")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns an iterator over all the external ids.
|
||||
pub fn iter<'t>(&self, rtxn: &'t RoTxn<'_>) -> heed::Result<RoIter<'t, Str, BEU32>> {
|
||||
self.0.iter(rtxn)
|
||||
}
|
||||
}
|
45
crates/milli/src/facet/facet_type.rs
Normal file
45
crates/milli/src/facet/facet_type.rs
Normal file
|
@ -0,0 +1,45 @@
|
|||
use std::error::Error;
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum FacetType {
|
||||
String,
|
||||
Number,
|
||||
}
|
||||
|
||||
impl fmt::Display for FacetType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
FacetType::String => f.write_str("string"),
|
||||
FacetType::Number => f.write_str("number"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for FacetType {
|
||||
type Err = InvalidFacetType;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
if s.trim().eq_ignore_ascii_case("string") {
|
||||
Ok(FacetType::String)
|
||||
} else if s.trim().eq_ignore_ascii_case("number") {
|
||||
Ok(FacetType::Number)
|
||||
} else {
|
||||
Err(InvalidFacetType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct InvalidFacetType;
|
||||
|
||||
impl fmt::Display for InvalidFacetType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(r#"Invalid facet type, must be "string" or "number""#)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for InvalidFacetType {}
|
56
crates/milli/src/facet/facet_value.rs
Normal file
56
crates/milli/src/facet/facet_value.rs
Normal file
|
@ -0,0 +1,56 @@
|
|||
use ordered_float::OrderedFloat;
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub enum FacetValue {
|
||||
String(String),
|
||||
Number(OrderedFloat<f64>),
|
||||
}
|
||||
|
||||
impl From<String> for FacetValue {
|
||||
fn from(string: String) -> FacetValue {
|
||||
FacetValue::String(string)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for FacetValue {
|
||||
fn from(string: &str) -> FacetValue {
|
||||
FacetValue::String(string.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<f64> for FacetValue {
|
||||
fn from(float: f64) -> FacetValue {
|
||||
FacetValue::Number(OrderedFloat(float))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OrderedFloat<f64>> for FacetValue {
|
||||
fn from(float: OrderedFloat<f64>) -> FacetValue {
|
||||
FacetValue::Number(float)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<i64> for FacetValue {
|
||||
fn from(integer: i64) -> FacetValue {
|
||||
FacetValue::Number(OrderedFloat(integer as f64))
|
||||
}
|
||||
}
|
||||
|
||||
/// We implement Serialize ourselves because we need to always serialize it as a string,
|
||||
/// JSON object keys must be strings not numbers.
|
||||
// TODO remove this impl and convert them into string, by hand, when required.
|
||||
impl Serialize for FacetValue {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match self {
|
||||
FacetValue::String(string) => serializer.serialize_str(string),
|
||||
FacetValue::Number(number) => {
|
||||
let string = number.to_string();
|
||||
serializer.serialize_str(&string)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
6
crates/milli/src/facet/mod.rs
Normal file
6
crates/milli/src/facet/mod.rs
Normal file
|
@ -0,0 +1,6 @@
|
|||
mod facet_type;
|
||||
mod facet_value;
|
||||
pub mod value_encoding;
|
||||
|
||||
pub use self::facet_type::FacetType;
|
||||
pub use self::facet_value::FacetValue;
|
49
crates/milli/src/facet/value_encoding.rs
Normal file
49
crates/milli/src/facet/value_encoding.rs
Normal file
|
@ -0,0 +1,49 @@
|
|||
// https://stackoverflow.com/a/43305015/1941280
|
||||
#[inline]
|
||||
pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> {
|
||||
if float.is_finite() {
|
||||
if float == 0.0 || float == -0.0 {
|
||||
return Some(xor_first_bit(0.0_f64.to_be_bytes()));
|
||||
} else if float.is_sign_negative() {
|
||||
return Some(xor_all_bits(float.to_be_bytes()));
|
||||
} else if float.is_sign_positive() {
|
||||
return Some(xor_first_bit(float.to_be_bytes()));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] {
|
||||
x[0] ^= 0x80;
|
||||
x
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] {
|
||||
x.iter_mut().for_each(|b| *b ^= 0xff);
|
||||
x
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::cmp::Ordering::Less;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn is_sorted<T: Ord>(x: &[T]) -> bool {
|
||||
x.windows(2).map(|x| x[0].cmp(&x[1])).all(|o| o == Less)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ordered_f64_bytes() {
|
||||
let a = -13_f64;
|
||||
let b = -10.0;
|
||||
let c = -0.0;
|
||||
let d = 1.0;
|
||||
let e = 43.0;
|
||||
|
||||
let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect();
|
||||
assert!(is_sorted(&vec), "{:?}", vec);
|
||||
}
|
||||
}
|
55
crates/milli/src/fieldids_weights_map.rs
Normal file
55
crates/milli/src/fieldids_weights_map.rs
Normal file
|
@ -0,0 +1,55 @@
|
|||
//! The fieldids weights map is in charge of storing linking the searchable fields with their weights.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||
use crate::{FieldId, FieldsIdsMap, Weight};
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
pub struct FieldidsWeightsMap {
|
||||
map: HashMap<FieldId, Weight>,
|
||||
}
|
||||
|
||||
impl FieldidsWeightsMap {
|
||||
/// Insert a field id -> weigth into the map.
|
||||
/// If the map did not have this key present, `None` is returned.
|
||||
/// If the map did have this key present, the value is updated, and the old value is returned.
|
||||
pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option<Weight> {
|
||||
self.map.insert(fid, weight)
|
||||
}
|
||||
|
||||
/// Create the map from the fields ids maps.
|
||||
/// Should only be called in the case there are NO searchable attributes.
|
||||
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
|
||||
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
|
||||
FieldidsWeightsMap {
|
||||
map: fid_map
|
||||
.iter()
|
||||
.filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
|
||||
.map(|(fid, _name)| (fid, 0))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes a field id from the map, returning the associated weight previously in the map.
|
||||
pub fn remove(&mut self, fid: FieldId) -> Option<Weight> {
|
||||
self.map.remove(&fid)
|
||||
}
|
||||
|
||||
/// Returns weight corresponding to the key.
|
||||
pub fn weight(&self, fid: FieldId) -> Option<Weight> {
|
||||
self.map.get(&fid).copied()
|
||||
}
|
||||
|
||||
/// Returns highest weight contained in the map if any.
|
||||
pub fn max_weight(&self) -> Option<Weight> {
|
||||
self.map.values().copied().max()
|
||||
}
|
||||
|
||||
/// Return an iterator visiting all field ids in arbitrary order.
|
||||
pub fn ids(&self) -> impl Iterator<Item = FieldId> + '_ {
|
||||
self.map.keys().copied()
|
||||
}
|
||||
}
|
167
crates/milli/src/fields_ids_map.rs
Normal file
167
crates/milli/src/fields_ids_map.rs
Normal file
|
@ -0,0 +1,167 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FieldsIdsMap {
|
||||
names_ids: BTreeMap<String, FieldId>,
|
||||
ids_names: BTreeMap<FieldId, String>,
|
||||
next_id: Option<FieldId>,
|
||||
}
|
||||
|
||||
impl FieldsIdsMap {
|
||||
pub fn new() -> FieldsIdsMap {
|
||||
FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) }
|
||||
}
|
||||
|
||||
/// Returns the number of fields ids in the map.
|
||||
pub fn len(&self) -> usize {
|
||||
self.names_ids.len()
|
||||
}
|
||||
|
||||
/// Returns `true` if the map is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.names_ids.is_empty()
|
||||
}
|
||||
|
||||
/// Returns the field id related to a field name, it will create a new field id if the
|
||||
/// name is not already known. Returns `None` if the maximum field id as been reached.
|
||||
pub fn insert(&mut self, name: &str) -> Option<FieldId> {
|
||||
match self.names_ids.get(name) {
|
||||
Some(id) => Some(*id),
|
||||
None => {
|
||||
let id = self.next_id?;
|
||||
self.next_id = id.checked_add(1);
|
||||
self.names_ids.insert(name.to_owned(), id);
|
||||
self.ids_names.insert(id, name.to_owned());
|
||||
Some(id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the ids of a field and all its nested fields based on its name.
|
||||
pub fn nested_ids(&self, name: &str) -> Vec<FieldId> {
|
||||
self.names_ids
|
||||
.range(name.to_string()..)
|
||||
.take_while(|(key, _)| key.starts_with(name))
|
||||
.filter(|(key, _)| crate::is_faceted_by(key, name))
|
||||
.map(|(_name, id)| *id)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get the id of a field based on its name.
|
||||
pub fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.names_ids.get(name).copied()
|
||||
}
|
||||
|
||||
/// Get the name of a field based on its id.
|
||||
pub fn name(&self, id: FieldId) -> Option<&str> {
|
||||
self.ids_names.get(&id).map(String::as_str)
|
||||
}
|
||||
|
||||
/// Remove a field name and id based on its name.
|
||||
pub fn remove(&mut self, name: &str) -> Option<FieldId> {
|
||||
match self.names_ids.remove(name) {
|
||||
Some(id) => self.ids_names.remove_entry(&id).map(|(id, _)| id),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over the ids and names in the ids order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (FieldId, &str)> {
|
||||
self.ids_names.iter().map(|(id, name)| (*id, name.as_str()))
|
||||
}
|
||||
|
||||
/// Iterate over the ids in the order of the ids.
|
||||
pub fn ids(&'_ self) -> impl Iterator<Item = FieldId> + '_ {
|
||||
self.ids_names.keys().copied()
|
||||
}
|
||||
|
||||
/// Iterate over the names in the order of the ids.
|
||||
pub fn names(&self) -> impl Iterator<Item = &str> {
|
||||
self.ids_names.values().map(AsRef::as_ref)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for FieldsIdsMap {
|
||||
fn default() -> FieldsIdsMap {
|
||||
FieldsIdsMap::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::documents::FieldIdMapper for FieldsIdsMap {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn fields_ids_map() {
|
||||
let mut map = FieldsIdsMap::new();
|
||||
|
||||
assert_eq!(map.insert("id"), Some(0));
|
||||
assert_eq!(map.insert("title"), Some(1));
|
||||
assert_eq!(map.insert("description"), Some(2));
|
||||
assert_eq!(map.insert("id"), Some(0));
|
||||
assert_eq!(map.insert("title"), Some(1));
|
||||
assert_eq!(map.insert("description"), Some(2));
|
||||
|
||||
assert_eq!(map.id("id"), Some(0));
|
||||
assert_eq!(map.id("title"), Some(1));
|
||||
assert_eq!(map.id("description"), Some(2));
|
||||
assert_eq!(map.id("date"), None);
|
||||
|
||||
assert_eq!(map.len(), 3);
|
||||
|
||||
assert_eq!(map.name(0), Some("id"));
|
||||
assert_eq!(map.name(1), Some("title"));
|
||||
assert_eq!(map.name(2), Some("description"));
|
||||
assert_eq!(map.name(4), None);
|
||||
|
||||
assert_eq!(map.remove("title"), Some(1));
|
||||
|
||||
assert_eq!(map.id("title"), None);
|
||||
assert_eq!(map.insert("title"), Some(3));
|
||||
assert_eq!(map.len(), 3);
|
||||
|
||||
let mut iter = map.iter();
|
||||
assert_eq!(iter.next(), Some((0, "id")));
|
||||
assert_eq!(iter.next(), Some((2, "description")));
|
||||
assert_eq!(iter.next(), Some((3, "title")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nested_fields() {
|
||||
let mut map = FieldsIdsMap::new();
|
||||
|
||||
assert_eq!(map.insert("id"), Some(0));
|
||||
assert_eq!(map.insert("doggo"), Some(1));
|
||||
assert_eq!(map.insert("doggo.name"), Some(2));
|
||||
assert_eq!(map.insert("doggolution"), Some(3));
|
||||
assert_eq!(map.insert("doggo.breed.name"), Some(4));
|
||||
assert_eq!(map.insert("description"), Some(5));
|
||||
|
||||
insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###"
|
||||
[
|
||||
1,
|
||||
4,
|
||||
2,
|
||||
]
|
||||
"###);
|
||||
|
||||
insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###"
|
||||
[
|
||||
4,
|
||||
]
|
||||
"###);
|
||||
|
||||
insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]");
|
||||
}
|
||||
}
|
29
crates/milli/src/heed_codec/beu16_str_codec.rs
Normal file
29
crates/milli/src/heed_codec/beu16_str_codec.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct BEU16StrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
|
||||
type DItem = (u16, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n_bytes, str_bytes) = bytes.split_at(2);
|
||||
let n = n_bytes.try_into().map(u16::from_be_bytes)?;
|
||||
let s = str::from_utf8(str_bytes)?;
|
||||
Ok((n, s))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
|
||||
type EItem = (u16, &'a str);
|
||||
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s.len() + 2);
|
||||
bytes.extend_from_slice(&n.to_be_bytes());
|
||||
bytes.extend_from_slice(s.as_bytes());
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
29
crates/milli/src/heed_codec/beu32_str_codec.rs
Normal file
29
crates/milli/src/heed_codec/beu32_str_codec.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct BEU32StrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for BEU32StrCodec {
|
||||
type DItem = (u32, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n_bytes, str_bytes) = bytes.split_at(4);
|
||||
let n = n_bytes.try_into().map(u32::from_be_bytes)?;
|
||||
let s = str::from_utf8(str_bytes)?;
|
||||
Ok((n, s))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for BEU32StrCodec {
|
||||
type EItem = (u32, &'a str);
|
||||
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s.len() + 4);
|
||||
bytes.extend_from_slice(&n.to_be_bytes());
|
||||
bytes.extend_from_slice(s.as_bytes());
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
23
crates/milli/src/heed_codec/byte_slice_ref.rs
Normal file
23
crates/milli/src/heed_codec/byte_slice_ref.rs
Normal file
|
@ -0,0 +1,23 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `&[u8]`. Unlike `Bytes`, its `EItem` and `DItem` associated
|
||||
/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure.
|
||||
pub struct BytesRefCodec;
|
||||
|
||||
impl<'a> BytesEncode<'a> for BytesRefCodec {
|
||||
type EItem = &'a [u8];
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for BytesRefCodec {
|
||||
type DItem = &'a [u8];
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(bytes)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
use std::borrow::Cow;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
use crate::heed_codec::SliceTooShortError;
|
||||
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetCodec<C>(PhantomData<C>);
|
||||
|
||||
impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec<C>
|
||||
where
|
||||
C: BytesDecode<'a>,
|
||||
{
|
||||
type DItem = (FieldId, DocumentId, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let value = C::bytes_decode(bytes)?;
|
||||
|
||||
Ok((field_id, document_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec<C>
|
||||
where
|
||||
C: BytesEncode<'a>,
|
||||
{
|
||||
type EItem = (FieldId, DocumentId, C::EItem);
|
||||
|
||||
fn bytes_encode(
|
||||
(field_id, document_id, value): &'a Self::EItem,
|
||||
) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(32);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes
|
||||
bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes
|
||||
let value_bytes = C::bytes_encode(value)?;
|
||||
// variable length, if f64 -> 16 bytes, if string -> large, potentially
|
||||
bytes.extend_from_slice(&value_bytes);
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
123
crates/milli/src/heed_codec/facet/mod.rs
Normal file
123
crates/milli/src/heed_codec/facet/mod.rs
Normal file
|
@ -0,0 +1,123 @@
|
|||
mod field_doc_id_facet_codec;
|
||||
mod ordered_f64_codec;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryFrom;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use heed::types::DecodeIgnore;
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
|
||||
pub use self::ordered_f64_codec::OrderedF64Codec;
|
||||
use super::StrRefCodec;
|
||||
use crate::{CboRoaringBitmapCodec, BEU16};
|
||||
|
||||
pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
|
||||
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
|
||||
pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec<DecodeIgnore>;
|
||||
|
||||
pub type FieldIdCodec = BEU16;
|
||||
|
||||
/// Tries to split a slice in half at the given middle point,
|
||||
/// `None` if the slice is too short.
|
||||
pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
||||
if slice.len() >= mid {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
|
||||
/// databases.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] // TODO: try removing PartialOrd and Ord
|
||||
pub struct FacetGroupKey<T> {
|
||||
pub field_id: u16,
|
||||
pub level: u8,
|
||||
pub left_bound: T,
|
||||
}
|
||||
|
||||
/// The value in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
|
||||
/// databases.
|
||||
#[derive(Debug)]
|
||||
pub struct FacetGroupValue {
|
||||
pub size: u8,
|
||||
pub bitmap: RoaringBitmap,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FacetGroupLazyValue<'b> {
|
||||
pub size: u8,
|
||||
pub bitmap_bytes: &'b [u8],
|
||||
}
|
||||
|
||||
pub struct FacetGroupKeyCodec<T> {
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<'a, T> heed::BytesEncode<'a> for FacetGroupKeyCodec<T>
|
||||
where
|
||||
T: BytesEncode<'a>,
|
||||
T::EItem: Sized,
|
||||
{
|
||||
type EItem = FacetGroupKey<T::EItem>;
|
||||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![];
|
||||
v.extend_from_slice(&value.field_id.to_be_bytes());
|
||||
v.extend_from_slice(&[value.level]);
|
||||
|
||||
let bound = T::bytes_encode(&value.left_bound)?;
|
||||
v.extend_from_slice(&bound);
|
||||
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T>
|
||||
where
|
||||
T: BytesDecode<'a>,
|
||||
{
|
||||
type DItem = FacetGroupKey<T::DItem>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1])?);
|
||||
let level = bytes[2];
|
||||
let bound = T::bytes_decode(&bytes[3..])?;
|
||||
Ok(FacetGroupKey { field_id: fid, level, left_bound: bound })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetGroupValueCodec;
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
||||
type EItem = FacetGroupValue;
|
||||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![value.size];
|
||||
CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v);
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
|
||||
type DItem = FacetGroupValue;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let size = bytes[0];
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
|
||||
Ok(FacetGroupValue { size, bitmap })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetGroupLazyValueCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec {
|
||||
type DItem = FacetGroupLazyValue<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] })
|
||||
}
|
||||
}
|
45
crates/milli/src/heed_codec/facet/ordered_f64_codec.rs
Normal file
45
crates/milli/src/heed_codec/facet/ordered_f64_codec.rs
Normal file
|
@ -0,0 +1,45 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::heed_codec::SliceTooShortError;
|
||||
|
||||
pub struct OrderedF64Codec;
|
||||
|
||||
impl<'a> BytesDecode<'a> for OrderedF64Codec {
|
||||
type DItem = f64;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
if bytes.len() < 16 {
|
||||
Err(SliceTooShortError.into())
|
||||
} else {
|
||||
bytes[8..].try_into().map(f64::from_be_bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for OrderedF64Codec {
|
||||
type EItem = f64;
|
||||
|
||||
fn bytes_encode(f: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut buffer = [0u8; 16];
|
||||
|
||||
// write the globally ordered float
|
||||
let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?;
|
||||
buffer[..8].copy_from_slice(&bytes[..]);
|
||||
// Then the f64 value just to be able to read it back
|
||||
let bytes = f.to_be_bytes();
|
||||
buffer[8..16].copy_from_slice(&bytes[..]);
|
||||
|
||||
Ok(Cow::Owned(buffer.to_vec()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the float {float} cannot be converted to a globally ordered representation")]
|
||||
pub struct InvalidGloballyOrderedFloatError {
|
||||
float: f64,
|
||||
}
|
30
crates/milli/src/heed_codec/field_id_word_count_codec.rs
Normal file
30
crates/milli/src/heed_codec/field_id_word_count_codec.rs
Normal file
|
@ -0,0 +1,30 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
pub struct FieldIdWordCountCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
|
||||
type DItem = (FieldId, u8);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let ([word_count], _nothing) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
Ok((field_id, word_count))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
|
||||
type EItem = (FieldId, u8);
|
||||
|
||||
fn bytes_encode((field_id, word_count): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(2 + 1);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.push(*word_count);
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
23
crates/milli/src/heed_codec/fst_set_codec.rs
Normal file
23
crates/milli/src/heed_codec/fst_set_codec.rs
Normal file
|
@ -0,0 +1,23 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use fst::Set;
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `Set<&[u8]>`.
|
||||
pub struct FstSetCodec;
|
||||
|
||||
impl<'a> BytesEncode<'a> for FstSetCodec {
|
||||
type EItem = Set<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_fst().as_bytes()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for FstSetCodec {
|
||||
type DItem = Set<&'a [u8]>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Set::new(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
39
crates/milli/src/heed_codec/mod.rs
Normal file
39
crates/milli/src/heed_codec/mod.rs
Normal file
|
@ -0,0 +1,39 @@
|
|||
mod beu16_str_codec;
|
||||
mod beu32_str_codec;
|
||||
mod byte_slice_ref;
|
||||
pub mod facet;
|
||||
mod field_id_word_count_codec;
|
||||
mod fst_set_codec;
|
||||
mod obkv_codec;
|
||||
mod roaring_bitmap;
|
||||
mod roaring_bitmap_length;
|
||||
mod str_beu32_codec;
|
||||
mod str_ref;
|
||||
mod str_str_u8_codec;
|
||||
|
||||
pub use byte_slice_ref::BytesRefCodec;
|
||||
use heed::BoxedError;
|
||||
pub use str_ref::StrRefCodec;
|
||||
use thiserror::Error;
|
||||
|
||||
pub use self::beu16_str_codec::BEU16StrCodec;
|
||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||
pub use self::fst_set_codec::FstSetCodec;
|
||||
pub use self::obkv_codec::ObkvCodec;
|
||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||
pub use self::roaring_bitmap_length::{
|
||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
||||
};
|
||||
pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
pub trait BytesDecodeOwned {
|
||||
type DItem;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError>;
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the slice is too short")]
|
||||
pub struct SliceTooShortError;
|
22
crates/milli/src/heed_codec/obkv_codec.rs
Normal file
22
crates/milli/src/heed_codec/obkv_codec.rs
Normal file
|
@ -0,0 +1,22 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
use obkv::{KvReaderU16, KvWriterU16};
|
||||
|
||||
pub struct ObkvCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
||||
type DItem = KvReaderU16<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(KvReaderU16::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for ObkvCodec {
|
||||
type EItem = KvWriterU16<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
item.clone().into_inner().map(Cow::Owned).map_err(Into::into)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct BoRoaringBitmapCodec;
|
||||
|
||||
impl BoRoaringBitmapCodec {
|
||||
pub fn serialize_into(bitmap: &RoaringBitmap, out: &mut Vec<u8>) {
|
||||
out.reserve(bitmap.len() as usize * size_of::<u32>());
|
||||
bitmap.iter().map(u32::to_ne_bytes).for_each(|bytes| out.extend_from_slice(&bytes));
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecode<'_> for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
|
||||
for chunk in bytes.chunks(size_of::<u32>()) {
|
||||
let bytes = chunk.try_into()?;
|
||||
bitmap.push(u32::from_ne_bytes(bytes));
|
||||
}
|
||||
|
||||
Ok(bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut out = Vec::new();
|
||||
BoRoaringBitmapCodec::serialize_into(item, &mut out);
|
||||
Ok(Cow::Owned(out))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
use std::borrow::Cow;
|
||||
use std::io::{self, Cursor};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use heed::BoxedError;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
/// to determine the encoding used only by using the array of bytes length.
|
||||
pub const THRESHOLD: usize = 7;
|
||||
|
||||
/// A conditionnal codec that either use the RoaringBitmap
|
||||
/// or a lighter ByteOrder en/decoding method.
|
||||
pub struct CboRoaringBitmapCodec;
|
||||
|
||||
impl CboRoaringBitmapCodec {
|
||||
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
||||
if roaring.len() <= THRESHOLD as u64 {
|
||||
roaring.len() as usize * size_of::<u32>()
|
||||
} else {
|
||||
roaring.serialized_size()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec<u8>) {
|
||||
if roaring.len() <= THRESHOLD as u64 {
|
||||
// If the number of items (u32s) to encode is less than or equal to the threshold
|
||||
// it means that it would weigh the same or less than the RoaringBitmap
|
||||
// header, so we directly encode them using ByteOrder instead.
|
||||
for integer in roaring {
|
||||
vec.write_u32::<NativeEndian>(integer).unwrap();
|
||||
}
|
||||
} else {
|
||||
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
||||
roaring.serialize_into(vec).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
// If there is threshold or less than threshold integers that can fit into this array
|
||||
// of bytes it means that we used the ByteOrder codec serializer.
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||
bitmap.insert(integer);
|
||||
}
|
||||
Ok(bitmap)
|
||||
} else {
|
||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||
// that the header takes threshold integers.
|
||||
RoaringBitmap::deserialize_unchecked_from(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn intersection_with_serialized(
|
||||
mut bytes: &[u8],
|
||||
other: &RoaringBitmap,
|
||||
) -> io::Result<RoaringBitmap> {
|
||||
// See above `deserialize_from` method for implementation details.
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||
if other.contains(integer) {
|
||||
bitmap.insert(integer);
|
||||
}
|
||||
}
|
||||
Ok(bitmap)
|
||||
} else {
|
||||
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
||||
///
|
||||
/// if the merged values length is under the threshold, values are directly
|
||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||
/// values and is serialized in the buffer.
|
||||
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = A>,
|
||||
A: AsRef<[u8]>,
|
||||
{
|
||||
let mut roaring = RoaringBitmap::new();
|
||||
let mut vec = Vec::new();
|
||||
|
||||
for bytes in slices {
|
||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut reader = bytes.as_ref();
|
||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||
vec.push(integer);
|
||||
}
|
||||
} else {
|
||||
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
|
||||
}
|
||||
}
|
||||
|
||||
if roaring.is_empty() {
|
||||
vec.sort_unstable();
|
||||
vec.dedup();
|
||||
|
||||
if vec.len() <= THRESHOLD {
|
||||
for integer in vec {
|
||||
buffer.extend_from_slice(&integer.to_ne_bytes());
|
||||
}
|
||||
} else {
|
||||
// We can unwrap safely because the vector is sorted upper.
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
} else {
|
||||
roaring.extend(vec);
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into<'a>(
|
||||
deladd: KvReaderDelAdd<'_>,
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> io::Result<Option<&'a [u8]>> {
|
||||
// Deserialize the bitmap that is already there
|
||||
let mut previous = Self::deserialize_from(previous)?;
|
||||
|
||||
// Remove integers we no more want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||
previous -= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
// Insert the new integers we want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||
previous |= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
if previous.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::serialize_into(&previous, buffer);
|
||||
Ok(Some(&buffer[..]))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut vec = Vec::with_capacity(Self::serialized_size(item));
|
||||
Self::serialize_into(item, &mut vec);
|
||||
Ok(Cow::Owned(vec))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn verify_encoding_decoding() {
|
||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
||||
let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap();
|
||||
let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!(input, output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_threshold() {
|
||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
||||
|
||||
// use roaring bitmap
|
||||
let mut bytes = Vec::new();
|
||||
input.serialize_into(&mut bytes).unwrap();
|
||||
let roaring_size = bytes.len();
|
||||
|
||||
// use byteorder directly
|
||||
let mut bytes = Vec::new();
|
||||
for integer in input {
|
||||
bytes.write_u32::<NativeEndian>(integer).unwrap();
|
||||
}
|
||||
let bo_size = bytes.len();
|
||||
|
||||
assert!(roaring_size > bo_size);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merge_cbo_roaring_bitmaps() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
let small_data = [
|
||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(4..6).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(1..3).unwrap(),
|
||||
];
|
||||
|
||||
let small_data: Vec<_> =
|
||||
small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||
CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap();
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||
let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap();
|
||||
assert_eq!(bitmap, expected);
|
||||
|
||||
let medium_data = [
|
||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(4..8).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(0..3).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(7..23).unwrap(),
|
||||
];
|
||||
|
||||
let medium_data: Vec<_> =
|
||||
medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
||||
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
|
||||
assert_eq!(bitmap, expected);
|
||||
}
|
||||
}
|
7
crates/milli/src/heed_codec/roaring_bitmap/mod.rs
Normal file
7
crates/milli/src/heed_codec/roaring_bitmap/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
|||
mod bo_roaring_bitmap_codec;
|
||||
pub mod cbo_roaring_bitmap_codec;
|
||||
mod roaring_bitmap_codec;
|
||||
|
||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
|
@ -0,0 +1,34 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct RoaringBitmapCodec;
|
||||
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmap::deserialize_unchecked_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmap::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(item.serialized_size());
|
||||
item.serialize_into(&mut bytes)?;
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
use std::mem;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct BoRoaringBitmapLenCodec;
|
||||
|
||||
impl BytesDecode<'_> for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok((bytes.len() / mem::size_of::<u32>()) as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
use std::mem;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
|
||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct CboRoaringBitmapLenCodec;
|
||||
|
||||
impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
|
||||
// If there is threshold or less than threshold integers that can fit into this array
|
||||
// of bytes it means that we used the ByteOrder codec serializer.
|
||||
BoRoaringBitmapLenCodec::bytes_decode(bytes)
|
||||
} else {
|
||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||
// that the header takes threshold integers.
|
||||
RoaringBitmapLenCodec::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
7
crates/milli/src/heed_codec/roaring_bitmap_length/mod.rs
Normal file
7
crates/milli/src/heed_codec/roaring_bitmap_length/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
|||
mod bo_roaring_bitmap_len_codec;
|
||||
mod cbo_roaring_bitmap_len_codec;
|
||||
mod roaring_bitmap_len_codec;
|
||||
|
||||
pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec;
|
||||
pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec;
|
||||
pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec;
|
|
@ -0,0 +1,88 @@
|
|||
use std::io::{self, BufRead, Read};
|
||||
use std::mem;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use heed::BoxedError;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
|
||||
const SERIAL_COOKIE: u16 = 12347;
|
||||
|
||||
pub struct RoaringBitmapLenCodec;
|
||||
|
||||
impl RoaringBitmapLenCodec {
|
||||
// FIXME should be exported in the RoaringBitmap crate
|
||||
fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result<u64> {
|
||||
let (size, has_offsets) = {
|
||||
let cookie = bytes.read_u32::<LittleEndian>()?;
|
||||
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
|
||||
(bytes.read_u32::<LittleEndian>()? as usize, true)
|
||||
} else if (cookie as u16) == SERIAL_COOKIE {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
|
||||
} else {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
|
||||
}
|
||||
};
|
||||
|
||||
if size > u16::MAX as usize + 1 {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
|
||||
}
|
||||
|
||||
let mut description_bytes = vec![0u8; size * 4];
|
||||
bytes.read_exact(&mut description_bytes)?;
|
||||
let description_bytes = &mut &description_bytes[..];
|
||||
|
||||
if has_offsets {
|
||||
bytes.consume(size * 4);
|
||||
}
|
||||
|
||||
let mut length = 0;
|
||||
for _ in 0..size {
|
||||
let _key = description_bytes.read_u16::<LittleEndian>()?;
|
||||
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
|
||||
length += len;
|
||||
|
||||
if len <= 4096 {
|
||||
bytes.consume(len as usize * mem::size_of::<u16>());
|
||||
} else {
|
||||
bytes.consume(1024 * mem::size_of::<u64>())
|
||||
}
|
||||
}
|
||||
|
||||
Ok(length)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::heed_codec::RoaringBitmapCodec;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmap_length() {
|
||||
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
|
||||
let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap();
|
||||
assert_eq!(bitmap.len(), len);
|
||||
}
|
||||
}
|
79
crates/milli/src/heed_codec/str_beu32_codec.rs
Normal file
79
crates/milli/src/heed_codec/str_beu32_codec.rs
Normal file
|
@ -0,0 +1,79 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
|
||||
pub struct StrBEU32Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU32Codec {
|
||||
type DItem = (&'a str, u32);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let footer_len = size_of::<u32>();
|
||||
|
||||
if bytes.len() < footer_len {
|
||||
return Err(SliceTooShortError.into());
|
||||
}
|
||||
|
||||
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let word = str::from_utf8(word)?;
|
||||
let pos = bytes.try_into().map(u32::from_be_bytes)?;
|
||||
|
||||
Ok((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
|
||||
type EItem = (&'a str, u32);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + pos.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StrBEU16Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
|
||||
type DItem = (&'a str, u16);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let footer_len = size_of::<u16>();
|
||||
|
||||
if bytes.len() < footer_len + 1 {
|
||||
return Err(SliceTooShortError.into());
|
||||
}
|
||||
|
||||
let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
// unwrap: we just checked the footer + 1 above.
|
||||
let (_, word) = word_plus_nul_byte.split_last().unwrap();
|
||||
let word = str::from_utf8(word)?;
|
||||
let pos = bytes.try_into().map(u16::from_be_bytes)?;
|
||||
|
||||
Ok((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
||||
type EItem = (&'a str, u16);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
21
crates/milli/src/heed_codec/str_ref.rs
Normal file
21
crates/milli/src/heed_codec/str_ref.rs
Normal file
|
@ -0,0 +1,21 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated
|
||||
/// types are equivalent (= `&'a str`) and these values can reside within another structure.
|
||||
pub struct StrRefCodec;
|
||||
impl<'a> BytesEncode<'a> for StrRefCodec {
|
||||
type EItem = &'a str;
|
||||
|
||||
fn bytes_encode(item: &'a &'a str) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_bytes()))
|
||||
}
|
||||
}
|
||||
impl<'a> BytesDecode<'a> for StrRefCodec {
|
||||
type DItem = &'a str;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
std::str::from_utf8(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
62
crates/milli/src/heed_codec/str_str_u8_codec.rs
Normal file
62
crates/milli/src/heed_codec/str_str_u8_codec.rs
Normal file
|
@ -0,0 +1,62 @@
|
|||
use std::borrow::Cow;
|
||||
use std::ffi::CStr;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
|
||||
pub struct U8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for U8StrStrCodec {
|
||||
type DItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let s1 = cstr.to_str()?;
|
||||
// skip '\0' byte between the two strings.
|
||||
let s2 = str::from_utf8(&bytes[s1.len() + 1..])?;
|
||||
Ok((*n, s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for U8StrStrCodec {
|
||||
type EItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2.as_bytes());
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
pub struct UncheckedU8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec {
|
||||
type DItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let s1_bytes = cstr.to_bytes();
|
||||
// skip '\0' byte between the two strings.
|
||||
let s2_bytes = &bytes[s1_bytes.len() + 1..];
|
||||
Ok((*n, s1_bytes, s2_bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec {
|
||||
type EItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1);
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2);
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
2909
crates/milli/src/index.rs
Normal file
2909
crates/milli/src/index.rs
Normal file
File diff suppressed because it is too large
Load diff
448
crates/milli/src/lib.rs
Normal file
448
crates/milli/src/lib.rs
Normal file
|
@ -0,0 +1,448 @@
|
|||
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
|
||||
#![allow(clippy::type_complexity)]
|
||||
|
||||
#[cfg(test)]
|
||||
#[global_allocator]
|
||||
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
#[macro_use]
|
||||
pub mod documents;
|
||||
|
||||
mod asc_desc;
|
||||
mod criterion;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
pub mod facet;
|
||||
mod fields_ids_map;
|
||||
pub mod heed_codec;
|
||||
pub mod index;
|
||||
mod localized_attributes_rules;
|
||||
pub mod order_by_map;
|
||||
pub mod prompt;
|
||||
pub mod proximity;
|
||||
pub mod score_details;
|
||||
mod search;
|
||||
mod thread_pool_no_abort;
|
||||
pub mod update;
|
||||
pub mod vector;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
pub mod snapshot_tests;
|
||||
mod fieldids_weights_map;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fmt;
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
|
||||
pub use filter_parser::{Condition, FilterCondition, Span, Token};
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
pub use search::new::{
|
||||
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext,
|
||||
SearchLogger, VisualSearchLogger,
|
||||
};
|
||||
use serde_json::Value;
|
||||
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
pub use {charabia as tokenizer, heed, rhai};
|
||||
|
||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||
pub use self::criterion::{default_criteria, Criterion, CriterionError};
|
||||
pub use self::error::{
|
||||
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
|
||||
};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fieldids_weights_map::FieldidsWeightsMap;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{
|
||||
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
|
||||
RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
|
||||
UncheckedU8StrStrCodec,
|
||||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::localized_attributes_rules::LocalizedAttributesRule;
|
||||
use self::localized_attributes_rules::LocalizedFieldIds;
|
||||
pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
|
||||
pub use self::search::similar::Similar;
|
||||
pub use self::search::{
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
|
||||
Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
|
||||
pub type Attribute = u32;
|
||||
pub type BEU16 = heed::types::U16<heed::byteorder::BE>;
|
||||
pub type BEU32 = heed::types::U32<heed::byteorder::BE>;
|
||||
pub type BEU64 = heed::types::U64<heed::byteorder::BE>;
|
||||
pub type DocumentId = u32;
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||
pub type FieldId = u16;
|
||||
pub type Weight = u16;
|
||||
pub type Object = serde_json::Map<String, serde_json::Value>;
|
||||
pub type Position = u32;
|
||||
pub type RelativePosition = u16;
|
||||
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
|
||||
pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
|
||||
pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
|
||||
pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
|
||||
|
||||
/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
|
||||
/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
|
||||
/// expressed in term of latitude and longitude.
|
||||
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
|
||||
|
||||
/// The maximum length a LMDB key can be.
|
||||
///
|
||||
/// Note that the actual allowed length is a little bit higher, but
|
||||
/// we keep a margin of safety.
|
||||
const MAX_LMDB_KEY_LENGTH: usize = 500;
|
||||
|
||||
/// The maximum length a field value can be when inserted in an LMDB key.
|
||||
///
|
||||
/// This number is determined by the keys of the different facet databases
|
||||
/// and adding a margin of safety.
|
||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;
|
||||
|
||||
/// The maximum length a word can be
|
||||
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
||||
|
||||
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TimeBudget {
|
||||
started_at: std::time::Instant,
|
||||
budget: std::time::Duration,
|
||||
|
||||
/// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
|
||||
/// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
|
||||
#[cfg(test)]
|
||||
stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for TimeBudget {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("TimeBudget")
|
||||
.field("started_at", &self.started_at)
|
||||
.field("budget", &self.budget)
|
||||
.field("left", &(self.budget - self.started_at.elapsed()))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TimeBudget {
|
||||
fn default() -> Self {
|
||||
Self::new(std::time::Duration::from_millis(1500))
|
||||
}
|
||||
}
|
||||
|
||||
impl TimeBudget {
|
||||
pub fn new(budget: std::time::Duration) -> Self {
|
||||
Self {
|
||||
started_at: std::time::Instant::now(),
|
||||
budget,
|
||||
|
||||
#[cfg(test)]
|
||||
stop_after: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max() -> Self {
|
||||
Self::new(std::time::Duration::from_secs(u64::MAX))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn with_stop_after(mut self, stop_after: usize) -> Self {
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::Arc;
|
||||
|
||||
self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn exceeded(&self) -> bool {
|
||||
#[cfg(test)]
|
||||
if let Some((current, stop_after)) = &self.stop_after {
|
||||
let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if current >= *stop_after {
|
||||
return true;
|
||||
} else {
|
||||
// if a number has been specified then we ignore entirely the time budget
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.started_at.elapsed() > self.budget
|
||||
}
|
||||
}
|
||||
|
||||
// Convert an absolute word position into a relative position.
|
||||
// Return the field id of the attribute related to the absolute position
|
||||
// and the relative position in the attribute.
|
||||
pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
|
||||
((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
|
||||
}
|
||||
|
||||
// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
|
||||
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
|
||||
(field_id as u32) << 16 | (relative as u32)
|
||||
}
|
||||
// TODO: this is wrong, but will do for now
|
||||
/// Compute the "bucketed" absolute position from the field id and relative position in the field.
|
||||
///
|
||||
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
|
||||
pub fn bucketed_position(relative: u16) -> u16 {
|
||||
// The first few relative positions are kept intact.
|
||||
if relative < 16 {
|
||||
relative
|
||||
} else if relative < 24 {
|
||||
// Relative positions between 16 and 24 all become equal to 24
|
||||
24
|
||||
} else {
|
||||
// Then, groups of positions that have the same base-2 logarithm are reduced to
|
||||
// the same relative position: the smallest power of 2 that is greater than them
|
||||
(relative as f64).log2().ceil().exp2() as u16
|
||||
}
|
||||
}
|
||||
|
||||
/// Transform a raw obkv store into a JSON Object.
|
||||
pub fn obkv_to_json(
|
||||
displayed_fields: &[FieldId],
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReaderU16<'_>,
|
||||
) -> Result<Object> {
|
||||
displayed_fields
|
||||
.iter()
|
||||
.copied()
|
||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||
.map(|(id, value)| {
|
||||
let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: id,
|
||||
process: "obkv_to_json",
|
||||
})?;
|
||||
let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?;
|
||||
Ok((name.to_owned(), value))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Transform every field of a raw obkv store into a JSON Object.
|
||||
pub fn all_obkv_to_json(
|
||||
obkv: obkv::KvReaderU16<'_>,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
) -> Result<Object> {
|
||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
||||
obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
pub fn json_to_string(value: &Value) -> Option<String> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
for (key, value) in object {
|
||||
buffer.clear();
|
||||
let _ = write!(&mut buffer, "{}: ", key);
|
||||
if inner(value, &mut buffer) {
|
||||
buffer.push_str(". ");
|
||||
// We write the "key: value. " pair only when
|
||||
// we are sure that the value can be written.
|
||||
output.push_str(&buffer);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut string = String::new();
|
||||
if inner(value, &mut string) {
|
||||
Some(string)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
/// Return the distance between two points in meters. Each points are composed of two f64,
|
||||
/// one latitude and one longitude.
|
||||
pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 {
|
||||
let a = geoutils::Location::new(a[0], a[1]);
|
||||
let b = geoutils::Location::new(b[0], b[1]);
|
||||
|
||||
a.haversine_distance_to(&b).meters()
|
||||
}
|
||||
|
||||
/// Convert a point expressed in terms of latitude and longitude to a point in the
|
||||
/// cartesian coordinate expressed in terms of x, y and z.
|
||||
pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] {
|
||||
let [lat, lng] = coord.map(|f| f.to_radians());
|
||||
let x = lat.cos() * lng.cos();
|
||||
let y = lat.cos() * lng.sin();
|
||||
let z = lat.sin();
|
||||
|
||||
[x, y, z]
|
||||
}
|
||||
|
||||
/// Returns `true` if the field match one of the faceted fields.
|
||||
/// See the function [`is_faceted_by`] below to see what “matching” means.
|
||||
pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool {
|
||||
faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref()))
|
||||
}
|
||||
|
||||
/// Returns `true` if the field match the facet.
|
||||
/// ```
|
||||
/// use milli::is_faceted_by;
|
||||
/// // -- the valid basics
|
||||
/// assert!(is_faceted_by("animaux", "animaux"));
|
||||
/// assert!(is_faceted_by("animaux.chien", "animaux"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure"));
|
||||
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur"));
|
||||
///
|
||||
/// // -- the wrongs
|
||||
/// assert!(!is_faceted_by("chien", "chat"));
|
||||
/// assert!(!is_faceted_by("animaux", "animaux.chien"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chat"));
|
||||
///
|
||||
/// // -- the strange edge cases
|
||||
/// assert!(!is_faceted_by("animaux.chien", "anima"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animau"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux."));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.c"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.ch"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chi"));
|
||||
/// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
|
||||
/// ```
|
||||
pub fn is_faceted_by(field: &str, facet: &str) -> bool {
|
||||
field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.')
|
||||
}
|
||||
|
||||
pub fn normalize_facet(original: &str) -> String {
|
||||
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn json_to_string_object() {
|
||||
let value = json!({
|
||||
"name": "John Doe",
|
||||
"age": 43,
|
||||
"not_there": null,
|
||||
});
|
||||
|
||||
let string = json_to_string(&value).unwrap();
|
||||
assert_eq!(string, "name: John Doe. age: 43. ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_to_string_array() {
|
||||
let value = json!([
|
||||
{ "name": "John Doe" },
|
||||
43,
|
||||
"hello",
|
||||
[ "I", "am", "fine" ],
|
||||
null,
|
||||
]);
|
||||
|
||||
let string = json_to_string(&value).unwrap();
|
||||
// We don't care about having two point (.) after the other as
|
||||
// the distance of hard separators is clamped to 8 anyway.
|
||||
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relative_position_conversion() {
|
||||
assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
|
||||
assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
|
||||
assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
|
||||
assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
|
||||
assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
|
||||
assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
|
||||
assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_absolute_position_conversion() {
|
||||
assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
|
||||
assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
|
||||
assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
|
||||
assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
|
||||
assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
|
||||
assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
|
||||
assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_obkv_to_json() {
|
||||
let mut fields_ids_map = FieldsIdsMap::new();
|
||||
let id1 = fields_ids_map.insert("field1").unwrap();
|
||||
let id2 = fields_ids_map.insert("field2").unwrap();
|
||||
|
||||
let mut writer = obkv::KvWriterU16::memory();
|
||||
writer.insert(id1, b"1234").unwrap();
|
||||
writer.insert(id2, b"4321").unwrap();
|
||||
let contents = writer.into_inner().unwrap();
|
||||
let obkv = obkv::KvReaderU16::new(&contents);
|
||||
|
||||
let expected = json!({
|
||||
"field1": 1234,
|
||||
"field2": 4321,
|
||||
});
|
||||
let expected = expected.as_object().unwrap();
|
||||
let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap();
|
||||
|
||||
assert_eq!(&actual, expected);
|
||||
}
|
||||
}
|
129
crates/milli/src/localized_attributes_rules.rs
Normal file
129
crates/milli/src/localized_attributes_rules.rs
Normal file
|
@ -0,0 +1,129 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use charabia::Language;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::FieldId;
|
||||
|
||||
/// A rule that defines which locales are supported for a given attribute.
|
||||
///
|
||||
/// The rule is a list of attribute patterns and a list of locales.
|
||||
/// The attribute patterns are matched against the attribute name.
|
||||
/// The pattern `*` matches any attribute name.
|
||||
/// The pattern `attribute_name*` matches any attribute name that starts with `attribute_name`.
|
||||
/// The pattern `*attribute_name` matches any attribute name that ends with `attribute_name`.
|
||||
/// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct LocalizedAttributesRule {
|
||||
pub attribute_patterns: Vec<String>,
|
||||
pub locales: Vec<Language>,
|
||||
}
|
||||
|
||||
impl LocalizedAttributesRule {
|
||||
pub fn new(attribute_patterns: Vec<String>, locales: Vec<Language>) -> Self {
|
||||
Self { attribute_patterns, locales }
|
||||
}
|
||||
|
||||
pub fn match_str(&self, str: &str) -> bool {
|
||||
self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str))
|
||||
}
|
||||
|
||||
pub fn locales(&self) -> &[Language] {
|
||||
&self.locales
|
||||
}
|
||||
}
|
||||
|
||||
fn match_pattern(pattern: &str, str: &str) -> bool {
|
||||
if pattern == "*" {
|
||||
true
|
||||
} else if pattern.starts_with('*') && pattern.ends_with('*') {
|
||||
str.contains(&pattern[1..pattern.len() - 1])
|
||||
} else if let Some(pattern) = pattern.strip_prefix('*') {
|
||||
str.ends_with(pattern)
|
||||
} else if let Some(pattern) = pattern.strip_suffix('*') {
|
||||
str.starts_with(pattern)
|
||||
} else {
|
||||
pattern == str
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct LocalizedFieldIds {
|
||||
field_id_to_locales: HashMap<FieldId, Vec<Language>>,
|
||||
}
|
||||
|
||||
impl LocalizedFieldIds {
|
||||
pub fn new<I: Iterator<Item = FieldId>>(
|
||||
rules: &Option<Vec<LocalizedAttributesRule>>,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
fields_ids: I,
|
||||
) -> Self {
|
||||
let mut field_id_to_locales = HashMap::new();
|
||||
|
||||
if let Some(rules) = rules {
|
||||
let fields = fields_ids.filter_map(|field_id| {
|
||||
fields_ids_map.name(field_id).map(|field_name| (field_id, field_name))
|
||||
});
|
||||
|
||||
for (field_id, field_name) in fields {
|
||||
let mut locales = Vec::new();
|
||||
for rule in rules {
|
||||
if rule.match_str(field_name) {
|
||||
locales.extend(rule.locales.iter());
|
||||
// Take the first rule that matches
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !locales.is_empty() {
|
||||
locales.sort();
|
||||
locales.dedup();
|
||||
field_id_to_locales.insert(field_id, locales);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Self { field_id_to_locales }
|
||||
}
|
||||
|
||||
pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> {
|
||||
self.field_id_to_locales.get(&fields_id).map(Vec::as_slice)
|
||||
}
|
||||
|
||||
pub fn all_locales(&self) -> Vec<Language> {
|
||||
let mut locales = Vec::new();
|
||||
for field_locales in self.field_id_to_locales.values() {
|
||||
if !field_locales.is_empty() {
|
||||
locales.extend(field_locales);
|
||||
} else {
|
||||
// If a field has no locales, we consider it as not localized
|
||||
return Vec::new();
|
||||
}
|
||||
}
|
||||
locales.sort();
|
||||
locales.dedup();
|
||||
locales
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_match_pattern() {
|
||||
assert!(match_pattern("*", "test"));
|
||||
assert!(match_pattern("test*", "test"));
|
||||
assert!(match_pattern("test*", "testa"));
|
||||
assert!(match_pattern("*test", "test"));
|
||||
assert!(match_pattern("*test", "atest"));
|
||||
assert!(match_pattern("*test*", "test"));
|
||||
assert!(match_pattern("*test*", "atesta"));
|
||||
assert!(match_pattern("*test*", "atest"));
|
||||
assert!(match_pattern("*test*", "testa"));
|
||||
assert!(!match_pattern("test*test", "test"));
|
||||
assert!(!match_pattern("*test", "testa"));
|
||||
assert!(!match_pattern("test*", "atest"));
|
||||
}
|
||||
}
|
57
crates/milli/src/order_by_map.rs
Normal file
57
crates/milli/src/order_by_map.rs
Normal file
|
@ -0,0 +1,57 @@
|
|||
use std::collections::{hash_map, HashMap};
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use serde::{Deserialize, Deserializer, Serialize};
|
||||
|
||||
use crate::OrderBy;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct OrderByMap(HashMap<String, OrderBy>);
|
||||
|
||||
impl OrderByMap {
|
||||
pub fn get(&self, key: impl AsRef<str>) -> OrderBy {
|
||||
self.0
|
||||
.get(key.as_ref())
|
||||
.copied()
|
||||
.unwrap_or_else(|| self.0.get("*").copied().unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: String, value: OrderBy) -> Option<OrderBy> {
|
||||
self.0.insert(key, value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for OrderByMap {
|
||||
fn default() -> Self {
|
||||
let mut map = HashMap::new();
|
||||
map.insert("*".to_string(), OrderBy::Lexicographic);
|
||||
OrderByMap(map)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromIterator<(String, OrderBy)> for OrderByMap {
|
||||
fn from_iter<T: IntoIterator<Item = (String, OrderBy)>>(iter: T) -> Self {
|
||||
OrderByMap(iter.into_iter().collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for OrderByMap {
|
||||
type Item = (String, OrderBy);
|
||||
type IntoIter = hash_map::IntoIter<String, OrderBy>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for OrderByMap {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let mut map = Deserialize::deserialize(deserializer).map(OrderByMap)?;
|
||||
// Insert the default ordering if it is not already overwritten by the user.
|
||||
map.0.entry("*".to_string()).or_insert(OrderBy::default());
|
||||
Ok(map)
|
||||
}
|
||||
}
|
97
crates/milli/src/prompt/context.rs
Normal file
97
crates/milli/src/prompt/context.rs
Normal file
|
@ -0,0 +1,97 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
use super::document::Document;
|
||||
use super::fields::Fields;
|
||||
use super::FieldsIdsMapWithMetadata;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Context<'a> {
|
||||
document: &'a Document<'a>,
|
||||
fields: Fields<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Context<'a> {
|
||||
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self {
|
||||
Self { document, fields: Fields::new(document, field_id_map) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectView for Context<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(self.document.as_value())
|
||||
.chain(std::iter::once(self.fields.as_value())),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "doc" || index == "fields"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"doc" => Some(self.document.as_value()),
|
||||
"fields" => Some(self.fields.as_value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ValueView for Context<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
131
crates/milli/src/prompt/document.rs
Normal file
131
crates/milli/src/prompt/document.rs
Normal file
|
@ -0,0 +1,131 @@
|
|||
use std::cell::OnceCell;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use liquid::model::{
|
||||
DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::FieldsIdsMap;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Document<'a>(BTreeMap<&'a str, (&'a [u8], ParsedValue)>);
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ParsedValue(std::cell::OnceCell<LiquidValue>);
|
||||
|
||||
impl ParsedValue {
|
||||
fn empty() -> ParsedValue {
|
||||
ParsedValue(OnceCell::new())
|
||||
}
|
||||
|
||||
fn get(&self, raw: &[u8]) -> &LiquidValue {
|
||||
self.0.get_or_init(|| {
|
||||
let value: serde_json::Value = serde_json::from_slice(raw).unwrap();
|
||||
liquid::model::to_value(&value).unwrap()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Document<'a> {
|
||||
pub fn new(
|
||||
data: obkv::KvReaderU16<'a>,
|
||||
side: DelAdd,
|
||||
inverted_field_map: &'a FieldsIdsMap,
|
||||
) -> Self {
|
||||
let mut out_data = BTreeMap::new();
|
||||
for (fid, raw) in data {
|
||||
let obkv = KvReaderDelAdd::new(raw);
|
||||
let Some(raw) = obkv.get(side) else {
|
||||
continue;
|
||||
};
|
||||
let Some(name) = inverted_field_map.name(fid) else {
|
||||
continue;
|
||||
};
|
||||
out_data.insert(name, (raw, ParsedValue::empty()));
|
||||
}
|
||||
Self(out_data)
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
fn iter(&self) -> impl Iterator<Item = (KString, LiquidValue)> + '_ {
|
||||
self.0.iter().map(|(&k, (raw, data))| (k.to_owned().into(), data.get(raw).to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectView for Document<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
self.len() as i64
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
let keys = BTreeMap::keys(&self.0).map(|&s| s.into());
|
||||
Box::new(keys)
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(self.0.values().map(|(raw, v)| v.get(raw) as &dyn ValueView))
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.0.iter().map(|(&k, (raw, data))| (k.into(), data.get(raw) as &dyn ValueView)))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
self.0.contains_key(index)
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
self.0.get(index).map(|(raw, v)| v.get(raw) as &dyn ValueView)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ValueView for Document<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => self.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(self.iter().collect())
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
56
crates/milli/src/prompt/error.rs
Normal file
56
crates/milli/src/prompt/error.rs
Normal file
|
@ -0,0 +1,56 @@
|
|||
use crate::error::FaultSource;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct NewPromptError {
|
||||
pub kind: NewPromptErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
|
||||
impl From<NewPromptError> for crate::Error {
|
||||
fn from(value: NewPromptError) -> Self {
|
||||
crate::Error::UserError(crate::UserError::InvalidPrompt(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl NewPromptError {
|
||||
pub(crate) fn cannot_parse_template(inner: liquid::Error) -> NewPromptError {
|
||||
Self { kind: NewPromptErrorKind::CannotParseTemplate(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn invalid_fields_in_template(inner: liquid::Error) -> NewPromptError {
|
||||
Self { kind: NewPromptErrorKind::InvalidFieldsInTemplate(inner), fault: FaultSource::User }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NewPromptErrorKind {
|
||||
#[error("cannot parse template: {0}")]
|
||||
CannotParseTemplate(liquid::Error),
|
||||
#[error("template contains invalid fields: {0}. Only `doc.*`, `fields[i].name`, `fields[i].value` are supported")]
|
||||
InvalidFieldsInTemplate(liquid::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct RenderPromptError {
|
||||
pub kind: RenderPromptErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
impl RenderPromptError {
|
||||
pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError {
|
||||
Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RenderPromptErrorKind {
|
||||
#[error("missing field in document: {0}")]
|
||||
MissingContext(liquid::Error),
|
||||
}
|
||||
|
||||
impl From<RenderPromptError> for crate::Error {
|
||||
fn from(value: RenderPromptError) -> Self {
|
||||
crate::Error::UserError(crate::UserError::MissingDocumentField(value))
|
||||
}
|
||||
}
|
184
crates/milli/src/prompt/fields.rs
Normal file
184
crates/milli/src/prompt/fields.rs
Normal file
|
@ -0,0 +1,184 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{ObjectView, ValueView};
|
||||
|
||||
use super::document::Document;
|
||||
use super::{FieldMetadata, FieldsIdsMapWithMetadata};
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Fields<'a>(Vec<FieldValue<'a>>);
|
||||
|
||||
impl<'a> Fields<'a> {
|
||||
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self {
|
||||
Self(
|
||||
std::iter::repeat(document)
|
||||
.zip(field_id_map.iter())
|
||||
.map(|(document, (fid, name))| FieldValue {
|
||||
document,
|
||||
name,
|
||||
metadata: field_id_map.metadata(fid).unwrap_or_default(),
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct FieldValue<'a> {
|
||||
name: &'a str,
|
||||
document: &'a Document<'a>,
|
||||
metadata: FieldMetadata,
|
||||
}
|
||||
|
||||
impl<'a> ValueView for FieldValue<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => self.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, v)| (k.to_string().into(), v.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FieldValue<'a> {
|
||||
pub fn name(&self) -> &&'a str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &dyn ValueView {
|
||||
self.document.get(self.name).unwrap_or(&LiquidValue::Nil)
|
||||
}
|
||||
|
||||
pub fn is_searchable(&self) -> &bool {
|
||||
&self.metadata.searchable
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.size() == 0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectView for FieldValue<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["name", "value", "is_searchable"].iter().map(|&x| KStringCow::from_static(x)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(self.name() as &dyn ValueView)
|
||||
.chain(std::iter::once(self.value()))
|
||||
.chain(std::iter::once(self.is_searchable() as &dyn ValueView)),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "name" || index == "value" || index == "is_searchable"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"name" => Some(self.name()),
|
||||
"value" => Some(self.value()),
|
||||
"is_searchable" => Some(self.is_searchable()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ArrayView for Fields<'a> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self.0.as_value()
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
self.0.len() as i64
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
self.0.values()
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: i64) -> bool {
|
||||
self.0.contains_key(index)
|
||||
}
|
||||
|
||||
fn get(&self, index: i64) -> Option<&dyn ValueView> {
|
||||
ArrayView::get(&self.0, index)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ValueView for Fields<'a> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
self.0.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
self.0.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
self.0.type_name()
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
self.0.query_state(state)
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
self.0.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
self.0.to_value()
|
||||
}
|
||||
|
||||
fn as_array(&self) -> Option<&dyn ArrayView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
281
crates/milli/src/prompt/mod.rs
Normal file
281
crates/milli/src/prompt/mod.rs
Normal file
|
@ -0,0 +1,281 @@
|
|||
mod context;
|
||||
mod document;
|
||||
pub(crate) mod error;
|
||||
mod fields;
|
||||
mod template_checker;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Deref;
|
||||
|
||||
use error::{NewPromptError, RenderPromptError};
|
||||
|
||||
use self::context::Context;
|
||||
use self::document::Document;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::{FieldId, FieldsIdsMap};
|
||||
|
||||
pub struct Prompt {
|
||||
template: liquid::Template,
|
||||
template_text: String,
|
||||
max_bytes: Option<NonZeroUsize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct PromptData {
|
||||
pub template: String,
|
||||
pub max_bytes: Option<NonZeroUsize>,
|
||||
}
|
||||
|
||||
impl From<Prompt> for PromptData {
|
||||
fn from(value: Prompt) -> Self {
|
||||
Self { template: value.template_text, max_bytes: value.max_bytes }
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<PromptData> for Prompt {
|
||||
type Error = NewPromptError;
|
||||
|
||||
fn try_from(value: PromptData) -> Result<Self, Self::Error> {
|
||||
Prompt::new(value.template, value.max_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Prompt {
|
||||
fn clone(&self) -> Self {
|
||||
let template_text = self.template_text.clone();
|
||||
Self {
|
||||
template: new_template(&template_text).unwrap(),
|
||||
template_text,
|
||||
max_bytes: self.max_bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_template(text: &str) -> Result<liquid::Template, liquid::Error> {
|
||||
liquid::ParserBuilder::with_stdlib().build().unwrap().parse(text)
|
||||
}
|
||||
|
||||
fn default_template() -> liquid::Template {
|
||||
new_template(default_template_text()).unwrap()
|
||||
}
|
||||
|
||||
fn default_template_text() -> &'static str {
|
||||
"{% for field in fields %}\
|
||||
{% if field.is_searchable and field.value != nil %}\
|
||||
{{ field.name }}: {{ field.value }}\n\
|
||||
{% endif %}\
|
||||
{% endfor %}"
|
||||
}
|
||||
|
||||
pub fn default_max_bytes() -> NonZeroUsize {
|
||||
NonZeroUsize::new(400).unwrap()
|
||||
}
|
||||
|
||||
impl Default for Prompt {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
template: default_template(),
|
||||
template_text: default_template_text().into(),
|
||||
max_bytes: Some(default_max_bytes()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PromptData {
|
||||
fn default() -> Self {
|
||||
Self { template: default_template_text().into(), max_bytes: Some(default_max_bytes()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Prompt {
|
||||
pub fn new(template: String, max_bytes: Option<NonZeroUsize>) -> Result<Self, NewPromptError> {
|
||||
let this = Self {
|
||||
template: liquid::ParserBuilder::with_stdlib()
|
||||
.build()
|
||||
.unwrap()
|
||||
.parse(&template)
|
||||
.map_err(NewPromptError::cannot_parse_template)?,
|
||||
template_text: template,
|
||||
max_bytes,
|
||||
};
|
||||
|
||||
// render template with special object that's OK with `doc.*` and `fields.*`
|
||||
this.template
|
||||
.render(&template_checker::TemplateChecker)
|
||||
.map_err(NewPromptError::invalid_fields_in_template)?;
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
pub fn render(
|
||||
&self,
|
||||
document: obkv::KvReaderU16<'_>,
|
||||
side: DelAdd,
|
||||
field_id_map: &FieldsIdsMapWithMetadata,
|
||||
) -> Result<String, RenderPromptError> {
|
||||
let document = Document::new(document, side, field_id_map);
|
||||
let context = Context::new(&document, field_id_map);
|
||||
|
||||
let mut rendered =
|
||||
self.template.render(&context).map_err(RenderPromptError::missing_context)?;
|
||||
if let Some(max_bytes) = self.max_bytes {
|
||||
truncate(&mut rendered, max_bytes.get());
|
||||
}
|
||||
Ok(rendered)
|
||||
}
|
||||
}
|
||||
|
||||
fn truncate(s: &mut String, max_bytes: usize) {
|
||||
if max_bytes >= s.len() {
|
||||
return;
|
||||
}
|
||||
for i in (0..=max_bytes).rev() {
|
||||
if s.is_char_boundary(i) {
|
||||
s.truncate(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldsIdsMapWithMetadata<'a> {
|
||||
fields_ids_map: &'a FieldsIdsMap,
|
||||
metadata: BTreeMap<FieldId, FieldMetadata>,
|
||||
}
|
||||
|
||||
impl<'a> FieldsIdsMapWithMetadata<'a> {
|
||||
pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self {
|
||||
let mut metadata: BTreeMap<FieldId, FieldMetadata> =
|
||||
fields_ids_map.ids().map(|id| (id, Default::default())).collect();
|
||||
for searchable_field_id in searchable_fields_ids {
|
||||
let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue };
|
||||
metadata.searchable = true;
|
||||
}
|
||||
Self { fields_ids_map, metadata }
|
||||
}
|
||||
|
||||
pub fn metadata(&self, field_id: FieldId) -> Option<FieldMetadata> {
|
||||
self.metadata.get(&field_id).copied()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for FieldsIdsMapWithMetadata<'a> {
|
||||
type Target = FieldsIdsMap;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.fields_ids_map
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct FieldMetadata {
|
||||
pub searchable: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::Prompt;
|
||||
use crate::error::FaultSource;
|
||||
use crate::prompt::error::{NewPromptError, NewPromptErrorKind};
|
||||
use crate::prompt::truncate;
|
||||
|
||||
#[test]
|
||||
fn default_template() {
|
||||
// does not panic
|
||||
Prompt::default();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_template() {
|
||||
Prompt::new("".into(), None).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_ok() {
|
||||
Prompt::new("{{doc.title}}: {{doc.overview}}".into(), None).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_syntax() {
|
||||
assert!(matches!(
|
||||
Prompt::new("{{doc.title: {{doc.overview}}".into(), None),
|
||||
Err(NewPromptError {
|
||||
kind: NewPromptErrorKind::CannotParseTemplate(_),
|
||||
fault: FaultSource::User
|
||||
})
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_missing_doc() {
|
||||
assert!(matches!(
|
||||
Prompt::new("{{title}}: {{overview}}".into(), None),
|
||||
Err(NewPromptError {
|
||||
kind: NewPromptErrorKind::InvalidFieldsInTemplate(_),
|
||||
fault: FaultSource::User
|
||||
})
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_nested_doc() {
|
||||
Prompt::new("{{doc.actor.firstName}}: {{doc.actor.lastName}}".into(), None).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_fields() {
|
||||
Prompt::new("{% for field in fields %}{{field}}{% endfor %}".into(), None).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_fields_ok() {
|
||||
Prompt::new(
|
||||
"{% for field in fields %}{{field.name}}: {{field.value}}{% endfor %}".into(),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_fields_invalid() {
|
||||
assert!(matches!(
|
||||
// intentionally garbled field
|
||||
Prompt::new("{% for field in fields %}{{field.vaelu}} {% endfor %}".into(), None),
|
||||
Err(NewPromptError {
|
||||
kind: NewPromptErrorKind::InvalidFieldsInTemplate(_),
|
||||
fault: FaultSource::User
|
||||
})
|
||||
));
|
||||
}
|
||||
|
||||
// todo: test truncation
|
||||
#[test]
|
||||
fn template_truncation() {
|
||||
let mut s = "インテル ザー ビーグル".to_string();
|
||||
|
||||
truncate(&mut s, 42);
|
||||
assert_eq!(s, "インテル ザー ビーグル");
|
||||
|
||||
assert_eq!(s.len(), 32);
|
||||
truncate(&mut s, 32);
|
||||
assert_eq!(s, "インテル ザー ビーグル");
|
||||
|
||||
truncate(&mut s, 31);
|
||||
assert_eq!(s, "インテル ザー ビーグ");
|
||||
truncate(&mut s, 30);
|
||||
assert_eq!(s, "インテル ザー ビーグ");
|
||||
truncate(&mut s, 28);
|
||||
assert_eq!(s, "インテル ザー ビー");
|
||||
truncate(&mut s, 26);
|
||||
assert_eq!(s, "インテル ザー ビー");
|
||||
truncate(&mut s, 25);
|
||||
assert_eq!(s, "インテル ザー ビ");
|
||||
|
||||
assert_eq!("イ".len(), 3);
|
||||
truncate(&mut s, 3);
|
||||
assert_eq!(s, "イ");
|
||||
truncate(&mut s, 2);
|
||||
assert_eq!(s, "");
|
||||
}
|
||||
}
|
301
crates/milli/src/prompt/template_checker.rs
Normal file
301
crates/milli/src/prompt/template_checker.rs
Normal file
|
@ -0,0 +1,301 @@
|
|||
use liquid::model::{
|
||||
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
|
||||
};
|
||||
use liquid::{Object, ObjectView, ValueView};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TemplateChecker;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyDoc;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyFields;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DummyField;
|
||||
|
||||
const DUMMY_VALUE: &LiquidValue = &LiquidValue::Nil;
|
||||
|
||||
impl ObjectView for DummyField {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["name", "value"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(vec![DUMMY_VALUE.as_view(), DUMMY_VALUE.as_view()].into_iter())
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "name" || index == "value"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
if self.contains_key(index) {
|
||||
Some(DUMMY_VALUE.as_view())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyField {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue => false,
|
||||
State::Empty => false,
|
||||
State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
let mut this = Object::new();
|
||||
this.insert("name".into(), LiquidValue::Nil);
|
||||
this.insert("value".into(), LiquidValue::Nil);
|
||||
LiquidValue::Object(this)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyFields {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"array"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue => false,
|
||||
State::Empty => false,
|
||||
State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Array(vec![DummyField.to_value()])
|
||||
}
|
||||
|
||||
fn as_array(&self) -> Option<&dyn ArrayView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ArrayView for DummyFields {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
u16::MAX as i64
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::once(DummyField.as_value()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: i64) -> bool {
|
||||
index < self.size()
|
||||
}
|
||||
|
||||
fn get(&self, _index: i64) -> Option<&dyn ValueView> {
|
||||
Some(DummyField.as_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectView for DummyDoc {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
1000
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(std::iter::empty())
|
||||
}
|
||||
|
||||
fn contains_key(&self, _index: &str) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, _index: &str) -> Option<&'s dyn ValueView> {
|
||||
// Recursively sends itself
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for DummyDoc {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.render()
|
||||
}
|
||||
|
||||
fn source(&self) -> DisplayCow<'_> {
|
||||
DUMMY_VALUE.source()
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue => false,
|
||||
State::Empty => false,
|
||||
State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> KStringCow<'_> {
|
||||
DUMMY_VALUE.to_kstr()
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Nil
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectView for TemplateChecker {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(DummyDoc.as_value()).chain(std::iter::once(DummyFields.as_value())),
|
||||
)
|
||||
}
|
||||
|
||||
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
|
||||
Box::new(self.keys().zip(self.values()))
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "doc" || index == "fields"
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"doc" => Some(DummyDoc.as_value()),
|
||||
"fields" => Some(DummyFields.as_value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueView for TemplateChecker {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
|
||||
fn render(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
|
||||
}
|
||||
|
||||
fn source(&self) -> liquid::model::DisplayCow<'_> {
|
||||
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
|
||||
}
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
"object"
|
||||
}
|
||||
|
||||
fn query_state(&self, state: liquid::model::State) -> bool {
|
||||
match state {
|
||||
State::Truthy => true,
|
||||
State::DefaultValue | State::Empty | State::Blank => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
|
||||
let s = ObjectRender::new(self).to_string();
|
||||
KStringCow::from_string(s)
|
||||
}
|
||||
|
||||
fn to_value(&self) -> LiquidValue {
|
||||
LiquidValue::Object(
|
||||
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
|
||||
)
|
||||
}
|
||||
|
||||
fn as_object(&self) -> Option<&dyn ObjectView> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
37
crates/milli/src/proximity.rs
Normal file
37
crates/milli/src/proximity.rs
Normal file
|
@ -0,0 +1,37 @@
|
|||
use std::cmp;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{relative_from_absolute_position, Position};
|
||||
|
||||
pub const MAX_DISTANCE: u32 = 4;
|
||||
|
||||
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs <= rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min((lhs - rhs) + 1, MAX_DISTANCE)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
|
||||
let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs);
|
||||
let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs);
|
||||
if lhs_attr != rhs_attr {
|
||||
MAX_DISTANCE
|
||||
} else {
|
||||
index_proximity(lhs_index as u32, rhs_index as u32)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn path_proximity(path: &[Position]) -> u32 {
|
||||
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum ProximityPrecision {
|
||||
#[default]
|
||||
ByWord,
|
||||
ByAttribute,
|
||||
}
|
495
crates/milli/src/score_details.rs
Normal file
495
crates/milli/src/score_details.rs
Normal file
|
@ -0,0 +1,495 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::distance_between_two_points;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum ScoreDetails {
|
||||
Words(Words),
|
||||
Typo(Typo),
|
||||
Proximity(Rank),
|
||||
Fid(Rank),
|
||||
Position(Rank),
|
||||
ExactAttribute(ExactAttribute),
|
||||
ExactWords(ExactWords),
|
||||
Sort(Sort),
|
||||
Vector(Vector),
|
||||
GeoSort(GeoSort),
|
||||
|
||||
/// Returned when we don't have the time to finish applying all the subsequent ranking-rules
|
||||
Skipped,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ScoreValue<'a> {
|
||||
Score(f64),
|
||||
Sort(&'a Sort),
|
||||
GeoSort(&'a GeoSort),
|
||||
}
|
||||
|
||||
enum RankOrValue<'a> {
|
||||
Rank(Rank),
|
||||
Sort(&'a Sort),
|
||||
GeoSort(&'a GeoSort),
|
||||
Score(f64),
|
||||
}
|
||||
|
||||
impl ScoreDetails {
|
||||
pub fn local_score(&self) -> Option<f64> {
|
||||
self.rank().map(Rank::local_score)
|
||||
}
|
||||
|
||||
pub fn rank(&self) -> Option<Rank> {
|
||||
match self {
|
||||
ScoreDetails::Words(details) => Some(details.rank()),
|
||||
ScoreDetails::Typo(details) => Some(details.rank()),
|
||||
ScoreDetails::Proximity(details) => Some(*details),
|
||||
ScoreDetails::Fid(details) => Some(*details),
|
||||
ScoreDetails::Position(details) => Some(*details),
|
||||
ScoreDetails::ExactAttribute(details) => Some(details.rank()),
|
||||
ScoreDetails::ExactWords(details) => Some(details.rank()),
|
||||
ScoreDetails::Sort(_) => None,
|
||||
ScoreDetails::GeoSort(_) => None,
|
||||
ScoreDetails::Vector(_) => None,
|
||||
ScoreDetails::Skipped => Some(Rank { rank: 0, max_rank: 1 }),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn global_score<'a>(details: impl Iterator<Item = &'a Self> + 'a) -> f64 {
|
||||
Self::score_values(details)
|
||||
.find_map(|x| {
|
||||
let ScoreValue::Score(score) = x else {
|
||||
return None;
|
||||
};
|
||||
Some(score)
|
||||
})
|
||||
.unwrap_or(1.0f64)
|
||||
}
|
||||
|
||||
pub fn score_values<'a>(
|
||||
details: impl Iterator<Item = &'a Self> + 'a,
|
||||
) -> impl Iterator<Item = ScoreValue<'a>> + 'a {
|
||||
details
|
||||
.map(ScoreDetails::rank_or_value)
|
||||
.coalesce(|left, right| match (left, right) {
|
||||
(RankOrValue::Rank(left), RankOrValue::Rank(right)) => {
|
||||
Ok(RankOrValue::Rank(Rank::merge(left, right)))
|
||||
}
|
||||
(left, right) => Err((left, right)),
|
||||
})
|
||||
.map(|rank_or_value| match rank_or_value {
|
||||
RankOrValue::Rank(r) => ScoreValue::Score(r.local_score()),
|
||||
RankOrValue::Sort(s) => ScoreValue::Sort(s),
|
||||
RankOrValue::GeoSort(g) => ScoreValue::GeoSort(g),
|
||||
RankOrValue::Score(s) => ScoreValue::Score(s),
|
||||
})
|
||||
}
|
||||
|
||||
fn rank_or_value(&self) -> RankOrValue<'_> {
|
||||
match self {
|
||||
ScoreDetails::Words(w) => RankOrValue::Rank(w.rank()),
|
||||
ScoreDetails::Typo(t) => RankOrValue::Rank(t.rank()),
|
||||
ScoreDetails::Proximity(p) => RankOrValue::Rank(*p),
|
||||
ScoreDetails::Fid(f) => RankOrValue::Rank(*f),
|
||||
ScoreDetails::Position(p) => RankOrValue::Rank(*p),
|
||||
ScoreDetails::ExactAttribute(e) => RankOrValue::Rank(e.rank()),
|
||||
ScoreDetails::ExactWords(e) => RankOrValue::Rank(e.rank()),
|
||||
ScoreDetails::Sort(sort) => RankOrValue::Sort(sort),
|
||||
ScoreDetails::GeoSort(geosort) => RankOrValue::GeoSort(geosort),
|
||||
ScoreDetails::Vector(vector) => {
|
||||
RankOrValue::Score(vector.similarity.as_ref().map(|s| *s as f64).unwrap_or(0.0f64))
|
||||
}
|
||||
ScoreDetails::Skipped => RankOrValue::Rank(Rank { rank: 0, max_rank: 1 }),
|
||||
}
|
||||
}
|
||||
|
||||
/// Panics
|
||||
///
|
||||
/// - If Position is not preceded by Fid
|
||||
/// - If Exactness is not preceded by ExactAttribute
|
||||
pub fn to_json_map<'a>(
|
||||
details: impl Iterator<Item = &'a Self>,
|
||||
) -> serde_json::Map<String, serde_json::Value> {
|
||||
let mut order = 0;
|
||||
let mut fid_details = None;
|
||||
let mut details_map = serde_json::Map::default();
|
||||
for details in details {
|
||||
match details {
|
||||
ScoreDetails::Words(words) => {
|
||||
let words_details = serde_json::json!({
|
||||
"order": order,
|
||||
"matchingWords": words.matching_words,
|
||||
"maxMatchingWords": words.max_matching_words,
|
||||
"score": words.rank().local_score(),
|
||||
});
|
||||
details_map.insert("words".into(), words_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Typo(typo) => {
|
||||
let typo_details = serde_json::json!({
|
||||
"order": order,
|
||||
"typoCount": typo.typo_count,
|
||||
"maxTypoCount": typo.max_typo_count,
|
||||
"score": typo.rank().local_score(),
|
||||
});
|
||||
details_map.insert("typo".into(), typo_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Proximity(proximity) => {
|
||||
let proximity_details = serde_json::json!({
|
||||
"order": order,
|
||||
"score": proximity.local_score(),
|
||||
});
|
||||
details_map.insert("proximity".into(), proximity_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Fid(fid) => {
|
||||
// copy the rank for future use in Position.
|
||||
fid_details = Some(*fid);
|
||||
// For now, fid is a virtual rule always followed by the "position" rule
|
||||
let fid_details = serde_json::json!({
|
||||
"order": order,
|
||||
"attributeRankingOrderScore": fid.local_score(),
|
||||
});
|
||||
details_map.insert("attribute".into(), fid_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Position(position) => {
|
||||
// For now, position is a virtual rule always preceded by the "fid" rule
|
||||
let attribute_details = details_map
|
||||
.get_mut("attribute")
|
||||
.expect("position not preceded by attribute");
|
||||
let attribute_details = attribute_details
|
||||
.as_object_mut()
|
||||
.expect("attribute details was not an object");
|
||||
let Some(fid_details) = fid_details else {
|
||||
unimplemented!("position not preceded by attribute");
|
||||
};
|
||||
|
||||
attribute_details
|
||||
.insert("queryWordDistanceScore".into(), position.local_score().into());
|
||||
let score = Rank::global_score([fid_details, *position].iter().copied());
|
||||
attribute_details.insert("score".into(), score.into());
|
||||
|
||||
// do not update the order since this was already done by fid
|
||||
}
|
||||
ScoreDetails::ExactAttribute(exact_attribute) => {
|
||||
let exactness_details = serde_json::json!({
|
||||
"order": order,
|
||||
"matchType": exact_attribute,
|
||||
"score": exact_attribute.rank().local_score(),
|
||||
});
|
||||
details_map.insert("exactness".into(), exactness_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::ExactWords(details) => {
|
||||
// For now, exactness is a virtual rule always preceded by the "ExactAttribute" rule
|
||||
let exactness_details = details_map
|
||||
.get_mut("exactness")
|
||||
.expect("Exactness not preceded by exactAttribute");
|
||||
let exactness_details = exactness_details
|
||||
.as_object_mut()
|
||||
.expect("exactness details was not an object");
|
||||
if exactness_details.get("matchType").expect("missing 'matchType'")
|
||||
== &serde_json::json!(ExactAttribute::NoExactMatch)
|
||||
{
|
||||
let score = Rank::global_score(
|
||||
[ExactAttribute::NoExactMatch.rank(), details.rank()].iter().copied(),
|
||||
);
|
||||
// tiny detail, but we want the score to be the last displayed field,
|
||||
// so we're removing it here, adding the other fields, then adding the new score
|
||||
exactness_details.remove("score");
|
||||
exactness_details
|
||||
.insert("matchingWords".into(), details.matching_words.into());
|
||||
exactness_details
|
||||
.insert("maxMatchingWords".into(), details.max_matching_words.into());
|
||||
exactness_details.insert("score".into(), score.into());
|
||||
}
|
||||
// do not update the order since this was already done by exactAttribute
|
||||
}
|
||||
ScoreDetails::Sort(details) => {
|
||||
let sort = if details.redacted {
|
||||
format!("<hidden-rule-{order}>")
|
||||
} else {
|
||||
format!(
|
||||
"{}:{}",
|
||||
details.field_name,
|
||||
if details.ascending { "asc" } else { "desc" }
|
||||
)
|
||||
};
|
||||
let value =
|
||||
if details.redacted { "<hidden>".into() } else { details.value.clone() };
|
||||
let sort_details = serde_json::json!({
|
||||
"order": order,
|
||||
"value": value,
|
||||
});
|
||||
details_map.insert(sort, sort_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::GeoSort(details) => {
|
||||
let sort = format!(
|
||||
"_geoPoint({}, {}):{}",
|
||||
details.target_point[0],
|
||||
details.target_point[1],
|
||||
if details.ascending { "asc" } else { "desc" }
|
||||
);
|
||||
let point = if let Some(value) = details.value {
|
||||
serde_json::json!({ "lat": value[0], "lng": value[1]})
|
||||
} else {
|
||||
serde_json::Value::Null
|
||||
};
|
||||
let sort_details = serde_json::json!({
|
||||
"order": order,
|
||||
"value": point,
|
||||
"distance": details.distance(),
|
||||
});
|
||||
details_map.insert(sort, sort_details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Vector(s) => {
|
||||
let similarity = s.similarity.as_ref();
|
||||
|
||||
let details = serde_json::json!({
|
||||
"order": order,
|
||||
"similarity": similarity,
|
||||
});
|
||||
details_map.insert("vectorSort".into(), details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Skipped => {
|
||||
details_map
|
||||
.insert("skipped".to_string(), serde_json::json!({ "order": order }));
|
||||
order += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
details_map
|
||||
}
|
||||
}
|
||||
|
||||
/// The strategy to compute scores.
|
||||
///
|
||||
/// It makes sense to pass down this strategy to the internals of the search, because
|
||||
/// some optimizations (today, mainly skipping ranking rules for universes of a single document)
|
||||
/// are not correct to do when computing the scores.
|
||||
///
|
||||
/// This strategy could feasibly be extended to differentiate between the normalized score and the
|
||||
/// detailed scores, but it is not useful today as the normalized score is *derived from* the
|
||||
/// detailed scores.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
||||
pub enum ScoringStrategy {
|
||||
/// Don't compute scores
|
||||
#[default]
|
||||
Skip,
|
||||
/// Compute detailed scores
|
||||
Detailed,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Words {
|
||||
pub matching_words: u32,
|
||||
pub max_matching_words: u32,
|
||||
}
|
||||
|
||||
impl Words {
|
||||
pub fn rank(&self) -> Rank {
|
||||
Rank { rank: self.matching_words, max_rank: self.max_matching_words }
|
||||
}
|
||||
|
||||
pub(crate) fn from_rank(rank: Rank) -> Self {
|
||||
Self { matching_words: rank.rank, max_matching_words: rank.max_rank }
|
||||
}
|
||||
}
|
||||
|
||||
/// Structure that is super similar to [`Words`], but whose semantics is a bit distinct.
|
||||
///
|
||||
/// In exactness, the number of matching words can actually be 0 with a non-zero score,
|
||||
/// if no words from the query appear exactly in the document.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct ExactWords {
|
||||
pub matching_words: u32,
|
||||
pub max_matching_words: u32,
|
||||
}
|
||||
|
||||
impl ExactWords {
|
||||
pub fn rank(&self) -> Rank {
|
||||
// 0 matching words means last rank (1)
|
||||
Rank { rank: self.matching_words + 1, max_rank: self.max_matching_words + 1 }
|
||||
}
|
||||
|
||||
pub(crate) fn from_rank(rank: Rank) -> Self {
|
||||
// last rank (1) means that 0 words from the query appear exactly in the document.
|
||||
// first rank (max_rank) means that (max_rank - 1) words from the query appear exactly in the document.
|
||||
Self {
|
||||
matching_words: rank.rank.saturating_sub(1),
|
||||
max_matching_words: rank.max_rank.saturating_sub(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Typo {
|
||||
pub typo_count: u32,
|
||||
pub max_typo_count: u32,
|
||||
}
|
||||
|
||||
impl Typo {
|
||||
pub fn rank(&self) -> Rank {
|
||||
Rank {
|
||||
rank: (self.max_typo_count + 1).saturating_sub(self.typo_count),
|
||||
max_rank: (self.max_typo_count + 1),
|
||||
}
|
||||
}
|
||||
|
||||
// max_rank = max_typo + 1
|
||||
// max_typo = max_rank - 1
|
||||
//
|
||||
// rank = max_typo - typo + 1
|
||||
// rank = max_rank - 1 - typo + 1
|
||||
// rank + typo = max_rank
|
||||
// typo = max_rank - rank
|
||||
pub fn from_rank(rank: Rank) -> Typo {
|
||||
Typo {
|
||||
typo_count: rank.max_rank.saturating_sub(rank.rank),
|
||||
max_typo_count: rank.max_rank.saturating_sub(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Rank {
|
||||
/// The ordinal rank, such that `max_rank` is the first rank, and 0 is the last rank.
|
||||
///
|
||||
/// The higher the better. Documents with a rank of 0 have a score of 0 and are typically never returned
|
||||
/// (they don't match the query).
|
||||
pub rank: u32,
|
||||
/// The maximum possible rank. Documents with this rank have a score of 1.
|
||||
///
|
||||
/// The max rank should not be 0.
|
||||
pub max_rank: u32,
|
||||
}
|
||||
|
||||
impl Rank {
|
||||
pub fn local_score(self) -> f64 {
|
||||
self.rank as f64 / self.max_rank as f64
|
||||
}
|
||||
|
||||
pub fn global_score(details: impl Iterator<Item = Self>) -> f64 {
|
||||
let mut rank = Rank { rank: 1, max_rank: 1 };
|
||||
for inner_rank in details {
|
||||
rank = Rank::merge(rank, inner_rank);
|
||||
}
|
||||
rank.local_score()
|
||||
}
|
||||
|
||||
pub fn merge(mut outer: Rank, inner: Rank) -> Rank {
|
||||
outer.rank = outer.rank.saturating_sub(1);
|
||||
|
||||
outer.rank *= inner.max_rank;
|
||||
outer.max_rank *= inner.max_rank;
|
||||
|
||||
outer.rank += inner.rank;
|
||||
|
||||
outer
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum ExactAttribute {
|
||||
ExactMatch,
|
||||
MatchesStart,
|
||||
NoExactMatch,
|
||||
}
|
||||
|
||||
impl ExactAttribute {
|
||||
pub fn rank(&self) -> Rank {
|
||||
let rank = match self {
|
||||
ExactAttribute::ExactMatch => 3,
|
||||
ExactAttribute::MatchesStart => 2,
|
||||
ExactAttribute::NoExactMatch => 1,
|
||||
};
|
||||
Rank { rank, max_rank: 3 }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct Sort {
|
||||
pub field_name: String,
|
||||
pub ascending: bool,
|
||||
pub redacted: bool,
|
||||
pub value: serde_json::Value,
|
||||
}
|
||||
|
||||
impl PartialOrd for Sort {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
if self.ascending != other.ascending {
|
||||
return None;
|
||||
}
|
||||
match (&self.value, &other.value) {
|
||||
(serde_json::Value::Null, serde_json::Value::Null) => Some(Ordering::Equal),
|
||||
(serde_json::Value::Null, _) => Some(Ordering::Less),
|
||||
(_, serde_json::Value::Null) => Some(Ordering::Greater),
|
||||
// numbers are always before strings
|
||||
(serde_json::Value::Number(_), serde_json::Value::String(_)) => Some(Ordering::Greater),
|
||||
(serde_json::Value::String(_), serde_json::Value::Number(_)) => Some(Ordering::Less),
|
||||
(serde_json::Value::Number(left), serde_json::Value::Number(right)) => {
|
||||
// FIXME: unwrap permitted here?
|
||||
let order = left.as_f64().unwrap().partial_cmp(&right.as_f64().unwrap())?;
|
||||
// 12 < 42, and when ascending, we want to see 12 first, so the smallest.
|
||||
// Hence, when ascending, smaller is better
|
||||
Some(if self.ascending { order.reverse() } else { order })
|
||||
}
|
||||
(serde_json::Value::String(left), serde_json::Value::String(right)) => {
|
||||
let order = left.cmp(right);
|
||||
// Taking e.g. "a" and "z"
|
||||
// "a" < "z", and when ascending, we want to see "a" first, so the smallest.
|
||||
// Hence, when ascending, smaller is better
|
||||
Some(if self.ascending { order.reverse() } else { order })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub struct GeoSort {
|
||||
pub target_point: [f64; 2],
|
||||
pub ascending: bool,
|
||||
pub value: Option<[f64; 2]>,
|
||||
}
|
||||
|
||||
impl PartialOrd for GeoSort {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
if self.ascending != other.ascending {
|
||||
return None;
|
||||
}
|
||||
Some(match (self.distance(), other.distance()) {
|
||||
(None, None) => Ordering::Equal,
|
||||
(None, Some(_)) => Ordering::Less,
|
||||
(Some(_), None) => Ordering::Greater,
|
||||
(Some(left), Some(right)) => {
|
||||
let order = left.partial_cmp(&right)?;
|
||||
if self.ascending {
|
||||
// when ascending, the one with the smallest distance has the best score
|
||||
order.reverse()
|
||||
} else {
|
||||
order
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd)]
|
||||
pub struct Vector {
|
||||
pub similarity: Option<f32>,
|
||||
}
|
||||
|
||||
impl GeoSort {
|
||||
pub fn distance(&self) -> Option<f64> {
|
||||
self.value.map(|value| distance_between_two_points(&self.target_point, &value))
|
||||
}
|
||||
}
|
846
crates/milli/src/search/facet/facet_distribution.rs
Normal file
846
crates/milli/src/search/facet/facet_distribution.rs
Normal file
|
@ -0,0 +1,846 @@
|
|||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::ops::ControlFlow;
|
||||
use std::{fmt, mem};
|
||||
|
||||
use heed::types::Bytes;
|
||||
use heed::BytesDecode;
|
||||
use indexmap::IndexMap;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::UserError;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
|
||||
};
|
||||
use crate::heed_codec::{BytesRefCodec, StrRefCodec};
|
||||
use crate::search::facet::facet_distribution_iter::{
|
||||
count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
|
||||
};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// The default number of values by facets that will
|
||||
/// be fetched from the key-value store.
|
||||
pub const DEFAULT_VALUES_PER_FACET: usize = 100;
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 3000;
|
||||
|
||||
/// How should we fetch the facets?
|
||||
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum OrderBy {
|
||||
/// By lexicographic order...
|
||||
#[default]
|
||||
Lexicographic,
|
||||
/// Or by number of docids in common?
|
||||
Count,
|
||||
}
|
||||
|
||||
impl Display for OrderBy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
OrderBy::Lexicographic => f.write_str("alphabetically"),
|
||||
OrderBy::Count => f.write_str("by count"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetDistribution<'a> {
|
||||
facets: Option<HashMap<String, OrderBy>>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
max_values_per_facet: usize,
|
||||
default_order_by: OrderBy,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
index: &'a Index,
|
||||
}
|
||||
|
||||
impl<'a> FacetDistribution<'a> {
|
||||
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> FacetDistribution<'a> {
|
||||
FacetDistribution {
|
||||
facets: None,
|
||||
candidates: None,
|
||||
max_values_per_facet: DEFAULT_VALUES_PER_FACET,
|
||||
default_order_by: OrderBy::default(),
|
||||
rtxn,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn facets<I: IntoIterator<Item = (A, OrderBy)>, A: AsRef<str>>(
|
||||
&mut self,
|
||||
names_ordered_by: I,
|
||||
) -> &mut Self {
|
||||
self.facets = Some(
|
||||
names_ordered_by
|
||||
.into_iter()
|
||||
.map(|(name, order_by)| (name.as_ref().to_string(), order_by))
|
||||
.collect(),
|
||||
);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values_per_facet(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values_per_facet = max;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn default_order_by(&mut self, order_by: OrderBy) -> &mut Self {
|
||||
self.default_order_by = order_by;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
|
||||
self.candidates = Some(candidates);
|
||||
self
|
||||
}
|
||||
|
||||
/// There is a small amount of candidates OR we ask for facet string values so we
|
||||
/// decide to iterate over the facet values of each one of them, one by one.
|
||||
fn facet_distribution_from_documents(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut IndexMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
match facet_type {
|
||||
FacetType::Number => {
|
||||
let mut lexicographic_distribution = BTreeMap::new();
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
|
||||
|
||||
let db = self.index.field_id_docid_facet_f64s;
|
||||
for docid in candidates {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.rtxn, &key_buffer)?
|
||||
.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, value), ()) = result?;
|
||||
*lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
distribution.extend(
|
||||
lexicographic_distribution
|
||||
.into_iter()
|
||||
.take(self.max_values_per_facet.saturating_sub(distribution.len())),
|
||||
);
|
||||
}
|
||||
FacetType::String => {
|
||||
let mut normalized_distribution = BTreeMap::new();
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
|
||||
|
||||
let db = self.index.field_id_docid_facet_strings;
|
||||
for docid in candidates {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.rtxn, &key_buffer)?
|
||||
.remap_key_type::<FieldDocIdFacetStringCodec>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, normalized_value), original_value) = result?;
|
||||
let (_, count) = normalized_distribution
|
||||
.entry(normalized_value)
|
||||
.or_insert_with(|| (original_value, 0));
|
||||
*count += 1;
|
||||
|
||||
// we'd like to break here if we have enough facet values, but we are collecting them by increasing docid,
|
||||
// so higher ranked facets could be in later docids
|
||||
}
|
||||
}
|
||||
|
||||
let iter = normalized_distribution
|
||||
.into_iter()
|
||||
.take(self.max_values_per_facet.saturating_sub(distribution.len()))
|
||||
.map(|(_normalized, (original, count))| (original.to_string(), count));
|
||||
distribution.extend(iter);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// There is too much documents, we use the facet levels to move throught
|
||||
/// the facet values, to find the candidates and values associated.
|
||||
fn facet_numbers_distribution_from_facet_levels(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
order_by: OrderBy,
|
||||
distribution: &mut IndexMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
let search_function = match order_by {
|
||||
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
|
||||
OrderBy::Count => count_iterate_over_facet_distribution,
|
||||
};
|
||||
|
||||
search_function(
|
||||
self.rtxn,
|
||||
self.index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||
field_id,
|
||||
candidates,
|
||||
|facet_key, nbr_docids, _| {
|
||||
let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
|
||||
distribution.insert(facet_key.to_string(), nbr_docids);
|
||||
if distribution.len() == self.max_values_per_facet {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn facet_strings_distribution_from_facet_levels(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
order_by: OrderBy,
|
||||
distribution: &mut IndexMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
let search_function = match order_by {
|
||||
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
|
||||
OrderBy::Count => count_iterate_over_facet_distribution,
|
||||
};
|
||||
|
||||
search_function(
|
||||
self.rtxn,
|
||||
self.index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||
field_id,
|
||||
candidates,
|
||||
|facet_key, nbr_docids, any_docid| {
|
||||
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
|
||||
|
||||
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
|
||||
let original_string = self
|
||||
.index
|
||||
.field_id_docid_facet_strings
|
||||
.get(self.rtxn, &key)?
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
|
||||
distribution.insert(original_string, nbr_docids);
|
||||
if distribution.len() == self.max_values_per_facet {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn facet_values(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
order_by: OrderBy,
|
||||
) -> heed::Result<IndexMap<String, u64>> {
|
||||
use FacetType::{Number, String};
|
||||
|
||||
let mut distribution = IndexMap::new();
|
||||
match (order_by, &self.candidates) {
|
||||
(OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => {
|
||||
// Classic search, candidates were specified, we must return facet values only related
|
||||
// to those candidates. We also enter here for facet strings for performance reasons.
|
||||
self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?;
|
||||
self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?;
|
||||
}
|
||||
_ => {
|
||||
let universe;
|
||||
let candidates = match &self.candidates {
|
||||
Some(cnd) => cnd,
|
||||
None => {
|
||||
universe = self.index.documents_ids(self.rtxn)?;
|
||||
&universe
|
||||
}
|
||||
};
|
||||
|
||||
self.facet_numbers_distribution_from_facet_levels(
|
||||
field_id,
|
||||
candidates,
|
||||
order_by,
|
||||
&mut distribution,
|
||||
)?;
|
||||
self.facet_strings_distribution_from_facet_levels(
|
||||
field_id,
|
||||
candidates,
|
||||
order_by,
|
||||
&mut distribution,
|
||||
)?;
|
||||
}
|
||||
};
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
|
||||
pub fn compute_stats(&self) -> Result<BTreeMap<String, (f64, f64)>> {
|
||||
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
|
||||
let candidates = if let Some(candidates) = self.candidates.clone() {
|
||||
candidates
|
||||
} else {
|
||||
return Ok(Default::default());
|
||||
};
|
||||
|
||||
let fields = match &self.facets {
|
||||
Some(facets) => {
|
||||
let invalid_fields: HashSet<_> = facets
|
||||
.iter()
|
||||
.map(|(name, _)| name)
|
||||
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
|
||||
.collect();
|
||||
if !invalid_fields.is_empty() {
|
||||
return Err(UserError::InvalidFacetsDistribution {
|
||||
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
||||
valid_facets_name: filterable_fields.into_iter().collect(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
facets.iter().map(|(name, _)| name).cloned().collect()
|
||||
}
|
||||
}
|
||||
None => filterable_fields,
|
||||
};
|
||||
|
||||
let mut distribution = BTreeMap::new();
|
||||
for (fid, name) in fields_ids_map.iter() {
|
||||
if crate::is_faceted(name, &fields) {
|
||||
let min_value = if let Some(min_value) = crate::search::facet::facet_min_value(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
fid,
|
||||
candidates.clone(),
|
||||
)? {
|
||||
min_value
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let max_value = if let Some(max_value) = crate::search::facet::facet_max_value(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
fid,
|
||||
candidates.clone(),
|
||||
)? {
|
||||
max_value
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
distribution.insert(name.to_string(), (min_value, max_value));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<BTreeMap<String, IndexMap<String, u64>>> {
|
||||
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
|
||||
|
||||
let fields = match self.facets {
|
||||
Some(ref facets) => {
|
||||
let invalid_fields: HashSet<_> = facets
|
||||
.iter()
|
||||
.map(|(name, _)| name)
|
||||
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
|
||||
.collect();
|
||||
if !invalid_fields.is_empty() {
|
||||
return Err(UserError::InvalidFacetsDistribution {
|
||||
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
||||
valid_facets_name: filterable_fields.into_iter().collect(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
facets.iter().map(|(name, _)| name).cloned().collect()
|
||||
}
|
||||
}
|
||||
None => filterable_fields,
|
||||
};
|
||||
|
||||
let mut distribution = BTreeMap::new();
|
||||
for (fid, name) in fields_ids_map.iter() {
|
||||
if crate::is_faceted(name, &fields) {
|
||||
let order_by = self
|
||||
.facets
|
||||
.as_ref()
|
||||
.and_then(|facets| facets.get(name).copied())
|
||||
.unwrap_or(self.default_order_by);
|
||||
let values = self.facet_values(fid, order_by)?;
|
||||
distribution.insert(name.to_string(), values);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FacetDistribution<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let FacetDistribution {
|
||||
facets,
|
||||
candidates,
|
||||
max_values_per_facet,
|
||||
default_order_by,
|
||||
rtxn: _,
|
||||
index: _,
|
||||
} = self;
|
||||
|
||||
f.debug_struct("FacetDistribution")
|
||||
.field("facets", facets)
|
||||
.field("candidates", candidates)
|
||||
.field("max_values_per_facet", max_values_per_facet)
|
||||
.field("default_order_by", default_order_by)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{milli_snap, FacetDistribution, OrderBy};
|
||||
|
||||
#[test]
|
||||
fn few_candidates_few_facet_values() {
|
||||
// All the tests here avoid using the code in `facet_distribution_iter` because there aren't
|
||||
// enough candidates.
|
||||
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "colour": "Blue" },
|
||||
{ "colour": " blue" },
|
||||
{ "colour": "RED" }
|
||||
]);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([0, 1, 2].iter().copied().collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([1, 2].iter().copied().collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
// I think it would be fine if " blue" was "Blue" instead.
|
||||
// We just need to get any non-normalised string I think, even if it's not in
|
||||
// the candidates
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([2].iter().copied().collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([0, 1, 2].iter().copied().collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::Count)))
|
||||
.candidates([0, 1, 2].iter().copied().collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_candidates_few_facet_values() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"];
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..10_000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": facet_values[i % 5],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..10_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::Count)))
|
||||
.candidates((0..5_000).collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Red": 3000}}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_candidates_many_facet_values() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..10_000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": facet_values[i % 1000],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.max_values_per_facet(2)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..10_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_stats() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": facet_values[i % 1000],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 776.0)}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_stats_array() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": [facet_values[i % 1000], facet_values[i % 1000] + 1000],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1999.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 1776.0)}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_stats_mixed_array() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": [facet_values[i % 1000], format!("{}", facet_values[i % 1000] + 1000)],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 776.0)}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_mixed_values() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = if i % 2 == 0 {
|
||||
serde_json::json!({
|
||||
"colour": [facet_values[i % 1000], facet_values[i % 1000] + 1000],
|
||||
})
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"colour": format!("{}", facet_values[i % 1000] + 10000),
|
||||
})
|
||||
};
|
||||
let document = document.as_object().unwrap().clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1998.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (218.0, 1776.0)}"###);
|
||||
}
|
||||
}
|
301
crates/milli/src/search/facet/facet_distribution_iter.rs
Normal file
301
crates/milli/src/search/facet/facet_distribution_iter.rs
Normal file
|
@ -0,0 +1,301 @@
|
|||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use heed::Result;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId};
|
||||
|
||||
/// Call the given closure on the facet distribution of the candidate documents.
|
||||
///
|
||||
/// The arguments to the closure are:
|
||||
/// - the facet value, as a byte slice
|
||||
/// - the number of documents among the candidates that contain this facet value
|
||||
/// - the id of a document which contains the facet value. Note that this document
|
||||
/// is not necessarily from the list of candidates, it is simply *any* document which
|
||||
/// contains this facet value.
|
||||
///
|
||||
/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should
|
||||
/// keep iterating over the different facet values or stop.
|
||||
pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
callback: CB,
|
||||
) -> Result<()>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
||||
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count_iterate_over_facet_distribution<'t, CB>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
mut callback: CB,
|
||||
) -> Result<()>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
/// # Important
|
||||
/// The order of the fields determines the order in which the facet values will be returned.
|
||||
/// This struct is inserted in a BinaryHeap and popped later on.
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
|
||||
struct LevelEntry<'t> {
|
||||
/// The number of candidates in this entry.
|
||||
count: u64,
|
||||
/// The key level of the entry.
|
||||
level: Reverse<u8>,
|
||||
/// The left bound key.
|
||||
left_bound: &'t [u8],
|
||||
/// The number of keys we must look for after `left_bound`.
|
||||
group_size: u8,
|
||||
/// Any docid in the set of matching documents. Used to find the original facet string.
|
||||
any_docid: u32,
|
||||
}
|
||||
|
||||
// Represents the list of keys that we must explore.
|
||||
let mut heap = BinaryHeap::new();
|
||||
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
// We first fill the heap with values from the highest level
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
for el in db.range(rtxn, &(&starting_key..))?.take(usize::MAX) {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != field_id {
|
||||
break;
|
||||
}
|
||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
let count = intersection.len();
|
||||
if count != 0 {
|
||||
heap.push(LevelEntry {
|
||||
count,
|
||||
level: Reverse(key.level),
|
||||
left_bound: key.left_bound,
|
||||
group_size: value.size,
|
||||
any_docid: intersection.min().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
|
||||
{
|
||||
if let Reverse(0) = level {
|
||||
match (callback)(left_bound, count, any_docid)? {
|
||||
ControlFlow::Continue(_) => (),
|
||||
ControlFlow::Break(_) => return Ok(()),
|
||||
}
|
||||
} else {
|
||||
let starting_key = FacetGroupKey { field_id, level: level.0 - 1, left_bound };
|
||||
for el in db.range(rtxn, &(&starting_key..))?.take(group_size as usize) {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != field_id {
|
||||
break;
|
||||
}
|
||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
let count = intersection.len();
|
||||
if count != 0 {
|
||||
heap.push(LevelEntry {
|
||||
count,
|
||||
level: Reverse(key.level),
|
||||
left_bound: key.left_bound,
|
||||
group_size: value.size,
|
||||
any_docid: intersection.min().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate over the facets values by lexicographic order.
|
||||
struct LexicographicFacetDistribution<'t, CB>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
||||
field_id: u16,
|
||||
callback: CB,
|
||||
}
|
||||
|
||||
impl<'t, CB> LexicographicFacetDistribution<'t, CB>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
fn iterate_level_0(
|
||||
&mut self,
|
||||
candidates: &RoaringBitmap,
|
||||
starting_bound: &'t [u8],
|
||||
group_size: usize,
|
||||
) -> Result<ControlFlow<()>> {
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
|
||||
let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size);
|
||||
for el in iter {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != self.field_id {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
if !docids_in_common.is_empty() {
|
||||
let any_docid_in_common = docids_in_common.min().unwrap();
|
||||
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
||||
{
|
||||
ControlFlow::Continue(_) => (),
|
||||
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
|
||||
fn iterate(
|
||||
&mut self,
|
||||
candidates: &RoaringBitmap,
|
||||
level: u8,
|
||||
starting_bound: &'t [u8],
|
||||
group_size: usize,
|
||||
) -> Result<ControlFlow<()>> {
|
||||
if level == 0 {
|
||||
return self.iterate_level_0(candidates, starting_bound, group_size);
|
||||
}
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound };
|
||||
let iter = self.db.range(self.rtxn, &(&starting_key..))?.take(group_size);
|
||||
|
||||
for el in iter {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != self.field_id {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
if !docids_in_common.is_empty() {
|
||||
let cf = self.iterate(
|
||||
&docids_in_common,
|
||||
level - 1,
|
||||
key.left_bound,
|
||||
value.size as usize,
|
||||
)?;
|
||||
match cf {
|
||||
ControlFlow::Continue(_) => (),
|
||||
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::lexicographically_iterate_over_facet_distribution;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
|
||||
|
||||
#[test]
|
||||
fn filter_distribution_all() {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (0..=255).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
lexicographically_iterate_over_facet_distribution(
|
||||
&txn,
|
||||
index.content,
|
||||
0,
|
||||
&candidates,
|
||||
|facet, count, _| {
|
||||
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
|
||||
results.push_str(&format!("{facet}: {count}\n"));
|
||||
Ok(ControlFlow::Continue(()))
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_distribution_all_stop_early() {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (0..=255).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let mut nbr_facets = 0;
|
||||
lexicographically_iterate_over_facet_distribution(
|
||||
&txn,
|
||||
index.content,
|
||||
0,
|
||||
&candidates,
|
||||
|facet, count, _| {
|
||||
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
|
||||
if nbr_facets == 100 {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
nbr_facets += 1;
|
||||
results.push_str(&format!("{facet}: {count}\n"));
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
688
crates/milli/src/search/facet/facet_range_search.rs
Normal file
688
crates/milli/src/search/facet/facet_range_search.rs
Normal file
|
@ -0,0 +1,688 @@
|
|||
use std::ops::{Bound, RangeBounds};
|
||||
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
/// Find all the document ids for which the given field contains a value contained within
|
||||
/// the two bounds.
|
||||
pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BoundCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
|
||||
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
docids: &mut RoaringBitmap,
|
||||
) -> Result<()>
|
||||
where
|
||||
BoundCodec: for<'a> BytesEncode<'a>,
|
||||
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized,
|
||||
{
|
||||
let inner;
|
||||
let left = match left {
|
||||
Bound::Included(left) => {
|
||||
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
|
||||
Bound::Included(inner.as_ref())
|
||||
}
|
||||
Bound::Excluded(left) => {
|
||||
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
|
||||
Bound::Excluded(inner.as_ref())
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let inner;
|
||||
let right = match right {
|
||||
Bound::Included(right) => {
|
||||
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
|
||||
Bound::Included(inner.as_ref())
|
||||
}
|
||||
Bound::Excluded(right) => {
|
||||
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
|
||||
Bound::Excluded(inner.as_ref())
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let db = db.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>();
|
||||
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids };
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(starting_left_bound) =
|
||||
get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?
|
||||
{
|
||||
let rightmost_bound =
|
||||
Bound::Included(get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
|
||||
let group_size = usize::MAX;
|
||||
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch the document ids that have a facet with a value between the two given bounds
|
||||
struct FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
||||
field_id: u16,
|
||||
left: Bound<&'b [u8]>,
|
||||
right: Bound<&'b [u8]>,
|
||||
/// The subset of documents ids that are useful for this search.
|
||||
/// Great performance optimizations can be achieved by only fetching values matching this subset.
|
||||
universe: Option<&'bitmap RoaringBitmap>,
|
||||
docids: &'bitmap mut RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
|
||||
let left_key =
|
||||
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound };
|
||||
let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size);
|
||||
for el in iter {
|
||||
let (key, value) = el?;
|
||||
// the right side of the iter range is unbounded, so we need to make sure that we are not iterating
|
||||
// on the next field id
|
||||
if key.field_id != self.field_id {
|
||||
return Ok(());
|
||||
}
|
||||
let should_skip = {
|
||||
match self.left {
|
||||
Bound::Included(left) => left > key.left_bound,
|
||||
Bound::Excluded(left) => left >= key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_skip {
|
||||
continue;
|
||||
}
|
||||
let should_stop = {
|
||||
match self.right {
|
||||
Bound::Included(right) => right < key.left_bound,
|
||||
Bound::Excluded(right) => right <= key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_stop {
|
||||
break;
|
||||
}
|
||||
|
||||
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recursive part of the algorithm for level > 0.
|
||||
///
|
||||
/// It works by visiting a slice of a level and checking whether the range asscociated
|
||||
/// with each visited element is contained within the bounds.
|
||||
///
|
||||
/// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating
|
||||
/// 2. If the element's range is fully contained by the bounds, then all of its docids are added to
|
||||
/// the roaring bitmap.
|
||||
/// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively
|
||||
/// on the children of the element from the level below.
|
||||
/// 4. If the element's range is greater than the right bound, we do nothing and stop iterating.
|
||||
/// Note that the right bound is found through either the `left_bound` of the *next* element,
|
||||
/// or from the `rightmost_bound` argument
|
||||
///
|
||||
/// ## Arguments
|
||||
/// - `level`: the level being visited
|
||||
/// - `starting_left_bound`: the left_bound of the first element to visit
|
||||
/// - `rightmost_bound`: the right bound of the last element that should be visited
|
||||
/// - `group_size`: the number of elements that should be visited
|
||||
fn run(
|
||||
&mut self,
|
||||
level: u8,
|
||||
starting_left_bound: &'t [u8],
|
||||
rightmost_bound: Bound<&'t [u8]>,
|
||||
group_size: usize,
|
||||
) -> Result<()> {
|
||||
if level == 0 {
|
||||
return self.run_level_0(starting_left_bound, group_size);
|
||||
}
|
||||
|
||||
let left_key =
|
||||
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound };
|
||||
let mut iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size);
|
||||
|
||||
// We iterate over the range while keeping in memory the previous value
|
||||
let (mut previous_key, mut previous_value) = iter.next().unwrap()?;
|
||||
for el in iter {
|
||||
let (next_key, next_value) = el?;
|
||||
// the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX),
|
||||
// so we need to make sure that we are not iterating on the next field id
|
||||
if next_key.field_id != self.field_id {
|
||||
break;
|
||||
}
|
||||
// now, do we skip, stop, or visit?
|
||||
let should_skip = {
|
||||
match self.left {
|
||||
Bound::Included(left) => left >= next_key.left_bound,
|
||||
Bound::Excluded(left) => left >= next_key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_skip {
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
continue;
|
||||
}
|
||||
|
||||
// should we stop?
|
||||
// We should if the search range doesn't include any
|
||||
// element from the previous key or its successors
|
||||
let should_stop = {
|
||||
match self.right {
|
||||
Bound::Included(right) => right < previous_key.left_bound,
|
||||
Bound::Excluded(right) => right <= previous_key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_stop {
|
||||
return Ok(());
|
||||
}
|
||||
// should we take the whole thing, without recursing down?
|
||||
let should_take_whole_group = {
|
||||
let left_condition = match self.left {
|
||||
Bound::Included(left) => previous_key.left_bound >= left,
|
||||
Bound::Excluded(left) => previous_key.left_bound > left,
|
||||
Bound::Unbounded => true,
|
||||
};
|
||||
let right_condition = match self.right {
|
||||
Bound::Included(right) => next_key.left_bound <= right,
|
||||
Bound::Excluded(right) => next_key.left_bound <= right,
|
||||
Bound::Unbounded => true,
|
||||
};
|
||||
left_condition && right_condition
|
||||
};
|
||||
if should_take_whole_group {
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
previous_value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||
};
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
continue;
|
||||
}
|
||||
// from here, we should visit the children of the previous element and
|
||||
// call the function recursively
|
||||
|
||||
let level = level - 1;
|
||||
let starting_left_bound = previous_key.left_bound;
|
||||
let rightmost_bound = Bound::Excluded(next_key.left_bound);
|
||||
let group_size = previous_value.size as usize;
|
||||
|
||||
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
}
|
||||
// previous_key/previous_value are the last element's key/value
|
||||
|
||||
// now, do we skip, stop, or visit?
|
||||
let should_skip = {
|
||||
match (self.left, rightmost_bound) {
|
||||
(Bound::Included(left), Bound::Included(right)) => left > right,
|
||||
(Bound::Included(left), Bound::Excluded(right)) => left >= right,
|
||||
(Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => {
|
||||
left >= right
|
||||
}
|
||||
(Bound::Unbounded, _) => false,
|
||||
(_, Bound::Unbounded) => false, // should never run?
|
||||
}
|
||||
};
|
||||
if should_skip {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// should we stop?
|
||||
// We should if the search range doesn't include any
|
||||
// element from the previous key or its successors
|
||||
let should_stop = {
|
||||
match self.right {
|
||||
Bound::Included(right) => right < previous_key.left_bound,
|
||||
Bound::Excluded(right) => right <= previous_key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_stop {
|
||||
return Ok(());
|
||||
}
|
||||
// should we take the whole thing, without recursing down?
|
||||
let should_take_whole_group = {
|
||||
let left_condition = match self.left {
|
||||
Bound::Included(left) => previous_key.left_bound >= left,
|
||||
Bound::Excluded(left) => previous_key.left_bound > left,
|
||||
Bound::Unbounded => true,
|
||||
};
|
||||
let right_condition = match (self.right, rightmost_bound) {
|
||||
(Bound::Included(right), Bound::Included(rightmost)) => {
|
||||
// we need to stay within the bound ..=right
|
||||
// the element's range goes to ..=righmost
|
||||
// so the element fits entirely within the bound if rightmost <= right
|
||||
rightmost <= right
|
||||
}
|
||||
(Bound::Included(right), Bound::Excluded(rightmost)) => {
|
||||
// we need to stay within the bound ..=right
|
||||
// the element's range goes to ..righmost
|
||||
// so the element fits entirely within the bound if rightmost <= right
|
||||
rightmost <= right
|
||||
}
|
||||
(Bound::Excluded(right), Bound::Included(rightmost)) => {
|
||||
// we need to stay within the bound ..right
|
||||
// the element's range goes to ..=righmost
|
||||
// so the element fits entirely within the bound if rightmost < right
|
||||
rightmost < right
|
||||
}
|
||||
(Bound::Excluded(right), Bound::Excluded(rightmost)) => {
|
||||
// we need to stay within the bound ..right
|
||||
// the element's range goes to ..righmost
|
||||
// so the element fits entirely within the bound if rightmost <= right
|
||||
rightmost <= right
|
||||
}
|
||||
(Bound::Unbounded, _) => {
|
||||
// we need to stay within the bound ..inf
|
||||
// so the element always fits entirely within the bound
|
||||
true
|
||||
}
|
||||
(_, Bound::Unbounded) => {
|
||||
// we need to stay within a finite bound
|
||||
// but the element's range goes to ..inf
|
||||
// so the element never fits entirely within the bound
|
||||
false
|
||||
}
|
||||
};
|
||||
left_condition && right_condition
|
||||
};
|
||||
if should_take_whole_group {
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
previous_value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||
};
|
||||
} else {
|
||||
let level = level - 1;
|
||||
let starting_left_bound = previous_key.left_bound;
|
||||
let group_size = previous_value.size as usize;
|
||||
|
||||
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Bound;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::find_docids_of_facet_within_bounds;
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::tests::{
|
||||
get_random_looking_index, get_random_looking_index_with_multiple_field_ids,
|
||||
get_simple_index, get_simple_index_with_multiple_field_ids,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
|
||||
#[test]
|
||||
fn random_looking_index_snap() {
|
||||
let index = get_random_looking_index();
|
||||
milli_snap!(format!("{index}"), @"3256c76a7c1b768a013e78d5fa6e9ff9");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_looking_index_with_multiple_field_ids_snap() {
|
||||
let index = get_random_looking_index_with_multiple_field_ids();
|
||||
milli_snap!(format!("{index}"), @"c3e5fe06a8f1c404ed4935b32c90a89b");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_index_snap() {
|
||||
let index = get_simple_index();
|
||||
milli_snap!(format!("{index}"), @"5dbfa134cc44abeb3ab6242fc182e48e");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_index_with_multiple_field_ids_snap() {
|
||||
let index = get_simple_index_with_multiple_field_ids();
|
||||
milli_snap!(format!("{index}"), @"a4893298218f682bc76357f46777448c");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_range_increasing() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(0.);
|
||||
let end = Bound::Included(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!("0 <= . <= {i} : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("included_{i}"));
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(0.);
|
||||
let end = Bound::Excluded(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!("0 < . < {i} : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("excluded_{i}"));
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn filter_range_decreasing() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=255).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(255.);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!("{i} <= . <= 255 : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("included_{i}"));
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=255).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(i);
|
||||
let end = Bound::Excluded(255.);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!("{i} < . < 255 : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("excluded_{i}"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn filter_range_pinch() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=128).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(255. - i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!(
|
||||
"{i} <= . <= {r} : {docids}\n",
|
||||
r = 255. - i,
|
||||
docids = display_bitmap(&docids)
|
||||
));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("included_{i}"));
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=128).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(i);
|
||||
let end = Bound::Excluded(255. - i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!(
|
||||
"{i} < . < {r} {docids}\n",
|
||||
r = 255. - i,
|
||||
docids = display_bitmap(&docids)
|
||||
));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("excluded_{i}"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_range_unbounded() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Unbounded;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!(">= {i}: {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("start_from_included_{i}"));
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Unbounded;
|
||||
let end = Bound::Included(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!("<= {i}: {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("end_at_included_{i}"));
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&Bound::Unbounded,
|
||||
&Bound::Unbounded,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(
|
||||
&format!("all field_id 0: {}\n", display_bitmap(&docids)),
|
||||
format!("unbounded_field_id_0_{i}")
|
||||
);
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
1,
|
||||
&Bound::Unbounded,
|
||||
&Bound::Unbounded,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(
|
||||
&format!("all field_id 1: {}\n", display_bitmap(&docids)),
|
||||
format!("unbounded_field_id_1_{i}")
|
||||
);
|
||||
|
||||
drop(txn);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_range_exact() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let mut results_0 = String::new();
|
||||
let mut results_1 = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results_0.push_str(&format!("{i}: {}\n", display_bitmap(&docids)));
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
1,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results_1.push_str(&format!("{i}: {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results_0, format!("field_id_0_exact_{i}"));
|
||||
milli_snap!(results_1, format!("field_id_1_exact_{i}"));
|
||||
|
||||
drop(txn);
|
||||
}
|
||||
}
|
||||
}
|
230
crates/milli/src/search/facet/facet_sort_ascending.rs
Normal file
230
crates/milli/src/search/facet/facet_sort_ascending.rs
Normal file
|
@ -0,0 +1,230 @@
|
|||
use heed::Result;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
|
||||
/// Return an iterator which iterates over the given candidate documents in
|
||||
/// ascending order of their facet value for the given field id.
|
||||
///
|
||||
/// The documents returned by the iterator are grouped by the facet values that
|
||||
/// determined their rank. For example, given the documents:
|
||||
///
|
||||
/// ```text
|
||||
/// 0: { "colour": ["blue", "green"] }
|
||||
/// 1: { "colour": ["blue", "red"] }
|
||||
/// 2: { "colour": ["orange", "red"] }
|
||||
/// 3: { "colour": ["green", "red"] }
|
||||
/// 4: { "colour": ["blue", "orange", "red"] }
|
||||
/// ```
|
||||
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
||||
/// over the following elements:
|
||||
/// ```text
|
||||
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
||||
/// [3] // same for "green"
|
||||
/// [2] // same for "orange"
|
||||
/// END
|
||||
/// ```
|
||||
/// Note that once a document id is returned by the iterator, it is never returned again.
|
||||
pub fn ascending_facet_sort<'t>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
|
||||
|
||||
Ok(itertools::Either::Left(AscendingFacetSort {
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
stack: vec![(candidates, iter)],
|
||||
}))
|
||||
} else {
|
||||
Ok(itertools::Either::Right(std::iter::empty()))
|
||||
}
|
||||
}
|
||||
|
||||
struct AscendingFacetSort<'t, 'e> {
|
||||
rtxn: &'t heed::RoTxn<'e>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
#[allow(clippy::type_complexity)]
|
||||
stack: Vec<(
|
||||
RoaringBitmap,
|
||||
std::iter::Take<heed::RoRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>>,
|
||||
)>,
|
||||
}
|
||||
|
||||
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
|
||||
type Item = Result<(RoaringBitmap, &'t [u8])>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, deepest_iter) = self.stack.last_mut()?;
|
||||
for result in deepest_iter {
|
||||
let (
|
||||
FacetGroupKey { level, left_bound, field_id },
|
||||
FacetGroupValue { size: group_size, mut bitmap },
|
||||
) = result.unwrap();
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if field_id != self.field_id {
|
||||
return None;
|
||||
}
|
||||
|
||||
// If the last iterator found an empty set of documents it means
|
||||
// that we found all the documents in the sub level iterations already,
|
||||
// we can pop this level iterator.
|
||||
if documents_ids.is_empty() {
|
||||
// break our of the for loop into the end of the 'outer loop, which
|
||||
// pops the stack
|
||||
break;
|
||||
}
|
||||
|
||||
bitmap &= &*documents_ids;
|
||||
if !bitmap.is_empty() {
|
||||
*documents_ids -= &bitmap;
|
||||
|
||||
if level == 0 {
|
||||
// Since the level is 0, the left_bound is the exact value.
|
||||
return Some(Ok((bitmap, left_bound)));
|
||||
}
|
||||
let starting_key_below =
|
||||
FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound };
|
||||
let iter = match self.db.range(self.rtxn, &(starting_key_below..)) {
|
||||
Ok(iter) => iter,
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
.take(group_size as usize);
|
||||
|
||||
self.stack.push((bitmap, iter));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
self.stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::facet_sort_ascending::ascending_facet_sort;
|
||||
use crate::search::facet::tests::{
|
||||
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
|
||||
get_simple_index, get_simple_string_index_with_multiple_field_ids,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending() {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_multiple_field_ids() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-0"));
|
||||
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-1"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_no_candidates() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_inexisting_field_id() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
244
crates/milli/src/search/facet/facet_sort_descending.rs
Normal file
244
crates/milli/src/search/facet/facet_sort_descending.rs
Normal file
|
@ -0,0 +1,244 @@
|
|||
use std::ops::Bound;
|
||||
|
||||
use heed::Result;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
|
||||
/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort).
|
||||
///
|
||||
/// This function does the same thing, but in the opposite order.
|
||||
pub fn descending_facet_sort<'t>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
let last_bound = get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap();
|
||||
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
|
||||
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
|
||||
Ok(itertools::Either::Left(DescendingFacetSort {
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
stack: vec![(candidates, iter, Bound::Included(last_bound))],
|
||||
}))
|
||||
} else {
|
||||
Ok(itertools::Either::Right(std::iter::empty()))
|
||||
}
|
||||
}
|
||||
|
||||
struct DescendingFacetSort<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
#[allow(clippy::type_complexity)]
|
||||
stack: Vec<(
|
||||
RoaringBitmap,
|
||||
std::iter::Take<
|
||||
heed::RoRevRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
>,
|
||||
Bound<&'t [u8]>,
|
||||
)>,
|
||||
}
|
||||
|
||||
impl<'t> Iterator for DescendingFacetSort<'t> {
|
||||
type Item = Result<(RoaringBitmap, &'t [u8])>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?;
|
||||
for result in deepest_iter.by_ref() {
|
||||
let (
|
||||
FacetGroupKey { level, left_bound, field_id },
|
||||
FacetGroupValue { size: group_size, mut bitmap },
|
||||
) = result.unwrap();
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if field_id != self.field_id {
|
||||
return None;
|
||||
}
|
||||
// If the last iterator found an empty set of documents it means
|
||||
// that we found all the documents in the sub level iterations already,
|
||||
// we can pop this level iterator.
|
||||
if documents_ids.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
bitmap &= &*documents_ids;
|
||||
if !bitmap.is_empty() {
|
||||
*documents_ids -= &bitmap;
|
||||
|
||||
if level == 0 {
|
||||
// Since we're at the level 0 the left_bound is the exact value.
|
||||
return Some(Ok((bitmap, left_bound)));
|
||||
}
|
||||
let starting_key_below =
|
||||
FacetGroupKey { field_id, level: level - 1, left_bound };
|
||||
|
||||
let end_key_kelow = match *right_bound {
|
||||
Bound::Included(right) => Bound::Included(FacetGroupKey {
|
||||
field_id,
|
||||
level: level - 1,
|
||||
left_bound: right,
|
||||
}),
|
||||
Bound::Excluded(right) => Bound::Excluded(FacetGroupKey {
|
||||
field_id,
|
||||
level: level - 1,
|
||||
left_bound: right,
|
||||
}),
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let prev_right_bound = *right_bound;
|
||||
*right_bound = Bound::Excluded(left_bound);
|
||||
let iter = match self
|
||||
.db
|
||||
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
.rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow))
|
||||
{
|
||||
Ok(iter) => iter,
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
.take(group_size as usize);
|
||||
|
||||
self.stack.push((bitmap, iter, prev_right_bound));
|
||||
continue 'outer;
|
||||
}
|
||||
*right_bound = Bound::Excluded(left_bound);
|
||||
}
|
||||
self.stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::FacetGroupKeyCodec;
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::facet_sort_descending::descending_facet_sort;
|
||||
use crate::search::facet::tests::{
|
||||
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
|
||||
get_simple_index, get_simple_index_with_multiple_field_ids,
|
||||
get_simple_string_index_with_multiple_field_ids,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
|
||||
#[test]
|
||||
fn filter_sort_descending() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_descending_multiple_field_ids() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-0"));
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-1"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_no_candidates() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
let mut results = String::new();
|
||||
let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_inexisting_field_id() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
1211
crates/milli/src/search/facet/filter.rs
Normal file
1211
crates/milli/src/search/facet/filter.rs
Normal file
File diff suppressed because it is too large
Load diff
227
crates/milli/src/search/facet/mod.rs
Normal file
227
crates/milli/src/search/facet/mod.rs
Normal file
|
@ -0,0 +1,227 @@
|
|||
pub use facet_sort_ascending::ascending_facet_sort;
|
||||
pub use facet_sort_descending::descending_facet_sort;
|
||||
use heed::types::{Bytes, DecodeIgnore};
|
||||
use heed::{BytesDecode, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::filter::{BadGeoError, Filter};
|
||||
pub use self::search::{FacetValueHit, SearchForFacetValues};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{Index, Result};
|
||||
|
||||
mod facet_distribution;
|
||||
mod facet_distribution_iter;
|
||||
mod facet_range_search;
|
||||
mod facet_sort_ascending;
|
||||
mod facet_sort_descending;
|
||||
mod filter;
|
||||
mod search;
|
||||
|
||||
fn facet_extreme_value<'t>(
|
||||
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
|
||||
) -> Result<Option<f64>> {
|
||||
let extreme_value =
|
||||
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
|
||||
let (_, extreme_value) = extreme_value?;
|
||||
OrderedF64Codec::bytes_decode(extreme_value)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn facet_min_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
pub fn facet_max_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
/// Get the first facet value in the facet database
|
||||
pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<Option<BoundCodec::DItem>>
|
||||
where
|
||||
BoundCodec: BytesDecode<'t>,
|
||||
{
|
||||
let mut level0prefix = vec![];
|
||||
level0prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level0prefix.push(0);
|
||||
let mut level0_iter_forward =
|
||||
db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, level0prefix.as_slice())?;
|
||||
if let Some(first) = level0_iter_forward.next() {
|
||||
let (first_key, _) = first?;
|
||||
let first_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(first_key)
|
||||
.map_err(heed::Error::Decoding)?;
|
||||
Ok(Some(first_key.left_bound))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the last facet value in the facet database
|
||||
pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<Option<BoundCodec::DItem>>
|
||||
where
|
||||
BoundCodec: BytesDecode<'t>,
|
||||
{
|
||||
let mut level0prefix = vec![];
|
||||
level0prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level0prefix.push(0);
|
||||
let mut level0_iter_backward =
|
||||
db.remap_types::<Bytes, DecodeIgnore>().rev_prefix_iter(txn, level0prefix.as_slice())?;
|
||||
if let Some(last) = level0_iter_backward.next() {
|
||||
let (last_key, _) = last?;
|
||||
let last_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(last_key)
|
||||
.map_err(heed::Error::Decoding)?;
|
||||
Ok(Some(last_key.left_bound))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the height of the highest level in the facet database
|
||||
pub(crate) fn get_highest_level<'t, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<u8> {
|
||||
let field_id_prefix = &field_id.to_be_bytes();
|
||||
Ok(db
|
||||
.remap_types::<Bytes, DecodeIgnore>()
|
||||
.rev_prefix_iter(txn, field_id_prefix)?
|
||||
.next()
|
||||
.map(|el| {
|
||||
let (key, _) = el.unwrap();
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
|
||||
key.level
|
||||
})
|
||||
.unwrap_or(0))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use rand::{Rng, SeedableRng};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::facet::test_helpers::FacetIndex;
|
||||
|
||||
pub fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
for i in 0..256u16 {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(i as u32);
|
||||
index.insert(&mut txn, 0, &(i as f64), &bitmap);
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
|
||||
for key in std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128) {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(key);
|
||||
bitmap.insert(key + 100);
|
||||
index.insert(&mut txn, 0, &(key as f64), &bitmap);
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
for fid in 0..2 {
|
||||
for i in 0..256u16 {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(i as u32);
|
||||
index.insert(&mut txn, fid, &(i as f64), &bitmap);
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
let keys =
|
||||
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
|
||||
for fid in 0..2 {
|
||||
for &key in &keys {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(key);
|
||||
bitmap.insert(key + 100);
|
||||
index.insert(&mut txn, fid, &(key as f64), &bitmap);
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
|
||||
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
for fid in 0..2 {
|
||||
for i in 0..256u16 {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(i as u32);
|
||||
if i % 2 == 0 {
|
||||
index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap);
|
||||
} else {
|
||||
index.insert(&mut txn, fid, &"", &bitmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
|
||||
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
let keys =
|
||||
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
|
||||
for fid in 0..2 {
|
||||
for &key in &keys {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(key);
|
||||
bitmap.insert(key + 100);
|
||||
if key % 2 == 0 {
|
||||
index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap);
|
||||
} else {
|
||||
index.insert(&mut txn, fid, &"", &bitmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
}
|
358
crates/milli/src/search/facet/search.rs
Normal file
358
crates/milli/src/search/facet/search.rs
Normal file
|
@ -0,0 +1,358 @@
|
|||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use charabia::normalizer::NormalizerOption;
|
||||
use charabia::{Language, Normalize, StrDetection, Token};
|
||||
use fst::automaton::{Automaton, Str};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use roaring::RoaringBitmap;
|
||||
use tracing::error;
|
||||
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::search::build_dfa;
|
||||
use crate::{DocumentId, FieldId, OrderBy, Result, Search};
|
||||
|
||||
/// The maximum number of values per facet returned by the facet search route.
|
||||
const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100;
|
||||
|
||||
pub struct SearchForFacetValues<'a> {
|
||||
query: Option<String>,
|
||||
facet: String,
|
||||
search_query: Search<'a>,
|
||||
max_values: usize,
|
||||
is_hybrid: bool,
|
||||
locales: Option<Vec<Language>>,
|
||||
}
|
||||
|
||||
impl<'a> SearchForFacetValues<'a> {
|
||||
pub fn new(
|
||||
facet: String,
|
||||
search_query: Search<'a>,
|
||||
is_hybrid: bool,
|
||||
) -> SearchForFacetValues<'a> {
|
||||
SearchForFacetValues {
|
||||
query: None,
|
||||
facet,
|
||||
search_query,
|
||||
max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET,
|
||||
is_hybrid,
|
||||
locales: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
|
||||
self.query = Some(query.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values = max;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn locales(&mut self, locales: Vec<Language>) -> &mut Self {
|
||||
self.locales = Some(locales);
|
||||
self
|
||||
}
|
||||
|
||||
fn one_original_value_of(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_str: &str,
|
||||
any_docid: DocumentId,
|
||||
) -> Result<Option<String>> {
|
||||
let index = self.search_query.index;
|
||||
let rtxn = self.search_query.rtxn;
|
||||
let key: (FieldId, _, &str) = (field_id, any_docid, facet_str);
|
||||
Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned()))
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<Vec<FacetValueHit>> {
|
||||
let index = self.search_query.index;
|
||||
let rtxn = self.search_query.rtxn;
|
||||
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
if !filterable_fields.contains(&self.facet) {
|
||||
let (valid_fields, hidden_fields) =
|
||||
index.remove_hidden_fields(rtxn, filterable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidFacetSearchFacetName {
|
||||
field: self.facet.clone(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let fid = match fields_ids_map.id(&self.facet) {
|
||||
Some(fid) => fid,
|
||||
// we return an empty list of results when the attribute has been
|
||||
// set as filterable but no document contains this field (yet).
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? {
|
||||
Some(fst) => fst,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let search_candidates = self.search_query.execute_for_candidates(
|
||||
self.is_hybrid
|
||||
|| self
|
||||
.search_query
|
||||
.semantic
|
||||
.as_ref()
|
||||
.and_then(|semantic| semantic.vector.as_ref())
|
||||
.is_some(),
|
||||
)?;
|
||||
|
||||
let mut results = match index.sort_facet_values_by(rtxn)?.get(&self.facet) {
|
||||
OrderBy::Lexicographic => ValuesCollection::by_lexicographic(self.max_values),
|
||||
OrderBy::Count => ValuesCollection::by_count(self.max_values),
|
||||
};
|
||||
|
||||
match self.query.as_ref() {
|
||||
Some(query) => {
|
||||
let query = normalize_facet_string(query, self.locales.as_deref());
|
||||
let query = query.as_ref();
|
||||
|
||||
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
|
||||
let field_authorizes_typos =
|
||||
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
|
||||
|
||||
if authorize_typos && field_authorizes_typos {
|
||||
let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
|
||||
if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
|
||||
if fst.contains(query) {
|
||||
self.fetch_original_facets_using_normalized(
|
||||
fid,
|
||||
query,
|
||||
query,
|
||||
&search_candidates,
|
||||
&mut results,
|
||||
)?;
|
||||
}
|
||||
} else {
|
||||
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
|
||||
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
|
||||
|
||||
let is_prefix = true;
|
||||
let automaton = if query.len() < one_typo as usize {
|
||||
build_dfa(query, 0, is_prefix)
|
||||
} else if query.len() < two_typos as usize {
|
||||
build_dfa(query, 1, is_prefix)
|
||||
} else {
|
||||
build_dfa(query, 2, is_prefix)
|
||||
};
|
||||
|
||||
let mut stream = fst.search(automaton).into_stream();
|
||||
while let Some(facet_value) = stream.next() {
|
||||
let value = std::str::from_utf8(facet_value)?;
|
||||
if self
|
||||
.fetch_original_facets_using_normalized(
|
||||
fid,
|
||||
value,
|
||||
query,
|
||||
&search_candidates,
|
||||
&mut results,
|
||||
)?
|
||||
.is_break()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let automaton = Str::new(query).starts_with();
|
||||
let mut stream = fst.search(automaton).into_stream();
|
||||
while let Some(facet_value) = stream.next() {
|
||||
let value = std::str::from_utf8(facet_value)?;
|
||||
if self
|
||||
.fetch_original_facets_using_normalized(
|
||||
fid,
|
||||
value,
|
||||
query,
|
||||
&search_candidates,
|
||||
&mut results,
|
||||
)?
|
||||
.is_break()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
|
||||
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
|
||||
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
|
||||
result?;
|
||||
let count = search_candidates.intersection_len(&bitmap);
|
||||
if count != 0 {
|
||||
let value = self
|
||||
.one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
|
||||
.unwrap_or_else(|| left_bound.to_string());
|
||||
if results.insert(FacetValueHit { value, count }).is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results.into_sorted_vec())
|
||||
}
|
||||
|
||||
fn fetch_original_facets_using_normalized(
|
||||
&self,
|
||||
fid: FieldId,
|
||||
value: &str,
|
||||
query: &str,
|
||||
search_candidates: &RoaringBitmap,
|
||||
results: &mut ValuesCollection,
|
||||
) -> Result<ControlFlow<()>> {
|
||||
let index = self.search_query.index;
|
||||
let rtxn = self.search_query.rtxn;
|
||||
|
||||
let database = index.facet_id_normalized_string_strings;
|
||||
let key = (fid, value);
|
||||
let original_strings = match database.get(rtxn, &key)? {
|
||||
Some(original_strings) => original_strings,
|
||||
None => {
|
||||
error!("the facet value is missing from the facet database: {key:?}");
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
};
|
||||
for original in original_strings {
|
||||
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
|
||||
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
|
||||
Some(FacetGroupValue { bitmap, .. }) => bitmap,
|
||||
None => {
|
||||
error!("the facet value is missing from the facet database: {key:?}");
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
};
|
||||
let count = search_candidates.intersection_len(&docids);
|
||||
if count != 0 {
|
||||
let value = self
|
||||
.one_original_value_of(fid, &original, docids.min().unwrap())?
|
||||
.unwrap_or_else(|| query.to_string());
|
||||
if results.insert(FacetValueHit { value, count }).is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, PartialEq)]
|
||||
pub struct FacetValueHit {
|
||||
/// The original facet value
|
||||
pub value: String,
|
||||
/// The number of documents associated to this facet
|
||||
pub count: u64,
|
||||
}
|
||||
|
||||
impl PartialOrd for FacetValueHit {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for FacetValueHit {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for FacetValueHit {}
|
||||
|
||||
/// A wrapper type that collects the best facet values by
|
||||
/// lexicographic or number of associated values.
|
||||
enum ValuesCollection {
|
||||
/// Keeps the top values according to the lexicographic order.
|
||||
Lexicographic { max: usize, content: Vec<FacetValueHit> },
|
||||
/// Keeps the top values according to the number of values associated to them.
|
||||
///
|
||||
/// Note that it is a max heap and we need to move the smallest counts
|
||||
/// at the top to be able to pop them when we reach the max_values limit.
|
||||
Count { max: usize, content: BinaryHeap<Reverse<FacetValueHit>> },
|
||||
}
|
||||
|
||||
impl ValuesCollection {
|
||||
pub fn by_lexicographic(max: usize) -> Self {
|
||||
ValuesCollection::Lexicographic { max, content: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn by_count(max: usize) -> Self {
|
||||
ValuesCollection::Count { max, content: BinaryHeap::new() }
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, value: FacetValueHit) -> ControlFlow<()> {
|
||||
match self {
|
||||
ValuesCollection::Lexicographic { max, content } => {
|
||||
if content.len() < *max {
|
||||
content.push(value);
|
||||
if content.len() < *max {
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
ControlFlow::Break(())
|
||||
}
|
||||
ValuesCollection::Count { max, content } => {
|
||||
if content.len() == *max {
|
||||
// Peeking gives us the worst value in the list as
|
||||
// this is a max-heap and we reversed it.
|
||||
let Some(mut peek) = content.peek_mut() else { return ControlFlow::Break(()) };
|
||||
if peek.0.count <= value.count {
|
||||
// Replace the current worst value in the heap
|
||||
// with the new one we received that is better.
|
||||
*peek = Reverse(value);
|
||||
}
|
||||
} else {
|
||||
content.push(Reverse(value));
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the list of facet values in descending order of, either,
|
||||
/// count or lexicographic order of the value depending on the type.
|
||||
pub fn into_sorted_vec(self) -> Vec<FacetValueHit> {
|
||||
match self {
|
||||
ValuesCollection::Lexicographic { content, .. } => content.into_iter().collect(),
|
||||
ValuesCollection::Count { content, .. } => {
|
||||
// Convert the heap into a vec of hits by removing the Reverse wrapper.
|
||||
// Hits are already in the right order as they were reversed and there
|
||||
// are output in ascending order.
|
||||
content.into_sorted_vec().into_iter().map(|Reverse(hit)| hit).collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script: detection.script(),
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
token.normalize(&options).lemma.into_owned()
|
||||
}
|
|
@ -0,0 +1,260 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
0: 1
|
||||
1: 1
|
||||
2: 1
|
||||
3: 1
|
||||
4: 1
|
||||
5: 1
|
||||
6: 1
|
||||
7: 1
|
||||
8: 1
|
||||
9: 1
|
||||
10: 1
|
||||
11: 1
|
||||
12: 1
|
||||
13: 1
|
||||
14: 1
|
||||
15: 1
|
||||
16: 1
|
||||
17: 1
|
||||
18: 1
|
||||
19: 1
|
||||
20: 1
|
||||
21: 1
|
||||
22: 1
|
||||
23: 1
|
||||
24: 1
|
||||
25: 1
|
||||
26: 1
|
||||
27: 1
|
||||
28: 1
|
||||
29: 1
|
||||
30: 1
|
||||
31: 1
|
||||
32: 1
|
||||
33: 1
|
||||
34: 1
|
||||
35: 1
|
||||
36: 1
|
||||
37: 1
|
||||
38: 1
|
||||
39: 1
|
||||
40: 1
|
||||
41: 1
|
||||
42: 1
|
||||
43: 1
|
||||
44: 1
|
||||
45: 1
|
||||
46: 1
|
||||
47: 1
|
||||
48: 1
|
||||
49: 1
|
||||
50: 1
|
||||
51: 1
|
||||
52: 1
|
||||
53: 1
|
||||
54: 1
|
||||
55: 1
|
||||
56: 1
|
||||
57: 1
|
||||
58: 1
|
||||
59: 1
|
||||
60: 1
|
||||
61: 1
|
||||
62: 1
|
||||
63: 1
|
||||
64: 1
|
||||
65: 1
|
||||
66: 1
|
||||
67: 1
|
||||
68: 1
|
||||
69: 1
|
||||
70: 1
|
||||
71: 1
|
||||
72: 1
|
||||
73: 1
|
||||
74: 1
|
||||
75: 1
|
||||
76: 1
|
||||
77: 1
|
||||
78: 1
|
||||
79: 1
|
||||
80: 1
|
||||
81: 1
|
||||
82: 1
|
||||
83: 1
|
||||
84: 1
|
||||
85: 1
|
||||
86: 1
|
||||
87: 1
|
||||
88: 1
|
||||
89: 1
|
||||
90: 1
|
||||
91: 1
|
||||
92: 1
|
||||
93: 1
|
||||
94: 1
|
||||
95: 1
|
||||
96: 1
|
||||
97: 1
|
||||
98: 1
|
||||
99: 1
|
||||
100: 1
|
||||
101: 1
|
||||
102: 1
|
||||
103: 1
|
||||
104: 1
|
||||
105: 1
|
||||
106: 1
|
||||
107: 1
|
||||
108: 1
|
||||
109: 1
|
||||
110: 1
|
||||
111: 1
|
||||
112: 1
|
||||
113: 1
|
||||
114: 1
|
||||
115: 1
|
||||
116: 1
|
||||
117: 1
|
||||
118: 1
|
||||
119: 1
|
||||
120: 1
|
||||
121: 1
|
||||
122: 1
|
||||
123: 1
|
||||
124: 1
|
||||
125: 1
|
||||
126: 1
|
||||
127: 1
|
||||
128: 1
|
||||
129: 1
|
||||
130: 1
|
||||
131: 1
|
||||
132: 1
|
||||
133: 1
|
||||
134: 1
|
||||
135: 1
|
||||
136: 1
|
||||
137: 1
|
||||
138: 1
|
||||
139: 1
|
||||
140: 1
|
||||
141: 1
|
||||
142: 1
|
||||
143: 1
|
||||
144: 1
|
||||
145: 1
|
||||
146: 1
|
||||
147: 1
|
||||
148: 1
|
||||
149: 1
|
||||
150: 1
|
||||
151: 1
|
||||
152: 1
|
||||
153: 1
|
||||
154: 1
|
||||
155: 1
|
||||
156: 1
|
||||
157: 1
|
||||
158: 1
|
||||
159: 1
|
||||
160: 1
|
||||
161: 1
|
||||
162: 1
|
||||
163: 1
|
||||
164: 1
|
||||
165: 1
|
||||
166: 1
|
||||
167: 1
|
||||
168: 1
|
||||
169: 1
|
||||
170: 1
|
||||
171: 1
|
||||
172: 1
|
||||
173: 1
|
||||
174: 1
|
||||
175: 1
|
||||
176: 1
|
||||
177: 1
|
||||
178: 1
|
||||
179: 1
|
||||
180: 1
|
||||
181: 1
|
||||
182: 1
|
||||
183: 1
|
||||
184: 1
|
||||
185: 1
|
||||
186: 1
|
||||
187: 1
|
||||
188: 1
|
||||
189: 1
|
||||
190: 1
|
||||
191: 1
|
||||
192: 1
|
||||
193: 1
|
||||
194: 1
|
||||
195: 1
|
||||
196: 1
|
||||
197: 1
|
||||
198: 1
|
||||
199: 1
|
||||
200: 1
|
||||
201: 1
|
||||
202: 1
|
||||
203: 1
|
||||
204: 1
|
||||
205: 1
|
||||
206: 1
|
||||
207: 1
|
||||
208: 1
|
||||
209: 1
|
||||
210: 1
|
||||
211: 1
|
||||
212: 1
|
||||
213: 1
|
||||
214: 1
|
||||
215: 1
|
||||
216: 1
|
||||
217: 1
|
||||
218: 1
|
||||
219: 1
|
||||
220: 1
|
||||
221: 1
|
||||
222: 1
|
||||
223: 1
|
||||
224: 1
|
||||
225: 1
|
||||
226: 1
|
||||
227: 1
|
||||
228: 1
|
||||
229: 1
|
||||
230: 1
|
||||
231: 1
|
||||
232: 1
|
||||
233: 1
|
||||
234: 1
|
||||
235: 1
|
||||
236: 1
|
||||
237: 1
|
||||
238: 1
|
||||
239: 1
|
||||
240: 1
|
||||
241: 1
|
||||
242: 1
|
||||
243: 1
|
||||
244: 1
|
||||
245: 1
|
||||
246: 1
|
||||
247: 1
|
||||
248: 1
|
||||
249: 1
|
||||
250: 1
|
||||
251: 1
|
||||
252: 1
|
||||
253: 1
|
||||
254: 1
|
||||
255: 1
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
3: 2
|
||||
5: 2
|
||||
6: 2
|
||||
9: 2
|
||||
10: 2
|
||||
11: 2
|
||||
14: 2
|
||||
18: 2
|
||||
19: 2
|
||||
24: 2
|
||||
26: 2
|
||||
28: 2
|
||||
29: 2
|
||||
32: 2
|
||||
33: 2
|
||||
35: 2
|
||||
36: 2
|
||||
37: 2
|
||||
38: 2
|
||||
39: 2
|
||||
41: 2
|
||||
46: 2
|
||||
47: 2
|
||||
49: 2
|
||||
52: 2
|
||||
53: 2
|
||||
55: 2
|
||||
59: 2
|
||||
61: 2
|
||||
64: 2
|
||||
68: 2
|
||||
71: 2
|
||||
74: 2
|
||||
75: 2
|
||||
76: 2
|
||||
81: 2
|
||||
83: 2
|
||||
85: 2
|
||||
86: 2
|
||||
88: 2
|
||||
90: 2
|
||||
91: 2
|
||||
92: 2
|
||||
98: 2
|
||||
99: 2
|
||||
101: 2
|
||||
102: 2
|
||||
103: 2
|
||||
107: 2
|
||||
111: 2
|
||||
115: 2
|
||||
119: 2
|
||||
123: 2
|
||||
124: 2
|
||||
130: 2
|
||||
131: 2
|
||||
133: 2
|
||||
135: 2
|
||||
136: 2
|
||||
137: 2
|
||||
139: 2
|
||||
141: 2
|
||||
143: 2
|
||||
144: 2
|
||||
147: 2
|
||||
150: 2
|
||||
156: 1
|
||||
158: 1
|
||||
160: 1
|
||||
162: 1
|
||||
163: 1
|
||||
164: 1
|
||||
167: 1
|
||||
169: 1
|
||||
173: 1
|
||||
177: 1
|
||||
178: 1
|
||||
179: 1
|
||||
181: 1
|
||||
182: 1
|
||||
186: 1
|
||||
189: 1
|
||||
192: 1
|
||||
193: 1
|
||||
195: 1
|
||||
197: 1
|
||||
205: 1
|
||||
206: 1
|
||||
207: 1
|
||||
208: 1
|
||||
209: 1
|
||||
210: 1
|
||||
216: 1
|
||||
219: 1
|
||||
220: 1
|
||||
223: 1
|
||||
226: 1
|
||||
235: 1
|
||||
236: 1
|
||||
238: 1
|
||||
243: 1
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
0: 1
|
||||
1: 1
|
||||
2: 1
|
||||
3: 1
|
||||
4: 1
|
||||
5: 1
|
||||
6: 1
|
||||
7: 1
|
||||
8: 1
|
||||
9: 1
|
||||
10: 1
|
||||
11: 1
|
||||
12: 1
|
||||
13: 1
|
||||
14: 1
|
||||
15: 1
|
||||
16: 1
|
||||
17: 1
|
||||
18: 1
|
||||
19: 1
|
||||
20: 1
|
||||
21: 1
|
||||
22: 1
|
||||
23: 1
|
||||
24: 1
|
||||
25: 1
|
||||
26: 1
|
||||
27: 1
|
||||
28: 1
|
||||
29: 1
|
||||
30: 1
|
||||
31: 1
|
||||
32: 1
|
||||
33: 1
|
||||
34: 1
|
||||
35: 1
|
||||
36: 1
|
||||
37: 1
|
||||
38: 1
|
||||
39: 1
|
||||
40: 1
|
||||
41: 1
|
||||
42: 1
|
||||
43: 1
|
||||
44: 1
|
||||
45: 1
|
||||
46: 1
|
||||
47: 1
|
||||
48: 1
|
||||
49: 1
|
||||
50: 1
|
||||
51: 1
|
||||
52: 1
|
||||
53: 1
|
||||
54: 1
|
||||
55: 1
|
||||
56: 1
|
||||
57: 1
|
||||
58: 1
|
||||
59: 1
|
||||
60: 1
|
||||
61: 1
|
||||
62: 1
|
||||
63: 1
|
||||
64: 1
|
||||
65: 1
|
||||
66: 1
|
||||
67: 1
|
||||
68: 1
|
||||
69: 1
|
||||
70: 1
|
||||
71: 1
|
||||
72: 1
|
||||
73: 1
|
||||
74: 1
|
||||
75: 1
|
||||
76: 1
|
||||
77: 1
|
||||
78: 1
|
||||
79: 1
|
||||
80: 1
|
||||
81: 1
|
||||
82: 1
|
||||
83: 1
|
||||
84: 1
|
||||
85: 1
|
||||
86: 1
|
||||
87: 1
|
||||
88: 1
|
||||
89: 1
|
||||
90: 1
|
||||
91: 1
|
||||
92: 1
|
||||
93: 1
|
||||
94: 1
|
||||
95: 1
|
||||
96: 1
|
||||
97: 1
|
||||
98: 1
|
||||
99: 1
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
3: 2
|
||||
5: 2
|
||||
6: 2
|
||||
9: 2
|
||||
10: 2
|
||||
11: 2
|
||||
14: 2
|
||||
18: 2
|
||||
19: 2
|
||||
24: 2
|
||||
26: 2
|
||||
28: 2
|
||||
29: 2
|
||||
32: 2
|
||||
33: 2
|
||||
35: 2
|
||||
36: 2
|
||||
37: 2
|
||||
38: 2
|
||||
39: 2
|
||||
41: 2
|
||||
46: 2
|
||||
47: 2
|
||||
49: 2
|
||||
52: 2
|
||||
53: 2
|
||||
55: 2
|
||||
59: 2
|
||||
61: 2
|
||||
64: 2
|
||||
68: 2
|
||||
71: 2
|
||||
74: 2
|
||||
75: 2
|
||||
76: 2
|
||||
81: 2
|
||||
83: 2
|
||||
85: 2
|
||||
86: 2
|
||||
88: 2
|
||||
90: 2
|
||||
91: 2
|
||||
92: 2
|
||||
98: 2
|
||||
99: 2
|
||||
101: 2
|
||||
102: 2
|
||||
103: 2
|
||||
107: 2
|
||||
111: 2
|
||||
115: 2
|
||||
119: 2
|
||||
123: 2
|
||||
124: 2
|
||||
130: 2
|
||||
131: 2
|
||||
133: 2
|
||||
135: 2
|
||||
136: 2
|
||||
137: 2
|
||||
139: 2
|
||||
141: 2
|
||||
143: 2
|
||||
144: 2
|
||||
147: 2
|
||||
150: 2
|
||||
156: 1
|
||||
158: 1
|
||||
160: 1
|
||||
162: 1
|
||||
163: 1
|
||||
164: 1
|
||||
167: 1
|
||||
169: 1
|
||||
173: 1
|
||||
177: 1
|
||||
178: 1
|
||||
179: 1
|
||||
181: 1
|
||||
182: 1
|
||||
186: 1
|
||||
189: 1
|
||||
192: 1
|
||||
193: 1
|
||||
195: 1
|
||||
197: 1
|
||||
205: 1
|
||||
206: 1
|
||||
207: 1
|
||||
208: 1
|
||||
209: 1
|
||||
210: 1
|
||||
216: 1
|
||||
219: 1
|
||||
220: 1
|
||||
223: 1
|
||||
226: 1
|
||||
235: 1
|
||||
236: 1
|
||||
238: 1
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
adf484f467a31ee9460dec539621938a
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c9939aa4977fcd4bfd35852e102dbc82
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
adf484f467a31ee9460dec539621938a
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c9939aa4977fcd4bfd35852e102dbc82
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
618738d28ff1386b6e93d171a5acb08f
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ffb62ab3eef55c2254c13dc0f4099849
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
618738d28ff1386b6e93d171a5acb08f
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ffb62ab3eef55c2254c13dc0f4099849
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9c25261cec7275cb5cfd85835904d023
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2f97f18c15e915853e4df879be6e1f63
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9c25261cec7275cb5cfd85835904d023
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2f97f18c15e915853e4df879be6e1f63
|
|
@ -0,0 +1,260 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
0: []
|
||||
1: []
|
||||
2: []
|
||||
3: []
|
||||
4: []
|
||||
5: []
|
||||
6: []
|
||||
7: []
|
||||
8: []
|
||||
9: []
|
||||
10: []
|
||||
11: []
|
||||
12: []
|
||||
13: []
|
||||
14: []
|
||||
15: []
|
||||
16: []
|
||||
17: []
|
||||
18: []
|
||||
19: []
|
||||
20: []
|
||||
21: []
|
||||
22: []
|
||||
23: []
|
||||
24: []
|
||||
25: []
|
||||
26: []
|
||||
27: []
|
||||
28: []
|
||||
29: []
|
||||
30: []
|
||||
31: []
|
||||
32: []
|
||||
33: []
|
||||
34: []
|
||||
35: []
|
||||
36: []
|
||||
37: []
|
||||
38: []
|
||||
39: []
|
||||
40: []
|
||||
41: []
|
||||
42: []
|
||||
43: []
|
||||
44: []
|
||||
45: []
|
||||
46: []
|
||||
47: []
|
||||
48: []
|
||||
49: []
|
||||
50: []
|
||||
51: []
|
||||
52: []
|
||||
53: []
|
||||
54: []
|
||||
55: []
|
||||
56: []
|
||||
57: []
|
||||
58: []
|
||||
59: []
|
||||
60: []
|
||||
61: []
|
||||
62: []
|
||||
63: []
|
||||
64: []
|
||||
65: []
|
||||
66: []
|
||||
67: []
|
||||
68: []
|
||||
69: []
|
||||
70: []
|
||||
71: []
|
||||
72: []
|
||||
73: []
|
||||
74: []
|
||||
75: []
|
||||
76: []
|
||||
77: []
|
||||
78: []
|
||||
79: []
|
||||
80: []
|
||||
81: []
|
||||
82: []
|
||||
83: []
|
||||
84: []
|
||||
85: []
|
||||
86: []
|
||||
87: []
|
||||
88: []
|
||||
89: []
|
||||
90: []
|
||||
91: []
|
||||
92: []
|
||||
93: []
|
||||
94: []
|
||||
95: []
|
||||
96: []
|
||||
97: []
|
||||
98: []
|
||||
99: []
|
||||
100: []
|
||||
101: []
|
||||
102: []
|
||||
103: []
|
||||
104: []
|
||||
105: []
|
||||
106: []
|
||||
107: []
|
||||
108: []
|
||||
109: []
|
||||
110: []
|
||||
111: []
|
||||
112: []
|
||||
113: []
|
||||
114: []
|
||||
115: []
|
||||
116: []
|
||||
117: []
|
||||
118: []
|
||||
119: []
|
||||
120: []
|
||||
121: []
|
||||
122: []
|
||||
123: []
|
||||
124: []
|
||||
125: []
|
||||
126: []
|
||||
127: []
|
||||
128: []
|
||||
129: []
|
||||
130: []
|
||||
131: []
|
||||
132: []
|
||||
133: []
|
||||
134: []
|
||||
135: []
|
||||
136: []
|
||||
137: []
|
||||
138: []
|
||||
139: []
|
||||
140: []
|
||||
141: []
|
||||
142: []
|
||||
143: []
|
||||
144: []
|
||||
145: []
|
||||
146: []
|
||||
147: []
|
||||
148: []
|
||||
149: []
|
||||
150: []
|
||||
151: []
|
||||
152: []
|
||||
153: []
|
||||
154: []
|
||||
155: []
|
||||
156: []
|
||||
157: []
|
||||
158: []
|
||||
159: []
|
||||
160: []
|
||||
161: []
|
||||
162: []
|
||||
163: []
|
||||
164: []
|
||||
165: []
|
||||
166: []
|
||||
167: []
|
||||
168: []
|
||||
169: []
|
||||
170: []
|
||||
171: []
|
||||
172: []
|
||||
173: []
|
||||
174: []
|
||||
175: []
|
||||
176: []
|
||||
177: []
|
||||
178: []
|
||||
179: []
|
||||
180: []
|
||||
181: []
|
||||
182: []
|
||||
183: []
|
||||
184: []
|
||||
185: []
|
||||
186: []
|
||||
187: []
|
||||
188: []
|
||||
189: []
|
||||
190: []
|
||||
191: []
|
||||
192: []
|
||||
193: []
|
||||
194: []
|
||||
195: []
|
||||
196: []
|
||||
197: []
|
||||
198: []
|
||||
199: []
|
||||
200: []
|
||||
201: []
|
||||
202: []
|
||||
203: []
|
||||
204: []
|
||||
205: []
|
||||
206: []
|
||||
207: []
|
||||
208: []
|
||||
209: []
|
||||
210: []
|
||||
211: []
|
||||
212: []
|
||||
213: []
|
||||
214: []
|
||||
215: []
|
||||
216: []
|
||||
217: []
|
||||
218: []
|
||||
219: []
|
||||
220: []
|
||||
221: []
|
||||
222: []
|
||||
223: []
|
||||
224: []
|
||||
225: []
|
||||
226: []
|
||||
227: []
|
||||
228: []
|
||||
229: []
|
||||
230: []
|
||||
231: []
|
||||
232: []
|
||||
233: []
|
||||
234: []
|
||||
235: []
|
||||
236: []
|
||||
237: []
|
||||
238: []
|
||||
239: []
|
||||
240: []
|
||||
241: []
|
||||
242: []
|
||||
243: []
|
||||
244: []
|
||||
245: []
|
||||
246: []
|
||||
247: []
|
||||
248: []
|
||||
249: []
|
||||
250: []
|
||||
251: []
|
||||
252: []
|
||||
253: []
|
||||
254: []
|
||||
255: []
|
||||
|
|
@ -0,0 +1,260 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
0: []
|
||||
1: []
|
||||
2: []
|
||||
3: []
|
||||
4: []
|
||||
5: []
|
||||
6: []
|
||||
7: []
|
||||
8: []
|
||||
9: []
|
||||
10: []
|
||||
11: []
|
||||
12: []
|
||||
13: []
|
||||
14: []
|
||||
15: []
|
||||
16: []
|
||||
17: []
|
||||
18: []
|
||||
19: []
|
||||
20: []
|
||||
21: []
|
||||
22: []
|
||||
23: []
|
||||
24: []
|
||||
25: []
|
||||
26: []
|
||||
27: []
|
||||
28: []
|
||||
29: []
|
||||
30: []
|
||||
31: []
|
||||
32: []
|
||||
33: []
|
||||
34: []
|
||||
35: []
|
||||
36: []
|
||||
37: []
|
||||
38: []
|
||||
39: []
|
||||
40: []
|
||||
41: []
|
||||
42: []
|
||||
43: []
|
||||
44: []
|
||||
45: []
|
||||
46: []
|
||||
47: []
|
||||
48: []
|
||||
49: []
|
||||
50: []
|
||||
51: []
|
||||
52: []
|
||||
53: []
|
||||
54: []
|
||||
55: []
|
||||
56: []
|
||||
57: []
|
||||
58: []
|
||||
59: []
|
||||
60: []
|
||||
61: []
|
||||
62: []
|
||||
63: []
|
||||
64: []
|
||||
65: []
|
||||
66: []
|
||||
67: []
|
||||
68: []
|
||||
69: []
|
||||
70: []
|
||||
71: []
|
||||
72: []
|
||||
73: []
|
||||
74: []
|
||||
75: []
|
||||
76: []
|
||||
77: []
|
||||
78: []
|
||||
79: []
|
||||
80: []
|
||||
81: []
|
||||
82: []
|
||||
83: []
|
||||
84: []
|
||||
85: []
|
||||
86: []
|
||||
87: []
|
||||
88: []
|
||||
89: []
|
||||
90: []
|
||||
91: []
|
||||
92: []
|
||||
93: []
|
||||
94: []
|
||||
95: []
|
||||
96: []
|
||||
97: []
|
||||
98: []
|
||||
99: []
|
||||
100: []
|
||||
101: []
|
||||
102: []
|
||||
103: []
|
||||
104: []
|
||||
105: []
|
||||
106: []
|
||||
107: []
|
||||
108: []
|
||||
109: []
|
||||
110: []
|
||||
111: []
|
||||
112: []
|
||||
113: []
|
||||
114: []
|
||||
115: []
|
||||
116: []
|
||||
117: []
|
||||
118: []
|
||||
119: []
|
||||
120: []
|
||||
121: []
|
||||
122: []
|
||||
123: []
|
||||
124: []
|
||||
125: []
|
||||
126: []
|
||||
127: []
|
||||
128: []
|
||||
129: []
|
||||
130: []
|
||||
131: []
|
||||
132: []
|
||||
133: []
|
||||
134: []
|
||||
135: []
|
||||
136: []
|
||||
137: []
|
||||
138: []
|
||||
139: []
|
||||
140: []
|
||||
141: []
|
||||
142: []
|
||||
143: []
|
||||
144: []
|
||||
145: []
|
||||
146: []
|
||||
147: []
|
||||
148: []
|
||||
149: []
|
||||
150: []
|
||||
151: []
|
||||
152: []
|
||||
153: []
|
||||
154: []
|
||||
155: []
|
||||
156: []
|
||||
157: []
|
||||
158: []
|
||||
159: []
|
||||
160: []
|
||||
161: []
|
||||
162: []
|
||||
163: []
|
||||
164: []
|
||||
165: []
|
||||
166: []
|
||||
167: []
|
||||
168: []
|
||||
169: []
|
||||
170: []
|
||||
171: []
|
||||
172: []
|
||||
173: []
|
||||
174: []
|
||||
175: []
|
||||
176: []
|
||||
177: []
|
||||
178: []
|
||||
179: []
|
||||
180: []
|
||||
181: []
|
||||
182: []
|
||||
183: []
|
||||
184: []
|
||||
185: []
|
||||
186: []
|
||||
187: []
|
||||
188: []
|
||||
189: []
|
||||
190: []
|
||||
191: []
|
||||
192: []
|
||||
193: []
|
||||
194: []
|
||||
195: []
|
||||
196: []
|
||||
197: []
|
||||
198: []
|
||||
199: []
|
||||
200: []
|
||||
201: []
|
||||
202: []
|
||||
203: []
|
||||
204: []
|
||||
205: []
|
||||
206: []
|
||||
207: []
|
||||
208: []
|
||||
209: []
|
||||
210: []
|
||||
211: []
|
||||
212: []
|
||||
213: []
|
||||
214: []
|
||||
215: []
|
||||
216: []
|
||||
217: []
|
||||
218: []
|
||||
219: []
|
||||
220: []
|
||||
221: []
|
||||
222: []
|
||||
223: []
|
||||
224: []
|
||||
225: []
|
||||
226: []
|
||||
227: []
|
||||
228: []
|
||||
229: []
|
||||
230: []
|
||||
231: []
|
||||
232: []
|
||||
233: []
|
||||
234: []
|
||||
235: []
|
||||
236: []
|
||||
237: []
|
||||
238: []
|
||||
239: []
|
||||
240: []
|
||||
241: []
|
||||
242: []
|
||||
243: []
|
||||
244: []
|
||||
245: []
|
||||
246: []
|
||||
247: []
|
||||
248: []
|
||||
249: []
|
||||
250: []
|
||||
251: []
|
||||
252: []
|
||||
253: []
|
||||
254: []
|
||||
255: []
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9c25261cec7275cb5cfd85835904d023
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2f97f18c15e915853e4df879be6e1f63
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
e849066b0e43d5c456f086c552372afc
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
8cc5e82995b0443b660f419bb9ea2e85
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
e849066b0e43d5c456f086c552372afc
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
8cc5e82995b0443b660f419bb9ea2e85
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
73b48005dc57b04f0939bbf21a68dab6
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
3c23d35627667dcee98468bfdecf09d3
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
73b48005dc57b04f0939bbf21a68dab6
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
3c23d35627667dcee98468bfdecf09d3
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c3f8b0b858a4820a508b25b42328cedd
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
38a42f5dc25e99d7a5312a63ce94ed30
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c3f8b0b858a4820a508b25b42328cedd
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
38a42f5dc25e99d7a5312a63ce94ed30
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2049930204498b323885c91de88e44ca
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
7f0ca8c0fc6494f3dba46e8eb9699045
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2049930204498b323885c91de88e44ca
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
7f0ca8c0fc6494f3dba46e8eb9699045
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ad8fc873747aaf1d3590e7ccab735985
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue