Merge branch 'main' into indexer-edition-2024

This commit is contained in:
ManyTheFish 2024-11-06 15:19:18 +01:00
commit 10feeb88f2
1122 changed files with 6265 additions and 5265 deletions

View file

@ -0,0 +1,846 @@
use std::collections::{BTreeMap, HashMap, HashSet};
use std::fmt::Display;
use std::ops::ControlFlow;
use std::{fmt, mem};
use heed::types::Bytes;
use heed::BytesDecode;
use indexmap::IndexMap;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use crate::error::UserError;
use crate::facet::FacetType;
use crate::heed_codec::facet::{
FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
};
use crate::heed_codec::{BytesRefCodec, StrRefCodec};
use crate::search::facet::facet_distribution_iter::{
count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
};
use crate::{FieldId, Index, Result};
/// The default number of values by facets that will
/// be fetched from the key-value store.
pub const DEFAULT_VALUES_PER_FACET: usize = 100;
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 3000;
/// How should we fetch the facets?
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum OrderBy {
/// By lexicographic order...
#[default]
Lexicographic,
/// Or by number of docids in common?
Count,
}
impl Display for OrderBy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
OrderBy::Lexicographic => f.write_str("alphabetically"),
OrderBy::Count => f.write_str("by count"),
}
}
}
pub struct FacetDistribution<'a> {
facets: Option<HashMap<String, OrderBy>>,
candidates: Option<RoaringBitmap>,
max_values_per_facet: usize,
default_order_by: OrderBy,
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
}
impl<'a> FacetDistribution<'a> {
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> FacetDistribution<'a> {
FacetDistribution {
facets: None,
candidates: None,
max_values_per_facet: DEFAULT_VALUES_PER_FACET,
default_order_by: OrderBy::default(),
rtxn,
index,
}
}
pub fn facets<I: IntoIterator<Item = (A, OrderBy)>, A: AsRef<str>>(
&mut self,
names_ordered_by: I,
) -> &mut Self {
self.facets = Some(
names_ordered_by
.into_iter()
.map(|(name, order_by)| (name.as_ref().to_string(), order_by))
.collect(),
);
self
}
pub fn max_values_per_facet(&mut self, max: usize) -> &mut Self {
self.max_values_per_facet = max;
self
}
pub fn default_order_by(&mut self, order_by: OrderBy) -> &mut Self {
self.default_order_by = order_by;
self
}
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
self.candidates = Some(candidates);
self
}
/// There is a small amount of candidates OR we ask for facet string values so we
/// decide to iterate over the facet values of each one of them, one by one.
fn facet_distribution_from_documents(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
distribution: &mut IndexMap<String, u64>,
) -> heed::Result<()> {
match facet_type {
FacetType::Number => {
let mut lexicographic_distribution = BTreeMap::new();
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
let db = self.index.field_id_docid_facet_f64s;
for docid in candidates {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
.remap_key_type::<Bytes>()
.prefix_iter(self.rtxn, &key_buffer)?
.remap_key_type::<FieldDocIdFacetF64Codec>();
for result in iter {
let ((_, _, value), ()) = result?;
*lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1;
}
}
distribution.extend(
lexicographic_distribution
.into_iter()
.take(self.max_values_per_facet.saturating_sub(distribution.len())),
);
}
FacetType::String => {
let mut normalized_distribution = BTreeMap::new();
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
let db = self.index.field_id_docid_facet_strings;
for docid in candidates {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
.remap_key_type::<Bytes>()
.prefix_iter(self.rtxn, &key_buffer)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
for result in iter {
let ((_, _, normalized_value), original_value) = result?;
let (_, count) = normalized_distribution
.entry(normalized_value)
.or_insert_with(|| (original_value, 0));
*count += 1;
// we'd like to break here if we have enough facet values, but we are collecting them by increasing docid,
// so higher ranked facets could be in later docids
}
}
let iter = normalized_distribution
.into_iter()
.take(self.max_values_per_facet.saturating_sub(distribution.len()))
.map(|(_normalized, (original, count))| (original.to_string(), count));
distribution.extend(iter);
}
}
Ok(())
}
/// There is too much documents, we use the facet levels to move throught
/// the facet values, to find the candidates and values associated.
fn facet_numbers_distribution_from_facet_levels(
&self,
field_id: FieldId,
candidates: &RoaringBitmap,
order_by: OrderBy,
distribution: &mut IndexMap<String, u64>,
) -> heed::Result<()> {
let search_function = match order_by {
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
OrderBy::Count => count_iterate_over_facet_distribution,
};
search_function(
self.rtxn,
self.index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
candidates,
|facet_key, nbr_docids, _| {
let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
distribution.insert(facet_key.to_string(), nbr_docids);
if distribution.len() == self.max_values_per_facet {
Ok(ControlFlow::Break(()))
} else {
Ok(ControlFlow::Continue(()))
}
},
)
}
fn facet_strings_distribution_from_facet_levels(
&self,
field_id: FieldId,
candidates: &RoaringBitmap,
order_by: OrderBy,
distribution: &mut IndexMap<String, u64>,
) -> heed::Result<()> {
let search_function = match order_by {
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
OrderBy::Count => count_iterate_over_facet_distribution,
};
search_function(
self.rtxn,
self.index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
candidates,
|facet_key, nbr_docids, any_docid| {
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
let original_string = self
.index
.field_id_docid_facet_strings
.get(self.rtxn, &key)?
.unwrap()
.to_owned();
distribution.insert(original_string, nbr_docids);
if distribution.len() == self.max_values_per_facet {
Ok(ControlFlow::Break(()))
} else {
Ok(ControlFlow::Continue(()))
}
},
)
}
fn facet_values(
&self,
field_id: FieldId,
order_by: OrderBy,
) -> heed::Result<IndexMap<String, u64>> {
use FacetType::{Number, String};
let mut distribution = IndexMap::new();
match (order_by, &self.candidates) {
(OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => {
// Classic search, candidates were specified, we must return facet values only related
// to those candidates. We also enter here for facet strings for performance reasons.
self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?;
}
_ => {
let universe;
let candidates = match &self.candidates {
Some(cnd) => cnd,
None => {
universe = self.index.documents_ids(self.rtxn)?;
&universe
}
};
self.facet_numbers_distribution_from_facet_levels(
field_id,
candidates,
order_by,
&mut distribution,
)?;
self.facet_strings_distribution_from_facet_levels(
field_id,
candidates,
order_by,
&mut distribution,
)?;
}
};
Ok(distribution)
}
pub fn compute_stats(&self) -> Result<BTreeMap<String, (f64, f64)>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
let candidates = if let Some(candidates) = self.candidates.clone() {
candidates
} else {
return Ok(Default::default());
};
let fields = match &self.facets {
Some(facets) => {
let invalid_fields: HashSet<_> = facets
.iter()
.map(|(name, _)| name)
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
.collect();
if !invalid_fields.is_empty() {
return Err(UserError::InvalidFacetsDistribution {
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
valid_facets_name: filterable_fields.into_iter().collect(),
}
.into());
} else {
facets.iter().map(|(name, _)| name).cloned().collect()
}
}
None => filterable_fields,
};
let mut distribution = BTreeMap::new();
for (fid, name) in fields_ids_map.iter() {
if crate::is_faceted(name, &fields) {
let min_value = if let Some(min_value) = crate::search::facet::facet_min_value(
self.index,
self.rtxn,
fid,
candidates.clone(),
)? {
min_value
} else {
continue;
};
let max_value = if let Some(max_value) = crate::search::facet::facet_max_value(
self.index,
self.rtxn,
fid,
candidates.clone(),
)? {
max_value
} else {
continue;
};
distribution.insert(name.to_string(), (min_value, max_value));
}
}
Ok(distribution)
}
pub fn execute(&self) -> Result<BTreeMap<String, IndexMap<String, u64>>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
let fields = match self.facets {
Some(ref facets) => {
let invalid_fields: HashSet<_> = facets
.iter()
.map(|(name, _)| name)
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
.collect();
if !invalid_fields.is_empty() {
return Err(UserError::InvalidFacetsDistribution {
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
valid_facets_name: filterable_fields.into_iter().collect(),
}
.into());
} else {
facets.iter().map(|(name, _)| name).cloned().collect()
}
}
None => filterable_fields,
};
let mut distribution = BTreeMap::new();
for (fid, name) in fields_ids_map.iter() {
if crate::is_faceted(name, &fields) {
let order_by = self
.facets
.as_ref()
.and_then(|facets| facets.get(name).copied())
.unwrap_or(self.default_order_by);
let values = self.facet_values(fid, order_by)?;
distribution.insert(name.to_string(), values);
}
}
Ok(distribution)
}
}
impl fmt::Debug for FacetDistribution<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let FacetDistribution {
facets,
candidates,
max_values_per_facet,
default_order_by,
rtxn: _,
index: _,
} = self;
f.debug_struct("FacetDistribution")
.field("facets", facets)
.field("candidates", candidates)
.field("max_values_per_facet", max_values_per_facet)
.field("default_order_by", default_order_by)
.finish()
}
}
#[cfg(test)]
mod tests {
use std::iter;
use big_s::S;
use maplit::hashset;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::{milli_snap, FacetDistribution, OrderBy};
#[test]
fn few_candidates_few_facet_values() {
// All the tests here avoid using the code in `facet_distribution_iter` because there aren't
// enough candidates.
let mut index = TempIndex::new();
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let documents = documents!([
{ "colour": "Blue" },
{ "colour": " blue" },
{ "colour": "RED" }
]);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates([0, 1, 2].iter().copied().collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates([1, 2].iter().copied().collect())
.execute()
.unwrap();
// I think it would be fine if " blue" was "Blue" instead.
// We just need to get any non-normalised string I think, even if it's not in
// the candidates
milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates([2].iter().copied().collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates([0, 1, 2].iter().copied().collect())
.max_values_per_facet(1)
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::Count)))
.candidates([0, 1, 2].iter().copied().collect())
.max_values_per_facet(1)
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
}
#[test]
fn many_candidates_few_facet_values() {
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"];
let mut documents = vec![];
for i in 0..10_000 {
let document = serde_json::json!({
"colour": facet_values[i % 5],
})
.as_object()
.unwrap()
.clone();
documents.push(document);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.max_values_per_facet(1)
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..10_000).collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..5_000).collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..5_000).collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..5_000).collect())
.max_values_per_facet(1)
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::Count)))
.candidates((0..5_000).collect())
.max_values_per_facet(1)
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Red": 3000}}"###);
}
#[test]
fn many_candidates_many_facet_values() {
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..10_000 {
let document = serde_json::json!({
"colour": facet_values[i % 1000],
})
.as_object()
.unwrap()
.clone();
documents.push(document);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5");
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.max_values_per_facet(2)
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..10_000).collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5");
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..5_000).collect())
.execute()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992");
}
#[test]
fn facet_stats() {
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
let document = serde_json::json!({
"colour": facet_values[i % 1000],
})
.as_object()
.unwrap()
.clone();
documents.push(document);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((217..777).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 776.0)}"###);
}
#[test]
fn facet_stats_array() {
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
let document = serde_json::json!({
"colour": [facet_values[i % 1000], facet_values[i % 1000] + 1000],
})
.as_object()
.unwrap()
.clone();
documents.push(document);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1999.0)}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((217..777).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 1776.0)}"###);
}
#[test]
fn facet_stats_mixed_array() {
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
let document = serde_json::json!({
"colour": [facet_values[i % 1000], format!("{}", facet_values[i % 1000] + 1000)],
})
.as_object()
.unwrap()
.clone();
documents.push(document);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((217..777).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 776.0)}"###);
}
#[test]
fn facet_mixed_values() {
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
let document = if i % 2 == 0 {
serde_json::json!({
"colour": [facet_values[i % 1000], facet_values[i % 1000] + 1000],
})
} else {
serde_json::json!({
"colour": format!("{}", facet_values[i % 1000] + 10000),
})
};
let document = document.as_object().unwrap().clone();
documents.push(document);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
let txn = index.read_txn().unwrap();
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1998.0)}"###);
let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::default())))
.candidates((217..777).collect())
.compute_stats()
.unwrap();
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (218.0, 1776.0)}"###);
}
}

View file

@ -0,0 +1,301 @@
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::ops::ControlFlow;
use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, DocumentId};
/// Call the given closure on the facet distribution of the candidate documents.
///
/// The arguments to the closure are:
/// - the facet value, as a byte slice
/// - the number of documents among the candidates that contain this facet value
/// - the id of a document which contains the facet value. Note that this document
/// is not necessarily from the list of candidates, it is simply *any* document which
/// contains this facet value.
///
/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should
/// keep iterating over the different facet values or stop.
pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
callback: CB,
) -> Result<()>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
Ok(())
} else {
Ok(())
}
}
pub fn count_iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
mut callback: CB,
) -> Result<()>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
/// # Important
/// The order of the fields determines the order in which the facet values will be returned.
/// This struct is inserted in a BinaryHeap and popped later on.
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
struct LevelEntry<'t> {
/// The number of candidates in this entry.
count: u64,
/// The key level of the entry.
level: Reverse<u8>,
/// The left bound key.
left_bound: &'t [u8],
/// The number of keys we must look for after `left_bound`.
group_size: u8,
/// Any docid in the set of matching documents. Used to find the original facet string.
any_docid: u32,
}
// Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new();
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level
let starting_key =
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
for el in db.range(rtxn, &(&starting_key..))?.take(usize::MAX) {
let (key, value) = el?;
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != field_id {
break;
}
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
count,
level: Reverse(key.level),
left_bound: key.left_bound,
group_size: value.size,
any_docid: intersection.min().unwrap(),
});
}
}
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
{
if let Reverse(0) = level {
match (callback)(left_bound, count, any_docid)? {
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(()),
}
} else {
let starting_key = FacetGroupKey { field_id, level: level.0 - 1, left_bound };
for el in db.range(rtxn, &(&starting_key..))?.take(group_size as usize) {
let (key, value) = el?;
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != field_id {
break;
}
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
count,
level: Reverse(key.level),
left_bound: key.left_bound,
group_size: value.size,
any_docid: intersection.min().unwrap(),
});
}
}
}
}
}
Ok(())
}
/// Iterate over the facets values by lexicographic order.
struct LexicographicFacetDistribution<'t, CB>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
field_id: u16,
callback: CB,
}
impl<'t, CB> LexicographicFacetDistribution<'t, CB>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
fn iterate_level_0(
&mut self,
candidates: &RoaringBitmap,
starting_bound: &'t [u8],
group_size: usize,
) -> Result<ControlFlow<()>> {
let starting_key =
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size);
for el in iter {
let (key, value) = el?;
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
{
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
}
}
}
Ok(ControlFlow::Continue(()))
}
fn iterate(
&mut self,
candidates: &RoaringBitmap,
level: u8,
starting_bound: &'t [u8],
group_size: usize,
) -> Result<ControlFlow<()>> {
if level == 0 {
return self.iterate_level_0(candidates, starting_bound, group_size);
}
let starting_key =
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound };
let iter = self.db.range(self.rtxn, &(&starting_key..))?.take(group_size);
for el in iter {
let (key, value) = el?;
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let cf = self.iterate(
&docids_in_common,
level - 1,
key.left_bound,
value.size as usize,
)?;
match cf {
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
}
}
}
Ok(ControlFlow::Continue(()))
}
}
#[cfg(test)]
mod tests {
use std::ops::ControlFlow;
use heed::BytesDecode;
use roaring::RoaringBitmap;
use super::lexicographically_iterate_over_facet_distribution;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::milli_snap;
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
#[test]
fn filter_distribution_all() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (0..=255).collect::<RoaringBitmap>();
let mut results = String::new();
lexicographically_iterate_over_facet_distribution(
&txn,
index.content,
0,
&candidates,
|facet, count, _| {
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {count}\n"));
Ok(ControlFlow::Continue(()))
},
)
.unwrap();
milli_snap!(results, i);
txn.commit().unwrap();
}
}
#[test]
fn filter_distribution_all_stop_early() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (0..=255).collect::<RoaringBitmap>();
let mut results = String::new();
let mut nbr_facets = 0;
lexicographically_iterate_over_facet_distribution(
&txn,
index.content,
0,
&candidates,
|facet, count, _| {
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
if nbr_facets == 100 {
Ok(ControlFlow::Break(()))
} else {
nbr_facets += 1;
results.push_str(&format!("{facet}: {count}\n"));
Ok(ControlFlow::Continue(()))
}
},
)
.unwrap();
milli_snap!(results, i);
txn.commit().unwrap();
}
}
}

View file

@ -0,0 +1,688 @@
use std::ops::{Bound, RangeBounds};
use heed::BytesEncode;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, Result};
/// Find all the document ids for which the given field contains a value contained within
/// the two bounds.
pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BoundCodec>, FacetGroupValueCodec>,
field_id: u16,
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
universe: Option<&RoaringBitmap>,
docids: &mut RoaringBitmap,
) -> Result<()>
where
BoundCodec: for<'a> BytesEncode<'a>,
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized,
{
let inner;
let left = match left {
Bound::Included(left) => {
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
Bound::Included(inner.as_ref())
}
Bound::Excluded(left) => {
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
Bound::Excluded(inner.as_ref())
}
Bound::Unbounded => Bound::Unbounded,
};
let inner;
let right = match right {
Bound::Included(right) => {
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
Bound::Included(inner.as_ref())
}
Bound::Excluded(right) => {
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
Bound::Excluded(inner.as_ref())
}
Bound::Unbounded => Bound::Unbounded,
};
let db = db.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>();
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids };
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(starting_left_bound) =
get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?
{
let rightmost_bound =
Bound::Included(get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
let group_size = usize::MAX;
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
Ok(())
} else {
Ok(())
}
}
/// Fetch the document ids that have a facet with a value between the two given bounds
struct FacetRangeSearch<'t, 'b, 'bitmap> {
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
field_id: u16,
left: Bound<&'b [u8]>,
right: Bound<&'b [u8]>,
/// The subset of documents ids that are useful for this search.
/// Great performance optimizations can be achieved by only fetching values matching this subset.
universe: Option<&'bitmap RoaringBitmap>,
docids: &'bitmap mut RoaringBitmap,
}
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
let left_key =
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound };
let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size);
for el in iter {
let (key, value) = el?;
// the right side of the iter range is unbounded, so we need to make sure that we are not iterating
// on the next field id
if key.field_id != self.field_id {
return Ok(());
}
let should_skip = {
match self.left {
Bound::Included(left) => left > key.left_bound,
Bound::Excluded(left) => left >= key.left_bound,
Bound::Unbounded => false,
}
};
if should_skip {
continue;
}
let should_stop = {
match self.right {
Bound::Included(right) => right < key.left_bound,
Bound::Excluded(right) => right <= key.left_bound,
Bound::Unbounded => false,
}
};
if should_stop {
break;
}
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
*self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
};
}
}
Ok(())
}
/// Recursive part of the algorithm for level > 0.
///
/// It works by visiting a slice of a level and checking whether the range asscociated
/// with each visited element is contained within the bounds.
///
/// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating
/// 2. If the element's range is fully contained by the bounds, then all of its docids are added to
/// the roaring bitmap.
/// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively
/// on the children of the element from the level below.
/// 4. If the element's range is greater than the right bound, we do nothing and stop iterating.
/// Note that the right bound is found through either the `left_bound` of the *next* element,
/// or from the `rightmost_bound` argument
///
/// ## Arguments
/// - `level`: the level being visited
/// - `starting_left_bound`: the left_bound of the first element to visit
/// - `rightmost_bound`: the right bound of the last element that should be visited
/// - `group_size`: the number of elements that should be visited
fn run(
&mut self,
level: u8,
starting_left_bound: &'t [u8],
rightmost_bound: Bound<&'t [u8]>,
group_size: usize,
) -> Result<()> {
if level == 0 {
return self.run_level_0(starting_left_bound, group_size);
}
let left_key =
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound };
let mut iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size);
// We iterate over the range while keeping in memory the previous value
let (mut previous_key, mut previous_value) = iter.next().unwrap()?;
for el in iter {
let (next_key, next_value) = el?;
// the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX),
// so we need to make sure that we are not iterating on the next field id
if next_key.field_id != self.field_id {
break;
}
// now, do we skip, stop, or visit?
let should_skip = {
match self.left {
Bound::Included(left) => left >= next_key.left_bound,
Bound::Excluded(left) => left >= next_key.left_bound,
Bound::Unbounded => false,
}
};
if should_skip {
previous_key = next_key;
previous_value = next_value;
continue;
}
// should we stop?
// We should if the search range doesn't include any
// element from the previous key or its successors
let should_stop = {
match self.right {
Bound::Included(right) => right < previous_key.left_bound,
Bound::Excluded(right) => right <= previous_key.left_bound,
Bound::Unbounded => false,
}
};
if should_stop {
return Ok(());
}
// should we take the whole thing, without recursing down?
let should_take_whole_group = {
let left_condition = match self.left {
Bound::Included(left) => previous_key.left_bound >= left,
Bound::Excluded(left) => previous_key.left_bound > left,
Bound::Unbounded => true,
};
let right_condition = match self.right {
Bound::Included(right) => next_key.left_bound <= right,
Bound::Excluded(right) => next_key.left_bound <= right,
Bound::Unbounded => true,
};
left_condition && right_condition
};
if should_take_whole_group {
*self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
previous_key = next_key;
previous_value = next_value;
continue;
}
// from here, we should visit the children of the previous element and
// call the function recursively
let level = level - 1;
let starting_left_bound = previous_key.left_bound;
let rightmost_bound = Bound::Excluded(next_key.left_bound);
let group_size = previous_value.size as usize;
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
previous_key = next_key;
previous_value = next_value;
}
// previous_key/previous_value are the last element's key/value
// now, do we skip, stop, or visit?
let should_skip = {
match (self.left, rightmost_bound) {
(Bound::Included(left), Bound::Included(right)) => left > right,
(Bound::Included(left), Bound::Excluded(right)) => left >= right,
(Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => {
left >= right
}
(Bound::Unbounded, _) => false,
(_, Bound::Unbounded) => false, // should never run?
}
};
if should_skip {
return Ok(());
}
// should we stop?
// We should if the search range doesn't include any
// element from the previous key or its successors
let should_stop = {
match self.right {
Bound::Included(right) => right < previous_key.left_bound,
Bound::Excluded(right) => right <= previous_key.left_bound,
Bound::Unbounded => false,
}
};
if should_stop {
return Ok(());
}
// should we take the whole thing, without recursing down?
let should_take_whole_group = {
let left_condition = match self.left {
Bound::Included(left) => previous_key.left_bound >= left,
Bound::Excluded(left) => previous_key.left_bound > left,
Bound::Unbounded => true,
};
let right_condition = match (self.right, rightmost_bound) {
(Bound::Included(right), Bound::Included(rightmost)) => {
// we need to stay within the bound ..=right
// the element's range goes to ..=righmost
// so the element fits entirely within the bound if rightmost <= right
rightmost <= right
}
(Bound::Included(right), Bound::Excluded(rightmost)) => {
// we need to stay within the bound ..=right
// the element's range goes to ..righmost
// so the element fits entirely within the bound if rightmost <= right
rightmost <= right
}
(Bound::Excluded(right), Bound::Included(rightmost)) => {
// we need to stay within the bound ..right
// the element's range goes to ..=righmost
// so the element fits entirely within the bound if rightmost < right
rightmost < right
}
(Bound::Excluded(right), Bound::Excluded(rightmost)) => {
// we need to stay within the bound ..right
// the element's range goes to ..righmost
// so the element fits entirely within the bound if rightmost <= right
rightmost <= right
}
(Bound::Unbounded, _) => {
// we need to stay within the bound ..inf
// so the element always fits entirely within the bound
true
}
(_, Bound::Unbounded) => {
// we need to stay within a finite bound
// but the element's range goes to ..inf
// so the element never fits entirely within the bound
false
}
};
left_condition && right_condition
};
if should_take_whole_group {
*self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
} else {
let level = level - 1;
let starting_left_bound = previous_key.left_bound;
let group_size = previous_value.size as usize;
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::ops::Bound;
use roaring::RoaringBitmap;
use super::find_docids_of_facet_within_bounds;
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
use crate::milli_snap;
use crate::search::facet::tests::{
get_random_looking_index, get_random_looking_index_with_multiple_field_ids,
get_simple_index, get_simple_index_with_multiple_field_ids,
};
use crate::snapshot_tests::display_bitmap;
#[test]
fn random_looking_index_snap() {
let index = get_random_looking_index();
milli_snap!(format!("{index}"), @"3256c76a7c1b768a013e78d5fa6e9ff9");
}
#[test]
fn random_looking_index_with_multiple_field_ids_snap() {
let index = get_random_looking_index_with_multiple_field_ids();
milli_snap!(format!("{index}"), @"c3e5fe06a8f1c404ed4935b32c90a89b");
}
#[test]
fn simple_index_snap() {
let index = get_simple_index();
milli_snap!(format!("{index}"), @"5dbfa134cc44abeb3ab6242fc182e48e");
}
#[test]
fn simple_index_with_multiple_field_ids_snap() {
let index = get_simple_index_with_multiple_field_ids();
milli_snap!(format!("{index}"), @"a4893298218f682bc76357f46777448c");
}
#[test]
fn filter_range_increasing() {
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
get_random_looking_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Included(0.);
let end = Bound::Included(i);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
#[allow(clippy::format_push_string)]
results.push_str(&format!("0 <= . <= {i} : {}\n", display_bitmap(&docids)));
}
milli_snap!(results, format!("included_{i}"));
let mut results = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Excluded(0.);
let end = Bound::Excluded(i);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
#[allow(clippy::format_push_string)]
results.push_str(&format!("0 < . < {i} : {}\n", display_bitmap(&docids)));
}
milli_snap!(results, format!("excluded_{i}"));
txn.commit().unwrap();
}
}
#[test]
fn filter_range_decreasing() {
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
get_random_looking_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in (0..=255).rev() {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(255.);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
results.push_str(&format!("{i} <= . <= 255 : {}\n", display_bitmap(&docids)));
}
milli_snap!(results, format!("included_{i}"));
let mut results = String::new();
for i in (0..=255).rev() {
let i = i as f64;
let start = Bound::Excluded(i);
let end = Bound::Excluded(255.);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
results.push_str(&format!("{i} < . < 255 : {}\n", display_bitmap(&docids)));
}
milli_snap!(results, format!("excluded_{i}"));
txn.commit().unwrap();
}
}
#[test]
fn filter_range_pinch() {
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
get_random_looking_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in (0..=128).rev() {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(255. - i);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
results.push_str(&format!(
"{i} <= . <= {r} : {docids}\n",
r = 255. - i,
docids = display_bitmap(&docids)
));
}
milli_snap!(results, format!("included_{i}"));
let mut results = String::new();
for i in (0..=128).rev() {
let i = i as f64;
let start = Bound::Excluded(i);
let end = Bound::Excluded(255. - i);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
results.push_str(&format!(
"{i} < . < {r} {docids}\n",
r = 255. - i,
docids = display_bitmap(&docids)
));
}
milli_snap!(results, format!("excluded_{i}"));
txn.commit().unwrap();
}
}
#[test]
fn filter_range_unbounded() {
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
get_random_looking_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Unbounded;
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
#[allow(clippy::format_push_string)]
results.push_str(&format!(">= {i}: {}\n", display_bitmap(&docids)));
}
milli_snap!(results, format!("start_from_included_{i}"));
let mut results = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Unbounded;
let end = Bound::Included(i);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
#[allow(clippy::format_push_string)]
results.push_str(&format!("<= {i}: {}\n", display_bitmap(&docids)));
}
milli_snap!(results, format!("end_at_included_{i}"));
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&Bound::Unbounded,
&Bound::Unbounded,
None,
&mut docids,
)
.unwrap();
milli_snap!(
&format!("all field_id 0: {}\n", display_bitmap(&docids)),
format!("unbounded_field_id_0_{i}")
);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
1,
&Bound::Unbounded,
&Bound::Unbounded,
None,
&mut docids,
)
.unwrap();
milli_snap!(
&format!("all field_id 1: {}\n", display_bitmap(&docids)),
format!("unbounded_field_id_1_{i}")
);
drop(txn);
}
}
#[test]
fn filter_range_exact() {
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
get_random_looking_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results_0 = String::new();
let mut results_1 = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(i);
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
0,
&start,
&end,
None,
&mut docids,
)
.unwrap();
#[allow(clippy::format_push_string)]
results_0.push_str(&format!("{i}: {}\n", display_bitmap(&docids)));
let mut docids = RoaringBitmap::new();
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
1,
&start,
&end,
None,
&mut docids,
)
.unwrap();
#[allow(clippy::format_push_string)]
results_1.push_str(&format!("{i}: {}\n", display_bitmap(&docids)));
}
milli_snap!(results_0, format!("field_id_0_exact_{i}"));
milli_snap!(results_1, format!("field_id_1_exact_{i}"));
drop(txn);
}
}
}

View file

@ -0,0 +1,230 @@
use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
/// Return an iterator which iterates over the given candidate documents in
/// ascending order of their facet value for the given field id.
///
/// The documents returned by the iterator are grouped by the facet values that
/// determined their rank. For example, given the documents:
///
/// ```text
/// 0: { "colour": ["blue", "green"] }
/// 1: { "colour": ["blue", "red"] }
/// 2: { "colour": ["orange", "red"] }
/// 3: { "colour": ["green", "red"] }
/// 4: { "colour": ["blue", "orange", "red"] }
/// ```
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
/// over the following elements:
/// ```text
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
/// [3] // same for "green"
/// [2] // same for "orange"
/// END
/// ```
/// Note that once a document id is returned by the iterator, it is never returned again.
pub fn ascending_facet_sort<'t>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
Ok(itertools::Either::Left(AscendingFacetSort {
rtxn,
db,
field_id,
stack: vec![(candidates, iter)],
}))
} else {
Ok(itertools::Either::Right(std::iter::empty()))
}
}
struct AscendingFacetSort<'t, 'e> {
rtxn: &'t heed::RoTxn<'e>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
#[allow(clippy::type_complexity)]
stack: Vec<(
RoaringBitmap,
std::iter::Take<heed::RoRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>>,
)>,
}
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
type Item = Result<(RoaringBitmap, &'t [u8])>;
fn next(&mut self) -> Option<Self::Item> {
'outer: loop {
let (documents_ids, deepest_iter) = self.stack.last_mut()?;
for result in deepest_iter {
let (
FacetGroupKey { level, left_bound, field_id },
FacetGroupValue { size: group_size, mut bitmap },
) = result.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if field_id != self.field_id {
return None;
}
// If the last iterator found an empty set of documents it means
// that we found all the documents in the sub level iterations already,
// we can pop this level iterator.
if documents_ids.is_empty() {
// break our of the for loop into the end of the 'outer loop, which
// pops the stack
break;
}
bitmap &= &*documents_ids;
if !bitmap.is_empty() {
*documents_ids -= &bitmap;
if level == 0 {
// Since the level is 0, the left_bound is the exact value.
return Some(Ok((bitmap, left_bound)));
}
let starting_key_below =
FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound };
let iter = match self.db.range(self.rtxn, &(starting_key_below..)) {
Ok(iter) => iter,
Err(e) => return Some(Err(e)),
}
.take(group_size as usize);
self.stack.push((bitmap, iter));
continue 'outer;
}
}
self.stack.pop();
}
}
}
#[cfg(test)]
mod tests {
use roaring::RoaringBitmap;
use crate::milli_snap;
use crate::search::facet::facet_sort_ascending::ascending_facet_sort;
use crate::search::facet::tests::{
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
get_simple_index, get_simple_string_index_with_multiple_field_ids,
};
use crate::snapshot_tests::display_bitmap;
#[test]
fn filter_sort_ascending() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, i);
txn.commit().unwrap();
}
}
#[test]
fn filter_sort_ascending_multiple_field_ids() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-0"));
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-1"));
txn.commit().unwrap();
}
}
#[test]
fn filter_sort_ascending_with_no_candidates() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for index in indexes {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());
txn.commit().unwrap();
}
}
#[test]
fn filter_sort_ascending_with_inexisting_field_id() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for index in indexes {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());
txn.commit().unwrap();
}
}
}

View file

@ -0,0 +1,244 @@
use std::ops::Bound;
use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort).
///
/// This function does the same thing, but in the opposite order.
pub fn descending_facet_sort<'t>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let last_bound = get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap();
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
Ok(itertools::Either::Left(DescendingFacetSort {
rtxn,
db,
field_id,
stack: vec![(candidates, iter, Bound::Included(last_bound))],
}))
} else {
Ok(itertools::Either::Right(std::iter::empty()))
}
}
struct DescendingFacetSort<'t> {
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
#[allow(clippy::type_complexity)]
stack: Vec<(
RoaringBitmap,
std::iter::Take<
heed::RoRevRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
>,
Bound<&'t [u8]>,
)>,
}
impl<'t> Iterator for DescendingFacetSort<'t> {
type Item = Result<(RoaringBitmap, &'t [u8])>;
fn next(&mut self) -> Option<Self::Item> {
'outer: loop {
let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?;
for result in deepest_iter.by_ref() {
let (
FacetGroupKey { level, left_bound, field_id },
FacetGroupValue { size: group_size, mut bitmap },
) = result.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if field_id != self.field_id {
return None;
}
// If the last iterator found an empty set of documents it means
// that we found all the documents in the sub level iterations already,
// we can pop this level iterator.
if documents_ids.is_empty() {
break;
}
bitmap &= &*documents_ids;
if !bitmap.is_empty() {
*documents_ids -= &bitmap;
if level == 0 {
// Since we're at the level 0 the left_bound is the exact value.
return Some(Ok((bitmap, left_bound)));
}
let starting_key_below =
FacetGroupKey { field_id, level: level - 1, left_bound };
let end_key_kelow = match *right_bound {
Bound::Included(right) => Bound::Included(FacetGroupKey {
field_id,
level: level - 1,
left_bound: right,
}),
Bound::Excluded(right) => Bound::Excluded(FacetGroupKey {
field_id,
level: level - 1,
left_bound: right,
}),
Bound::Unbounded => Bound::Unbounded,
};
let prev_right_bound = *right_bound;
*right_bound = Bound::Excluded(left_bound);
let iter = match self
.db
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
.rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow))
{
Ok(iter) => iter,
Err(e) => return Some(Err(e)),
}
.take(group_size as usize);
self.stack.push((bitmap, iter, prev_right_bound));
continue 'outer;
}
*right_bound = Bound::Excluded(left_bound);
}
self.stack.pop();
}
}
}
#[cfg(test)]
mod tests {
use roaring::RoaringBitmap;
use crate::heed_codec::facet::FacetGroupKeyCodec;
use crate::heed_codec::BytesRefCodec;
use crate::milli_snap;
use crate::search::facet::facet_sort_descending::descending_facet_sort;
use crate::search::facet::tests::{
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
get_simple_index, get_simple_index_with_multiple_field_ids,
get_simple_string_index_with_multiple_field_ids,
};
use crate::snapshot_tests::display_bitmap;
#[test]
fn filter_sort_descending() {
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, i);
txn.commit().unwrap();
}
}
#[test]
fn filter_sort_descending_multiple_field_ids() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-0"));
let mut results = String::new();
let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-1"));
txn.commit().unwrap();
}
}
#[test]
fn filter_sort_ascending_with_no_candidates() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for index in indexes {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());
let mut results = String::new();
let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());
txn.commit().unwrap();
}
}
#[test]
fn filter_sort_ascending_with_inexisting_field_id() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for index in indexes {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
for el in iter {
let (docids, _) = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());
txn.commit().unwrap();
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,227 @@
pub use facet_sort_ascending::ascending_facet_sort;
pub use facet_sort_descending::descending_facet_sort;
use heed::types::{Bytes, DecodeIgnore};
use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::filter::{BadGeoError, Filter};
pub use self::search::{FacetValueHit, SearchForFacetValues};
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
use crate::heed_codec::BytesRefCodec;
use crate::{Index, Result};
mod facet_distribution;
mod facet_distribution_iter;
mod facet_range_search;
mod facet_sort_ascending;
mod facet_sort_descending;
mod filter;
mod search;
fn facet_extreme_value<'t>(
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
) -> Result<Option<f64>> {
let extreme_value =
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
let (_, extreme_value) = extreme_value?;
OrderedF64Codec::bytes_decode(extreme_value)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into)
}
pub fn facet_min_value<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
pub fn facet_max_value<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
/// Get the first facet value in the facet database
pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>(
txn: &'t RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>>
where
BoundCodec: BytesDecode<'t>,
{
let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0);
let mut level0_iter_forward =
db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, level0prefix.as_slice())?;
if let Some(first) = level0_iter_forward.next() {
let (first_key, _) = first?;
let first_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(first_key)
.map_err(heed::Error::Decoding)?;
Ok(Some(first_key.left_bound))
} else {
Ok(None)
}
}
/// Get the last facet value in the facet database
pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>(
txn: &'t RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>>
where
BoundCodec: BytesDecode<'t>,
{
let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0);
let mut level0_iter_backward =
db.remap_types::<Bytes, DecodeIgnore>().rev_prefix_iter(txn, level0prefix.as_slice())?;
if let Some(last) = level0_iter_backward.next() {
let (last_key, _) = last?;
let last_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(last_key)
.map_err(heed::Error::Decoding)?;
Ok(Some(last_key.left_bound))
} else {
Ok(None)
}
}
/// Get the height of the highest level in the facet database
pub(crate) fn get_highest_level<'t, DC>(
txn: &'t RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
field_id: u16,
) -> heed::Result<u8> {
let field_id_prefix = &field_id.to_be_bytes();
Ok(db
.remap_types::<Bytes, DecodeIgnore>()
.rev_prefix_iter(txn, field_id_prefix)?
.next()
.map(|el| {
let (key, _) = el.unwrap();
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
key.level
})
.unwrap_or(0))
}
#[cfg(test)]
pub(crate) mod tests {
use rand::{Rng, SeedableRng};
use roaring::RoaringBitmap;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::heed_codec::StrRefCodec;
use crate::update::facet::test_helpers::FacetIndex;
pub fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
index.insert(&mut txn, 0, &(i as f64), &bitmap);
}
txn.commit().unwrap();
index
}
pub fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
for key in std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128) {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
index.insert(&mut txn, 0, &(key as f64), &bitmap);
}
txn.commit().unwrap();
index
}
pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
for fid in 0..2 {
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
index.insert(&mut txn, fid, &(i as f64), &bitmap);
}
}
txn.commit().unwrap();
index
}
pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for fid in 0..2 {
for &key in &keys {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
index.insert(&mut txn, fid, &(key as f64), &bitmap);
}
}
txn.commit().unwrap();
index
}
pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
for fid in 0..2 {
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
if i % 2 == 0 {
index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap);
} else {
index.insert(&mut txn, fid, &"", &bitmap);
}
}
}
txn.commit().unwrap();
index
}
pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for fid in 0..2 {
for &key in &keys {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
if key % 2 == 0 {
index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap);
} else {
index.insert(&mut txn, fid, &"", &bitmap);
}
}
}
txn.commit().unwrap();
index
}
}

View file

@ -0,0 +1,358 @@
use std::cmp::{Ordering, Reverse};
use std::collections::BinaryHeap;
use std::ops::ControlFlow;
use charabia::normalizer::NormalizerOption;
use charabia::{Language, Normalize, StrDetection, Token};
use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Streamer};
use roaring::RoaringBitmap;
use tracing::error;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::search::build_dfa;
use crate::{DocumentId, FieldId, OrderBy, Result, Search};
/// The maximum number of values per facet returned by the facet search route.
const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100;
pub struct SearchForFacetValues<'a> {
query: Option<String>,
facet: String,
search_query: Search<'a>,
max_values: usize,
is_hybrid: bool,
locales: Option<Vec<Language>>,
}
impl<'a> SearchForFacetValues<'a> {
pub fn new(
facet: String,
search_query: Search<'a>,
is_hybrid: bool,
) -> SearchForFacetValues<'a> {
SearchForFacetValues {
query: None,
facet,
search_query,
max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET,
is_hybrid,
locales: None,
}
}
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
self.query = Some(query.into());
self
}
pub fn max_values(&mut self, max: usize) -> &mut Self {
self.max_values = max;
self
}
pub fn locales(&mut self, locales: Vec<Language>) -> &mut Self {
self.locales = Some(locales);
self
}
fn one_original_value_of(
&self,
field_id: FieldId,
facet_str: &str,
any_docid: DocumentId,
) -> Result<Option<String>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let key: (FieldId, _, &str) = (field_id, any_docid, facet_str);
Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned()))
}
pub fn execute(&self) -> Result<Vec<FacetValueHit>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let filterable_fields = index.filterable_fields(rtxn)?;
if !filterable_fields.contains(&self.facet) {
let (valid_fields, hidden_fields) =
index.remove_hidden_fields(rtxn, filterable_fields)?;
return Err(UserError::InvalidFacetSearchFacetName {
field: self.facet.clone(),
valid_fields,
hidden_fields,
}
.into());
}
let fields_ids_map = index.fields_ids_map(rtxn)?;
let fid = match fields_ids_map.id(&self.facet) {
Some(fid) => fid,
// we return an empty list of results when the attribute has been
// set as filterable but no document contains this field (yet).
None => return Ok(Vec::new()),
};
let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? {
Some(fst) => fst,
None => return Ok(Vec::new()),
};
let search_candidates = self.search_query.execute_for_candidates(
self.is_hybrid
|| self
.search_query
.semantic
.as_ref()
.and_then(|semantic| semantic.vector.as_ref())
.is_some(),
)?;
let mut results = match index.sort_facet_values_by(rtxn)?.get(&self.facet) {
OrderBy::Lexicographic => ValuesCollection::by_lexicographic(self.max_values),
OrderBy::Count => ValuesCollection::by_count(self.max_values),
};
match self.query.as_ref() {
Some(query) => {
let query = normalize_facet_string(query, self.locales.as_deref());
let query = query.as_ref();
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
let field_authorizes_typos =
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
if authorize_typos && field_authorizes_typos {
let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
if fst.contains(query) {
self.fetch_original_facets_using_normalized(
fid,
query,
query,
&search_candidates,
&mut results,
)?;
}
} else {
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
let is_prefix = true;
let automaton = if query.len() < one_typo as usize {
build_dfa(query, 0, is_prefix)
} else if query.len() < two_typos as usize {
build_dfa(query, 1, is_prefix)
} else {
build_dfa(query, 2, is_prefix)
};
let mut stream = fst.search(automaton).into_stream();
while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?;
if self
.fetch_original_facets_using_normalized(
fid,
value,
query,
&search_candidates,
&mut results,
)?
.is_break()
{
break;
}
}
}
} else {
let automaton = Str::new(query).starts_with();
let mut stream = fst.search(automaton).into_stream();
while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?;
if self
.fetch_original_facets_using_normalized(
fid,
value,
query,
&search_candidates,
&mut results,
)?
.is_break()
{
break;
}
}
}
}
None => {
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
result?;
let count = search_candidates.intersection_len(&bitmap);
if count != 0 {
let value = self
.one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
.unwrap_or_else(|| left_bound.to_string());
if results.insert(FacetValueHit { value, count }).is_break() {
break;
}
}
}
}
}
Ok(results.into_sorted_vec())
}
fn fetch_original_facets_using_normalized(
&self,
fid: FieldId,
value: &str,
query: &str,
search_candidates: &RoaringBitmap,
results: &mut ValuesCollection,
) -> Result<ControlFlow<()>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let database = index.facet_id_normalized_string_strings;
let key = (fid, value);
let original_strings = match database.get(rtxn, &key)? {
Some(original_strings) => original_strings,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
for original in original_strings {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, &original, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
if results.insert(FacetValueHit { value, count }).is_break() {
break;
}
}
}
Ok(ControlFlow::Continue(()))
}
}
#[derive(Debug, Clone, serde::Serialize, PartialEq)]
pub struct FacetValueHit {
/// The original facet value
pub value: String,
/// The number of documents associated to this facet
pub count: u64,
}
impl PartialOrd for FacetValueHit {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for FacetValueHit {
fn cmp(&self, other: &Self) -> Ordering {
self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value))
}
}
impl Eq for FacetValueHit {}
/// A wrapper type that collects the best facet values by
/// lexicographic or number of associated values.
enum ValuesCollection {
/// Keeps the top values according to the lexicographic order.
Lexicographic { max: usize, content: Vec<FacetValueHit> },
/// Keeps the top values according to the number of values associated to them.
///
/// Note that it is a max heap and we need to move the smallest counts
/// at the top to be able to pop them when we reach the max_values limit.
Count { max: usize, content: BinaryHeap<Reverse<FacetValueHit>> },
}
impl ValuesCollection {
pub fn by_lexicographic(max: usize) -> Self {
ValuesCollection::Lexicographic { max, content: Vec::new() }
}
pub fn by_count(max: usize) -> Self {
ValuesCollection::Count { max, content: BinaryHeap::new() }
}
pub fn insert(&mut self, value: FacetValueHit) -> ControlFlow<()> {
match self {
ValuesCollection::Lexicographic { max, content } => {
if content.len() < *max {
content.push(value);
if content.len() < *max {
return ControlFlow::Continue(());
}
}
ControlFlow::Break(())
}
ValuesCollection::Count { max, content } => {
if content.len() == *max {
// Peeking gives us the worst value in the list as
// this is a max-heap and we reversed it.
let Some(mut peek) = content.peek_mut() else { return ControlFlow::Break(()) };
if peek.0.count <= value.count {
// Replace the current worst value in the heap
// with the new one we received that is better.
*peek = Reverse(value);
}
} else {
content.push(Reverse(value));
}
ControlFlow::Continue(())
}
}
}
/// Returns the list of facet values in descending order of, either,
/// count or lexicographic order of the value depending on the type.
pub fn into_sorted_vec(self) -> Vec<FacetValueHit> {
match self {
ValuesCollection::Lexicographic { content, .. } => content.into_iter().collect(),
ValuesCollection::Count { content, .. } => {
// Convert the heap into a vec of hits by removing the Reverse wrapper.
// Hits are already in the right order as they were reversed and there
// are output in ascending order.
content.into_sorted_vec().into_iter().map(|Reverse(hit)| hit).collect()
}
}
}
}
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
let options = NormalizerOption { lossy: true, ..Default::default() };
let mut detection = StrDetection::new(facet_string, locales);
// Detect the language of the facet string only if several locales are explicitly provided.
let language = match locales {
Some(&[language]) => Some(language),
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
_ => None,
};
let token = Token {
lemma: std::borrow::Cow::Borrowed(facet_string),
script: detection.script(),
language,
..Default::default()
};
token.normalize(&options).lemma.into_owned()
}

View file

@ -0,0 +1,260 @@
---
source: milli/src/search/facet/facet_distribution_iter.rs
---
0: 1
1: 1
2: 1
3: 1
4: 1
5: 1
6: 1
7: 1
8: 1
9: 1
10: 1
11: 1
12: 1
13: 1
14: 1
15: 1
16: 1
17: 1
18: 1
19: 1
20: 1
21: 1
22: 1
23: 1
24: 1
25: 1
26: 1
27: 1
28: 1
29: 1
30: 1
31: 1
32: 1
33: 1
34: 1
35: 1
36: 1
37: 1
38: 1
39: 1
40: 1
41: 1
42: 1
43: 1
44: 1
45: 1
46: 1
47: 1
48: 1
49: 1
50: 1
51: 1
52: 1
53: 1
54: 1
55: 1
56: 1
57: 1
58: 1
59: 1
60: 1
61: 1
62: 1
63: 1
64: 1
65: 1
66: 1
67: 1
68: 1
69: 1
70: 1
71: 1
72: 1
73: 1
74: 1
75: 1
76: 1
77: 1
78: 1
79: 1
80: 1
81: 1
82: 1
83: 1
84: 1
85: 1
86: 1
87: 1
88: 1
89: 1
90: 1
91: 1
92: 1
93: 1
94: 1
95: 1
96: 1
97: 1
98: 1
99: 1
100: 1
101: 1
102: 1
103: 1
104: 1
105: 1
106: 1
107: 1
108: 1
109: 1
110: 1
111: 1
112: 1
113: 1
114: 1
115: 1
116: 1
117: 1
118: 1
119: 1
120: 1
121: 1
122: 1
123: 1
124: 1
125: 1
126: 1
127: 1
128: 1
129: 1
130: 1
131: 1
132: 1
133: 1
134: 1
135: 1
136: 1
137: 1
138: 1
139: 1
140: 1
141: 1
142: 1
143: 1
144: 1
145: 1
146: 1
147: 1
148: 1
149: 1
150: 1
151: 1
152: 1
153: 1
154: 1
155: 1
156: 1
157: 1
158: 1
159: 1
160: 1
161: 1
162: 1
163: 1
164: 1
165: 1
166: 1
167: 1
168: 1
169: 1
170: 1
171: 1
172: 1
173: 1
174: 1
175: 1
176: 1
177: 1
178: 1
179: 1
180: 1
181: 1
182: 1
183: 1
184: 1
185: 1
186: 1
187: 1
188: 1
189: 1
190: 1
191: 1
192: 1
193: 1
194: 1
195: 1
196: 1
197: 1
198: 1
199: 1
200: 1
201: 1
202: 1
203: 1
204: 1
205: 1
206: 1
207: 1
208: 1
209: 1
210: 1
211: 1
212: 1
213: 1
214: 1
215: 1
216: 1
217: 1
218: 1
219: 1
220: 1
221: 1
222: 1
223: 1
224: 1
225: 1
226: 1
227: 1
228: 1
229: 1
230: 1
231: 1
232: 1
233: 1
234: 1
235: 1
236: 1
237: 1
238: 1
239: 1
240: 1
241: 1
242: 1
243: 1
244: 1
245: 1
246: 1
247: 1
248: 1
249: 1
250: 1
251: 1
252: 1
253: 1
254: 1
255: 1

View file

@ -0,0 +1,105 @@
---
source: milli/src/search/facet/facet_distribution_iter.rs
---
3: 2
5: 2
6: 2
9: 2
10: 2
11: 2
14: 2
18: 2
19: 2
24: 2
26: 2
28: 2
29: 2
32: 2
33: 2
35: 2
36: 2
37: 2
38: 2
39: 2
41: 2
46: 2
47: 2
49: 2
52: 2
53: 2
55: 2
59: 2
61: 2
64: 2
68: 2
71: 2
74: 2
75: 2
76: 2
81: 2
83: 2
85: 2
86: 2
88: 2
90: 2
91: 2
92: 2
98: 2
99: 2
101: 2
102: 2
103: 2
107: 2
111: 2
115: 2
119: 2
123: 2
124: 2
130: 2
131: 2
133: 2
135: 2
136: 2
137: 2
139: 2
141: 2
143: 2
144: 2
147: 2
150: 2
156: 1
158: 1
160: 1
162: 1
163: 1
164: 1
167: 1
169: 1
173: 1
177: 1
178: 1
179: 1
181: 1
182: 1
186: 1
189: 1
192: 1
193: 1
195: 1
197: 1
205: 1
206: 1
207: 1
208: 1
209: 1
210: 1
216: 1
219: 1
220: 1
223: 1
226: 1
235: 1
236: 1
238: 1
243: 1

View file

@ -0,0 +1,104 @@
---
source: milli/src/search/facet/facet_distribution_iter.rs
---
0: 1
1: 1
2: 1
3: 1
4: 1
5: 1
6: 1
7: 1
8: 1
9: 1
10: 1
11: 1
12: 1
13: 1
14: 1
15: 1
16: 1
17: 1
18: 1
19: 1
20: 1
21: 1
22: 1
23: 1
24: 1
25: 1
26: 1
27: 1
28: 1
29: 1
30: 1
31: 1
32: 1
33: 1
34: 1
35: 1
36: 1
37: 1
38: 1
39: 1
40: 1
41: 1
42: 1
43: 1
44: 1
45: 1
46: 1
47: 1
48: 1
49: 1
50: 1
51: 1
52: 1
53: 1
54: 1
55: 1
56: 1
57: 1
58: 1
59: 1
60: 1
61: 1
62: 1
63: 1
64: 1
65: 1
66: 1
67: 1
68: 1
69: 1
70: 1
71: 1
72: 1
73: 1
74: 1
75: 1
76: 1
77: 1
78: 1
79: 1
80: 1
81: 1
82: 1
83: 1
84: 1
85: 1
86: 1
87: 1
88: 1
89: 1
90: 1
91: 1
92: 1
93: 1
94: 1
95: 1
96: 1
97: 1
98: 1
99: 1

View file

@ -0,0 +1,104 @@
---
source: milli/src/search/facet/facet_distribution_iter.rs
---
3: 2
5: 2
6: 2
9: 2
10: 2
11: 2
14: 2
18: 2
19: 2
24: 2
26: 2
28: 2
29: 2
32: 2
33: 2
35: 2
36: 2
37: 2
38: 2
39: 2
41: 2
46: 2
47: 2
49: 2
52: 2
53: 2
55: 2
59: 2
61: 2
64: 2
68: 2
71: 2
74: 2
75: 2
76: 2
81: 2
83: 2
85: 2
86: 2
88: 2
90: 2
91: 2
92: 2
98: 2
99: 2
101: 2
102: 2
103: 2
107: 2
111: 2
115: 2
119: 2
123: 2
124: 2
130: 2
131: 2
133: 2
135: 2
136: 2
137: 2
139: 2
141: 2
143: 2
144: 2
147: 2
150: 2
156: 1
158: 1
160: 1
162: 1
163: 1
164: 1
167: 1
169: 1
173: 1
177: 1
178: 1
179: 1
181: 1
182: 1
186: 1
189: 1
192: 1
193: 1
195: 1
197: 1
205: 1
206: 1
207: 1
208: 1
209: 1
210: 1
216: 1
219: 1
220: 1
223: 1
226: 1
235: 1
236: 1
238: 1

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
adf484f467a31ee9460dec539621938a

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
c9939aa4977fcd4bfd35852e102dbc82

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
adf484f467a31ee9460dec539621938a

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
c9939aa4977fcd4bfd35852e102dbc82

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
618738d28ff1386b6e93d171a5acb08f

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
ffb62ab3eef55c2254c13dc0f4099849

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
618738d28ff1386b6e93d171a5acb08f

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
ffb62ab3eef55c2254c13dc0f4099849

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
9c25261cec7275cb5cfd85835904d023

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
2f97f18c15e915853e4df879be6e1f63

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
9c25261cec7275cb5cfd85835904d023

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
2f97f18c15e915853e4df879be6e1f63

View file

@ -0,0 +1,260 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
0: []
1: []
2: []
3: []
4: []
5: []
6: []
7: []
8: []
9: []
10: []
11: []
12: []
13: []
14: []
15: []
16: []
17: []
18: []
19: []
20: []
21: []
22: []
23: []
24: []
25: []
26: []
27: []
28: []
29: []
30: []
31: []
32: []
33: []
34: []
35: []
36: []
37: []
38: []
39: []
40: []
41: []
42: []
43: []
44: []
45: []
46: []
47: []
48: []
49: []
50: []
51: []
52: []
53: []
54: []
55: []
56: []
57: []
58: []
59: []
60: []
61: []
62: []
63: []
64: []
65: []
66: []
67: []
68: []
69: []
70: []
71: []
72: []
73: []
74: []
75: []
76: []
77: []
78: []
79: []
80: []
81: []
82: []
83: []
84: []
85: []
86: []
87: []
88: []
89: []
90: []
91: []
92: []
93: []
94: []
95: []
96: []
97: []
98: []
99: []
100: []
101: []
102: []
103: []
104: []
105: []
106: []
107: []
108: []
109: []
110: []
111: []
112: []
113: []
114: []
115: []
116: []
117: []
118: []
119: []
120: []
121: []
122: []
123: []
124: []
125: []
126: []
127: []
128: []
129: []
130: []
131: []
132: []
133: []
134: []
135: []
136: []
137: []
138: []
139: []
140: []
141: []
142: []
143: []
144: []
145: []
146: []
147: []
148: []
149: []
150: []
151: []
152: []
153: []
154: []
155: []
156: []
157: []
158: []
159: []
160: []
161: []
162: []
163: []
164: []
165: []
166: []
167: []
168: []
169: []
170: []
171: []
172: []
173: []
174: []
175: []
176: []
177: []
178: []
179: []
180: []
181: []
182: []
183: []
184: []
185: []
186: []
187: []
188: []
189: []
190: []
191: []
192: []
193: []
194: []
195: []
196: []
197: []
198: []
199: []
200: []
201: []
202: []
203: []
204: []
205: []
206: []
207: []
208: []
209: []
210: []
211: []
212: []
213: []
214: []
215: []
216: []
217: []
218: []
219: []
220: []
221: []
222: []
223: []
224: []
225: []
226: []
227: []
228: []
229: []
230: []
231: []
232: []
233: []
234: []
235: []
236: []
237: []
238: []
239: []
240: []
241: []
242: []
243: []
244: []
245: []
246: []
247: []
248: []
249: []
250: []
251: []
252: []
253: []
254: []
255: []

View file

@ -0,0 +1,260 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
0: []
1: []
2: []
3: []
4: []
5: []
6: []
7: []
8: []
9: []
10: []
11: []
12: []
13: []
14: []
15: []
16: []
17: []
18: []
19: []
20: []
21: []
22: []
23: []
24: []
25: []
26: []
27: []
28: []
29: []
30: []
31: []
32: []
33: []
34: []
35: []
36: []
37: []
38: []
39: []
40: []
41: []
42: []
43: []
44: []
45: []
46: []
47: []
48: []
49: []
50: []
51: []
52: []
53: []
54: []
55: []
56: []
57: []
58: []
59: []
60: []
61: []
62: []
63: []
64: []
65: []
66: []
67: []
68: []
69: []
70: []
71: []
72: []
73: []
74: []
75: []
76: []
77: []
78: []
79: []
80: []
81: []
82: []
83: []
84: []
85: []
86: []
87: []
88: []
89: []
90: []
91: []
92: []
93: []
94: []
95: []
96: []
97: []
98: []
99: []
100: []
101: []
102: []
103: []
104: []
105: []
106: []
107: []
108: []
109: []
110: []
111: []
112: []
113: []
114: []
115: []
116: []
117: []
118: []
119: []
120: []
121: []
122: []
123: []
124: []
125: []
126: []
127: []
128: []
129: []
130: []
131: []
132: []
133: []
134: []
135: []
136: []
137: []
138: []
139: []
140: []
141: []
142: []
143: []
144: []
145: []
146: []
147: []
148: []
149: []
150: []
151: []
152: []
153: []
154: []
155: []
156: []
157: []
158: []
159: []
160: []
161: []
162: []
163: []
164: []
165: []
166: []
167: []
168: []
169: []
170: []
171: []
172: []
173: []
174: []
175: []
176: []
177: []
178: []
179: []
180: []
181: []
182: []
183: []
184: []
185: []
186: []
187: []
188: []
189: []
190: []
191: []
192: []
193: []
194: []
195: []
196: []
197: []
198: []
199: []
200: []
201: []
202: []
203: []
204: []
205: []
206: []
207: []
208: []
209: []
210: []
211: []
212: []
213: []
214: []
215: []
216: []
217: []
218: []
219: []
220: []
221: []
222: []
223: []
224: []
225: []
226: []
227: []
228: []
229: []
230: []
231: []
232: []
233: []
234: []
235: []
236: []
237: []
238: []
239: []
240: []
241: []
242: []
243: []
244: []
245: []
246: []
247: []
248: []
249: []
250: []
251: []
252: []
253: []
254: []
255: []

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
9c25261cec7275cb5cfd85835904d023

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
2f97f18c15e915853e4df879be6e1f63

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
e849066b0e43d5c456f086c552372afc

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
8cc5e82995b0443b660f419bb9ea2e85

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
e849066b0e43d5c456f086c552372afc

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
8cc5e82995b0443b660f419bb9ea2e85

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
73b48005dc57b04f0939bbf21a68dab6

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
3c23d35627667dcee98468bfdecf09d3

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
73b48005dc57b04f0939bbf21a68dab6

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
3c23d35627667dcee98468bfdecf09d3

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
c3f8b0b858a4820a508b25b42328cedd

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
38a42f5dc25e99d7a5312a63ce94ed30

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
c3f8b0b858a4820a508b25b42328cedd

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
38a42f5dc25e99d7a5312a63ce94ed30

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
2049930204498b323885c91de88e44ca

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
7f0ca8c0fc6494f3dba46e8eb9699045

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
2049930204498b323885c91de88e44ca

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
7f0ca8c0fc6494f3dba46e8eb9699045

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
ad8fc873747aaf1d3590e7ccab735985

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
7c6cc88697da835d33877b2df41fa1cb

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
ad8fc873747aaf1d3590e7ccab735985

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
7c6cc88697da835d33877b2df41fa1cb

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
9a8c7343b4735d37704748cabcd51ff2

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
898a7dc25a1441bc3e7e2a8a62d99090

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
9a8c7343b4735d37704748cabcd51ff2

View file

@ -0,0 +1,4 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
898a7dc25a1441bc3e7e2a8a62d99090

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ]

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ]

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 1: []

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 1: []

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]

View file

@ -0,0 +1,5 @@
---
source: milli/src/search/facet/facet_range_search.rs
---
all field_id 1: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ]

View file

@ -0,0 +1,60 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[200, ]
[201, ]
[202, ]
[203, ]
[204, ]
[205, ]
[206, ]
[207, ]
[208, ]
[209, ]
[210, ]
[211, ]
[212, ]
[213, ]
[214, ]
[215, ]
[216, ]
[217, ]
[218, ]
[219, ]
[220, ]
[221, ]
[222, ]
[223, ]
[224, ]
[225, ]
[226, ]
[227, ]
[228, ]
[229, ]
[230, ]
[231, ]
[232, ]
[233, ]
[234, ]
[235, ]
[236, ]
[237, ]
[238, ]
[239, ]
[240, ]
[241, ]
[242, ]
[243, ]
[244, ]
[245, ]
[246, ]
[247, ]
[248, ]
[249, ]
[250, ]
[251, ]
[252, ]
[253, ]
[254, ]
[255, ]

View file

@ -0,0 +1,54 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, ]
[202, ]
[203, ]
[207, ]
[211, ]
[215, ]
[219, ]
[223, ]
[224, ]
[230, ]
[231, ]
[233, ]
[235, ]
[236, ]
[237, ]
[239, ]
[241, ]
[243, ]
[244, ]
[247, ]
[250, ]
[256, ]
[258, ]
[260, ]
[262, ]
[263, ]
[264, ]
[267, ]
[269, ]
[273, ]
[277, ]
[278, ]
[279, ]
[281, ]
[282, ]
[286, ]
[289, ]
[292, ]
[293, ]
[295, ]
[297, ]
[205, ]
[206, ]
[208, ]
[209, ]
[210, ]
[216, ]
[220, ]
[226, ]
[238, ]

View file

@ -0,0 +1,33 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
[200, ]
[202, ]
[204, ]
[206, ]
[208, ]
[210, ]
[212, ]
[214, ]
[216, ]
[218, ]
[220, ]
[222, ]
[224, ]
[226, ]
[228, ]
[230, ]
[232, ]
[234, ]
[236, ]
[238, ]
[240, ]
[242, ]
[244, ]
[246, ]
[248, ]
[250, ]
[252, ]
[254, ]

View file

@ -0,0 +1,33 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
[200, ]
[202, ]
[204, ]
[206, ]
[208, ]
[210, ]
[212, ]
[214, ]
[216, ]
[218, ]
[220, ]
[222, ]
[224, ]
[226, ]
[228, ]
[230, ]
[232, ]
[234, ]
[236, ]
[238, ]
[240, ]
[242, ]
[244, ]
[246, ]
[248, ]
[250, ]
[252, ]
[254, ]

View file

@ -0,0 +1,27 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
[202, ]
[224, ]
[230, ]
[236, ]
[244, ]
[250, ]
[256, ]
[258, ]
[260, ]
[262, ]
[264, ]
[278, ]
[282, ]
[286, ]
[292, ]
[206, ]
[208, ]
[210, ]
[216, ]
[220, ]
[226, ]
[238, ]

View file

@ -0,0 +1,27 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
[202, ]
[224, ]
[230, ]
[236, ]
[244, ]
[250, ]
[256, ]
[258, ]
[260, ]
[262, ]
[264, ]
[278, ]
[282, ]
[286, ]
[292, ]
[206, ]
[208, ]
[210, ]
[216, ]
[220, ]
[226, ]
[238, ]

View file

@ -0,0 +1,60 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[255, ]
[254, ]
[253, ]
[252, ]
[251, ]
[250, ]
[249, ]
[248, ]
[247, ]
[246, ]
[245, ]
[244, ]
[243, ]
[242, ]
[241, ]
[240, ]
[239, ]
[238, ]
[237, ]
[236, ]
[235, ]
[234, ]
[233, ]
[232, ]
[231, ]
[230, ]
[229, ]
[228, ]
[227, ]
[226, ]
[225, ]
[224, ]
[223, ]
[222, ]
[221, ]
[220, ]
[219, ]
[218, ]
[217, ]
[216, ]
[215, ]
[214, ]
[213, ]
[212, ]
[211, ]
[210, ]
[209, ]
[208, ]
[207, ]
[206, ]
[205, ]
[204, ]
[203, ]
[202, ]
[201, ]
[200, ]

View file

@ -0,0 +1,54 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[243, ]
[238, ]
[236, ]
[235, ]
[226, ]
[223, ]
[220, ]
[219, ]
[216, ]
[210, ]
[209, ]
[208, ]
[207, ]
[206, ]
[205, ]
[297, ]
[295, ]
[293, ]
[292, ]
[289, ]
[286, ]
[282, ]
[281, ]
[279, ]
[278, ]
[277, ]
[273, ]
[269, ]
[267, ]
[264, ]
[263, ]
[262, ]
[260, ]
[258, ]
[256, ]
[250, ]
[247, ]
[244, ]
[241, ]
[239, ]
[237, ]
[233, ]
[231, ]
[230, ]
[224, ]
[215, ]
[211, ]
[203, ]
[202, ]
[201, ]

View file

@ -0,0 +1,60 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[255, ]
[254, ]
[253, ]
[252, ]
[251, ]
[250, ]
[249, ]
[248, ]
[247, ]
[246, ]
[245, ]
[244, ]
[243, ]
[242, ]
[241, ]
[240, ]
[239, ]
[238, ]
[237, ]
[236, ]
[235, ]
[234, ]
[233, ]
[232, ]
[231, ]
[230, ]
[229, ]
[228, ]
[227, ]
[226, ]
[225, ]
[224, ]
[223, ]
[222, ]
[221, ]
[220, ]
[219, ]
[218, ]
[217, ]
[216, ]
[215, ]
[214, ]
[213, ]
[212, ]
[211, ]
[210, ]
[209, ]
[208, ]
[207, ]
[206, ]
[205, ]
[204, ]
[203, ]
[202, ]
[201, ]
[200, ]

View file

@ -0,0 +1,33 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[254, ]
[252, ]
[250, ]
[248, ]
[246, ]
[244, ]
[242, ]
[240, ]
[238, ]
[236, ]
[234, ]
[232, ]
[230, ]
[228, ]
[226, ]
[224, ]
[222, ]
[220, ]
[218, ]
[216, ]
[214, ]
[212, ]
[210, ]
[208, ]
[206, ]
[204, ]
[202, ]
[200, ]
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]

View file

@ -0,0 +1,33 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[254, ]
[252, ]
[250, ]
[248, ]
[246, ]
[244, ]
[242, ]
[240, ]
[238, ]
[236, ]
[234, ]
[232, ]
[230, ]
[228, ]
[226, ]
[224, ]
[222, ]
[220, ]
[218, ]
[216, ]
[214, ]
[212, ]
[210, ]
[208, ]
[206, ]
[204, ]
[202, ]
[200, ]
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]

View file

@ -0,0 +1,27 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[238, ]
[236, ]
[226, ]
[220, ]
[216, ]
[210, ]
[208, ]
[206, ]
[292, ]
[286, ]
[282, ]
[278, ]
[264, ]
[262, ]
[260, ]
[258, ]
[256, ]
[250, ]
[244, ]
[230, ]
[224, ]
[202, ]
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]

View file

@ -0,0 +1,27 @@
---
source: milli/src/search/facet/facet_sort_descending.rs
---
[238, ]
[236, ]
[226, ]
[220, ]
[216, ]
[210, ]
[208, ]
[206, ]
[292, ]
[286, ]
[282, ]
[278, ]
[264, ]
[262, ]
[260, ]
[258, ]
[256, ]
[250, ]
[244, ]
[230, ]
[224, ]
[202, ]
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]

View file

@ -0,0 +1,187 @@
/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged.
/// All credits for this code go to BurntSushi.
use fst::Automaton;
pub struct StartsWith<A>(pub A);
/// The `Automaton` state for `StartsWith<A>`.
pub struct StartsWithState<A: Automaton>(pub StartsWithStateKind<A>);
impl<A: Automaton> Clone for StartsWithState<A>
where
A::State: Clone,
{
fn clone(&self) -> Self {
Self(self.0.clone())
}
}
/// The inner state of a `StartsWithState<A>`.
pub enum StartsWithStateKind<A: Automaton> {
/// Sink state that is reached when the automaton has matched the prefix.
Done,
/// State in which the automaton is while it hasn't matched the prefix.
Running(A::State),
}
impl<A: Automaton> Clone for StartsWithStateKind<A>
where
A::State: Clone,
{
fn clone(&self) -> Self {
match self {
StartsWithStateKind::Done => StartsWithStateKind::Done,
StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()),
}
}
}
impl<A: Automaton> Automaton for StartsWith<A> {
type State = StartsWithState<A>;
fn start(&self) -> StartsWithState<A> {
StartsWithState({
let inner = self.0.start();
if self.0.is_match(&inner) {
StartsWithStateKind::Done
} else {
StartsWithStateKind::Running(inner)
}
})
}
fn is_match(&self, state: &StartsWithState<A>) -> bool {
match state.0 {
StartsWithStateKind::Done => true,
StartsWithStateKind::Running(_) => false,
}
}
fn can_match(&self, state: &StartsWithState<A>) -> bool {
match state.0 {
StartsWithStateKind::Done => true,
StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
}
}
fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
match state.0 {
StartsWithStateKind::Done => true,
StartsWithStateKind::Running(_) => false,
}
}
fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> {
StartsWithState(match state.0 {
StartsWithStateKind::Done => StartsWithStateKind::Done,
StartsWithStateKind::Running(ref inner) => {
let next_inner = self.0.accept(inner, byte);
if self.0.is_match(&next_inner) {
StartsWithStateKind::Done
} else {
StartsWithStateKind::Running(next_inner)
}
}
})
}
}
/// An automaton that matches when one of its component automata match.
#[derive(Clone, Debug)]
pub struct Union<A, B>(pub A, pub B);
/// The `Automaton` state for `Union<A, B>`.
pub struct UnionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
impl<A: Automaton, B: Automaton> Clone for UnionState<A, B>
where
A::State: Clone,
B::State: Clone,
{
fn clone(&self) -> Self {
Self(self.0.clone(), self.1.clone())
}
}
impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
type State = UnionState<A, B>;
fn start(&self) -> UnionState<A, B> {
UnionState(self.0.start(), self.1.start())
}
fn is_match(&self, state: &UnionState<A, B>) -> bool {
self.0.is_match(&state.0) || self.1.is_match(&state.1)
}
fn can_match(&self, state: &UnionState<A, B>) -> bool {
self.0.can_match(&state.0) || self.1.can_match(&state.1)
}
fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1)
}
fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
}
}
/// An automaton that matches when both of its component automata match.
#[derive(Clone, Debug)]
pub struct Intersection<A, B>(pub A, pub B);
/// The `Automaton` state for `Intersection<A, B>`.
pub struct IntersectionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
impl<A: Automaton, B: Automaton> Clone for IntersectionState<A, B>
where
A::State: Clone,
B::State: Clone,
{
fn clone(&self) -> Self {
Self(self.0.clone(), self.1.clone())
}
}
impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
type State = IntersectionState<A, B>;
fn start(&self) -> IntersectionState<A, B> {
IntersectionState(self.0.start(), self.1.start())
}
fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
self.0.is_match(&state.0) && self.1.is_match(&state.1)
}
fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
self.0.can_match(&state.0) && self.1.can_match(&state.1)
}
fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1)
}
fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> {
IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
}
}
/// An automaton that matches exactly when the automaton it wraps does not.
#[derive(Clone, Debug)]
pub struct Complement<A>(pub A);
/// The `Automaton` state for `Complement<A>`.
pub struct ComplementState<A: Automaton>(pub A::State);
impl<A: Automaton> Clone for ComplementState<A>
where
A::State: Clone,
{
fn clone(&self) -> Self {
Self(self.0.clone())
}
}
impl<A: Automaton> Automaton for Complement<A> {
type State = ComplementState<A>;
fn start(&self) -> ComplementState<A> {
ComplementState(self.0.start())
}
fn is_match(&self, state: &ComplementState<A>) -> bool {
!self.0.is_match(&state.0)
}
fn can_match(&self, state: &ComplementState<A>) -> bool {
!self.0.will_always_match(&state.0)
}
fn will_always_match(&self, state: &ComplementState<A>) -> bool {
!self.0.can_match(&state.0)
}
fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> {
ComplementState(self.0.accept(&state.0, byte))
}
}

View file

@ -0,0 +1,289 @@
use std::cmp::Ordering;
use itertools::Itertools;
use roaring::RoaringBitmap;
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
use crate::search::SemanticSearch;
use crate::{MatchingWords, Result, Search, SearchResult};
struct ScoreWithRatioResult {
matching_words: MatchingWords,
candidates: RoaringBitmap,
document_scores: Vec<(u32, ScoreWithRatio)>,
degraded: bool,
used_negative_operator: bool,
}
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
#[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")]
fn compare_scores(
&(ref left_scores, left_ratio): &ScoreWithRatio,
&(ref right_scores, right_ratio): &ScoreWithRatio,
) -> Ordering {
let mut left_it = ScoreDetails::score_values(left_scores.iter());
let mut right_it = ScoreDetails::score_values(right_scores.iter());
loop {
let left = left_it.next();
let right = right_it.next();
match (left, right) {
(None, None) => return Ordering::Equal,
(None, Some(_)) => return Ordering::Less,
(Some(_), None) => return Ordering::Greater,
(Some(ScoreValue::Score(left)), Some(ScoreValue::Score(right))) => {
let left = left * left_ratio as f64;
let right = right * right_ratio as f64;
if (left - right).abs() <= f64::EPSILON {
continue;
}
return left.partial_cmp(&right).unwrap();
}
(Some(ScoreValue::Sort(left)), Some(ScoreValue::Sort(right))) => {
match left.partial_cmp(right).unwrap() {
Ordering::Equal => continue,
order => return order,
}
}
(Some(ScoreValue::GeoSort(left)), Some(ScoreValue::GeoSort(right))) => {
match left.partial_cmp(right).unwrap() {
Ordering::Equal => continue,
order => return order,
}
}
(Some(ScoreValue::Score(x)), Some(_)) => {
return if x == 0. { Ordering::Less } else { Ordering::Greater }
}
(Some(_), Some(ScoreValue::Score(x))) => {
return if x == 0. { Ordering::Greater } else { Ordering::Less }
}
// if we have this, we're bad
(Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_)))
| (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => {
unreachable!("Unexpected geo and sort comparison")
}
}
}
}
impl ScoreWithRatioResult {
fn new(results: SearchResult, ratio: f32) -> Self {
let document_scores = results
.documents_ids
.into_iter()
.zip(results.document_scores.into_iter().map(|scores| (scores, ratio)))
.collect();
Self {
matching_words: results.matching_words,
candidates: results.candidates,
document_scores,
degraded: results.degraded,
used_negative_operator: results.used_negative_operator,
}
}
#[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")]
fn merge(
vector_results: Self,
keyword_results: Self,
from: usize,
length: usize,
) -> (SearchResult, u32) {
#[derive(Clone, Copy)]
enum ResultSource {
Semantic,
Keyword,
}
let mut semantic_hit_count = 0;
let mut documents_ids = Vec::with_capacity(
vector_results.document_scores.len() + keyword_results.document_scores.len(),
);
let mut document_scores = Vec::with_capacity(
vector_results.document_scores.len() + keyword_results.document_scores.len(),
);
let mut documents_seen = RoaringBitmap::new();
for ((docid, (main_score, _sub_score)), source) in vector_results
.document_scores
.into_iter()
.zip(std::iter::repeat(ResultSource::Semantic))
.merge_by(
keyword_results
.document_scores
.into_iter()
.zip(std::iter::repeat(ResultSource::Keyword)),
|((_, left), _), ((_, right), _)| {
// the first value is the one with the greatest score
compare_scores(left, right).is_ge()
},
)
// remove documents we already saw
.filter(|((docid, _), _)| documents_seen.insert(*docid))
// start skipping **after** the filter
.skip(from)
// take **after** skipping
.take(length)
{
if let ResultSource::Semantic = source {
semantic_hit_count += 1;
}
documents_ids.push(docid);
// TODO: pass both scores to documents_score in some way?
document_scores.push(main_score);
}
(
SearchResult {
matching_words: keyword_results.matching_words,
candidates: vector_results.candidates | keyword_results.candidates,
documents_ids,
document_scores,
degraded: vector_results.degraded | keyword_results.degraded,
used_negative_operator: vector_results.used_negative_operator
| keyword_results.used_negative_operator,
},
semantic_hit_count,
)
}
}
impl<'a> Search<'a> {
#[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")]
pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result<(SearchResult, Option<u32>)> {
// TODO: find classier way to achieve that than to reset vector and query params
// create separate keyword and semantic searches
let mut search = Search {
query: self.query.clone(),
filter: self.filter.clone(),
offset: 0,
limit: self.limit + self.offset,
sort_criteria: self.sort_criteria.clone(),
distinct: self.distinct.clone(),
searchable_attributes: self.searchable_attributes,
geo_strategy: self.geo_strategy,
terms_matching_strategy: self.terms_matching_strategy,
scoring_strategy: ScoringStrategy::Detailed,
words_limit: self.words_limit,
exhaustive_number_hits: self.exhaustive_number_hits,
rtxn: self.rtxn,
index: self.index,
semantic: self.semantic.clone(),
time_budget: self.time_budget.clone(),
ranking_score_threshold: self.ranking_score_threshold,
locales: self.locales.clone(),
};
let semantic = search.semantic.take();
let keyword_results = search.execute()?;
// completely skip semantic search if the results of the keyword search are good enough
if self.results_good_enough(&keyword_results, semantic_ratio) {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
}
// no vector search against placeholder search
let Some(query) = search.query.take() else {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
};
// no embedder, no semantic search
let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
};
let vector_query = match vector {
Some(vector_query) => vector_query,
None => {
// attempt to embed the vector
let span = tracing::trace_span!(target: "search::hybrid", "embed_one");
let _entered = span.enter();
match embedder.embed_one(query) {
Ok(embedding) => embedding,
Err(error) => {
tracing::error!(error=%error, "Embedding failed");
return Ok((keyword_results, Some(0)));
}
}
}
};
search.semantic =
Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized });
// TODO: would be better to have two distinct functions at this point
let vector_results = search.execute()?;
let keyword_results = ScoreWithRatioResult::new(keyword_results, 1.0 - semantic_ratio);
let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio);
let (merge_results, semantic_hit_count) =
ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit);
assert!(merge_results.documents_ids.len() <= self.limit);
Ok((merge_results, Some(semantic_hit_count)))
}
fn results_good_enough(&self, keyword_results: &SearchResult, semantic_ratio: f32) -> bool {
// A result is good enough if its keyword score is > 0.9 with a semantic ratio of 0.5 => 0.9 * 0.5
const GOOD_ENOUGH_SCORE: f64 = 0.45;
// 1. we check that we got a sufficient number of results
if keyword_results.document_scores.len() < self.limit + self.offset {
return false;
}
// 2. and that all results have a good enough score.
// we need to check all results because due to sort like rules, they're not necessarily in relevancy order
for score in &keyword_results.document_scores {
let score = ScoreDetails::global_score(score.iter());
if score * ((1.0 - semantic_ratio) as f64) < GOOD_ENOUGH_SCORE {
return false;
}
}
true
}
}
fn return_keyword_results(
limit: usize,
offset: usize,
SearchResult {
matching_words,
candidates,
mut documents_ids,
mut document_scores,
degraded,
used_negative_operator,
}: SearchResult,
) -> (SearchResult, Option<u32>) {
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
// technically redudant because documents_ids.len() == document_scores.len(),
// defensive programming
offset >= document_scores.len()
{
(vec![], vec![])
} else {
// PANICS: offset < len
documents_ids.rotate_left(offset);
documents_ids.truncate(limit);
// PANICS: offset < len
document_scores.rotate_left(offset);
document_scores.truncate(limit);
(documents_ids, document_scores)
};
(
SearchResult {
matching_words,
candidates,
documents_ids,
document_scores,
degraded,
used_negative_operator,
},
Some(0),
)
}

View file

@ -0,0 +1,413 @@
use std::fmt;
use std::sync::Arc;
use charabia::Language;
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
use self::new::{execute_vector_search, PartialSearchResult};
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::vector::Embedder;
use crate::{
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index,
Result, SearchContext, TimeBudget, UserError,
};
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
pub mod facet;
mod fst_utils;
pub mod hybrid;
pub mod new;
pub mod similar;
#[derive(Debug, Clone)]
pub struct SemanticSearch {
vector: Option<Vec<f32>>,
embedder_name: String,
embedder: Arc<Embedder>,
quantized: bool,
}
pub struct Search<'a> {
query: Option<String>,
// this should be linked to the String in the query
filter: Option<Filter<'a>>,
offset: usize,
limit: usize,
sort_criteria: Option<Vec<AscDesc>>,
distinct: Option<String>,
searchable_attributes: Option<&'a [String]>,
geo_strategy: new::GeoSortStrategy,
terms_matching_strategy: TermsMatchingStrategy,
scoring_strategy: ScoringStrategy,
words_limit: usize,
exhaustive_number_hits: bool,
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
semantic: Option<SemanticSearch>,
time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
locales: Option<Vec<Language>>,
}
impl<'a> Search<'a> {
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Search<'a> {
Search {
query: None,
filter: None,
offset: 0,
limit: 20,
sort_criteria: None,
distinct: None,
searchable_attributes: None,
geo_strategy: new::GeoSortStrategy::default(),
terms_matching_strategy: TermsMatchingStrategy::default(),
scoring_strategy: Default::default(),
exhaustive_number_hits: false,
words_limit: 10,
rtxn,
index,
semantic: None,
locales: None,
time_budget: TimeBudget::max(),
ranking_score_threshold: None,
}
}
pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> {
self.query = Some(query.into());
self
}
pub fn semantic(
&mut self,
embedder_name: String,
embedder: Arc<Embedder>,
quantized: bool,
vector: Option<Vec<f32>>,
) -> &mut Search<'a> {
self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector });
self
}
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
self.offset = offset;
self
}
pub fn limit(&mut self, limit: usize) -> &mut Search<'a> {
self.limit = limit;
self
}
pub fn sort_criteria(&mut self, criteria: Vec<AscDesc>) -> &mut Search<'a> {
self.sort_criteria = Some(criteria);
self
}
pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> {
self.distinct = Some(distinct);
self
}
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
self.searchable_attributes = Some(searchable);
self
}
pub fn terms_matching_strategy(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> {
self.terms_matching_strategy = value;
self
}
pub fn scoring_strategy(&mut self, value: ScoringStrategy) -> &mut Search<'a> {
self.scoring_strategy = value;
self
}
pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> {
self.words_limit = value;
self
}
pub fn filter(&mut self, condition: Filter<'a>) -> &mut Search<'a> {
self.filter = Some(condition);
self
}
#[cfg(test)]
pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> {
self.geo_strategy = strategy;
self
}
/// Forces the search to exhaustively compute the number of candidates,
/// this will increase the search time but allows finite pagination.
pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> {
self.exhaustive_number_hits = exhaustive_number_hits;
self
}
pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> {
self.time_budget = time_budget;
self
}
pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Search<'a> {
self.ranking_score_threshold = Some(ranking_score_threshold);
self
}
pub fn locales(&mut self, locales: Vec<Language>) -> &mut Search<'a> {
self.locales = Some(locales);
self
}
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
if has_vector_search {
let ctx = SearchContext::new(self.index, self.rtxn)?;
filtered_universe(ctx.index, ctx.txn, &self.filter)
} else {
Ok(self.execute()?.candidates)
}
}
pub fn execute(&self) -> Result<SearchResult> {
let mut ctx = SearchContext::new(self.index, self.rtxn)?;
if let Some(searchable_attributes) = self.searchable_attributes {
ctx.attributes_to_search_on(searchable_attributes)?;
}
if let Some(distinct) = &self.distinct {
let filterable_fields = ctx.index.filterable_fields(ctx.txn)?;
if !crate::is_faceted(distinct, &filterable_fields) {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?;
return Err(Error::UserError(UserError::InvalidDistinctAttribute {
field: distinct.clone(),
valid_fields,
hidden_fields,
}));
}
}
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
let PartialSearchResult {
located_query_terms,
candidates,
documents_ids,
document_scores,
degraded,
used_negative_operator,
} = match self.semantic.as_ref() {
Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => {
execute_vector_search(
&mut ctx,
vector,
self.scoring_strategy,
universe,
&self.sort_criteria,
&self.distinct,
self.geo_strategy,
self.offset,
self.limit,
embedder_name,
embedder,
*quantized,
self.time_budget.clone(),
self.ranking_score_threshold,
)?
}
_ => execute_search(
&mut ctx,
self.query.as_deref(),
self.terms_matching_strategy,
self.scoring_strategy,
self.exhaustive_number_hits,
universe,
&self.sort_criteria,
&self.distinct,
self.geo_strategy,
self.offset,
self.limit,
Some(self.words_limit),
&mut DefaultSearchLogger,
&mut DefaultSearchLogger,
self.time_budget.clone(),
self.ranking_score_threshold,
self.locales.as_ref(),
)?,
};
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
None => MatchingWords::default(),
};
Ok(SearchResult {
matching_words,
candidates,
document_scores,
documents_ids,
degraded,
used_negative_operator,
})
}
}
impl fmt::Debug for Search<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Search {
query,
filter,
offset,
limit,
sort_criteria,
distinct,
searchable_attributes,
geo_strategy: _,
terms_matching_strategy,
scoring_strategy,
words_limit,
exhaustive_number_hits,
rtxn: _,
index: _,
semantic,
time_budget,
ranking_score_threshold,
locales,
} = self;
f.debug_struct("Search")
.field("query", query)
.field("vector", &"[...]")
.field("filter", filter)
.field("offset", offset)
.field("limit", limit)
.field("sort_criteria", sort_criteria)
.field("distinct", distinct)
.field("searchable_attributes", searchable_attributes)
.field("terms_matching_strategy", terms_matching_strategy)
.field("scoring_strategy", scoring_strategy)
.field("exhaustive_number_hits", exhaustive_number_hits)
.field("words_limit", words_limit)
.field(
"semantic.embedder_name",
&semantic.as_ref().map(|semantic| &semantic.embedder_name),
)
.field("time_budget", time_budget)
.field("ranking_score_threshold", ranking_score_threshold)
.field("locales", locales)
.finish()
}
}
#[derive(Default, Debug)]
pub struct SearchResult {
pub matching_words: MatchingWords,
pub candidates: RoaringBitmap,
pub documents_ids: Vec<DocumentId>,
pub document_scores: Vec<Vec<ScoreDetails>>,
pub degraded: bool,
pub used_negative_operator: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TermsMatchingStrategy {
// remove last word first
Last,
// all words are mandatory
All,
// remove more frequent word first
Frequency,
}
impl Default for TermsMatchingStrategy {
fn default() -> Self {
Self::Last
}
}
fn get_first(s: &str) -> &str {
match s.chars().next() {
Some(c) => &s[..c.len_utf8()],
None => panic!("unexpected empty query"),
}
}
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
let lev = match typos {
0 => &LEVDIST0,
1 => &LEVDIST1,
_ => &LEVDIST2,
};
if is_prefix {
lev.build_prefix_dfa(word)
} else {
lev.build_dfa(word)
}
}
#[cfg(test)]
mod test {
#[allow(unused_imports)]
use super::*;
#[cfg(feature = "japanese")]
#[cfg(not(feature = "chinese-pinyin"))]
#[test]
fn test_kanji_language_detection() {
use crate::index::tests::TempIndex;
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
{ "id": 1, "title": "東京のお寿司。" },
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
]))
.unwrap();
let txn = index.write_txn().unwrap();
let mut search = Search::new(&txn, &index);
search.query("東京");
let SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids, vec![1]);
}
#[cfg(feature = "korean")]
#[test]
fn test_hangul_language_detection() {
use crate::index::tests::TempIndex;
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
{ "id": 1, "title": "김밥먹을래。" },
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
]))
.unwrap();
let txn = index.write_txn().unwrap();
let mut search = Search::new(&txn, &index);
search.query("김밥");
let SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids, vec![1]);
}
}

View file

@ -0,0 +1,366 @@
use roaring::RoaringBitmap;
use super::logger::SearchLogger;
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
use super::SearchContext;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
use crate::{Result, TimeBudget};
pub struct BucketSortOutput {
pub docids: Vec<u32>,
pub scores: Vec<Vec<ScoreDetails>>,
pub all_candidates: RoaringBitmap,
pub degraded: bool,
}
// TODO: would probably be good to regroup some of these inside of a struct?
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")]
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
query: &Q,
distinct: Option<&str>,
universe: &RoaringBitmap,
from: usize,
length: usize,
scoring_strategy: ScoringStrategy,
logger: &mut dyn SearchLogger<Q>,
time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
) -> Result<BucketSortOutput> {
logger.initial_query(query);
logger.ranking_rules(&ranking_rules);
logger.initial_universe(universe);
let distinct_field = match distinct {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
let distinct_fid = if let Some(field) = distinct_field {
ctx.index.fields_ids_map(ctx.txn)?.id(field)
} else {
None
};
if universe.len() < from as u64 {
return Ok(BucketSortOutput {
docids: vec![],
scores: vec![],
all_candidates: universe.clone(),
degraded: false,
});
}
if ranking_rules.is_empty() {
if let Some(distinct_fid) = distinct_fid {
let mut excluded = RoaringBitmap::new();
let mut results = vec![];
for docid in universe.iter() {
if results.len() >= from + length {
break;
}
if excluded.contains(docid) {
continue;
}
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
results.push(docid);
}
let mut all_candidates = universe - excluded;
all_candidates.extend(results.iter().copied());
// drain the results of the skipped elements
// this **must** be done **after** writing the entire results in `all_candidates` to ensure
// e.g. estimatedTotalHits is correct.
if results.len() >= from {
results.drain(..from);
} else {
results.clear();
}
return Ok(BucketSortOutput {
scores: vec![Default::default(); results.len()],
docids: results,
all_candidates,
degraded: false,
});
} else {
let docids: Vec<u32> = universe.iter().skip(from).take(length).collect();
return Ok(BucketSortOutput {
scores: vec![Default::default(); docids.len()],
docids,
all_candidates: universe.clone(),
degraded: false,
});
};
}
let ranking_rules_len = ranking_rules.len();
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
let mut ranking_rule_scores: Vec<ScoreDetails> = vec![];
let mut ranking_rule_universes: Vec<RoaringBitmap> =
vec![RoaringBitmap::default(); ranking_rules_len];
ranking_rule_universes[0].clone_from(universe);
let mut cur_ranking_rule_index = 0;
/// Finish iterating over the current ranking rule, yielding
/// control to the parent (or finishing the search if not possible).
/// Update the universes accordingly and inform the logger.
macro_rules! back {
() => {
// FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013>
// assert!(
// ranking_rule_universes[cur_ranking_rule_index].is_empty(),
// "The ranking rule {} did not sort its bucket exhaustively",
// ranking_rules[cur_ranking_rule_index].id()
// );
logger.end_iteration_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(),
&ranking_rule_universes[cur_ranking_rule_index],
);
ranking_rule_universes[cur_ranking_rule_index].clear();
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
if cur_ranking_rule_index == 0 {
break;
} else {
cur_ranking_rule_index -= 1;
}
if ranking_rule_scores.len() > cur_ranking_rule_index {
ranking_rule_scores.pop();
}
};
}
let mut all_candidates = universe.clone();
let mut valid_docids = vec![];
let mut valid_scores = vec![];
let mut cur_offset = 0usize;
macro_rules! maybe_add_to_results {
($candidates:expr) => {
maybe_add_to_results(
ctx,
from,
length,
logger,
&mut valid_docids,
&mut valid_scores,
&mut all_candidates,
&mut ranking_rule_universes,
&mut ranking_rules,
cur_ranking_rule_index,
&mut cur_offset,
distinct_fid,
&ranking_rule_scores,
$candidates,
)?;
};
}
while valid_docids.len() < length {
if time_budget.exceeded() {
loop {
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
ranking_rule_scores.push(ScoreDetails::Skipped);
// remove candidates from the universe without adding them to result if their score is below the threshold
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
maybe_add_to_results!(bucket);
ranking_rule_scores.pop();
if cur_ranking_rule_index == 0 {
break;
}
back!();
}
return Ok(BucketSortOutput {
scores: valid_scores,
docids: valid_docids,
all_candidates,
degraded: true,
});
}
// The universe for this bucket is zero, so we don't need to sort
// anything, just go back to the parent ranking rule.
if ranking_rule_universes[cur_ranking_rule_index].is_empty()
|| (scoring_strategy == ScoringStrategy::Skip
&& ranking_rule_universes[cur_ranking_rule_index].len() == 1)
{
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
maybe_add_to_results!(bucket);
back!();
continue;
}
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(
ctx,
logger,
&ranking_rule_universes[cur_ranking_rule_index],
)?
else {
back!();
continue;
};
ranking_rule_scores.push(next_bucket.score);
logger.next_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(),
&ranking_rule_universes[cur_ranking_rule_index],
&next_bucket.candidates,
);
debug_assert!(
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
);
// remove candidates from the universe without adding them to result if their score is below the threshold
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -=
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
if cur_ranking_rule_index == ranking_rules_len - 1
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|| cur_offset + (next_bucket.candidates.len() as usize) < from
{
maybe_add_to_results!(next_bucket.candidates);
ranking_rule_scores.pop();
continue;
}
cur_ranking_rule_index += 1;
ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates);
logger.start_iteration_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(),
&next_bucket.query,
&ranking_rule_universes[cur_ranking_rule_index],
);
ranking_rules[cur_ranking_rule_index].start_iteration(
ctx,
logger,
&next_bucket.candidates,
&next_bucket.query,
)?;
}
Ok(BucketSortOutput {
docids: valid_docids,
scores: valid_scores,
all_candidates,
degraded: false,
})
}
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
/// into account and inform the logger.
#[allow(clippy::too_many_arguments)]
fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>,
from: usize,
length: usize,
logger: &mut dyn SearchLogger<Q>,
valid_docids: &mut Vec<u32>,
valid_scores: &mut Vec<Vec<ScoreDetails>>,
all_candidates: &mut RoaringBitmap,
ranking_rule_universes: &mut [RoaringBitmap],
ranking_rules: &mut [BoxRankingRule<'ctx, Q>],
cur_ranking_rule_index: usize,
cur_offset: &mut usize,
distinct_fid: Option<u16>,
ranking_rule_scores: &[ScoreDetails],
candidates: RoaringBitmap,
) -> Result<()> {
// First apply the distinct rule on the candidates, reducing the universes if necessary
let candidates = if let Some(distinct_fid) = distinct_fid {
let DistinctOutput { remaining, excluded } =
apply_distinct_rule(ctx, distinct_fid, &candidates)?;
for universe in ranking_rule_universes.iter_mut() {
*universe -= &excluded;
*all_candidates -= &excluded;
}
remaining
} else {
candidates.clone()
};
*all_candidates |= &candidates;
// if the candidates are empty, there is nothing to do;
if candidates.is_empty() {
return Ok(());
}
// if we still haven't reached the first document to return
if *cur_offset < from {
// and if no document from this bucket can be returned
if *cur_offset + (candidates.len() as usize) < from {
// then just skip the bucket
logger.skip_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(),
&candidates,
);
} else {
// otherwise, skip some of the documents and add some of the rest, in order of ids
let candidates_vec = candidates.iter().collect::<Vec<_>>();
let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);
logger.skip_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(),
&skipped_candidates.iter().collect(),
);
let candidates =
candidates.iter().take(length - valid_docids.len()).copied().collect::<Vec<_>>();
logger.add_to_results(&candidates);
valid_docids.extend_from_slice(&candidates);
valid_scores
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
}
} else {
// if we have passed the offset already, add some of the documents (up to the limit)
let candidates = candidates.iter().take(length - valid_docids.len()).collect::<Vec<u32>>();
logger.add_to_results(&candidates);
valid_docids.extend_from_slice(&candidates);
valid_scores
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
}
*cur_offset += candidates.len() as usize;
Ok(())
}

View file

@ -0,0 +1,719 @@
use std::borrow::Cow;
use std::collections::hash_map::Entry;
use std::hash::Hash;
use fxhash::FxHashMap;
use grenad::MergeFunction;
use heed::types::Bytes;
use heed::{BytesEncode, Database, RoTxn};
use roaring::RoaringBitmap;
use super::interner::Interned;
use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::proximity::ProximityPrecision;
use crate::update::MergeCboRoaringBitmaps;
use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
};
/// A cache storing pointers to values in the LMDB databases.
///
/// Used for performance reasons only. By using this cache, we avoid performing a
/// database lookup and instead get a direct reference to the value using a fast
/// local HashMap lookup.
#[derive(Default)]
pub struct DatabaseCache<'ctx> {
pub word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
pub prefix_word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
pub exact_word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
pub word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
}
impl<'ctx> DatabaseCache<'ctx> {
fn get_value<'v, K1, KC>(
txn: &'ctx RoTxn<'_>,
cache_key: K1,
db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
universe: Option<&RoaringBitmap>,
db: Database<KC, Bytes>,
) -> Result<Option<RoaringBitmap>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
{
if let Entry::Vacant(entry) = cache.entry(cache_key) {
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
entry.insert(bitmap_ptr);
}
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some)
.map_err(Into::into)
}
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
}
}
fn get_value_length<'v, K1, KC>(
txn: &'ctx RoTxn<'_>,
cache_key: K1,
db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
) -> Result<Option<u64>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
{
if let Entry::Vacant(entry) = cache.entry(cache_key) {
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
entry.insert(bitmap_ptr);
}
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into)
}
fn get_value_from_keys<'v, K1, KC, MF>(
txn: &'ctx RoTxn<'_>,
cache_key: K1,
db_keys: &'v [KC::EItem],
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
universe: Option<&RoaringBitmap>,
merger: MF,
) -> Result<Option<RoaringBitmap>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
KC::EItem: Sized,
MF: MergeFunction,
crate::Error: From<MF::Error>,
{
if let Entry::Vacant(entry) = cache.entry(cache_key) {
let bitmap_ptr: Option<Cow<'ctx, [u8]>> = match db_keys {
[] => None,
[key] => db.get(txn, key)?.map(Cow::Borrowed),
keys => {
let bitmaps = keys
.iter()
.filter_map(|key| db.get(txn, key).transpose())
.map(|v| v.map(Cow::Borrowed))
.collect::<std::result::Result<Vec<Cow<'_, [u8]>>, _>>()?;
if bitmaps.is_empty() {
None
} else {
Some(merger.merge(&[], &bitmaps[..])?)
}
}
};
entry.insert(bitmap_ptr);
}
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some)
.map_err(Into::into)
}
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
}
}
}
impl<'ctx> SearchContext<'ctx> {
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
if let Some(fst) = self.db_cache.words_fst.clone() {
Ok(fst)
} else {
let fst = self.index.words_fst(self.txn)?;
self.db_cache.words_fst = Some(fst.clone());
Ok(fst)
}
}
pub fn word_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Word,
) -> Result<Option<RoaringBitmap>> {
match word {
Word::Original(word) => {
let exact = self.get_db_exact_word_docids(universe, word)?;
let tolerant = self.get_db_word_docids(universe, word)?;
Ok(match (exact, tolerant) {
(None, None) => None,
(None, Some(tolerant)) => Some(tolerant),
(Some(exact), None) => Some(exact),
(Some(exact), Some(tolerant)) => {
let mut both = exact;
both |= tolerant;
Some(both)
}
})
}
Word::Derived(word) => self.get_db_word_docids(universe, word),
}
}
/// Retrieve or insert the given value in the `word_docids` database.
fn get_db_word_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys(
self.txn,
word,
&keys[..],
&mut self.db_cache.word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
universe,
MergeCboRoaringBitmaps,
)
}
None => DatabaseCache::get_value(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids,
universe,
self.index.word_docids.remap_data_type::<Bytes>(),
),
}
}
fn get_db_exact_word_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys(
self.txn,
word,
&keys[..],
&mut self.db_cache.exact_word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
universe,
MergeCboRoaringBitmaps,
)
}
None => DatabaseCache::get_value(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.exact_word_docids,
universe,
self.index.exact_word_docids.remap_data_type::<Bytes>(),
),
}
}
pub fn word_prefix_docids(
&mut self,
universe: Option<&RoaringBitmap>,
prefix: Word,
) -> Result<Option<RoaringBitmap>> {
match prefix {
Word::Original(prefix) => {
let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?;
let tolerant = self.get_db_word_prefix_docids(universe, prefix)?;
Ok(match (exact, tolerant) {
(None, None) => None,
(None, Some(tolerant)) => Some(tolerant),
(Some(exact), None) => Some(exact),
(Some(exact), Some(tolerant)) => {
let mut both = exact;
both |= tolerant;
Some(both)
}
})
}
Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix),
}
}
/// Retrieve or insert the given value in the `word_prefix_docids` database.
fn get_db_word_prefix_docids(
&mut self,
universe: Option<&RoaringBitmap>,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
universe,
MergeCboRoaringBitmaps,
)
}
None => DatabaseCache::get_value(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids,
universe,
self.index.word_prefix_docids.remap_data_type::<Bytes>(),
),
}
}
fn get_db_exact_word_prefix_docids(
&mut self,
universe: Option<&RoaringBitmap>,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.exact_word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
universe,
MergeCboRoaringBitmaps,
)
}
None => DatabaseCache::get_value(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.exact_word_prefix_docids,
universe,
self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
),
}
}
pub fn get_db_word_pair_proximity_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word1: Interned<String>,
word2: Interned<String>,
proximity: u8,
) -> Result<Option<RoaringBitmap>> {
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
ProximityPrecision::ByAttribute => {
// Force proximity to 0 because:
// in ByAttribute, there are only 2 possible distances:
// 1. words in same attribute: in that the DB contains (0, word1, word2)
// 2. words in different attributes: no DB entry for these two words.
let proximity = 0;
let docids = if let Some(docids) =
self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2))
{
docids
.as_ref()
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
.transpose()
.map_err(heed::Error::Decoding)?
} else {
// Compute the distance at the attribute level and store it in the cache.
let fids = self.index.searchable_fields_ids(self.txn)?;
let mut docids = RoaringBitmap::new();
for fid in fids {
// for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache.
let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?;
if let (Some(word1_docids), Some(word2_docids)) =
(word1_docids, word2_docids)
{
docids |= word1_docids & word2_docids;
}
}
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
.map(Cow::into_owned)
.map(Cow::Owned)
.map(Some)
.map_err(heed::Error::Decoding)?;
self.db_cache
.word_pair_proximity_docids
.insert((proximity, word1, word2), encoded);
Some(docids)
};
Ok(docids)
}
ProximityPrecision::ByWord => DatabaseCache::get_value(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
universe,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
),
}
}
pub fn get_db_word_pair_proximity_docids_len(
&mut self,
universe: Option<&RoaringBitmap>,
word1: Interned<String>,
word2: Interned<String>,
proximity: u8,
) -> Result<Option<u64>> {
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
ProximityPrecision::ByAttribute => Ok(self
.get_db_word_pair_proximity_docids(universe, word1, word2, proximity)?
.map(|d| d.len())),
ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
),
}
}
pub fn get_db_word_prefix_pair_proximity_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word1: Interned<String>,
prefix2: Interned<String>,
mut proximity: u8,
) -> Result<Option<RoaringBitmap>> {
let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default();
if proximity_precision == ProximityPrecision::ByAttribute {
// Force proximity to 0 because:
// in ByAttribute, there are only 2 possible distances:
// 1. words in same attribute: in that the DB contains (0, word1, word2)
// 2. words in different attributes: no DB entry for these two words.
proximity = 0;
}
let docids = if let Some(docids) =
self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2))
{
docids.clone()
} else {
let prefix_docids = match proximity_precision {
ProximityPrecision::ByAttribute => {
// Compute the distance at the attribute level and store it in the cache.
let fids = self.index.searchable_fields_ids(self.txn)?;
let mut prefix_docids = RoaringBitmap::new();
// for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache.
for fid in fids {
let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
let prefix2_docids =
self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?;
if let (Some(word1_docids), Some(prefix2_docids)) =
(word1_docids, prefix2_docids)
{
prefix_docids |= word1_docids & prefix2_docids;
}
}
prefix_docids
}
ProximityPrecision::ByWord => {
// compute docids using prefix iter and store the result in the cache.
let key = U8StrStrCodec::bytes_encode(&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
))
.unwrap()
.into_owned();
let mut prefix_docids = RoaringBitmap::new();
let remap_key_type = self
.index
.word_pair_proximity_docids
.remap_key_type::<Bytes>()
.prefix_iter(self.txn, &key)?;
for result in remap_key_type {
let (_, docids) = result?;
prefix_docids |= docids;
}
prefix_docids
}
};
self.db_cache
.word_prefix_pair_proximity_docids
.insert((proximity, word1, prefix2), Some(prefix_docids.clone()));
Some(prefix_docids)
};
Ok(docids)
}
pub fn get_db_prefix_word_pair_proximity_docids(
&mut self,
universe: Option<&RoaringBitmap>,
left_prefix: Interned<String>,
right: Interned<String>,
proximity: u8,
) -> Result<Option<RoaringBitmap>> {
// only accept exact matches on reverted positions
self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity)
}
pub fn get_db_word_fid_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>,
fid: u16,
) -> Result<Option<RoaringBitmap>> {
// if the requested fid isn't in the restricted list, return None.
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
return Ok(None);
}
DatabaseCache::get_value(
self.txn,
(word, fid),
&(self.word_interner.get(word).as_str(), fid),
&mut self.db_cache.word_fid_docids,
universe,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
)
}
pub fn get_db_word_prefix_fid_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word_prefix: Interned<String>,
fid: u16,
) -> Result<Option<RoaringBitmap>> {
// if the requested fid isn't in the restricted list, return None.
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
return Ok(None);
}
DatabaseCache::get_value(
self.txn,
(word_prefix, fid),
&(self.word_interner.get(word_prefix).as_str(), fid),
&mut self.db_cache.word_prefix_fid_docids,
universe,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
)
}
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
let fids = match self.db_cache.word_fids.entry(word) {
Entry::Occupied(fids) => fids.get().clone(),
Entry::Vacant(entry) => {
let mut key = self.word_interner.get(word).as_bytes().to_owned();
key.push(0);
let mut fids = vec![];
let remap_key_type = self
.index
.word_fid_docids
.remap_types::<Bytes, Bytes>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
let ((_, fid), value) = result?;
// filling other caches to avoid searching for them again
self.db_cache.word_fid_docids.insert((word, fid), Some(Cow::Borrowed(value)));
fids.push(fid);
}
entry.insert(fids.clone());
fids
}
};
Ok(fids)
}
pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned<String>) -> Result<Vec<u16>> {
let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) {
Entry::Occupied(fids) => fids.get().clone(),
Entry::Vacant(entry) => {
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
key.push(0);
let mut fids = vec![];
let remap_key_type = self
.index
.word_prefix_fid_docids
.remap_types::<Bytes, Bytes>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
let ((_, fid), value) = result?;
// filling other caches to avoid searching for them again
self.db_cache
.word_prefix_fid_docids
.insert((word_prefix, fid), Some(Cow::Borrowed(value)));
fids.push(fid);
}
entry.insert(fids.clone());
fids
}
};
Ok(fids)
}
pub fn get_db_word_position_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>,
position: u16,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
(word, position),
&(self.word_interner.get(word).as_str(), position),
&mut self.db_cache.word_position_docids,
universe,
self.index.word_position_docids.remap_data_type::<Bytes>(),
)
}
pub fn get_db_word_prefix_position_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word_prefix: Interned<String>,
position: u16,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
(word_prefix, position),
&(self.word_interner.get(word_prefix).as_str(), position),
&mut self.db_cache.word_prefix_position_docids,
universe,
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
)
}
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
let positions = match self.db_cache.word_positions.entry(word) {
Entry::Occupied(positions) => positions.get().clone(),
Entry::Vacant(entry) => {
let mut key = self.word_interner.get(word).as_bytes().to_owned();
key.push(0);
let mut positions = vec![];
let remap_key_type = self
.index
.word_position_docids
.remap_types::<Bytes, Bytes>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
let ((_, position), value) = result?;
// filling other caches to avoid searching for them again
self.db_cache
.word_position_docids
.insert((word, position), Some(Cow::Borrowed(value)));
positions.push(position);
}
entry.insert(positions.clone());
positions
}
};
Ok(positions)
}
pub fn get_db_word_prefix_positions(
&mut self,
word_prefix: Interned<String>,
) -> Result<Vec<u16>> {
let positions = match self.db_cache.word_prefix_positions.entry(word_prefix) {
Entry::Occupied(positions) => positions.get().clone(),
Entry::Vacant(entry) => {
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
key.push(0);
let mut positions = vec![];
let remap_key_type = self
.index
.word_prefix_position_docids
.remap_types::<Bytes, Bytes>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
let ((_, position), value) = result?;
// filling other caches to avoid searching for them again
self.db_cache
.word_prefix_position_docids
.insert((word_prefix, position), Some(Cow::Borrowed(value)));
positions.push(position);
}
entry.insert(positions.clone());
positions
}
};
Ok(positions)
}
}

View file

@ -0,0 +1,123 @@
use heed::types::{Bytes, Str, Unit};
use heed::{Database, RoPrefix, RoTxn};
use roaring::RoaringBitmap;
const FID_SIZE: usize = 2;
const DOCID_SIZE: usize = 4;
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::{Index, Result, SearchContext};
pub struct DistinctOutput {
pub remaining: RoaringBitmap,
pub excluded: RoaringBitmap,
}
/// Return a [`DistinctOutput`] containing:
/// - `remaining`: a set of docids built such that exactly one element from `candidates`
/// is kept for each distinct value inside the given field. If the field does not exist, it
/// is considered unique.
/// - `excluded`: the set of document ids that contain a value for the given field that occurs
/// in the given candidates.
pub fn apply_distinct_rule(
ctx: &mut SearchContext<'_>,
field_id: u16,
candidates: &RoaringBitmap,
) -> Result<DistinctOutput> {
let mut excluded = RoaringBitmap::new();
let mut remaining = RoaringBitmap::new();
for docid in candidates {
if excluded.contains(docid) {
continue;
}
distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?;
remaining.push(docid);
}
Ok(DistinctOutput { remaining, excluded })
}
/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id.
pub fn distinct_single_docid(
index: &Index,
txn: &RoTxn<'_>,
field_id: u16,
docid: u32,
excluded: &mut RoaringBitmap,
) -> Result<()> {
for item in facet_string_values(docid, field_id, index, txn)? {
let ((_, _, facet_value), _) = item?;
if let Some(facet_docids) = facet_value_docids(
index.facet_id_string_docids.remap_types(),
txn,
field_id,
facet_value,
)? {
*excluded |= facet_docids;
}
}
for item in facet_number_values(docid, field_id, index, txn)? {
let ((_, _, facet_value), _) = item?;
if let Some(facet_docids) =
facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)?
{
*excluded |= facet_docids;
}
}
Ok(())
}
/// Return all the docids containing the given value in the given field
fn facet_value_docids(
database: Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
txn: &RoTxn<'_>,
field_id: u16,
facet_value: &[u8],
) -> heed::Result<Option<RoaringBitmap>> {
database
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })
.map(|opt| opt.map(|v| v.bitmap))
}
/// Return an iterator over each number value in the given field of the given document.
fn facet_number_values<'a>(
docid: u32,
field_id: u16,
index: &Index,
txn: &'a RoTxn<'a>,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Unit>> {
let key = facet_values_prefix_key(field_id, docid);
let iter = index
.field_id_docid_facet_f64s
.remap_key_type::<Bytes>()
.prefix_iter(txn, &key)?
.remap_key_type();
Ok(iter)
}
/// Return an iterator over each string value in the given field of the given document.
pub fn facet_string_values<'a>(
docid: u32,
field_id: u16,
index: &Index,
txn: &'a RoTxn<'a>,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Str>> {
let key = facet_values_prefix_key(field_id, docid);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<Bytes>()
.prefix_iter(txn, &key)?
.remap_types();
Ok(iter)
}
#[allow(clippy::drop_non_drop)]
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}

View file

@ -0,0 +1,299 @@
use heed::types::Bytes;
use roaring::{MultiOps, RoaringBitmap};
use super::query_graph::QueryGraph;
use super::ranking_rules::{RankingRule, RankingRuleOutput};
use crate::score_details::{self, ScoreDetails};
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm;
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
/// A ranking rule that produces 3 disjoint buckets:
///
/// 1. Documents from the universe whose value is exactly the query.
/// 2. Documents from the universe not in (1) whose value starts with the query.
/// 3. Documents from the universe not in (1) or (2).
pub struct ExactAttribute {
state: State,
}
impl ExactAttribute {
pub fn new() -> Self {
Self { state: Default::default() }
}
}
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
fn id(&self) -> String {
"exact_attribute".to_owned()
}
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
fn start_iteration(
&mut self,
ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
universe: &roaring::RoaringBitmap,
query: &QueryGraph,
) -> Result<()> {
self.state = State::start_iteration(ctx, universe, query)?;
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
fn next_bucket(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
universe: &roaring::RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
let state = std::mem::take(&mut self.state);
let (state, output) = State::next(state, universe);
self.state = state;
Ok(output)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
fn end_iteration(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.state = Default::default();
}
}
/// Inner state of the ranking rule.
#[derive(Default)]
enum State {
/// State between two iterations
#[default]
Uninitialized,
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
/// but isn't the exact query.
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
/// The next calls to `next` will output the input universe.
Empty(QueryGraph),
}
/// The candidates sorted by attributes
///
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
struct FieldCandidates {
/// The candidates that start with all the words of the query in the field
start_with_exact: RoaringBitmap,
/// The candidates that have the same number of words as the query in the field
exact_word_count: RoaringBitmap,
}
impl State {
fn start_iteration(
ctx: &mut SearchContext<'_>,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<Self> {
struct ExactTermInfo {
exact_term: ExactTerm,
start_position: u16,
start_term_id: u8,
position_count: usize,
}
let mut exact_terms: Vec<ExactTermInfo> =
Vec::with_capacity(query_graph.nodes.len() as usize);
for (_, node) in query_graph.nodes.iter() {
match &node.data {
QueryNodeData::Term(term) => {
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
exact_term
} else {
continue;
};
exact_terms.push(ExactTermInfo {
exact_term,
start_position: *term.positions.start(),
start_term_id: *term.term_ids.start(),
position_count: term.positions.len(),
});
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
exact_terms.sort_by_key(|x| x.start_term_id);
exact_terms.dedup_by_key(|x| x.start_term_id);
let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count);
// bail if there is a "hole" (missing word) in remaining query graph
if let Some(e) = exact_terms.first() {
if e.start_term_id != 0 {
return Ok(State::Empty(query_graph.clone()));
}
} else {
return Ok(State::Empty(query_graph.clone()));
}
let mut previous_id = 0;
for e in exact_terms.iter() {
if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 {
return Ok(State::Empty(query_graph.clone()));
} else {
previous_id = e.start_term_id;
}
}
// sample query: "sunflower are pretty"
// sunflower at pos 0 in attr A
// are at pos 1 in attr B
// pretty at pos 2 in attr C
// We want to eliminate such document
// first check that for each term, there exists some attribute that has this term at the correct position
//"word-position-docids";
let mut candidates = universe.clone();
let words_positions: Vec<(Vec<_>, _)> = exact_terms
.iter()
.map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position))
.collect();
for (words, position) in &words_positions {
if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone()));
}
'words: for (offset, word) in words.iter().enumerate() {
let offset = offset as u16;
let word = if let Some(word) = word {
word
} else {
continue 'words;
};
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
// longer phrases we'll be losing on precision here.
let bucketed_position = crate::bucketed_position(position + offset);
let word_position_docids = ctx
.get_db_word_position_docids(Some(universe), *word, bucketed_position)?
.unwrap_or_default();
candidates &= word_position_docids;
if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone()));
}
}
}
let candidates = candidates;
if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone()));
}
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?;
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
// then check that there exists at least one attribute that has all of the terms
for fid in searchable_fields_ids {
let intersection = MultiOps::intersection(
words_positions
.iter()
.flat_map(|(words, ..)| words.iter())
// ignore stop words words in phrases
.flatten()
.map(|word| -> Result<_> {
Ok(ctx
.get_db_word_fid_docids(Some(&candidates), *word, fid)?
.unwrap_or_default())
}),
)?;
if !intersection.is_empty() {
// Although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
let bitmap_bytes = ctx
.index
.field_id_word_count_docids
.remap_data_type::<Bytes>()
.get(ctx.txn, &(fid, count_all_positions as u8))?;
match bitmap_bytes {
Some(bytes) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
}
None => RoaringBitmap::default(),
}
} else {
RoaringBitmap::default()
};
candidates_per_attribute.push(FieldCandidates {
start_with_exact: intersection,
exact_word_count: candidates_with_exact_word_count,
});
}
}
// note we could have "false positives" where there both exist different attributes that collectively
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
}
fn next(
state: State,
universe: &RoaringBitmap,
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
let (state, output) = match state {
State::Uninitialized => (state, None),
State::ExactAttribute(query_graph, candidates_per_attribute) => {
// TODO it can be much faster to do the intersections before the unions...
// or maybe the candidates_per_attribute are not containing anything outside universe
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|FieldCandidates { start_with_exact, exact_word_count }| {
start_with_exact & exact_word_count
},
));
candidates &= universe;
(
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
Some(RankingRuleOutput {
query: query_graph,
candidates,
score: ScoreDetails::ExactAttribute(
score_details::ExactAttribute::ExactMatch,
),
}),
)
}
State::AttributeStarts(query_graph, candidates_per_attribute) => {
// TODO it can be much faster to do the intersections before the unions...
// or maybe the candidates_per_attribute are not containing anything outside universe
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|FieldCandidates { mut start_with_exact, exact_word_count }| {
start_with_exact -= exact_word_count;
start_with_exact
},
));
candidates &= universe;
(
State::Empty(query_graph.clone()),
Some(RankingRuleOutput {
query: query_graph,
candidates,
score: ScoreDetails::ExactAttribute(
score_details::ExactAttribute::MatchesStart,
),
}),
)
}
State::Empty(query_graph) => (
State::Empty(query_graph.clone()),
Some(RankingRuleOutput {
query: query_graph,
candidates: universe.clone(),
score: ScoreDetails::ExactAttribute(
score_details::ExactAttribute::NoExactMatch,
),
}),
),
};
(state, output)
}
}

View file

@ -0,0 +1,309 @@
use std::collections::VecDeque;
use std::iter::FromIterator;
use heed::types::{Bytes, Unit};
use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;
use super::facet_string_values;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
use crate::score_details::{self, ScoreDetails};
use crate::{
distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
SearchLogger,
};
const FID_SIZE: usize = 2;
const DOCID_SIZE: usize = 4;
#[allow(clippy::drop_non_drop)]
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}
/// Return an iterator over each number value in the given field of the given document.
fn facet_number_values<'a>(
docid: u32,
field_id: u16,
index: &Index,
txn: &'a RoTxn<'a>,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
let key = facet_values_prefix_key(field_id, docid);
let iter = index
.field_id_docid_facet_f64s
.remap_key_type::<Bytes>()
.prefix_iter(txn, &key)?
.remap_key_type();
Ok(iter)
}
/// Define the strategy used by the geo sort.
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
/// the point where we move from using the iterative strategy to the rtree.
#[derive(Debug, Clone, Copy)]
pub enum Strategy {
AlwaysIterative(usize),
AlwaysRtree(usize),
Dynamic(usize),
}
impl Default for Strategy {
fn default() -> Self {
Strategy::Dynamic(1000)
}
}
impl Strategy {
pub fn use_rtree(&self, candidates: usize) -> bool {
match self {
Strategy::AlwaysIterative(_) => false,
Strategy::AlwaysRtree(_) => true,
Strategy::Dynamic(i) => candidates >= *i,
}
}
pub fn cache_size(&self) -> usize {
match self {
Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
}
}
}
pub struct GeoSort<Q: RankingRuleQueryTrait> {
query: Option<Q>,
strategy: Strategy,
ascending: bool,
point: [f64; 2],
field_ids: Option<[u16; 2]>,
rtree: Option<RTree<GeoPoint>>,
cached_sorted_docids: VecDeque<(u32, [f64; 2])>,
geo_candidates: RoaringBitmap,
}
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
pub fn new(
strategy: Strategy,
geo_faceted_docids: RoaringBitmap,
point: [f64; 2],
ascending: bool,
) -> Result<Self> {
Ok(Self {
query: None,
strategy,
ascending,
point,
geo_candidates: geo_faceted_docids,
field_ids: None,
rtree: None,
cached_sorted_docids: VecDeque::new(),
})
}
/// Refill the internal buffer of cached docids based on the strategy.
/// Drop the rtree if we don't need it anymore.
fn fill_buffer(
&mut self,
ctx: &mut SearchContext<'_>,
geo_candidates: &RoaringBitmap,
) -> Result<()> {
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
debug_assert!(self.cached_sorted_docids.is_empty());
// lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree`
let rtree = if self.strategy.use_rtree(geo_candidates.len() as usize) {
if let Some(rtree) = self.rtree.as_ref() {
// get rtree from cache
Some(rtree)
} else {
let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree");
// insert rtree in cache and returns it.
// Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation.
Some(&*self.rtree.insert(rtree))
}
} else {
None
};
let cache_size = self.strategy.cache_size();
if let Some(rtree) = rtree {
if self.ascending {
let point = lat_lng_to_xyz(&self.point);
for point in rtree.nearest_neighbor_iter(&point) {
if geo_candidates.contains(point.data.0) {
self.cached_sorted_docids.push_back(point.data);
if self.cached_sorted_docids.len() >= cache_size {
break;
}
}
}
} else {
// in the case of the desc geo sort we look for the closest point to the opposite of the queried point
// and we insert the points in reverse order they get reversed when emptying the cache later on
let point = lat_lng_to_xyz(&opposite_of(self.point));
for point in rtree.nearest_neighbor_iter(&point) {
if geo_candidates.contains(point.data.0) {
self.cached_sorted_docids.push_front(point.data);
if self.cached_sorted_docids.len() >= cache_size {
break;
}
}
}
}
} else {
// the iterative version
let [lat, lng] = self.field_ids.unwrap();
let mut documents = geo_candidates
.iter()
.map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) })
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
// computing the distance between two points is expensive thus we cache the result
documents
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
self.cached_sorted_docids.extend(documents);
};
Ok(())
}
}
/// Extracts the lat and long values from a single document.
///
/// If it is not able to find it in the facet number index it will extract it
/// from the facet string index and parse it as f64 (as the geo extraction behaves).
fn geo_value(
docid: u32,
field_lat: u16,
field_lng: u16,
index: &Index,
rtxn: &RoTxn<'_>,
) -> Result<[f64; 2]> {
let extract_geo = |geo_field: u16| -> Result<f64> {
match facet_number_values(docid, geo_field, index, rtxn)?.next() {
Some(Ok(((_, _, geo), ()))) => Ok(geo),
Some(Err(e)) => Err(e.into()),
None => match facet_string_values(docid, geo_field, index, rtxn)?.next() {
Some(Ok((_, geo))) => {
Ok(geo.parse::<f64>().expect("cannot parse geo field as f64"))
}
Some(Err(e)) => Err(e.into()),
None => panic!("A geo faceted document doesn't contain any lat or lng"),
},
}
};
let lat = extract_geo(field_lat)?;
let lng = extract_geo(field_lng)?;
Ok([lat, lng])
}
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
fn id(&self) -> String {
"geo_sort".to_owned()
}
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
fn start_iteration(
&mut self,
ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<Q>,
universe: &RoaringBitmap,
query: &Q,
) -> Result<()> {
assert!(self.query.is_none());
self.query = Some(query.clone());
let geo_candidates = &self.geo_candidates & universe;
if geo_candidates.is_empty() {
return Ok(());
}
let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
self.field_ids = Some([lat, lng]);
self.fill_buffer(ctx, &geo_candidates)?;
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
#[allow(clippy::only_used_in_recursion)]
fn next_bucket(
&mut self,
ctx: &mut SearchContext<'ctx>,
logger: &mut dyn SearchLogger<Q>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Q>>> {
let query = self.query.as_ref().unwrap().clone();
let geo_candidates = &self.geo_candidates & universe;
if geo_candidates.is_empty() {
return Ok(Some(RankingRuleOutput {
query,
candidates: universe.clone(),
score: ScoreDetails::GeoSort(score_details::GeoSort {
target_point: self.point,
ascending: self.ascending,
value: None,
}),
}));
}
let ascending = self.ascending;
let next = |cache: &mut VecDeque<_>| {
if ascending {
cache.pop_front()
} else {
cache.pop_back()
}
};
while let Some((id, point)) = next(&mut self.cached_sorted_docids) {
if geo_candidates.contains(id) {
return Ok(Some(RankingRuleOutput {
query,
candidates: RoaringBitmap::from_iter([id]),
score: ScoreDetails::GeoSort(score_details::GeoSort {
target_point: self.point,
ascending: self.ascending,
value: Some(point),
}),
}));
}
}
// if we got out of this loop it means we've exhausted our cache.
// we need to refill it and run the function again.
self.fill_buffer(ctx, &geo_candidates)?;
self.next_bucket(ctx, logger, universe)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
// we do not reset the rtree here, it could be used in a next iteration
self.query = None;
self.cached_sorted_docids.clear();
}
}
/// Compute the antipodal coordinate of `coord`
fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] {
coord[0] *= -1.;
// in the case of x,0 we want to return x,180
if coord[1] > 0. {
coord[1] -= 180.;
} else {
coord[1] += 180.;
}
coord
}

View file

@ -0,0 +1,431 @@
/*! Implementation of a generic graph-based ranking rule.
A graph-based ranking rule is a ranking rule that works by representing
its possible operations and their relevancy cost as a directed acyclic multi-graph
built on top of the query graph. It then computes its buckets by finding the
cheapest paths from the start node to the end node and computing the document ids
that satisfy those paths.
For example, the proximity ranking rule builds a graph where the edges between two
nodes represent a condition that the term of the source node is in a certain proximity
to the term of the destination node. With the query "pretty house by" where the term
"pretty" has three possible proximities to the term "house" and "house" has two
proximities to "by", the graph will look like this:
```txt
11
START 0pretty 2 house by 0 END
32-
```
The proximity ranking rule's first bucket will be determined by the union of all
the shortest paths from START to END, which in this case is:
```txt
START --0-> pretty --1--> house --1--> by --0--> end
```
The path's corresponding document ids are found by taking the intersection of the
document ids of each edge. That is, we find the documents where both `pretty` is
1-close to `house` AND `house` is 1-close to `by`.
For the second bucket, we get the union of the second-cheapest paths, which are:
```txt
START --0-> pretty --1--> house --2--> by --0--> end
START --0-> pretty --2--> house --1--> by --0--> end
```
That is we find the documents where either:
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
*/
use std::collections::BTreeSet;
use std::ops::ControlFlow;
use roaring::RoaringBitmap;
use super::interner::{Interned, MappedInterner};
use super::logger::SearchLogger;
use super::query_graph::QueryNode;
use super::ranking_rule_graph::{
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
};
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::score_details::Rank;
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::ranking_rule_graph::PathVisitor;
use crate::{Result, TermsMatchingStrategy};
pub type Words = GraphBasedRankingRule<WordsGraph>;
impl GraphBasedRankingRule<WordsGraph> {
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
}
}
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
impl GraphBasedRankingRule<ProximityGraph> {
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
Self::new_with_id("proximity".to_owned(), terms_matching_strategy)
}
}
pub type Fid = GraphBasedRankingRule<FidGraph>;
impl GraphBasedRankingRule<FidGraph> {
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
Self::new_with_id("fid".to_owned(), terms_matching_strategy)
}
}
pub type Position = GraphBasedRankingRule<PositionGraph>;
impl GraphBasedRankingRule<PositionGraph> {
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
Self::new_with_id("position".to_owned(), terms_matching_strategy)
}
}
pub type Typo = GraphBasedRankingRule<TypoGraph>;
impl GraphBasedRankingRule<TypoGraph> {
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
}
}
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
impl GraphBasedRankingRule<ExactnessGraph> {
pub fn new() -> Self {
Self::new_with_id("exactness".to_owned(), None)
}
}
/// A generic graph-based ranking rule
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
id: String,
terms_matching_strategy: Option<TermsMatchingStrategy>,
// When the ranking rule is not iterating over its buckets,
// its state is `None`.
state: Option<GraphBasedRankingRuleState<G>>,
}
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
/// Creates the ranking rule with the given identifier
pub fn new_with_id(id: String, terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
Self { id, terms_matching_strategy, state: None }
}
}
/// The internal state of a graph-based ranking rule during iteration
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
/// The current graph
graph: RankingRuleGraph<G>,
/// Cache to retrieve the docids associated with each edge
conditions_cache: ConditionDocIdsCache<G>,
/// Cache used to optimistically discard paths that resolve to no documents.
dead_ends_cache: DeadEndsCache<G::Condition>,
/// A structure giving the list of possible costs from each node to the end node
all_costs: MappedInterner<QueryNode, Vec<u64>>,
/// An index in the first element of `all_distances`, giving the cost of the next bucket
cur_cost: u64,
/// One above the highest possible cost for this rule
next_max_cost: u64,
}
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
fn id(&self) -> String {
self.id.clone()
}
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
fn start_iteration(
&mut self,
ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
_universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<()> {
// the `next_max_cost` is the successor integer to the maximum cost of the paths in the graph.
//
// When there is a matching strategy, it also factors the additional costs of:
// 1. The words that are matched in phrases
// 2. Skipping words (by adding them to the paths with a cost)
let mut next_max_cost = 1;
let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy {
// add the cost of the phrase to the next_max_cost
next_max_cost += query_graph
.words_in_phrases_count(ctx)
// remove 1 from the words in phrases count, because when there is a phrase we can now have a document
// where only the phrase is matching, and none of the non-phrase words.
// With the `1` that `next_max_cost` is initialized with, this gets counted twice.
.saturating_sub(1) as u64;
match terms_matching_strategy {
TermsMatchingStrategy::Last => {
let removal_order =
query_graph.removal_order_for_terms_matching_strategy_last(ctx);
let mut forbidden_nodes =
SmallBitmap::for_interned_values_in(&query_graph.nodes);
let mut costs = query_graph.nodes.map(|_| None);
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
for ns in removal_order {
for n in ns.iter() {
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
}
forbidden_nodes.union(&ns);
}
costs
}
TermsMatchingStrategy::Frequency => {
let removal_order =
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
let mut forbidden_nodes =
SmallBitmap::for_interned_values_in(&query_graph.nodes);
let mut costs = query_graph.nodes.map(|_| None);
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
for ns in removal_order {
for n in ns.iter() {
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
}
forbidden_nodes.union(&ns);
}
costs
}
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
}
} else {
query_graph.nodes.map(|_| None)
};
let graph = RankingRuleGraph::build(ctx, query_graph.clone(), removal_cost)?;
let condition_docids_cache = ConditionDocIdsCache::default();
let dead_ends_cache = DeadEndsCache::new(&graph.conditions_interner);
// Then pre-compute the cost of all paths from each node to the end node
let all_costs = graph.find_all_costs_to_end();
next_max_cost +=
all_costs.get(graph.query_graph.root_node).iter().copied().max().unwrap_or(0);
let state = GraphBasedRankingRuleState {
graph,
conditions_cache: condition_docids_cache,
dead_ends_cache,
all_costs,
cur_cost: 0,
next_max_cost,
};
self.state = Some(state);
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
fn next_bucket(
&mut self,
ctx: &mut SearchContext<'ctx>,
logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
// should never happen
let mut state = self.state.take().unwrap();
let all_costs = state.all_costs.get(state.graph.query_graph.root_node);
// Retrieve the cost of the paths to compute
let Some(&cost) = all_costs.iter().find(|c| **c >= state.cur_cost) else {
self.state = None;
return Ok(None);
};
state.cur_cost = cost + 1;
let mut bucket = RoaringBitmap::new();
let GraphBasedRankingRuleState {
graph,
conditions_cache: condition_docids_cache,
dead_ends_cache,
all_costs,
cur_cost: _,
next_max_cost,
} = &mut state;
let rank = *next_max_cost - cost;
let score = G::rank_to_score(Rank { rank: rank as u32, max_rank: *next_max_cost as u32 });
let mut universe = universe.clone();
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
let mut good_paths = vec![];
let mut considered_paths = vec![];
// For each path of the given cost, we will compute its associated
// document ids.
// In case the path does not resolve to any document id, we try to figure out why
// and update the `dead_ends_cache` accordingly.
// Updating the dead_ends_cache helps speed up the execution of `visit_paths_of_cost` and reduces
// the number of future candidate paths given by that same function.
let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
considered_paths.push(path.to_vec());
// If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
if universe.is_empty() {
return Ok(ControlFlow::Break(()));
}
// `visit_paths` performs a depth-first search, so the previously visited path
// is likely to share a prefix with the current one.
// We stored the previous path and the docids associated to each of its prefixes in `subpaths_docids`.
// We take advantage of this to avoid computing the docids associated with the common prefix between
// the old and current path.
let idx_of_first_different_condition = {
let mut idx = 0;
for (&last_c, cur_c) in path.iter().zip(subpaths_docids.iter().map(|x| x.0)) {
if last_c == cur_c {
idx += 1;
} else {
break;
}
}
subpaths_docids.truncate(idx);
idx
};
// Then for the remaining of the path, we continue computing docids.
for latest_condition in path[idx_of_first_different_condition..].iter().copied() {
let success = visit_path_condition(
ctx,
graph,
&universe,
dead_ends_cache,
condition_docids_cache,
&mut subpaths_docids,
&mut nodes_with_removed_outgoing_conditions,
latest_condition,
)?;
if !success {
return Ok(ControlFlow::Continue(()));
}
}
assert!(subpaths_docids.iter().map(|x| x.0).eq(path.iter().copied()));
let path_docids =
subpaths_docids.pop().map(|x| x.1).unwrap_or_else(|| universe.clone());
assert!(!path_docids.is_empty());
// Accumulate the path for logging purposes only
good_paths.push(path.to_vec());
for &condition in path {
used_conditions.insert(condition);
}
bucket |= &path_docids;
// Reduce the size of the universe so that we can more optimistically discard candidate paths
universe -= &path_docids;
for (_, docids) in subpaths_docids.iter_mut() {
*docids -= &path_docids;
}
if universe.is_empty() {
Ok(ControlFlow::Break(()))
} else {
Ok(ControlFlow::Continue(()))
}
})?;
logger.log_internal_state(graph);
logger.log_internal_state(&good_paths);
// We modify the next query graph so that it only contains the subgraph
// that was used to compute this bucket
let paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>> = good_paths
.into_iter()
.map(|path| {
path.into_iter()
.map(|condition| {
let (a, b) =
condition_docids_cache.get_subsets_used_by_condition(condition);
(a.clone(), b.clone())
})
.collect()
})
.collect();
let next_query_graph = QueryGraph::build_from_paths(paths);
#[allow(clippy::comparison_chain)]
if nodes_with_removed_outgoing_conditions.len() == 1 {
graph.update_all_costs_before_node(
*nodes_with_removed_outgoing_conditions.first().unwrap(),
all_costs,
);
} else if nodes_with_removed_outgoing_conditions.len() > 1 {
*all_costs = graph.find_all_costs_to_end();
}
self.state = Some(state);
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket, score }))
}
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
fn end_iteration(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.state = None;
}
}
/// Returns false if the intersection between the condition
/// docids and the previous path docids is empty.
#[allow(clippy::too_many_arguments)]
fn visit_path_condition<G: RankingRuleGraphTrait>(
ctx: &mut SearchContext<'_>,
graph: &mut RankingRuleGraph<G>,
universe: &RoaringBitmap,
dead_ends_cache: &mut DeadEndsCache<G::Condition>,
condition_docids_cache: &mut ConditionDocIdsCache<G>,
subpath: &mut Vec<(Interned<G::Condition>, RoaringBitmap)>,
nodes_with_removed_outgoing_conditions: &mut BTreeSet<Interned<QueryNode>>,
latest_condition: Interned<G::Condition>,
) -> Result<bool> {
let condition_docids = &condition_docids_cache
.get_computed_condition(ctx, latest_condition, graph, universe)?
.docids;
if condition_docids.is_empty() {
// 1. Store in the cache that this edge is empty for this universe
dead_ends_cache.forbid_condition(latest_condition);
// 2. remove all the edges with this condition from the ranking rule graph
let source_nodes = graph.remove_edges_with_condition(latest_condition);
nodes_with_removed_outgoing_conditions.extend(source_nodes);
return Ok(false);
}
let latest_path_docids = if let Some((_, prev_docids)) = subpath.last() {
prev_docids & condition_docids
} else {
condition_docids.clone()
};
if !latest_path_docids.is_empty() {
subpath.push((latest_condition, latest_path_docids));
return Ok(true);
}
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
// First, we know that this path is empty, and thus any path
// that is a superset of it will also be empty.
dead_ends_cache.forbid_condition_after_prefix(subpath.iter().map(|x| x.0), latest_condition);
if subpath.len() <= 1 {
return Ok(false);
}
let mut subprefix = vec![];
// Deadend if the intersection between this edge and any
// previous prefix is disjoint with the universe
// We already know that the intersection with the last one
// is empty,
for (past_condition, sp_docids) in subpath[..subpath.len() - 1].iter() {
subprefix.push(*past_condition);
if condition_docids.is_disjoint(sp_docids) {
dead_ends_cache
.forbid_condition_after_prefix(subprefix.iter().copied(), latest_condition);
}
}
Ok(false)
}

View file

@ -0,0 +1,259 @@
use std::fmt;
use std::hash::Hash;
use std::marker::PhantomData;
use fxhash::FxHashMap;
use super::small_bitmap::SmallBitmap;
/// An index within an interner ([`FixedSizeInterner`], [`DedupInterner`], or [`MappedInterner`]).
pub struct Interned<T> {
idx: u16,
_phantom: PhantomData<T>,
}
impl<T> Interned<T> {
/// Create an interned value manually from its raw index within the interner.
pub fn from_raw(idx: u16) -> Self {
Self { idx, _phantom: PhantomData }
}
/// Get the raw index from the interned value
pub fn into_raw(self) -> u16 {
self.idx
}
}
/// A [`DedupInterner`] is used to store a unique copy of a value of type `T`. This value
/// is then identified by a lightweight index of type [`Interned<T>`], which can
/// be copied, compared, and hashed efficiently. An immutable reference to the original value
/// can be retrieved using `self.get(interned)`. A set of values within the interner can be
/// efficiently managed using [`SmallBitmap<T>`](super::small_bitmap::SmallBitmap).
///
/// A dedup-interner can contain a maximum of `u16::MAX` values.
#[derive(Clone)]
pub struct DedupInterner<T> {
stable_store: Vec<T>,
lookup: FxHashMap<T, Interned<T>>,
}
impl<T> Default for DedupInterner<T> {
fn default() -> Self {
Self { stable_store: Default::default(), lookup: Default::default() }
}
}
impl<T> DedupInterner<T> {
/// Convert the dedup-interner into a fixed-size interner, such that new
/// elements cannot be added to it anymore.
pub fn freeze(self) -> FixedSizeInterner<T> {
FixedSizeInterner { stable_store: self.stable_store }
}
}
impl<T> DedupInterner<T>
where
T: Clone + Eq + Hash,
{
/// Insert the given value into the dedup-interner, and return
/// its index.
pub fn insert(&mut self, s: T) -> Interned<T> {
if let Some(interned) = self.lookup.get(&s) {
*interned
} else {
assert!(self.stable_store.len() < u16::MAX as usize);
self.stable_store.push(s.clone());
let interned = Interned::from_raw(self.stable_store.len() as u16 - 1);
self.lookup.insert(s, interned);
interned
}
}
/// Get a reference to the interned value.
pub fn get(&self, interned: Interned<T>) -> &T {
&self.stable_store[interned.idx as usize]
}
}
/// A fixed-length store for values of type `T`, where each value is identified
/// by an index of type [`Interned<T>`].
#[derive(Clone)]
pub struct FixedSizeInterner<T> {
stable_store: Vec<T>,
}
impl<T: Clone> FixedSizeInterner<T> {
/// Create a fixed-size interner of the given length containing
/// clones of the given value.
pub fn new(length: u16, value: T) -> Self {
Self { stable_store: vec![value; length as usize] }
}
}
impl<T> FixedSizeInterner<T> {
pub fn from_vec(store: Vec<T>) -> Self {
Self { stable_store: store }
}
pub fn all_interned_values(&self) -> SmallBitmap<T> {
let mut b = SmallBitmap::for_interned_values_in(self);
for i in self.indexes() {
b.insert(i);
}
b
}
pub fn get(&self, interned: Interned<T>) -> &T {
&self.stable_store[interned.idx as usize]
}
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
&mut self.stable_store[interned.idx as usize]
}
pub fn len(&self) -> u16 {
self.stable_store.len() as u16
}
pub fn map_move<U>(self, map_f: impl Fn(T) -> U) -> FixedSizeInterner<U> {
FixedSizeInterner { stable_store: self.stable_store.into_iter().map(map_f).collect() }
}
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
MappedInterner {
stable_store: self.stable_store.iter().map(map_f).collect(),
_phantom: PhantomData,
}
}
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
}
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
}
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
}
/// A fixed-length store for values of type `T`, where each value is identified
/// by an index of type [`Interned<T>`].
#[derive(Clone)]
pub struct Interner<T> {
stable_store: Vec<T>,
}
impl<T> Default for Interner<T> {
fn default() -> Self {
Self { stable_store: vec![] }
}
}
impl<T> Interner<T> {
pub fn from_vec(v: Vec<T>) -> Self {
Self { stable_store: v }
}
pub fn get(&self, interned: Interned<T>) -> &T {
&self.stable_store[interned.idx as usize]
}
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
&mut self.stable_store[interned.idx as usize]
}
pub fn push(&mut self, value: T) -> Interned<T> {
assert!(self.stable_store.len() < u16::MAX as usize);
self.stable_store.push(value);
Interned::from_raw(self.stable_store.len() as u16 - 1)
}
pub fn len(&self) -> u16 {
self.stable_store.len() as u16
}
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
MappedInterner {
stable_store: self.stable_store.iter().map(map_f).collect(),
_phantom: PhantomData,
}
}
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
}
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
}
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
pub fn freeze(self) -> FixedSizeInterner<T> {
FixedSizeInterner { stable_store: self.stable_store }
}
}
/// A store of values of type `T`, each linked to a value of type `From`
/// stored in another interner. To create a mapped interner, use the
/// `map` method on [`FixedSizeInterner`] or [`MappedInterner`].
///
/// Values in this interner are indexed with [`Interned<From>`].
#[derive(Clone)]
pub struct MappedInterner<From, T> {
stable_store: Vec<T>,
_phantom: PhantomData<From>,
}
impl<From, T> MappedInterner<From, T> {
pub fn get(&self, interned: Interned<From>) -> &T {
&self.stable_store[interned.idx as usize]
}
pub fn get_mut(&mut self, interned: Interned<From>) -> &mut T {
&mut self.stable_store[interned.idx as usize]
}
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<From, U> {
MappedInterner {
stable_store: self.stable_store.iter().map(map_f).collect(),
_phantom: PhantomData,
}
}
pub fn iter(&self) -> impl Iterator<Item = (Interned<From>, &T)> {
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<From>, &mut T)> {
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
}
}
// Interned<T> boilerplate implementations
impl<T> Hash for Interned<T> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.idx.hash(state);
}
}
impl<T> Ord for Interned<T> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.idx.cmp(&other.idx)
}
}
impl<T> PartialOrd for Interned<T> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<T> Eq for Interned<T> {}
impl<T> PartialEq for Interned<T> {
fn eq(&self, other: &Self) -> bool {
self.idx == other.idx
}
}
impl<T> Clone for Interned<T> {
fn clone(&self) -> Self {
*self
}
}
impl<T> Copy for Interned<T> {}
impl<T> fmt::Display for Interned<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.idx, f)
}
}
impl<T> fmt::Debug for Interned<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self.idx, f)
}
}

View file

@ -0,0 +1,17 @@
/// Maximum number of tokens we consider in a single search.
pub const MAX_TOKEN_COUNT: usize = 1_000;
/// Maximum number of prefixes that can be derived from a single word.
pub const MAX_PREFIX_COUNT: usize = 1_000;
/// Maximum number of words that can be derived from a single word with a distance of one to that word.
pub const MAX_ONE_TYPO_COUNT: usize = 150;
/// Maximum number of words that can be derived from a single word with a distance of two to that word.
pub const MAX_TWO_TYPOS_COUNT: usize = 50;
/// Maximum amount of synonym phrases that can be derived from a single word.
pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50;
/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word.
///
/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms.
pub const MAX_SYNONYM_WORD_COUNT: usize = 100;

View file

@ -0,0 +1,81 @@
// #[cfg(test)]
pub mod visual;
use std::any::Any;
use roaring::RoaringBitmap;
use super::ranking_rules::BoxRankingRule;
use super::{RankingRule, RankingRuleQueryTrait};
/// Trait for structure logging the execution of a search query.
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
/// Logs the initial query
fn initial_query(&mut self, _query: &Q);
/// Logs the value of the initial set of all candidates
fn initial_universe(&mut self, _universe: &RoaringBitmap);
/// Logs the query that was used to compute the set of all candidates
fn query_for_initial_universe(&mut self, _query: &Q);
/// Logs the ranking rules used to perform the search query
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<'_, Q>]);
/// Logs the start of a ranking rule's iteration.
fn start_iteration_ranking_rule(
&mut self,
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, Q>,
_query: &Q,
_universe: &RoaringBitmap,
) {
}
/// Logs the end of the computation of a ranking rule bucket
fn next_bucket_ranking_rule(
&mut self,
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, Q>,
_universe: &RoaringBitmap,
_candidates: &RoaringBitmap,
) {
}
/// Logs the skipping of a ranking rule bucket
fn skip_bucket_ranking_rule(
&mut self,
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, Q>,
_candidates: &RoaringBitmap,
) {
}
/// Logs the end of a ranking rule's iteration.
fn end_iteration_ranking_rule(
&mut self,
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, Q>,
_universe: &RoaringBitmap,
) {
}
/// Logs the addition of document ids to the final results
fn add_to_results(&mut self, _docids: &[u32]);
/// Logs an internal state in the search algorithms
fn log_internal_state(&mut self, _rr: &dyn Any);
}
/// A dummy [`SearchLogger`] which does nothing.
pub struct DefaultSearchLogger;
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
fn initial_query(&mut self, _query: &Q) {}
fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
fn query_for_initial_universe(&mut self, _query: &Q) {}
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<'_, Q>]) {}
fn add_to_results(&mut self, _docids: &[u32]) {}
fn log_internal_state(&mut self, _rr: &dyn Any) {}
}

View file

@ -0,0 +1,554 @@
use std::any::Any;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};
use std::time::Instant;
use roaring::RoaringBitmap;
use crate::search::new::interner::Interned;
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::ranking_rule_graph::{
Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
WordsCondition, WordsGraph,
};
use crate::search::new::ranking_rules::BoxRankingRule;
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
use crate::Result;
pub enum SearchEvents {
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize },
ExtendResults { new: Vec<u32> },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
TypoGraph { graph: RankingRuleGraph<TypoGraph> },
TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
WordsGraph { graph: RankingRuleGraph<WordsGraph> },
WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
FidGraph { graph: RankingRuleGraph<FidGraph> },
FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
PositionGraph { graph: RankingRuleGraph<PositionGraph> },
PositionPaths { paths: Vec<Vec<Interned<PositionCondition>>> },
}
enum Location {
Words,
Typo,
Proximity,
Fid,
Position,
Other,
}
#[derive(Default)]
pub struct VisualSearchLogger {
initial_query: Option<QueryGraph>,
initial_query_time: Option<Instant>,
query_for_universe: Option<QueryGraph>,
initial_universe: Option<RoaringBitmap>,
ranking_rules_ids: Option<Vec<String>>,
events: Vec<SearchEvents>,
location: Vec<Location>,
}
impl SearchLogger<QueryGraph> for VisualSearchLogger {
fn initial_query(&mut self, query: &QueryGraph) {
self.initial_query = Some(query.clone());
self.initial_query_time = Some(Instant::now());
}
fn query_for_initial_universe(&mut self, query: &QueryGraph) {
self.query_for_universe = Some(query.clone());
}
fn initial_universe(&mut self, universe: &RoaringBitmap) {
self.initial_universe = Some(universe.clone());
}
fn ranking_rules(&mut self, rr: &[BoxRankingRule<'_, QueryGraph>]) {
self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect());
}
fn start_iteration_ranking_rule(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'_, QueryGraph>,
_query: &QueryGraph,
universe: &RoaringBitmap,
) {
self.events.push(SearchEvents::RankingRuleStartIteration {
ranking_rule_idx,
universe_len: universe.len(),
});
self.location.push(match ranking_rule.id().as_str() {
"words" => Location::Words,
"typo" => Location::Typo,
"proximity" => Location::Proximity,
"fid" => Location::Fid,
"position" => Location::Position,
_ => Location::Other,
});
}
fn next_bucket_ranking_rule(
&mut self,
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
universe: &RoaringBitmap,
bucket: &RoaringBitmap,
) {
self.events.push(SearchEvents::RankingRuleNextBucket {
ranking_rule_idx,
universe_len: universe.len(),
bucket_len: bucket.len(),
});
}
fn skip_bucket_ranking_rule(
&mut self,
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
bucket: &RoaringBitmap,
) {
self.events.push(SearchEvents::RankingRuleSkipBucket {
ranking_rule_idx,
bucket_len: bucket.len(),
})
}
fn end_iteration_ranking_rule(
&mut self,
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
_universe: &RoaringBitmap,
) {
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
self.location.pop();
}
fn add_to_results(&mut self, docids: &[u32]) {
self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() });
}
/// Logs the internal state of the ranking rule
fn log_internal_state(&mut self, state: &dyn Any) {
let Some(location) = self.location.last() else { return };
match location {
Location::Words => {
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
}
}
Location::Typo => {
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<TypoGraph>>() {
self.events.push(SearchEvents::TypoGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<TypoCondition>>>>() {
self.events.push(SearchEvents::TypoPaths { paths: paths.clone() });
}
}
Location::Proximity => {
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<ProximityGraph>>() {
self.events.push(SearchEvents::ProximityGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<ProximityCondition>>>>()
{
self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() });
}
}
Location::Fid => {
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<FidGraph>>() {
self.events.push(SearchEvents::FidGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<FidCondition>>>>() {
self.events.push(SearchEvents::FidPaths { paths: paths.clone() });
}
}
Location::Position => {
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<PositionGraph>>() {
self.events.push(SearchEvents::PositionGraph { graph: graph.clone() });
}
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<PositionCondition>>>>() {
self.events.push(SearchEvents::PositionPaths { paths: paths.clone() });
}
}
Location::Other => {}
}
}
}
impl VisualSearchLogger {
pub fn finish<'ctx>(self, ctx: &'ctx mut SearchContext<'ctx>, folder: &Path) -> Result<()> {
let mut f = DetailedLoggerFinish::new(ctx, folder)?;
f.finish(self)?;
Ok(())
}
}
struct DetailedLoggerFinish<'ctx> {
ctx: &'ctx mut SearchContext<'ctx>,
/// The folder where all the files should be printed
folder_path: PathBuf,
/// The main file visualising the search request
index_file: BufWriter<File>,
/// A vector of counters where each counter at index i represents the number of times
/// that the ranking rule at idx i-1 was called since its last call to `start_iteration`.
/// This is used to uniquely identify a point in the sequence diagram.
rr_action_counter: Vec<usize>,
/// The file storing information about the internal state of the latest active ranking rule
file_for_internal_state: Option<BufWriter<File>>,
}
impl<'ctx> DetailedLoggerFinish<'ctx> {
fn cur_file(&mut self) -> &mut BufWriter<File> {
if let Some(file) = self.file_for_internal_state.as_mut() {
file
} else {
&mut self.index_file
}
}
fn pop_rr_action(&mut self) {
self.file_for_internal_state = None;
self.rr_action_counter.pop();
}
fn push_new_rr_action(&mut self) {
self.file_for_internal_state = None;
self.rr_action_counter.push(0);
}
fn increment_cur_rr_action(&mut self) {
self.file_for_internal_state = None;
if let Some(c) = self.rr_action_counter.last_mut() {
*c += 1;
}
}
fn id_of_timestamp(&self) -> String {
let mut s = String::new();
for t in self.rr_action_counter.iter() {
s.push_str(&format!("{t}_"));
}
s
}
fn id_of_extend_results(&self) -> String {
let mut s = String::new();
s.push_str("results.\"");
s.push_str(&self.id_of_timestamp());
s.push('"');
s
}
fn id_of_last_rr_action(&self) -> String {
let mut s = String::new();
let rr_id = if self.rr_action_counter.is_empty() {
"start.\"".to_owned()
} else {
format!("{}.\"", self.rr_action_counter.len() - 1)
};
s.push_str(&rr_id);
s.push_str(&self.id_of_timestamp());
s.push('"');
s
}
fn make_new_file_for_internal_state_if_needed(&mut self) -> Result<()> {
if self.file_for_internal_state.is_some() {
return Ok(());
}
let timestamp = self.id_of_timestamp();
let id = self.id_of_last_rr_action();
let new_file_path = self.folder_path.join(format!("{timestamp}.d2"));
self.file_for_internal_state = Some(BufWriter::new(File::create(new_file_path)?));
writeln!(
&mut self.index_file,
"{id} {{
link: \"{timestamp}.d2.svg\"
}}"
)?;
Ok(())
}
fn new(ctx: &'ctx mut SearchContext<'ctx>, folder_path: &Path) -> Result<Self> {
let index_path = folder_path.join("index.d2");
let index_file = BufWriter::new(File::create(index_path)?);
Ok(Self {
ctx,
folder_path: folder_path.to_owned(),
index_file,
rr_action_counter: vec![],
file_for_internal_state: None,
})
}
fn finish(&mut self, logger: VisualSearchLogger) -> Result<()> {
writeln!(&mut self.index_file, "direction: right")?;
if let Some(qg) = logger.initial_query {
writeln!(&mut self.index_file, "Initial Query Graph: {{")?;
self.write_query_graph(&qg)?;
writeln!(&mut self.index_file, "}}")?;
}
if let Some(qg) = logger.query_for_universe {
writeln!(&mut self.index_file, "Query Graph Used To Compute Universe: {{")?;
self.write_query_graph(&qg)?;
writeln!(&mut self.index_file, "}}")?;
}
let Some(ranking_rules_ids) = logger.ranking_rules_ids else { return Ok(()) };
writeln!(&mut self.index_file, "Control Flow Between Ranking Rules: {{")?;
writeln!(&mut self.index_file, "shape: sequence_diagram")?;
writeln!(&mut self.index_file, "start")?;
for (idx, rr_id) in ranking_rules_ids.iter().enumerate() {
writeln!(&mut self.index_file, "{idx}: {rr_id}")?;
}
writeln!(&mut self.index_file, "results")?;
for event in logger.events {
self.write_event(event)?;
}
writeln!(&mut self.index_file, "}}")?;
Ok(())
}
fn write_event(&mut self, e: SearchEvents) -> Result<()> {
match e {
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, universe_len } => {
assert!(ranking_rule_idx == self.rr_action_counter.len());
self.write_start_iteration(universe_len)?;
}
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe_len, bucket_len } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_next_bucket(bucket_len, universe_len)?;
}
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, bucket_len } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_skip_bucket(bucket_len)?;
}
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_end_iteration()?;
}
SearchEvents::ExtendResults { new } => {
self.write_extend_results(new)?;
}
SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::ProximityPaths { paths } => {
self.write_rr_graph_paths::<ProximityGraph>(paths)?;
}
SearchEvents::TypoGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::TypoPaths { paths } => {
self.write_rr_graph_paths::<TypoGraph>(paths)?;
}
SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::WordsPaths { paths } => {
self.write_rr_graph_paths::<WordsGraph>(paths)?;
}
SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::FidPaths { paths } => {
self.write_rr_graph_paths::<FidGraph>(paths)?;
}
SearchEvents::PositionGraph { graph } => self.write_rr_graph(&graph)?,
SearchEvents::PositionPaths { paths } => {
self.write_rr_graph_paths::<PositionGraph>(paths)?;
}
}
Ok(())
}
fn write_query_graph(&mut self, qg: &QueryGraph) -> Result<()> {
writeln!(self.cur_file(), "direction: right")?;
for (node_id, node) in qg.nodes.iter() {
if matches!(node.data, QueryNodeData::Deleted) {
continue;
}
self.write_query_node(node_id, node)?;
for edge in node.successors.iter() {
writeln!(self.cur_file(), "{node_id} -> {edge};\n").unwrap();
}
}
Ok(())
}
fn write_start_iteration(&mut self, _universe_len: u64) -> Result<()> {
let parent_action_id = self.id_of_last_rr_action();
self.push_new_rr_action();
let self_action_id = self.id_of_last_rr_action();
writeln!(&mut self.index_file, "{parent_action_id} -> {self_action_id} : start iteration")?;
writeln!(
&mut self.index_file,
"{self_action_id} {{
style {{
fill: \"#D8A7B1\"
}}
}}"
)?;
Ok(())
}
fn write_next_bucket(&mut self, bucket_len: u64, universe_len: u64) -> Result<()> {
let cur_action_id = self.id_of_last_rr_action();
self.increment_cur_rr_action();
let next_action_id = self.id_of_last_rr_action();
writeln!(
&mut self.index_file,
"{cur_action_id} -> {next_action_id} : next bucket {bucket_len}/{universe_len}"
)?;
Ok(())
}
fn write_skip_bucket(&mut self, bucket_len: u64) -> Result<()> {
let cur_action_id = self.id_of_last_rr_action();
self.increment_cur_rr_action();
let next_action_id = self.id_of_last_rr_action();
writeln!(
&mut self.index_file,
"{cur_action_id} -> {next_action_id} : skip bucket ({bucket_len})"
)?;
Ok(())
}
fn write_end_iteration(&mut self) -> Result<()> {
let cur_action_id = self.id_of_last_rr_action();
self.pop_rr_action();
let parent_action_id = self.id_of_last_rr_action();
writeln!(&mut self.index_file, "{cur_action_id} -> {parent_action_id} : end iteration",)?;
Ok(())
}
fn write_extend_results(&mut self, new: Vec<u32>) -> Result<()> {
if new.is_empty() {
return Ok(());
}
let cur_action_id = self.id_of_last_rr_action();
let results_id = self.id_of_extend_results();
let docids = new.iter().collect::<Vec<_>>();
let len = new.len();
writeln!(
&mut self.index_file,
"{cur_action_id} -> {results_id} : \"add {len}\"
{results_id} {{
tooltip: \"{docids:?}\"
style {{
fill: \"#B6E2D3\"
}}
}}
"
)?;
Ok(())
}
fn write_query_node(&mut self, node_idx: Interned<QueryNode>, node: &QueryNode) -> Result<()> {
let Self {
ctx, index_file, file_for_internal_state: active_ranking_rule_state_file, ..
} = self;
let file = if let Some(file) = active_ranking_rule_state_file.as_mut() {
file
} else {
index_file
};
match &node.data {
QueryNodeData::Term(LocatedQueryTermSubset {
term_subset,
positions: _,
term_ids: _,
}) => {
writeln!(
file,
"{node_idx} : \"{}\" {{
shape: class
max_nbr_typo: {}",
term_subset.description(ctx),
term_subset.max_typo_cost(ctx)
)?;
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
let w = ctx.word_interner.get(w.interned());
writeln!(file, "{w}: word")?;
}
for p in term_subset.all_phrases(ctx)? {
writeln!(file, "{}: phrase", p.description(ctx))?;
}
if let Some(w) = term_subset.use_prefix_db(ctx) {
let w = ctx.word_interner.get(w.interned());
writeln!(file, "{w}: prefix db")?;
}
writeln!(file, "}}")?;
}
QueryNodeData::Deleted => panic!(),
QueryNodeData::Start => {
writeln!(file, "{node_idx} : START")?;
}
QueryNodeData::End => {
writeln!(file, "{node_idx} : END")?;
}
}
Ok(())
}
fn write_rr_graph<R: RankingRuleGraphTrait>(
&mut self,
graph: &RankingRuleGraph<R>,
) -> Result<()> {
self.make_new_file_for_internal_state_if_needed()?;
writeln!(self.cur_file(), "direction: right")?;
writeln!(self.cur_file(), "Graph {{")?;
for (node_idx, node) in graph.query_graph.nodes.iter() {
if matches!(&node.data, QueryNodeData::Deleted) {
continue;
}
self.write_query_node(node_idx, node)?;
}
for (_edge_id, edge) in graph.edges_store.iter() {
let Some(edge) = edge else { continue };
let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge;
match &details {
None => {
writeln!(
self.cur_file(),
"{source_node} -> {dest_node} : \"always cost {cost}\"",
)?;
}
Some(condition) => {
writeln!(
self.cur_file(),
"{source_node} -> {dest_node} : \"{condition} cost {cost}\"",
cost = edge.cost,
)?;
}
}
}
writeln!(self.cur_file(), "}}")?;
Ok(())
}
fn write_rr_graph_paths<R: RankingRuleGraphTrait>(
&mut self,
paths: Vec<Vec<Interned<R::Condition>>>,
) -> Result<()> {
self.make_new_file_for_internal_state_if_needed()?;
let file = if let Some(file) = self.file_for_internal_state.as_mut() {
file
} else {
&mut self.index_file
};
writeln!(file, "Path {{")?;
for (path_idx, condition_indexes) in paths.iter().enumerate() {
writeln!(file, "{path_idx} {{")?;
for condition in condition_indexes.iter() {
writeln!(file, "{condition}")?;
}
for couple_edges in condition_indexes.windows(2) {
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
writeln!(file, "{src_edge_idx} -> {dest_edge_idx}")?;
}
writeln!(file, "}}")?;
}
writeln!(file, "}}")?;
Ok(())
}
}

View file

@ -0,0 +1,139 @@
use super::matching_words::WordId;
use super::{Match, MatchPosition};
struct MatchIntervalWithScore {
interval: [usize; 2],
score: [i16; 3],
}
// count score for phrases
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
let words_in_phrase_minus_one = (lwp - fwp) as i16;
// will always be ordered, so +1 for each space between words
*order_score += words_in_phrase_minus_one;
// distance will always be 1, so -1 for each space between words
*distance_score -= words_in_phrase_minus_one;
}
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
let mut order_score = 0;
let mut distance_score = 0;
let mut iter = matches.iter().peekable();
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
if next_match.ids.iter().min() > m.ids.iter().min() {
order_score += 1;
}
let m_last_word_pos = match m.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
lwp
}
};
let next_match_first_word_pos = next_match.get_first_word_pos();
// compute distance between matches
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
// in case last match is a phrase, count score for its words
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
}
ids.extend(m.ids.iter());
}
ids.sort_unstable();
ids.dedup();
let uniq_score = ids.len() as i16;
// rank by unique match count, then by distance between matches, then by ordered match count.
[uniq_score, distance_score, order_score]
}
/// Returns the first and last match where the score computed by match_interval_score is the best.
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
if matches.is_empty() {
panic!("`matches` should not be empty at this point");
}
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval: Option<MatchIntervalWithScore> = None;
let mut save_best_interval = |interval_first, interval_last| {
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
let is_interval_score_better = &best_interval
.as_ref()
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
if *is_interval_score_better {
best_interval = Some(MatchIntervalWithScore {
interval: [interval_first, interval_last],
score: interval_score,
});
}
};
// we compute the matches interval if we have at least 2 matches.
// current interval positions.
let mut interval_first = 0;
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
for (index, next_match) in matches.iter().enumerate() {
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
let next_match_last_word_pos = next_match.get_last_word_pos();
// if the next match would mean that we pass the crop size window,
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
// and calculate a score for it, and check if it's better than our best so far
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
// if index is 0 there is no last viable match
if index != 0 {
let interval_last = index - 1;
// keep interval if it's the best
save_best_interval(interval_first, interval_last);
}
// advance start of the interval while interval is longer than crop_size.
loop {
interval_first += 1;
if interval_first == matches.len() {
interval_first -= 1;
break;
}
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
if interval_first_match_first_word_pos > next_match_last_word_pos
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
{
break;
}
}
}
}
// compute the last interval score and compare it to the best one.
let interval_last = matches.len() - 1;
// if it's the last match with itself, we need to make sure it's
// not a phrase longer than the crop window
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
save_best_interval(interval_first, interval_last);
}
// if none of the matches fit the criteria above, default to the first one
best_interval.map_or(
[&matches[0], &matches[0]],
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
)
}

View file

@ -0,0 +1,62 @@
use super::matching_words::WordId;
#[derive(Clone, Debug)]
pub enum MatchPosition {
Word {
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
},
Phrase {
// position of the first and last word in the phrase in the whole text.
word_positions: [usize; 2],
// position of the first and last token in the phrase in the whole text.
token_positions: [usize; 2],
},
}
#[derive(Clone, Debug)]
pub struct Match {
pub char_count: usize,
// ids of the query words that matches.
pub ids: Vec<WordId>,
pub position: MatchPosition,
}
impl Match {
pub(super) fn get_first_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
}
}
pub(super) fn get_last_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
}
}
pub(super) fn get_first_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
}
}
pub(super) fn get_last_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
}
}
pub(super) fn get_word_count(&self) -> usize {
match self.position {
MatchPosition::Word { .. } => 1,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
}
}
}

View file

@ -0,0 +1,331 @@
use std::cmp::Reverse;
use std::fmt;
use std::ops::RangeInclusive;
use charabia::Token;
use super::super::interner::Interned;
use super::super::query_term::LocatedQueryTerm;
use super::super::{DedupInterner, Phrase};
use crate::SearchContext;
pub struct LocatedMatchingPhrase {
pub value: Interned<Phrase>,
pub positions: RangeInclusive<WordId>,
}
pub struct LocatedMatchingWords {
pub value: Vec<Interned<String>>,
pub positions: RangeInclusive<WordId>,
pub is_prefix: bool,
pub original_char_count: usize,
}
/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
word_interner: DedupInterner<String>,
phrase_interner: DedupInterner<Phrase>,
phrases: Vec<LocatedMatchingPhrase>,
words: Vec<LocatedMatchingWords>,
}
impl MatchingWords {
pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
let mut phrases = Vec::new();
let mut words = Vec::new();
// Extract and centralize the different phrases and words to match stored in a QueryTerm
// and wrap them in dedicated structures.
for located_term in located_terms {
let term = ctx.term_interner.get(located_term.value);
let (matching_words, matching_phrases) = term.all_computed_derivations();
for matching_phrase in matching_phrases {
phrases.push(LocatedMatchingPhrase {
value: matching_phrase,
positions: located_term.positions.clone(),
});
}
words.push(LocatedMatchingWords {
value: matching_words,
positions: located_term.positions.clone(),
is_prefix: term.is_prefix(),
original_char_count: term.original_word(&ctx).chars().count(),
});
}
// Sort word to put prefixes at the bottom prioritizing the exact matches.
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
Self {
phrases,
words,
word_interner: ctx.word_interner,
phrase_interner: ctx.phrase_interner,
}
}
/// Returns an iterator over terms that match or partially match the given token.
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
}
/// Try to match the token with one of the located_words.
fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
for located_words in &self.words {
for word in &located_words.value {
let word = self.word_interner.get(*word);
// if the word is a prefix we match using starts_with.
if located_words.is_prefix && token.lemma().starts_with(word) {
let Some((char_index, c)) =
word.char_indices().take(located_words.original_char_count).last()
else {
continue;
};
let prefix_length = char_index + c.len_utf8();
let (char_count, byte_len) = token.original_lengths(prefix_length);
let ids = &located_words.positions;
return Some(MatchType::Full { ids, char_count, byte_len });
// else we exact match the token.
} else if token.lemma() == word {
let ids = &located_words.positions;
return Some(MatchType::Full {
char_count: token.char_end - token.char_start,
byte_len: token.byte_end - token.byte_start,
ids,
});
}
}
}
None
}
}
/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
pub struct MatchesIter<'a, 'b> {
matching_words: &'a MatchingWords,
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
token: &'b Token<'b>,
}
impl<'a> Iterator for MatchesIter<'a, '_> {
type Item = MatchType<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.phrases.next() {
// Try to match all the phrases first.
Some(located_phrase) => {
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
// create a PartialMatch struct to make it compute the first match
// instead of duplicating the code.
let ids = &located_phrase.positions;
// collect the references of words from the interner.
let words = phrase
.words
.iter()
.map(|word| {
word.map(|word| self.matching_words.word_interner.get(word).as_str())
})
.collect();
let partial = PartialMatch { matching_words: words, ids };
partial.match_token(self.token).or_else(|| self.next())
}
// If no phrases matches, try to match uiques words.
None => self.matching_words.match_unique_words(self.token),
}
}
}
/// Id of a matching term corespounding to a word written by the end user.
pub type WordId = u16;
/// A given token can partially match a query word for several reasons:
/// - split words
/// - multi-word synonyms
/// In these cases we need to match consecutively several tokens to consider that the match is full.
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
Partial(PartialMatch<'a>),
}
/// Structure helper to match several tokens in a row in order to complete a partial match.
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: Vec<Option<&'a str>>,
ids: &'a RangeInclusive<WordId>,
}
impl<'a> PartialMatch<'a> {
/// Returns:
/// - None if the given token breaks the partial match
/// - Partial if the given token matches the partial match but doesn't complete it
/// - Full if the given token completes the partial match
pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
let Self { mut matching_words, ids, .. } = self;
let is_matching = match matching_words.first()? {
Some(word) => &token.lemma() == word,
// a None value in the phrase corresponds to a stop word,
// the walue is considered a match if the current token is categorized as a stop word.
None => token.is_stopword(),
};
// if there are remaining words to match in the phrase and the current token is matching,
// return a new Partial match allowing the highlighter to continue.
if is_matching && matching_words.len() > 1 {
matching_words.remove(0);
Some(MatchType::Partial(Self { matching_words, ids }))
// if there is no remaining word to match in the phrase and the current token is matching,
// return a Full match.
} else if is_matching {
Some(MatchType::Full {
char_count: token.char_end - token.char_start,
byte_len: token.byte_end - token.byte_start,
ids,
})
// if the current token doesn't match, return None to break the match sequence.
} else {
None
}
}
}
impl fmt::Debug for MatchingWords {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
let phrases: Vec<_> = phrases
.iter()
.map(|p| {
(
phrase_interner
.get(p.value)
.words
.iter()
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
.collect::<Vec<_>>()
.join(" "),
p.positions.clone(),
)
})
.collect();
let words: Vec<_> = words
.iter()
.flat_map(|w| {
w.value
.iter()
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
.collect::<Vec<_>>()
})
.collect();
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
}
}
#[cfg(test)]
pub(crate) mod tests {
use std::borrow::Cow;
use charabia::{TokenKind, TokenizerBuilder};
use super::super::super::located_query_terms_from_tokens;
use super::*;
use crate::index::tests::TempIndex;
use crate::search::new::query_term::ExtractedTokens;
pub(crate) fn temp_index_with_documents() -> TempIndex {
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
{ "id": 2, "name": "Westfália" },
{ "id": 3, "name": "Ŵôřlḑôle" },
]))
.unwrap();
temp_index
}
#[test]
fn matching_words() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let tokens = tokenizer.tokenize("split this world");
let ExtractedTokens { query_terms, .. } =
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
let matching_words = MatchingWords::new(ctx, query_terms);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("split"),
char_end: "split".chars().count(),
byte_end: "split".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
..Default::default()
})
.next(),
None
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("worlded"),
char_end: "worlded".chars().count(),
byte_end: "worlded".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
..Default::default()
})
.next(),
None
);
}
}

View file

@ -0,0 +1,929 @@
mod best_match_interval;
mod r#match;
mod matching_words;
mod simple_token_kind;
use charabia::{Language, SeparatorKind, Token, Tokenizer};
use either::Either;
pub use matching_words::MatchingWords;
use matching_words::{MatchType, PartialMatch};
use r#match::{Match, MatchPosition};
use serde::Serialize;
use simple_token_kind::SimpleTokenKind;
use std::{
borrow::Cow,
cmp::{max, min},
};
const DEFAULT_CROP_MARKER: &str = "";
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
/// Structure used to build a Matcher allowing to customize formating tags.
pub struct MatcherBuilder<'m> {
matching_words: MatchingWords,
tokenizer: Tokenizer<'m>,
crop_marker: Option<String>,
highlight_prefix: Option<String>,
highlight_suffix: Option<String>,
}
impl<'m> MatcherBuilder<'m> {
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
Self {
matching_words,
tokenizer,
crop_marker: None,
highlight_prefix: None,
highlight_suffix: None,
}
}
pub fn crop_marker(&mut self, marker: String) -> &Self {
self.crop_marker = Some(marker);
self
}
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
self.highlight_prefix = Some(prefix);
self
}
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
self.highlight_suffix = Some(suffix);
self
}
pub fn build<'t, 'lang>(
&self,
text: &'t str,
locales: Option<&'lang [Language]>,
) -> Matcher<'t, 'm, '_, 'lang> {
let crop_marker = match &self.crop_marker {
Some(marker) => marker.as_str(),
None => DEFAULT_CROP_MARKER,
};
let highlight_prefix = match &self.highlight_prefix {
Some(marker) => marker.as_str(),
None => DEFAULT_HIGHLIGHT_PREFIX,
};
let highlight_suffix = match &self.highlight_suffix {
Some(marker) => marker.as_str(),
None => DEFAULT_HIGHLIGHT_SUFFIX,
};
Matcher {
text,
matching_words: &self.matching_words,
tokenizer: &self.tokenizer,
crop_marker,
highlight_prefix,
highlight_suffix,
matches: None,
locales,
}
}
}
#[derive(Copy, Clone, Default, Debug)]
pub struct FormatOptions {
pub highlight: bool,
pub crop: Option<usize>,
}
impl FormatOptions {
pub fn merge(self, other: Self) -> Self {
Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
}
pub fn should_format(&self) -> bool {
self.highlight || self.crop.is_some()
}
}
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
pub struct MatchBounds {
pub start: usize,
pub length: usize,
}
/// Structure used to analyze a string, compute words that match,
/// and format the source string, returning a highlighted and cropped sub-string.
pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
text: &'t str,
matching_words: &'b MatchingWords,
tokenizer: &'b Tokenizer<'tokenizer>,
locales: Option<&'lang [Language]>,
crop_marker: &'b str,
highlight_prefix: &'b str,
highlight_suffix: &'b str,
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
}
impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
/// Iterates over tokens and save any of them that matches the query.
fn compute_matches(&mut self) -> &mut Self {
/// some words are counted as matches only if they are close together and in the good order,
/// compute_partial_match peek into next words to validate if the match is complete.
fn compute_partial_match<'a>(
mut partial: PartialMatch<'a>,
first_token_position: usize,
first_word_position: usize,
first_word_char_start: &usize,
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
matches: &mut Vec<Match>,
) -> bool {
for (token_position, word_position, word) in words_positions {
partial = match partial.match_token(word) {
// token matches the partial match, but the match is not full,
// we temporarily save the current token then we try to match the next one.
Some(MatchType::Partial(partial)) => partial,
// partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { ids, .. }) => {
// save the token that closes the partial match as a match.
matches.push(Match {
char_count: word.char_end - *first_word_char_start,
ids: ids.clone().collect(),
position: MatchPosition::Phrase {
word_positions: [first_word_position, word_position],
token_positions: [first_token_position, token_position],
},
});
// the match is complete, we return true.
return true;
}
// no match, continue to next match.
None => break,
};
}
// the match is not complete, we return false.
false
}
let tokens: Vec<_> =
self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect();
let mut matches = Vec::new();
let mut words_positions = tokens
.iter()
.scan((0, 0), |(token_position, word_position), token| {
let current_token_position = *token_position;
let current_word_position = *word_position;
*token_position += 1;
if !token.is_separator() {
*word_position += 1;
}
Some((current_token_position, current_word_position, token))
})
.filter(|(_, _, token)| !token.is_separator());
while let Some((token_position, word_position, word)) = words_positions.next() {
for match_type in self.matching_words.match_token(word) {
match match_type {
// we match, we save the current token as a match,
// then we continue the rest of the tokens.
MatchType::Full { ids, char_count, .. } => {
let ids: Vec<_> = ids.clone().collect();
matches.push(Match {
char_count,
ids,
position: MatchPosition::Word { word_position, token_position },
});
break;
}
// we match partially, iterate over next tokens to check if we can complete the match.
MatchType::Partial(partial) => {
// if match is completed, we break the matching loop over the current token,
// then we continue the rest of the tokens.
let mut wp = words_positions.clone();
if compute_partial_match(
partial,
token_position,
word_position,
&word.char_start,
&mut wp,
&mut matches,
) {
words_positions = wp;
break;
}
}
}
}
}
self.matches = Some((tokens, matches));
self
}
/// Returns boundaries of the words that match the query.
pub fn matches(&mut self) -> Vec<MatchBounds> {
match &self.matches {
None => self.compute_matches().matches(),
Some((tokens, matches)) => matches
.iter()
.map(|m| MatchBounds {
start: tokens[m.get_first_token_pos()].byte_start,
// TODO: Why is this in chars, while start is in bytes?
length: m.char_count,
})
.collect(),
}
}
/// Returns the bounds in byte index of the crop window.
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
let (
mut remaining_words,
is_iterating_forward,
before_tokens_starting_index,
after_tokens_starting_index,
) = if !matches.is_empty() {
let [matches_first, matches_last] =
best_match_interval::find_best_match_interval(matches, crop_size);
let matches_size =
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
let is_crop_size_gte_match_size = crop_size >= matches_size;
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
let remaining_words = if is_crop_size_gte_match_size {
crop_size - matches_size
} else {
// in case matches size is greater than crop size, which implies there's only one match,
// we count words backwards, because we have to remove words, as they're extra words outside of
// crop window
matches_size - crop_size
};
let after_tokens_starting_index = if matches_size == 0 {
0
} else {
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
if last_match_last_token_position_plus_one < tokens.len() {
last_match_last_token_position_plus_one
} else {
// we have matched the end of possible tokens, there's nothing to advance
tokens.len() - 1
}
};
(
remaining_words,
is_iterating_forward,
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
after_tokens_starting_index,
)
} else {
(crop_size, true, 0, 0)
};
// create the initial state of the crop window: 2 iterators starting from the matches positions,
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
// an iterator ...
let mut after_tokens = if is_iterating_forward {
// ... starting from the last match token position and going towards the end of the text.
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
} else {
// ... starting from the last match token position and going towards the start of the text.
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
};
// grows the crop window peeking in both directions
// until the window contains the good number of words:
while remaining_words > 0 {
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
let after_token_kind =
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
match (before_token_kind, after_token_kind) {
// we can expand both sides.
(Some(before_token_kind), Some(after_token_kind)) => {
match (before_token_kind, after_token_kind) {
// if they are both separators and are the same kind then advance both,
// or expand in the soft separator separator side.
(
SimpleTokenKind::Separator(before_token_separator_kind),
SimpleTokenKind::Separator(after_token_separator_kind),
) => {
if before_token_separator_kind == after_token_separator_kind {
before_tokens.next();
// this avoid having an ending separator before crop marker.
if remaining_words > 1 {
after_tokens.next();
}
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
after_tokens.next();
} else {
before_tokens.next();
}
}
// if one of the tokens is a word, we expend in the side of the word.
// left is a word, advance left.
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
before_tokens.next();
remaining_words -= 1;
}
// right is a word, advance right.
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
after_tokens.next();
remaining_words -= 1;
}
// both are words, advance left then right if remaining_word > 0.
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
before_tokens.next();
remaining_words -= 1;
if remaining_words > 0 {
after_tokens.next();
remaining_words -= 1;
}
}
}
}
// the end of the text is reached, advance left.
(Some(before_token_kind), None) => {
before_tokens.next();
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
remaining_words -= 1;
}
}
// the start of the text is reached, advance right.
(None, Some(after_token_kind)) => {
after_tokens.next();
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
remaining_words -= 1;
}
}
// no more token to add.
(None, None) => break,
}
}
// finally, keep the byte index of each bound of the crop window.
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
[crop_byte_start, crop_byte_end]
}
// Returns the formatted version of the original text.
pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
if !format_options.highlight && format_options.crop.is_none() {
// compute matches is not needed if no highlight nor crop is requested.
Cow::Borrowed(self.text)
} else {
match &self.matches {
Some((tokens, matches)) => {
// If the text has to be cropped, crop around the best interval.
let [crop_byte_start, crop_byte_end] = match format_options.crop {
Some(crop_size) if crop_size > 0 => {
self.crop_bounds(tokens, matches, crop_size)
}
_ => [0, self.text.len()],
};
let mut formatted = Vec::new();
// push crop marker if it's not the start of the text.
if crop_byte_start > 0 && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker);
}
let mut byte_index = crop_byte_start;
if format_options.highlight {
// insert highlight markers around matches.
for m in matches {
let [m_byte_start, m_byte_end] = match m.position {
MatchPosition::Word { token_position, .. } => {
let token = &tokens[token_position];
[&token.byte_start, &token.byte_end]
}
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
[&tokens[ftp].byte_start, &tokens[ltp].byte_end]
}
};
// skip matches out of the crop window
if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end {
continue;
}
// adjust start and end to the crop window size
let [m_byte_start, m_byte_end] = [
max(m_byte_start, &crop_byte_start),
min(m_byte_end, &crop_byte_end),
];
// push text that is positioned before our matches
if byte_index < *m_byte_start {
formatted.push(&self.text[byte_index..*m_byte_start]);
}
formatted.push(self.highlight_prefix);
// TODO: This is additional work done, charabia::token::Token byte_len
// should already get us the original byte length, however, that doesn't work as
// it's supposed to, investigate why
let highlight_byte_index = self.text[*m_byte_start..]
.char_indices()
.nth(m.char_count)
.map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end));
formatted.push(&self.text[*m_byte_start..highlight_byte_index]);
formatted.push(self.highlight_suffix);
// if it's a prefix highlight, we put the end of the word after the highlight marker.
if highlight_byte_index < *m_byte_end {
formatted.push(&self.text[highlight_byte_index..*m_byte_end]);
}
byte_index = *m_byte_end;
}
}
// push the rest of the text between last match and the end of crop.
if byte_index < crop_byte_end {
formatted.push(&self.text[byte_index..crop_byte_end]);
}
// push crop marker if it's not the end of the text.
if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker);
}
if formatted.len() == 1 {
// avoid concatenating if there is already 1 slice.
Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end])
} else {
Cow::Owned(formatted.concat())
}
}
None => self.compute_matches().format(format_options),
}
}
}
}
#[cfg(test)]
mod tests {
use charabia::TokenizerBuilder;
use matching_words::tests::temp_index_with_documents;
use super::*;
use crate::index::tests::TempIndex;
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
impl<'a> MatcherBuilder<'a> {
fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
let mut ctx = SearchContext::new(index, rtxn).unwrap();
let universe = filtered_universe(ctx.index, ctx.txn, &None).unwrap();
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
&mut ctx,
Some(query),
crate::TermsMatchingStrategy::default(),
crate::score_details::ScoringStrategy::Skip,
false,
universe,
&None,
&None,
crate::search::new::GeoSortStrategy::default(),
0,
100,
Some(10),
&mut crate::DefaultSearchLogger,
&mut crate::DefaultSearchLogger,
TimeBudget::max(),
None,
None,
)
.unwrap();
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
None => MatchingWords::default(),
};
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
}
}
#[test]
fn format_identity() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = FormatOptions { highlight: false, crop: None };
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let mut matcher = builder.build(text, None);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text, None);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text, None);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text);
}
#[test]
fn format_highlight() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = FormatOptions { highlight: true, crop: None };
// empty text.
let text = "";
let mut matcher = builder.build(text, None);
assert_eq!(&matcher.format(format_options), "");
// text containing only separators.
let text = ":-)";
let mut matcher = builder.build(text, None);
assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let mut matcher = builder.build(text, None);
// no crop should return complete text, because there is no matches.
assert_eq!(&matcher.format(format_options), &text);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
);
}
#[test]
fn highlight_unicode() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "world");
let format_options = FormatOptions { highlight: true, crop: None };
// Text containing prefix match.
let text = "Ŵôřlḑôle";
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Ŵôřlḑ</em>ôle"
);
// Text containing unicode match.
let text = "Ŵôřlḑ";
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Ŵôřlḑ</em>"
);
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "westfali");
let format_options = FormatOptions { highlight: true, crop: None };
// Text containing unicode match.
let text = "Westfália";
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>Westfáli</em>a"
);
}
#[test]
fn format_crop() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = FormatOptions { highlight: false, crop: Some(10) };
// empty text.
let text = "";
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@""
);
// text containing only separators.
let text = ":-)";
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@":-)"
);
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let mut matcher = builder.build(text, None);
// no highlight should return 10 first words with a marker at the end.
insta::assert_snapshot!(
matcher.format(format_options),
@"A quick brown fox can not jump 32 feet, right…"
);
// Text without any match starting by a separator.
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
let mut matcher = builder.build(text, None);
// no highlight should return 10 first words with a marker at the end.
insta::assert_snapshot!(
matcher.format(format_options),
@"(A quick brown fox can not jump 32 feet, right…"
);
// Test phrase propagation
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
let mut matcher = builder.build(text, None);
// should crop the phrase instead of croping around the match.
insta::assert_snapshot!(
matcher.format(format_options),
@"…Split The World is a book written by Emily Henry…"
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text, None);
// no highlight should return 10 last words with a marker at the start.
insta::assert_snapshot!(
matcher.format(format_options),
@"…future to build a world with the boy she loves…"
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text, None);
// no highlight should return 10 last words with a marker at the start.
insta::assert_snapshot!(
matcher.format(format_options),
@"…she loves. Emily Henry: The Love That Split The World."
);
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let mut matcher = builder.build(text, None);
// crop should return 10 last words with a marker at the start.
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void split the world void void"
);
// Text containing matches with different density.
let text = "split void the void void world void void void void void void void void void void split the world void void";
let mut matcher = builder.build(text, None);
// crop should return 10 last words with a marker at the start.
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void split the world void void"
);
// Text containing matches with same word.
let text = "split split split split split split void void void void void void void void void void split the world void void";
let mut matcher = builder.build(text, None);
// crop should return 10 last words with a marker at the start.
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void split the world void void"
);
}
#[test]
fn format_highlight_crop() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = FormatOptions { highlight: true, crop: Some(10) };
// empty text.
let text = "";
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@""
);
// text containing only separators.
let text = ":-)";
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@":-)"
);
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let mut matcher = builder.build(text, None);
// both should return 10 first words with a marker at the end.
insta::assert_snapshot!(
matcher.format(format_options),
@"A quick brown fox can not jump 32 feet, right…"
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let mut matcher = builder.build(text, None);
// both should return 10 last words with a marker at the start and highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…future to build a <em>world</em> with <em>the</em> boy she loves…"
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let mut matcher = builder.build(text, None);
// both should return 10 last words with a marker at the start and highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
);
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let mut matcher = builder.build(text, None);
// crop should return 10 last words with a marker at the start.
insta::assert_snapshot!(
matcher.format(format_options),
@"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
);
}
#[test]
fn format_highlight_crop_phrase_query() {
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
let temp_index = TempIndex::new();
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
temp_index
.add_documents(documents!([
{ "id": 1, "text": text }
]))
.unwrap();
let rtxn = temp_index.read_txn().unwrap();
let format_options = FormatOptions { highlight: true, crop: Some(10) };
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
let mut matcher = builder.build(text, None);
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…the power to split <em>the world</em> between those who embraced…"
);
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
let mut matcher = builder.build(text, None);
// should highlight "those" and the phrase "and those".
insta::assert_snapshot!(
matcher.format(format_options),
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention had the power to split the world\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>The groundbreaking invention had the power to split the world</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention had the power to split the world between those\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>The groundbreaking invention had the power to split the world</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
// TODO: Should include exclamation mark without crop markers
@"…between those who <em>embraced progress and those who resisted change</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"groundbreaking invention\" \"split the world between\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"groundbreaking invention\" \"had the power to split the world between those\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"…<em>invention</em> <em>had the power to split the world between those</em>…"
);
}
#[test]
fn smaller_crop_size() {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let text = "void void split the world void void.";
// set a smaller crop size
let format_options = FormatOptions { highlight: false, crop: Some(2) };
let mut matcher = builder.build(text, None);
// because crop size < query size, partially format matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…split the…"
);
// set a smaller crop size
let format_options = FormatOptions { highlight: false, crop: Some(1) };
let mut matcher = builder.build(text, None);
// because crop size < query size, partially format matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…split…"
);
// set crop size to 0
let format_options = FormatOptions { highlight: false, crop: Some(0) };
let mut matcher = builder.build(text, None);
// because crop size is 0, crop is ignored.
insta::assert_snapshot!(
matcher.format(format_options),
@"void void split the world void void."
);
}
#[test]
fn partial_matches() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let mut builder =
MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
builder.highlight_prefix("_".to_string());
builder.highlight_suffix("_".to_string());
let format_options = FormatOptions { highlight: true, crop: None };
let text = "the do or die can't be he do and or isn't he";
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"_the_ _do or_ die can't be he do and or isn'_t he_"
);
}
}

View file

@ -0,0 +1,15 @@
use charabia::{SeparatorKind, Token, TokenKind};
pub enum SimpleTokenKind {
Separator(SeparatorKind),
NotSeparator,
}
impl SimpleTokenKind {
pub fn new(token: &&Token<'_>) -> Self {
match token.kind {
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
_ => Self::NotSeparator,
}
}
}

View file

@ -0,0 +1,883 @@
mod bucket_sort;
mod db_cache;
mod distinct;
mod geo_sort;
mod graph_based_ranking_rule;
mod interner;
mod limits;
mod logger;
pub mod matches;
mod query_graph;
mod query_term;
mod ranking_rule_graph;
mod ranking_rules;
mod resolve_query_graph;
mod small_bitmap;
mod exact_attribute;
mod sort;
mod vector_sort;
#[cfg(test)]
mod tests;
use std::collections::HashSet;
use bucket_sort::{bucket_sort, BucketSortOutput};
use charabia::{Language, TokenizerBuilder};
use db_cache::DatabaseCache;
use exact_attribute::ExactAttribute;
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
use heed::RoTxn;
use interner::{DedupInterner, Interner};
pub use logger::visual::VisualSearchLogger;
pub use logger::{DefaultSearchLogger, SearchLogger};
use query_graph::{QueryGraph, QueryNode};
use query_term::{
located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
};
use ranking_rules::{
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
};
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap;
use sort::Sort;
use self::distinct::facet_string_values;
use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use self::vector_sort::VectorSort;
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule;
use crate::vector::Embedder;
use crate::{
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
UserError, Weight,
};
/// A structure used throughout the execution of a search query.
pub struct SearchContext<'ctx> {
pub index: &'ctx Index,
pub txn: &'ctx RoTxn<'ctx>,
pub db_cache: DatabaseCache<'ctx>,
pub word_interner: DedupInterner<String>,
pub phrase_interner: DedupInterner<Phrase>,
pub term_interner: Interner<QueryTerm>,
pub phrase_docids: PhraseDocIdsCache,
pub restricted_fids: Option<RestrictedFids>,
}
impl<'ctx> SearchContext<'ctx> {
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result<Self> {
let searchable_fids = index.searchable_fields_and_weights(txn)?;
let exact_attributes_ids = index.exact_attributes_ids(txn)?;
let mut exact = Vec::new();
let mut tolerant = Vec::new();
for (_name, fid, weight) in searchable_fids {
if exact_attributes_ids.contains(&fid) {
exact.push((fid, weight));
} else {
tolerant.push((fid, weight));
}
}
Ok(Self {
index,
txn,
db_cache: <_>::default(),
word_interner: <_>::default(),
phrase_interner: <_>::default(),
term_interner: <_>::default(),
phrase_docids: <_>::default(),
restricted_fids: None,
})
}
pub fn attributes_to_search_on(
&mut self,
attributes_to_search_on: &'ctx [String],
) -> Result<()> {
let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?;
let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?;
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
let mut wildcard = false;
let mut restricted_fids = RestrictedFids::default();
for field_name in attributes_to_search_on {
if field_name == "*" {
wildcard = true;
// we cannot early exit as we want to returns error in case of unknown fields
continue;
}
let searchable_weight =
searchable_fields_weights.iter().find(|(name, _, _)| name == field_name);
let (fid, weight) = match searchable_weight {
// The Field id exist and the field is searchable
Some((_name, fid, weight)) => (*fid, *weight),
// The field is not searchable but the user didn't define any searchable attributes
None if user_defined_searchable.is_none() => continue,
// The field is not searchable => User error
None => {
let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
self.txn,
searchable_fields_weights.iter().map(|(name, _, _)| name),
)?;
let field = field_name.to_string();
return Err(UserError::InvalidSearchableAttribute {
field,
valid_fields,
hidden_fields,
}
.into());
}
};
if exact_attributes_ids.contains(&fid) {
restricted_fids.exact.push((fid, weight));
} else {
restricted_fids.tolerant.push((fid, weight));
};
}
if wildcard {
self.restricted_fids = None;
} else {
self.restricted_fids = Some(restricted_fids);
}
Ok(())
}
}
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
pub enum Word {
Original(Interned<String>),
Derived(Interned<String>),
}
impl Word {
pub fn interned(&self) -> Interned<String> {
match self {
Word::Original(word) => *word,
Word::Derived(word) => *word,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct RestrictedFids {
pub tolerant: Vec<(FieldId, Weight)>,
pub exact: Vec<(FieldId, Weight)>,
}
impl RestrictedFids {
pub fn contains(&self, fid: &FieldId) -> bool {
self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid)
}
}
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
fn resolve_maximally_reduced_query_graph(
ctx: &mut SearchContext<'_>,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
matching_strategy: TermsMatchingStrategy,
logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<RoaringBitmap> {
let mut graph = query_graph.clone();
let nodes_to_remove = match matching_strategy {
TermsMatchingStrategy::Last => query_graph
.removal_order_for_terms_matching_strategy_last(ctx)
.iter()
.flat_map(|x| x.iter())
.collect(),
TermsMatchingStrategy::Frequency => query_graph
.removal_order_for_terms_matching_strategy_frequency(ctx)?
.iter()
.flat_map(|x| x.iter())
.collect(),
TermsMatchingStrategy::All => vec![],
};
graph.remove_nodes_keep_edges(&nodes_to_remove);
logger.query_for_initial_universe(&graph);
let docids = compute_query_graph_docids(ctx, &graph, universe)?;
Ok(docids)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
fn resolve_universe(
ctx: &mut SearchContext<'_>,
initial_universe: &RoaringBitmap,
query_graph: &QueryGraph,
matching_strategy: TermsMatchingStrategy,
logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<RoaringBitmap> {
resolve_maximally_reduced_query_graph(
ctx,
initial_universe,
query_graph,
matching_strategy,
logger,
)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
fn resolve_negative_words(
ctx: &mut SearchContext<'_>,
universe: Option<&RoaringBitmap>,
negative_words: &[Word],
) -> Result<RoaringBitmap> {
let mut negative_bitmap = RoaringBitmap::new();
for &word in negative_words {
if let Some(bitmap) = ctx.word_docids(universe, word)? {
negative_bitmap |= bitmap;
}
}
Ok(negative_bitmap)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
fn resolve_negative_phrases(
ctx: &mut SearchContext<'_>,
negative_phrases: &[LocatedQueryTerm],
) -> Result<RoaringBitmap> {
let mut negative_bitmap = RoaringBitmap::new();
for term in negative_phrases {
let query_term = ctx.term_interner.get(term.value);
if let Some(phrase) = query_term.original_phrase() {
negative_bitmap |= ctx.get_phrase_docids(phrase)?;
}
}
Ok(negative_bitmap)
}
/// Return the list of initialised ranking rules to be used for a placeholder search.
fn get_ranking_rules_for_placeholder_search<'ctx>(
ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy,
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
let mut sort = false;
let mut sorted_fields = HashSet::new();
let mut geo_sorted = false;
let mut ranking_rules: Vec<BoxRankingRule<'ctx, PlaceholderQuery>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules {
match rr {
// These rules need a query to have an effect; ignore them in placeholder search
crate::Criterion::Words
| crate::Criterion::Typo
| crate::Criterion::Attribute
| crate::Criterion::Proximity
| crate::Criterion::Exactness => continue,
crate::Criterion::Sort => {
if sort {
continue;
}
resolve_sort_criteria(
sort_criteria,
ctx,
&mut ranking_rules,
&mut sorted_fields,
&mut geo_sorted,
geo_strategy,
)?;
sort = true;
}
crate::Criterion::Asc(field_name) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
}
crate::Criterion::Desc(field_name) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
}
}
}
Ok(ranking_rules)
}
#[allow(clippy::too_many_arguments)]
fn get_ranking_rules_for_vector<'ctx>(
ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy,
limit_plus_offset: usize,
target: &[f32],
embedder_name: &str,
embedder: &Embedder,
quantized: bool,
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
// query graph search
let mut sort = false;
let mut sorted_fields = HashSet::new();
let mut geo_sorted = false;
let mut vector = false;
let mut ranking_rules: Vec<BoxRankingRule<'ctx, PlaceholderQuery>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules {
match rr {
crate::Criterion::Words
| crate::Criterion::Typo
| crate::Criterion::Proximity
| crate::Criterion::Attribute
| crate::Criterion::Exactness => {
if !vector {
let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
let vector_sort = VectorSort::new(
ctx,
target.to_vec(),
vector_candidates,
limit_plus_offset,
embedder_name,
embedder,
quantized,
)?;
ranking_rules.push(Box::new(vector_sort));
vector = true;
}
}
crate::Criterion::Sort => {
if sort {
continue;
}
resolve_sort_criteria(
sort_criteria,
ctx,
&mut ranking_rules,
&mut sorted_fields,
&mut geo_sorted,
geo_strategy,
)?;
sort = true;
}
crate::Criterion::Asc(field_name) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
}
crate::Criterion::Desc(field_name) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
}
}
}
Ok(ranking_rules)
}
/// Return the list of initialised ranking rules to be used for a query graph search.
fn get_ranking_rules_for_query_graph_search<'ctx>(
ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy,
terms_matching_strategy: TermsMatchingStrategy,
) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
// query graph search
let mut words = false;
let mut typo = false;
let mut proximity = false;
let mut sort = false;
let mut attribute = false;
let mut exactness = false;
let mut sorted_fields = HashSet::new();
let mut geo_sorted = false;
// Don't add the `words` ranking rule if the term matching strategy is `All`
if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
words = true;
}
let mut ranking_rules: Vec<BoxRankingRule<'ctx, QueryGraph>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules {
// Add Words before any of: typo, proximity, attribute
match rr {
crate::Criterion::Typo
| crate::Criterion::Attribute
| crate::Criterion::Proximity
| crate::Criterion::Exactness => {
if !words {
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
words = true;
}
}
_ => {}
}
match rr {
crate::Criterion::Words => {
if words {
continue;
}
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
words = true;
}
crate::Criterion::Typo => {
if typo {
continue;
}
typo = true;
ranking_rules.push(Box::new(Typo::new(None)));
}
crate::Criterion::Proximity => {
if proximity {
continue;
}
proximity = true;
ranking_rules.push(Box::new(Proximity::new(None)));
}
crate::Criterion::Attribute => {
if attribute {
continue;
}
attribute = true;
ranking_rules.push(Box::new(Fid::new(None)));
ranking_rules.push(Box::new(Position::new(None)));
}
crate::Criterion::Sort => {
if sort {
continue;
}
resolve_sort_criteria(
sort_criteria,
ctx,
&mut ranking_rules,
&mut sorted_fields,
&mut geo_sorted,
geo_strategy,
)?;
sort = true;
}
crate::Criterion::Exactness => {
if exactness {
continue;
}
ranking_rules.push(Box::new(ExactAttribute::new()));
ranking_rules.push(Box::new(Exactness::new()));
exactness = true;
}
crate::Criterion::Asc(field_name) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
}
crate::Criterion::Desc(field_name) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
}
}
}
Ok(ranking_rules)
}
fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
sort_criteria: &Option<Vec<AscDesc>>,
ctx: &SearchContext<'ctx>,
ranking_rules: &mut Vec<BoxRankingRule<'ctx, Query>>,
sorted_fields: &mut HashSet<String>,
geo_sorted: &mut bool,
geo_strategy: geo_sort::Strategy,
) -> Result<()> {
let sort_criteria = sort_criteria.clone().unwrap_or_default();
ranking_rules.reserve(sort_criteria.len());
for criterion in sort_criteria {
match criterion {
AscDesc::Asc(Member::Field(field_name)) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
}
AscDesc::Desc(Member::Field(field_name)) => {
if sorted_fields.contains(&field_name) {
continue;
}
sorted_fields.insert(field_name.clone());
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
}
AscDesc::Asc(Member::Geo(point)) => {
if *geo_sorted {
continue;
}
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
ranking_rules.push(Box::new(GeoSort::new(
geo_strategy,
geo_faceted_docids,
point,
true,
)?));
}
AscDesc::Desc(Member::Geo(point)) => {
if *geo_sorted {
continue;
}
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
ranking_rules.push(Box::new(GeoSort::new(
geo_strategy,
geo_faceted_docids,
point,
false,
)?));
}
};
}
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
pub fn filtered_universe(
index: &Index,
txn: &RoTxn<'_>,
filters: &Option<Filter<'_>>,
) -> Result<RoaringBitmap> {
Ok(if let Some(filters) = filters {
filters.evaluate(txn, index)?
} else {
index.documents_ids(txn)?
})
}
#[allow(clippy::too_many_arguments)]
pub fn execute_vector_search(
ctx: &mut SearchContext<'_>,
vector: &[f32],
scoring_strategy: ScoringStrategy,
universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
from: usize,
length: usize,
embedder_name: &str,
embedder: &Embedder,
quantized: bool,
time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
) -> Result<PartialSearchResult> {
check_sort_criteria(ctx, sort_criteria.as_ref())?;
// FIXME: input universe = universe & documents_with_vectors
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe
let ranking_rules = get_ranking_rules_for_vector(
ctx,
sort_criteria,
geo_strategy,
from + length,
vector,
embedder_name,
embedder,
quantized,
)?;
let mut placeholder_search_logger = logger::DefaultSearchLogger;
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
&mut placeholder_search_logger;
let BucketSortOutput { docids, scores, all_candidates, degraded } = bucket_sort(
ctx,
ranking_rules,
&PlaceholderQuery,
distinct.as_deref(),
&universe,
from,
length,
scoring_strategy,
placeholder_search_logger,
time_budget,
ranking_score_threshold,
)?;
Ok(PartialSearchResult {
candidates: all_candidates,
document_scores: scores,
documents_ids: docids,
located_query_terms: None,
degraded,
used_negative_operator: false,
})
}
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "search::main")]
pub fn execute_search(
ctx: &mut SearchContext<'_>,
query: Option<&str>,
terms_matching_strategy: TermsMatchingStrategy,
scoring_strategy: ScoringStrategy,
exhaustive_number_hits: bool,
mut universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
from: usize,
length: usize,
words_limit: Option<usize>,
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
locales: Option<&Vec<Language>>,
) -> Result<PartialSearchResult> {
check_sort_criteria(ctx, sort_criteria.as_ref())?;
let mut used_negative_operator = false;
let mut located_query_terms = None;
let query_terms = if let Some(query) = query {
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
let entered = span.enter();
// We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them.
let mut tokbuilder = TokenizerBuilder::new();
let stop_words = ctx.index.stop_words(ctx.txn)?;
if let Some(ref stop_words) = stop_words {
tokbuilder.stop_words(stop_words);
}
let separators = ctx.index.allowed_separators(ctx.txn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref separators) = separators {
tokbuilder.separators(separators);
}
let dictionary = ctx.index.dictionary(ctx.txn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref dictionary) = dictionary {
tokbuilder.words_dict(dictionary);
}
let db_locales;
match locales {
Some(locales) => {
if !locales.is_empty() {
tokbuilder.allow_list(locales);
}
}
None => {
// If no locales are specified, we use the locales specified in the localized attributes rules
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
let localized_fields = match &ctx.restricted_fids {
// if AttributeToSearchOn is set, use the restricted list of ids
Some(restricted_fids) => {
let iter = restricted_fids
.exact
.iter()
.chain(restricted_fids.tolerant.iter())
.map(|(fid, _)| *fid);
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
}
// Otherwise use the full list of ids coming from the index searchable fields
None => LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
searchable_fields.into_iter(),
),
};
db_locales = localized_fields.all_locales();
if !db_locales.is_empty() {
tokbuilder.allow_list(&db_locales);
}
}
};
let tokenizer = tokbuilder.build();
drop(entered);
let span = tracing::trace_span!(target: "search::tokens", "tokenize");
let entered = span.enter();
let tokens = tokenizer.tokenize(query);
drop(entered);
let ExtractedTokens { query_terms, negative_words, negative_phrases } =
located_query_terms_from_tokens(ctx, tokens, words_limit)?;
used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?;
let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
universe -= ignored_documents;
universe -= ignored_phrases;
if query_terms.is_empty() {
// Do a placeholder search instead
None
} else {
Some(query_terms)
}
} else {
None
};
let bucket_sort_output = if let Some(query_terms) = query_terms {
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(new_located_query_terms);
let ranking_rules = get_ranking_rules_for_query_graph_search(
ctx,
sort_criteria,
geo_strategy,
terms_matching_strategy,
)?;
universe &=
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;
bucket_sort(
ctx,
ranking_rules,
&graph,
distinct.as_deref(),
&universe,
from,
length,
scoring_strategy,
query_graph_logger,
time_budget,
ranking_score_threshold,
)?
} else {
let ranking_rules =
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?;
bucket_sort(
ctx,
ranking_rules,
&PlaceholderQuery,
distinct.as_deref(),
&universe,
from,
length,
scoring_strategy,
placeholder_search_logger,
time_budget,
ranking_score_threshold,
)?
};
let BucketSortOutput { docids, scores, mut all_candidates, degraded } = bucket_sort_output;
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
// The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set.
if exhaustive_number_hits {
let distinct_field = match distinct.as_deref() {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
if let Some(f) = distinct_field {
if let Some(distinct_fid) = fields_ids_map.id(f) {
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
}
}
}
Ok(PartialSearchResult {
candidates: all_candidates,
document_scores: scores,
documents_ids: docids,
located_query_terms,
degraded,
used_negative_operator,
})
}
fn check_sort_criteria(
ctx: &SearchContext<'_>,
sort_criteria: Option<&Vec<AscDesc>>,
) -> Result<()> {
let sort_criteria = if let Some(sort_criteria) = sort_criteria {
sort_criteria
} else {
return Ok(());
};
if sort_criteria.is_empty() {
return Ok(());
}
// We check that the sort ranking rule exists and throw an
// error if we try to use it and that it doesn't.
let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort);
if sort_ranking_rule_missing {
return Err(UserError::SortRankingRuleMissing.into());
}
// We check that we are allowed to use the sort criteria, we check
// that they are declared in the sortable fields.
let sortable_fields = ctx.index.sortable_fields(ctx.txn)?;
for asc_desc in sort_criteria {
match asc_desc.member() {
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
return Err(UserError::InvalidSortableAttribute {
field: field.to_string(),
valid_fields,
hidden_fields,
}
.into());
}
Member::Geo(_) if !sortable_fields.contains("_geo") => {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
return Err(UserError::InvalidSortableAttribute {
field: "_geo".to_string(),
valid_fields,
hidden_fields,
}
.into());
}
_ => (),
}
}
Ok(())
}
pub struct PartialSearchResult {
pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
pub candidates: RoaringBitmap,
pub documents_ids: Vec<DocumentId>,
pub document_scores: Vec<Vec<ScoreDetails>>,
pub degraded: bool,
pub used_negative_operator: bool,
}

View file

@ -0,0 +1,536 @@
use std::cmp::{Ordering, Reverse};
use std::collections::BTreeMap;
use std::hash::{Hash, Hasher};
use fxhash::{FxHashMap, FxHasher};
use roaring::RoaringBitmap;
use super::interner::{FixedSizeInterner, Interned};
use super::query_term::{
self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset,
};
use super::small_bitmap::SmallBitmap;
use super::SearchContext;
use crate::search::new::interner::Interner;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::Result;
/// A node of the [`QueryGraph`].
///
/// There are four types of nodes:
/// 1. `Start` : unique, represents the start of the query
/// 2. `End` : unique, represents the end of a query
/// 3. `Deleted` : represents a node that was deleted.
/// All deleted nodes are unreachable from the start node.
/// 4. `Term` is a regular node representing a word or combination of words
/// from the user query.
#[derive(Clone)]
pub struct QueryNode {
pub data: QueryNodeData,
pub predecessors: SmallBitmap<QueryNode>,
pub successors: SmallBitmap<QueryNode>,
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum QueryNodeData {
Term(LocatedQueryTermSubset),
Deleted,
Start,
End,
}
/**
A graph representing all the ways to interpret the user's search query.
## Example 1
For the search query `sunflower`, we need to register the following things:
- we need to look for the exact word `sunflower`
- but also any word which is 1 or 2 typos apart from `sunflower`
- and every word that contains the prefix `sunflower`
- and also the couple of adjacent words `sun flower`
- as well as all the user-defined synonyms of `sunflower`
All these derivations of a word will be stored in [`QueryTerm`].
## Example 2:
For the search query `summer house by`.
We also look for all word derivations of each term. And we also need to consider
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
Furthermore, we need to know which words these ngrams replace. This is done by creating the
following graph, where each node also contains a list of derivations:
```txt
houseby
START summer house by END
summerhouse
summerhouseby
```
Note also that each node has a range of positions associated with it,
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
is registered with the positions `1..=2`. When two nodes are connected by an edge,
it means that they are potentially next to each other in the user's search query
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
and the transformations that were done on the query graph).
*/
#[derive(Clone)]
pub struct QueryGraph {
/// The index of the start node within `self.nodes`
pub root_node: Interned<QueryNode>,
/// The index of the end node within `self.nodes`
pub end_node: Interned<QueryNode>,
/// The list of all query nodes
pub nodes: FixedSizeInterner<QueryNode>,
}
impl QueryGraph {
/// Build the query graph from the parsed user search query, return an updated list of the located query terms
/// which contains ngrams.
pub fn from_query(
ctx: &mut SearchContext<'_>,
// The terms here must be consecutive
terms: &[LocatedQueryTerm],
) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
let mut new_located_query_terms = terms.to_vec();
let nbr_typos = number_of_typos_allowed(ctx)?;
let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
let root_node = 0;
let end_node = 1;
// Ee could consider generalizing to 4,5,6,7,etc. ngrams
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
(vec![], vec![], vec![root_node]);
let original_terms_len = terms.len();
for term_idx in 0..original_terms_len {
let mut new_nodes = vec![];
let new_node_idx = add_node(
&mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset {
term_subset: QueryTermSubset::full(terms[term_idx].value),
positions: terms[term_idx].positions.clone(),
term_ids: term_idx as u8..=term_idx as u8,
}),
);
new_nodes.push(new_node_idx);
if !prev1.is_empty() {
if let Some(ngram) =
query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
{
new_located_query_terms.push(ngram.clone());
let ngram_idx = add_node(
&mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset {
term_subset: QueryTermSubset::full(ngram.value),
positions: ngram.positions,
term_ids: term_idx as u8 - 1..=term_idx as u8,
}),
);
new_nodes.push(ngram_idx);
}
}
if !prev2.is_empty() {
if let Some(ngram) =
query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
{
new_located_query_terms.push(ngram.clone());
let ngram_idx = add_node(
&mut nodes_data,
QueryNodeData::Term(LocatedQueryTermSubset {
term_subset: QueryTermSubset::full(ngram.value),
positions: ngram.positions,
term_ids: term_idx as u8 - 2..=term_idx as u8,
}),
);
new_nodes.push(ngram_idx);
}
}
(prev0, prev1, prev2) = (new_nodes, prev0, prev1);
}
let root_node = Interned::from_raw(root_node);
let end_node = Interned::from_raw(end_node);
let mut nodes = FixedSizeInterner::new(
nodes_data.len() as u16,
QueryNode {
data: QueryNodeData::Deleted,
predecessors: SmallBitmap::new(nodes_data.len() as u16),
successors: SmallBitmap::new(nodes_data.len() as u16),
},
);
for (node_idx, node_data) in nodes_data.into_iter().enumerate() {
let node = nodes.get_mut(Interned::from_raw(node_idx as u16));
node.data = node_data;
}
let mut graph = QueryGraph { root_node, end_node, nodes };
graph.build_initial_edges();
Ok((graph, new_located_query_terms))
}
/// Remove the given nodes, connecting all their predecessors to all their successors.
pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned<QueryNode>]) {
for &node_id in nodes {
let node = self.nodes.get(node_id);
let old_node_pred = node.predecessors.clone();
let old_node_succ = node.successors.clone();
for pred in old_node_pred.iter() {
let pred_successors = &mut self.nodes.get_mut(pred).successors;
pred_successors.remove(node_id);
pred_successors.union(&old_node_succ);
}
for succ in old_node_succ.iter() {
let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors;
succ_predecessors.remove(node_id);
succ_predecessors.union(&old_node_pred);
}
let node = self.nodes.get_mut(node_id);
node.data = QueryNodeData::Deleted;
node.predecessors.clear();
node.successors.clear();
}
}
/// Remove the given nodes and all their edges from the query graph.
pub fn remove_nodes(&mut self, nodes: &[Interned<QueryNode>]) {
for &node_id in nodes {
let node = &self.nodes.get(node_id);
let old_node_pred = node.predecessors.clone();
let old_node_succ = node.successors.clone();
for pred in old_node_pred.iter() {
self.nodes.get_mut(pred).successors.remove(node_id);
}
for succ in old_node_succ.iter() {
self.nodes.get_mut(succ).predecessors.remove(node_id);
}
let node = self.nodes.get_mut(node_id);
node.data = QueryNodeData::Deleted;
node.predecessors.clear();
node.successors.clear();
}
}
/// Simplify the query graph by removing all nodes that are disconnected from
/// the start or end nodes.
pub fn simplify(&mut self) {
loop {
let mut nodes_to_remove = vec![];
for (node_idx, node) in self.nodes.iter() {
if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted)
&& node.successors.is_empty())
|| (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted)
&& node.predecessors.is_empty())
{
nodes_to_remove.push(node_idx);
}
}
if nodes_to_remove.is_empty() {
break;
} else {
self.remove_nodes(&nodes_to_remove);
}
}
}
fn build_initial_edges(&mut self) {
for (_, node) in self.nodes.iter_mut() {
node.successors.clear();
node.predecessors.clear();
}
for node_id in self.nodes.indexes() {
let node = self.nodes.get(node_id);
let end_prev_term_id = match &node.data {
QueryNodeData::Term(term) => *term.term_ids.end() as i16,
QueryNodeData::Start => -1,
QueryNodeData::Deleted => continue,
QueryNodeData::End => continue,
};
let successors = {
let mut successors = SmallBitmap::for_interned_values_in(&self.nodes);
let mut min = i16::MAX;
for (node_id, node) in self.nodes.iter() {
let start_next_term_id = match &node.data {
QueryNodeData::Term(term) => *term.term_ids.start() as i16,
QueryNodeData::End => i16::MAX,
QueryNodeData::Start => continue,
QueryNodeData::Deleted => continue,
};
if start_next_term_id <= end_prev_term_id {
continue;
}
match start_next_term_id.cmp(&min) {
Ordering::Less => {
min = start_next_term_id;
successors.clear();
successors.insert(node_id);
}
Ordering::Equal => {
successors.insert(node_id);
}
Ordering::Greater => continue,
}
}
successors
};
let node = self.nodes.get_mut(node_id);
node.successors = successors.clone();
for successor in successors.iter() {
let successor = self.nodes.get_mut(successor);
successor.predecessors.insert(node_id);
}
}
}
pub fn removal_order_for_terms_matching_strategy_frequency(
&self,
ctx: &mut SearchContext<'_>,
) -> Result<Vec<SmallBitmap<QueryNode>>> {
// lookup frequency for each term
let mut term_with_frequency: Vec<(u8, u64)> = {
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
for (_, node) in self.nodes.iter() {
match &node.data {
QueryNodeData::Term(t) => {
let docids = compute_query_term_subset_docids(ctx, None, &t.term_subset)?;
for id in t.term_ids.clone() {
term_docids
.entry(id)
.and_modify(|curr| *curr |= &docids)
.or_insert_with(|| docids.clone());
}
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
term_docids
.into_iter()
.map(|(idx, docids)| match docids.len() {
0 => (idx, u64::MAX),
frequency => (idx, frequency),
})
.collect()
};
term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
let mut term_weight = BTreeMap::new();
let mut weight: u16 = 1;
let mut peekable = term_with_frequency.into_iter().peekable();
while let Some((idx, frequency)) = peekable.next() {
term_weight.insert(idx, weight);
if peekable.peek().map_or(false, |(_, f)| frequency != *f) {
weight += 1;
}
}
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
}
pub fn removal_order_for_terms_matching_strategy_last(
&self,
ctx: &SearchContext<'_>,
) -> Vec<SmallBitmap<QueryNode>> {
let (first_term_idx, last_term_idx) = {
let mut first_term_idx = u8::MAX;
let mut last_term_idx = 0u8;
for (_, node) in self.nodes.iter() {
match &node.data {
QueryNodeData::Term(t) => {
if *t.term_ids.end() > last_term_idx {
last_term_idx = *t.term_ids.end();
}
if *t.term_ids.start() < first_term_idx {
first_term_idx = *t.term_ids.start();
}
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
(first_term_idx, last_term_idx)
};
if first_term_idx >= last_term_idx {
return vec![];
}
let cost_of_term_idx = |term_idx: u8| {
let rank = 1 + last_term_idx - term_idx;
rank as u16
};
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
}
pub fn removal_order_for_terms_matching_strategy(
&self,
ctx: &SearchContext<'_>,
order: impl Fn(u8) -> u16,
) -> Vec<SmallBitmap<QueryNode>> {
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
let mut at_least_one_mandatory_term = false;
for (node_id, node) in self.nodes.iter() {
let QueryNodeData::Term(t) = &node.data else { continue };
if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() {
at_least_one_mandatory_term = true;
continue;
}
let mut cost = 0;
for id in t.term_ids.clone() {
cost = std::cmp::max(cost, order(id));
}
nodes_to_remove
.entry(cost)
.or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes))
.insert(node_id);
}
let mut res: Vec<_> = nodes_to_remove.into_values().collect();
if !at_least_one_mandatory_term {
res.pop();
}
res
}
/// Number of words in the phrases in this query graph
pub(crate) fn words_in_phrases_count(&self, ctx: &SearchContext<'_>) -> usize {
let mut word_count = 0;
for (_, node) in self.nodes.iter() {
match &node.data {
QueryNodeData::Term(term) => {
let Some(phrase) = term.term_subset.original_phrase(ctx) else { continue };
let phrase = ctx.phrase_interner.get(phrase);
word_count += phrase.words.iter().copied().filter(|a| a.is_some()).count()
}
_ => continue,
}
}
word_count
}
}
fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
let new_node_idx = nodes_data.len() as u16;
nodes_data.push(node_data);
new_node_idx
}
impl QueryGraph {
/*
Build a query graph from a list of paths
The paths are composed of source and dest terms.
For example, consider the following paths:
```txt
PATH 1 : a -> b1 -> c1 -> d -> e1
PATH 2 : a -> b2 -> c2 -> d -> e2
```
Then the resulting graph will be:
```txt
b1 c1 d e1
a
b2 c2 d e2
```
*/
pub fn build_from_paths(
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
) -> Self {
let mut node_data = Interner::default();
let root_node = node_data.push(QueryNodeData::Start);
let end_node = node_data.push(QueryNodeData::End);
let mut paths_with_single_terms = vec![];
for path in paths {
let mut processed_path = vec![];
let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
for (start_term, dest_term) in path {
if let Some(prev_dest_term) = prev_dest_term.take() {
if let Some(mut start_term) = start_term {
if start_term.term_ids == prev_dest_term.term_ids {
start_term.term_subset.intersect(&prev_dest_term.term_subset);
processed_path.push(start_term);
} else {
processed_path.push(prev_dest_term);
processed_path.push(start_term);
}
} else {
processed_path.push(prev_dest_term);
}
} else if let Some(start_term) = start_term {
processed_path.push(start_term);
}
prev_dest_term = Some(dest_term);
}
if let Some(prev_dest_term) = prev_dest_term {
processed_path.push(prev_dest_term);
}
paths_with_single_terms.push(processed_path);
}
let mut paths_with_single_terms_and_suffix_hash = vec![];
for path in paths_with_single_terms {
let mut hasher = FxHasher::default();
let mut path_with_hash = vec![];
for term in path.into_iter().rev() {
term.hash(&mut hasher);
path_with_hash.push((term, hasher.finish()));
}
path_with_hash.reverse();
paths_with_single_terms_and_suffix_hash.push(path_with_hash);
}
let mut node_data_id_for_term_and_suffix_hash =
FxHashMap::<(LocatedQueryTermSubset, u64), Interned<QueryNodeData>>::default();
let mut paths_with_ids = vec![];
for path in paths_with_single_terms_and_suffix_hash {
let mut path_with_ids = vec![];
for (term, suffix_hash) in path {
let node_data_id = node_data_id_for_term_and_suffix_hash
.entry((term.clone(), suffix_hash))
.or_insert_with(|| node_data.push(QueryNodeData::Term(term)));
path_with_ids.push(Interned::from_raw(node_data_id.into_raw()));
}
paths_with_ids.push(path_with_ids);
}
let nodes_data = node_data.freeze();
let nodes_data_len = nodes_data.len();
let mut nodes = nodes_data.map_move(|n| QueryNode {
data: n,
predecessors: SmallBitmap::new(nodes_data_len),
successors: SmallBitmap::new(nodes_data_len),
});
let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
for path in paths_with_ids {
let mut prev_node_id = root_node;
for node_id in path {
let prev_node = nodes.get_mut(prev_node_id);
prev_node.successors.insert(node_id);
let node = nodes.get_mut(node_id);
node.predecessors.insert(prev_node_id);
prev_node_id = node_id;
}
let prev_node = nodes.get_mut(prev_node_id);
prev_node.successors.insert(end_node);
let node = nodes.get_mut(end_node);
node.predecessors.insert(prev_node_id);
}
QueryGraph { root_node, end_node, nodes }
}
}

View file

@ -0,0 +1,428 @@
use std::borrow::Cow;
use std::collections::BTreeSet;
use std::ops::ControlFlow;
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use heed::types::DecodeIgnore;
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{Lazy, TwoTypoTerm};
use crate::search::new::{limits, SearchContext};
use crate::search::{build_dfa, get_first};
use crate::{Result, MAX_WORD_LENGTH};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NumberOfTypos {
Zero,
One,
Two,
}
pub enum ZeroOrOneTypo {
Zero,
One,
}
impl Interned<QueryTerm> {
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
let s = ctx.term_interner.get_mut(self);
if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
assert!(s.two_typo.is_uninit());
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
self.initialize_one_typo_subterm(ctx)?;
let s = ctx.term_interner.get_mut(self);
assert!(s.one_typo.is_init());
s.two_typo = Lazy::Init(TwoTypoTerm::default());
} else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
assert!(s.two_typo.is_uninit());
self.initialize_one_and_two_typo_subterm(ctx)?;
let s = ctx.term_interner.get_mut(self);
assert!(s.one_typo.is_init() && s.two_typo.is_init());
}
Ok(())
}
}
fn find_zero_typo_prefix_derivations(
word_interned: Interned<String>,
fst: fst::Set<Cow<'_, [u8]>>,
word_interner: &mut DedupInterner<String>,
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
) -> Result<()> {
let word = word_interner.get(word_interned).to_owned();
let word = word.as_str();
let prefix = Str::new(word).starts_with();
let mut stream = fst.search(prefix).into_stream();
while let Some(derived_word) = stream.next() {
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
let derived_word_interned = word_interner.insert(derived_word);
if derived_word_interned != word_interned {
let cf = visit(derived_word_interned)?;
if cf.is_break() {
break;
}
}
}
Ok(())
}
fn find_zero_one_typo_derivations(
ctx: &mut SearchContext<'_>,
word_interned: Interned<String>,
is_prefix: bool,
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
) -> Result<()> {
let fst = ctx.get_words_fst()?;
let word = ctx.word_interner.get(word_interned).to_owned();
let word = word.as_str();
let dfa = build_dfa(word, 1, is_prefix);
let starts = StartsWith(Str::new(get_first(word)));
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
while let Some((derived_word, state)) = stream.next() {
let derived_word = std::str::from_utf8(derived_word)?;
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
let d = dfa.distance(state.1);
match d.to_u8() {
0 => {
if derived_word != word_interned {
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
if cf.is_break() {
break;
}
}
}
1 => {
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
if cf.is_break() {
break;
}
}
_ => {
unreachable!("One typo dfa produced multiple typos")
}
}
}
Ok(())
}
fn find_zero_one_two_typo_derivations(
word_interned: Interned<String>,
is_prefix: bool,
fst: fst::Set<Cow<'_, [u8]>>,
word_interner: &mut DedupInterner<String>,
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
) -> Result<()> {
let word = word_interner.get(word_interned).to_owned();
let word = word.as_str();
let starts = StartsWith(Str::new(get_first(word)));
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
let second_dfa = build_dfa(word, 2, is_prefix);
let second = Intersection(&second_dfa, &starts);
let automaton = Union(first, &second);
let mut stream = fst.search_with_state(automaton).into_stream();
while let Some((derived_word, state)) = stream.next() {
let derived_word = std::str::from_utf8(derived_word)?;
let derived_word_interned = word_interner.insert(derived_word.to_owned());
// in the case the typo is on the first letter, we know the number of typo
// is two
if get_first(derived_word) != get_first(word) {
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
if cf.is_break() {
break;
}
} else {
// Else, we know that it is the second dfa that matched and compute the
// correct distance
let d = second_dfa.distance((state.1).0);
match d.to_u8() {
0 => {
if derived_word_interned != word_interned {
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
if cf.is_break() {
break;
}
}
}
1 => {
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
if cf.is_break() {
break;
}
}
2 => {
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
if cf.is_break() {
break;
}
}
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
}
}
}
Ok(())
}
pub fn partially_initialized_term_from_word(
ctx: &mut SearchContext<'_>,
word: &str,
max_typo: u8,
is_prefix: bool,
is_ngram: bool,
) -> Result<QueryTerm> {
let word_interned = ctx.word_interner.insert(word.to_owned());
if word.len() > MAX_WORD_LENGTH {
return Ok({
QueryTerm {
original: ctx.word_interner.insert(word.to_owned()),
ngram_words: None,
is_prefix: false,
max_levenshtein_distance: 0,
zero_typo: <_>::default(),
one_typo: Lazy::Init(<_>::default()),
two_typo: Lazy::Init(<_>::default()),
}
});
}
let fst = ctx.index.words_fst(ctx.txn)?;
let use_prefix_db = is_prefix
&& (ctx
.index
.word_prefix_docids
.remap_data_type::<DecodeIgnore>()
.get(ctx.txn, word)?
.is_some()
|| (!is_ngram
&& ctx
.index
.exact_word_prefix_docids
.remap_data_type::<DecodeIgnore>()
.get(ctx.txn, word)?
.is_some()));
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
let mut zero_typo = None;
let mut prefix_of = BTreeSet::new();
if fst.contains(word) {
zero_typo = Some(word_interned);
}
if is_prefix && use_prefix_db.is_none() {
find_zero_typo_prefix_derivations(
word_interned,
fst,
&mut ctx.word_interner,
|derived_word| {
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
prefix_of.insert(derived_word);
Ok(ControlFlow::Continue(()))
} else {
Ok(ControlFlow::Break(()))
}
},
)?;
}
let synonyms = ctx.index.synonyms(ctx.txn)?;
let mut synonym_word_count = 0;
let synonyms = synonyms
.get(&vec![word.to_owned()])
.cloned()
.unwrap_or_default()
.into_iter()
.take(limits::MAX_SYNONYM_PHRASE_COUNT)
.filter_map(|words| {
if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT {
return None;
}
synonym_word_count += words.len();
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
Some(ctx.phrase_interner.insert(Phrase { words }))
})
.collect();
let zero_typo =
ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
Ok(QueryTerm {
original: word_interned,
ngram_words: None,
max_levenshtein_distance: max_typo,
is_prefix,
zero_typo,
one_typo: Lazy::Uninit,
two_typo: Lazy::Uninit,
})
}
fn find_split_words(ctx: &mut SearchContext<'_>, word: &str) -> Result<Option<Interned<Phrase>>> {
if let Some((l, r)) = split_best_frequency(ctx, word)? {
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
} else {
Ok(None)
}
}
impl Interned<QueryTerm> {
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext<'_>) -> Result<()> {
let self_mut = ctx.term_interner.get_mut(self);
let allows_split_words = self_mut.allows_split_words();
let QueryTerm {
original,
is_prefix,
one_typo,
max_levenshtein_distance: max_nbr_typos,
..
} = self_mut;
let original = *original;
let is_prefix = *is_prefix;
// let original_str = ctx.word_interner.get(*original).to_owned();
if one_typo.is_init() {
return Ok(());
}
let mut one_typo_words = BTreeSet::new();
if *max_nbr_typos > 0 {
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
match nbr_typos {
ZeroOrOneTypo::Zero => {}
ZeroOrOneTypo::One => {
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
one_typo_words.insert(derived_word);
} else {
return Ok(ControlFlow::Break(()));
}
}
}
Ok(ControlFlow::Continue(()))
})?;
}
let split_words = if allows_split_words {
let original_str = ctx.word_interner.get(original).to_owned();
find_split_words(ctx, original_str.as_str())?
} else {
None
};
let self_mut = ctx.term_interner.get_mut(self);
// Only add the split words to the derivations if:
// 1. the term is neither an ngram nor a phrase; OR
// 2. the term is an ngram, but the split words are different from the ngram's component words
let split_words = if let Some((ngram_words, split_words)) =
self_mut.ngram_words.as_ref().zip(split_words.as_ref())
{
let Phrase { words } = ctx.phrase_interner.get(*split_words);
if ngram_words.iter().ne(words.iter().flatten()) {
Some(*split_words)
} else {
None
}
} else {
split_words
};
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
self_mut.one_typo = Lazy::Init(one_typo);
Ok(())
}
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext<'_>) -> Result<()> {
let self_mut = ctx.term_interner.get_mut(self);
let QueryTerm {
original,
is_prefix,
two_typo,
max_levenshtein_distance: max_nbr_typos,
..
} = self_mut;
let original_str = ctx.word_interner.get(*original).to_owned();
if two_typo.is_init() {
return Ok(());
}
let mut one_typo_words = BTreeSet::new();
let mut two_typo_words = BTreeSet::new();
if *max_nbr_typos > 0 {
find_zero_one_two_typo_derivations(
*original,
*is_prefix,
ctx.index.words_fst(ctx.txn)?,
&mut ctx.word_interner,
|derived_word, nbr_typos| {
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
{
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
return Ok(ControlFlow::Break(()));
}
match nbr_typos {
NumberOfTypos::Zero => {}
NumberOfTypos::One => {
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
one_typo_words.insert(derived_word);
}
}
NumberOfTypos::Two => {
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
two_typo_words.insert(derived_word);
}
}
}
Ok(ControlFlow::Continue(()))
},
)?;
}
let split_words = find_split_words(ctx, original_str.as_str())?;
let self_mut = ctx.term_interner.get_mut(self);
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
self_mut.one_typo = Lazy::Init(one_typo);
self_mut.two_typo = Lazy::Init(two_typo);
Ok(())
}
}
/// Split the original word into the two words that appear the
/// most next to each other in the index.
///
/// Return `None` if the original word cannot be split.
fn split_best_frequency(
ctx: &mut SearchContext<'_>,
original: &str,
) -> Result<Option<(Interned<String>, Interned<String>)>> {
let chars = original.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = original.split_at(i);
let left = ctx.word_interner.insert(left.to_owned());
let right = ctx.word_interner.insert(right.to_owned());
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? {
if best.map_or(true, |(old, _, _)| frequency > old) {
best = Some((frequency, left, right));
}
}
}
Ok(best.map(|(_, left, right)| (left, right)))
}

View file

@ -0,0 +1,510 @@
mod compute_derivations;
mod ntypo_subset;
mod parse_query;
mod phrase;
use std::collections::BTreeSet;
use std::iter::FromIterator;
use std::ops::RangeInclusive;
use either::Either;
pub use ntypo_subset::NTypoTermSubset;
pub use parse_query::{
located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
};
pub use phrase::Phrase;
use super::interner::{DedupInterner, Interned};
use super::{limits, SearchContext, Word};
use crate::Result;
/// A set of word derivations attached to a location in the search query.
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct LocatedQueryTermSubset {
pub term_subset: QueryTermSubset,
pub positions: RangeInclusive<u16>,
pub term_ids: RangeInclusive<u8>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct QueryTermSubset {
original: Interned<QueryTerm>,
zero_typo_subset: NTypoTermSubset,
one_typo_subset: NTypoTermSubset,
two_typo_subset: NTypoTermSubset,
/// `true` if the term cannot be deleted through the term matching strategy
///
/// Note that there are other reasons for which a term cannot be deleted, such as
/// being a phrase. In that case, this field could be set to `false`, but it
/// still wouldn't be deleteable by the term matching strategy.
mandatory: bool,
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct QueryTerm {
original: Interned<String>,
ngram_words: Option<Vec<Interned<String>>>,
max_levenshtein_distance: u8,
is_prefix: bool,
zero_typo: ZeroTypoTerm,
// May not be computed yet
one_typo: Lazy<OneTypoTerm>,
// May not be computed yet
two_typo: Lazy<TwoTypoTerm>,
}
// SubTerms will be in a dedup interner
#[derive(Default, Clone, PartialEq, Eq, Hash)]
struct ZeroTypoTerm {
/// The original phrase, if any
phrase: Option<Interned<Phrase>>,
/// A single word equivalent to the original term, with zero typos
exact: Option<Interned<String>>,
/// All the words that contain the original word as prefix
prefix_of: BTreeSet<Interned<String>>,
/// All the synonyms of the original word or phrase
synonyms: BTreeSet<Interned<Phrase>>,
/// A prefix in the prefix databases matching the original word
use_prefix_db: Option<Interned<String>>,
}
#[derive(Default, Clone, PartialEq, Eq, Hash)]
struct OneTypoTerm {
/// The original word split into multiple consecutive words
split_words: Option<Interned<Phrase>>,
/// Words that are 1 typo away from the original word
one_typo: BTreeSet<Interned<String>>,
}
#[derive(Default, Clone, PartialEq, Eq, Hash)]
struct TwoTypoTerm {
/// Words that are 2 typos away from the original word
two_typos: BTreeSet<Interned<String>>,
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Lazy<T> {
Uninit,
Init(T),
}
impl<T> Lazy<T> {
pub fn is_init(&self) -> bool {
match self {
Lazy::Uninit => false,
Lazy::Init(_) => true,
}
}
pub fn is_uninit(&self) -> bool {
match self {
Lazy::Uninit => true,
Lazy::Init(_) => false,
}
}
}
#[derive(Clone, Copy)]
pub enum ExactTerm {
Phrase(Interned<Phrase>),
Word(Interned<String>),
}
impl ExactTerm {
pub fn interned_words<'ctx>(
&self,
ctx: &'ctx SearchContext<'ctx>,
) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
match *self {
ExactTerm::Phrase(phrase) => {
let phrase = ctx.phrase_interner.get(phrase);
Either::Left(phrase.words.iter().copied())
}
ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
}
}
}
impl QueryTermSubset {
pub fn is_mandatory(&self) -> bool {
self.mandatory
}
pub fn make_mandatory(&mut self) {
self.mandatory = true;
}
pub fn exact_term(&self, ctx: &SearchContext<'_>) -> Option<ExactTerm> {
let full_query_term = ctx.term_interner.get(self.original);
if full_query_term.ngram_words.is_some() {
return None;
}
if let Some(phrase) = full_query_term.zero_typo.phrase {
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
} else if let Some(word) = full_query_term.zero_typo.exact {
self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
} else {
None
}
}
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
Self {
original: for_term,
zero_typo_subset: NTypoTermSubset::Nothing,
one_typo_subset: NTypoTermSubset::Nothing,
two_typo_subset: NTypoTermSubset::Nothing,
mandatory: false,
}
}
pub fn full(for_term: Interned<QueryTerm>) -> Self {
Self {
original: for_term,
zero_typo_subset: NTypoTermSubset::All,
one_typo_subset: NTypoTermSubset::All,
two_typo_subset: NTypoTermSubset::All,
mandatory: false,
}
}
pub fn union(&mut self, other: &Self) {
assert!(self.original == other.original);
self.zero_typo_subset.union(&other.zero_typo_subset);
self.one_typo_subset.union(&other.one_typo_subset);
self.two_typo_subset.union(&other.two_typo_subset);
}
pub fn intersect(&mut self, other: &Self) {
assert!(self.original == other.original);
self.zero_typo_subset.intersect(&other.zero_typo_subset);
self.one_typo_subset.intersect(&other.one_typo_subset);
self.two_typo_subset.intersect(&other.two_typo_subset);
}
pub fn use_prefix_db(&self, ctx: &SearchContext<'_>) -> Option<Word> {
let original = ctx.term_interner.get(self.original);
let use_prefix_db = original.zero_typo.use_prefix_db?;
let word = match &self.zero_typo_subset {
NTypoTermSubset::All => Some(use_prefix_db),
NTypoTermSubset::Subset { words, phrases: _ } => {
if words.contains(&use_prefix_db) {
Some(use_prefix_db)
} else {
None
}
}
NTypoTermSubset::Nothing => None,
};
word.map(|word| {
if original.ngram_words.is_some() {
Word::Derived(word)
} else {
Word::Original(word)
}
})
}
pub fn all_single_words_except_prefix_db(
&self,
ctx: &mut SearchContext<'_>,
) -> Result<BTreeSet<Word>> {
let mut result = BTreeSet::default();
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
self.original.compute_fully_if_needed(ctx)?;
}
let original = ctx.term_interner.get_mut(self.original);
match &self.zero_typo_subset {
NTypoTermSubset::All => {
let ZeroTypoTerm {
phrase: _,
exact: zero_typo,
prefix_of,
synonyms: _,
use_prefix_db: _,
} = &original.zero_typo;
result.extend(zero_typo.iter().copied().map(|w| {
if original.ngram_words.is_some() {
Word::Derived(w)
} else {
Word::Original(w)
}
}));
result.extend(prefix_of.iter().copied().map(|w| {
if original.ngram_words.is_some() {
Word::Derived(w)
} else {
Word::Original(w)
}
}));
}
NTypoTermSubset::Subset { words, phrases: _ } => {
let ZeroTypoTerm {
phrase: _,
exact: zero_typo,
prefix_of,
synonyms: _,
use_prefix_db: _,
} = &original.zero_typo;
if let Some(zero_typo) = zero_typo {
if words.contains(zero_typo) {
if original.ngram_words.is_some() {
result.insert(Word::Derived(*zero_typo));
} else {
result.insert(Word::Original(*zero_typo));
}
}
}
result.extend(prefix_of.intersection(words).copied().map(|w| {
if original.ngram_words.is_some() {
Word::Derived(w)
} else {
Word::Original(w)
}
}));
}
NTypoTermSubset::Nothing => {}
}
match &self.one_typo_subset {
NTypoTermSubset::All => {
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo
else {
panic!()
};
result.extend(one_typo.iter().copied().map(Word::Derived))
}
NTypoTermSubset::Subset { words, phrases: _ } => {
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo
else {
panic!()
};
result.extend(one_typo.intersection(words).copied().map(Word::Derived));
}
NTypoTermSubset::Nothing => {}
};
match &self.two_typo_subset {
NTypoTermSubset::All => {
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() };
result.extend(two_typos.iter().copied().map(Word::Derived));
}
NTypoTermSubset::Subset { words, phrases: _ } => {
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() };
result.extend(two_typos.intersection(words).copied().map(Word::Derived));
}
NTypoTermSubset::Nothing => {}
};
Ok(result)
}
pub fn all_phrases(&self, ctx: &mut SearchContext<'_>) -> Result<BTreeSet<Interned<Phrase>>> {
let mut result = BTreeSet::default();
if !self.one_typo_subset.is_empty() {
self.original.compute_fully_if_needed(ctx)?;
}
let original = ctx.term_interner.get_mut(self.original);
let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
&original.zero_typo;
result.extend(phrase.iter().copied());
result.extend(synonyms.iter().copied());
match &self.one_typo_subset {
NTypoTermSubset::All => {
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo
else {
panic!();
};
result.extend(split_words.iter().copied());
}
NTypoTermSubset::Subset { phrases, .. } => {
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo
else {
panic!();
};
if let Some(split_words) = split_words {
if phrases.contains(split_words) {
result.insert(*split_words);
}
}
}
NTypoTermSubset::Nothing => {}
}
Ok(result)
}
pub fn original_phrase(&self, ctx: &SearchContext<'_>) -> Option<Interned<Phrase>> {
let t = ctx.term_interner.get(self.original);
if let Some(p) = t.zero_typo.phrase {
if self.zero_typo_subset.contains_phrase(p) {
return Some(p);
}
}
None
}
pub fn max_typo_cost(&self, ctx: &SearchContext<'_>) -> u8 {
let t = ctx.term_interner.get(self.original);
match t.max_levenshtein_distance {
0 => {
if t.allows_split_words() {
1
} else {
0
}
}
1 => {
if self.one_typo_subset.is_empty() {
0
} else {
1
}
}
2 => {
if self.two_typo_subset.is_empty() {
if self.one_typo_subset.is_empty() {
0
} else {
1
}
} else {
2
}
}
_ => panic!(),
}
}
pub fn keep_only_exact_term(&mut self, ctx: &SearchContext<'_>) {
if let Some(term) = self.exact_term(ctx) {
match term {
ExactTerm::Phrase(p) => {
self.zero_typo_subset = NTypoTermSubset::Subset {
words: BTreeSet::new(),
phrases: BTreeSet::from_iter([p]),
};
self.clear_one_typo_subset();
self.clear_two_typo_subset();
}
ExactTerm::Word(w) => {
self.zero_typo_subset = NTypoTermSubset::Subset {
words: BTreeSet::from_iter([w]),
phrases: BTreeSet::new(),
};
self.clear_one_typo_subset();
self.clear_two_typo_subset();
}
}
}
}
pub fn clear_zero_typo_subset(&mut self) {
self.zero_typo_subset = NTypoTermSubset::Nothing;
}
pub fn clear_one_typo_subset(&mut self) {
self.one_typo_subset = NTypoTermSubset::Nothing;
}
pub fn clear_two_typo_subset(&mut self) {
self.two_typo_subset = NTypoTermSubset::Nothing;
}
pub fn description(&self, ctx: &SearchContext<'_>) -> String {
let t = ctx.term_interner.get(self.original);
ctx.word_interner.get(t.original).to_owned()
}
}
impl ZeroTypoTerm {
fn is_empty(&self) -> bool {
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
phrase.is_none()
&& zero_typo.is_none()
&& prefix_of.is_empty()
&& synonyms.is_empty()
&& use_prefix_db.is_none()
}
}
impl OneTypoTerm {
fn is_empty(&self) -> bool {
let OneTypoTerm { split_words, one_typo } = self;
one_typo.is_empty() && split_words.is_none()
}
}
impl TwoTypoTerm {
fn is_empty(&self) -> bool {
let TwoTypoTerm { two_typos } = self;
two_typos.is_empty()
}
}
impl QueryTerm {
fn is_empty(&self) -> bool {
let Lazy::Init(one_typo) = &self.one_typo else {
return false;
};
let Lazy::Init(two_typo) = &self.two_typo else {
return false;
};
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
}
fn allows_split_words(&self) -> bool {
self.zero_typo.phrase.is_none()
}
}
impl Interned<QueryTerm> {
/// Return the original word from the given query term
fn original_single_word(self, ctx: &SearchContext<'_>) -> Option<Interned<String>> {
let self_ = ctx.term_interner.get(self);
if self_.ngram_words.is_some() {
None
} else {
Some(self_.original)
}
}
}
/// A query term coupled with its position in the user's search query.
#[derive(Clone)]
pub struct LocatedQueryTerm {
pub value: Interned<QueryTerm>,
pub positions: RangeInclusive<u16>,
}
impl LocatedQueryTerm {
/// Return `true` iff the term is empty
pub fn is_empty(&self, interner: &DedupInterner<QueryTerm>) -> bool {
interner.get(self.value).is_empty()
}
}
impl QueryTerm {
pub fn is_cached_prefix(&self) -> bool {
self.zero_typo.use_prefix_db.is_some()
}
pub fn is_prefix(&self) -> bool {
self.is_prefix
}
pub fn original_word(&self, ctx: &SearchContext<'_>) -> String {
ctx.word_interner.get(self.original).clone()
}
pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
self.zero_typo.phrase
}
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
let mut words = BTreeSet::new();
let mut phrases = BTreeSet::new();
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
&self.zero_typo;
words.extend(zero_typo.iter().copied());
words.extend(prefix_of.iter().copied());
phrases.extend(phrase.iter().copied());
phrases.extend(synonyms.iter().copied());
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
words.extend(one_typo.iter().copied());
phrases.extend(split_words.iter().copied());
};
if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
words.extend(two_typos.iter().copied());
};
(words.into_iter().collect(), phrases.into_iter().collect())
}
}

View file

@ -0,0 +1,79 @@
use std::collections::BTreeSet;
use super::Phrase;
use crate::search::new::interner::Interned;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum NTypoTermSubset {
All,
Subset {
words: BTreeSet<Interned<String>>,
phrases: BTreeSet<Interned<Phrase>>,
// TODO: prefixes: BTreeSet<Interned<String>>,
},
Nothing,
}
impl NTypoTermSubset {
pub fn contains_word(&self, word: Interned<String>) -> bool {
match self {
NTypoTermSubset::All => true,
NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word),
NTypoTermSubset::Nothing => false,
}
}
pub fn contains_phrase(&self, phrase: Interned<Phrase>) -> bool {
match self {
NTypoTermSubset::All => true,
NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase),
NTypoTermSubset::Nothing => false,
}
}
pub fn is_empty(&self) -> bool {
match self {
NTypoTermSubset::All => false,
NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(),
NTypoTermSubset::Nothing => true,
}
}
pub fn union(&mut self, other: &Self) {
match self {
Self::All => {}
Self::Subset { words, phrases } => match other {
Self::All => {
*self = Self::All;
}
Self::Subset { words: w2, phrases: p2 } => {
words.extend(w2);
phrases.extend(p2);
}
Self::Nothing => {}
},
Self::Nothing => {
*self = other.clone();
}
}
}
pub fn intersect(&mut self, other: &Self) {
match self {
Self::All => *self = other.clone(),
Self::Subset { words, phrases } => match other {
Self::All => {}
Self::Subset { words: w2, phrases: p2 } => {
let mut ws = BTreeSet::new();
for w in words.intersection(w2) {
ws.insert(*w);
}
let mut ps = BTreeSet::new();
for p in phrases.intersection(p2) {
ps.insert(*p);
}
*words = ws;
*phrases = ps;
}
Self::Nothing => *self = Self::Nothing,
},
Self::Nothing => {}
}
}
}

View file

@ -0,0 +1,382 @@
use std::collections::BTreeSet;
use charabia::normalizer::NormalizedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use super::compute_derivations::partially_initialized_term_from_word;
use super::{LocatedQueryTerm, ZeroTypoTerm};
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
use crate::search::new::Word;
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
#[derive(Clone)]
/// Extraction of the content of a query.
pub struct ExtractedTokens {
/// The terms to search for in the database.
pub query_terms: Vec<LocatedQueryTerm>,
/// The words that must not appear in the results.
pub negative_words: Vec<Word>,
/// The phrases that must not appear in the results.
pub negative_phrases: Vec<LocatedQueryTerm>,
}
/// Convert the tokenised search query into a list of located query terms.
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
pub fn located_query_terms_from_tokens(
ctx: &mut SearchContext<'_>,
query: NormalizedTokenIter<'_, '_, '_, '_>,
words_limit: Option<usize>,
) -> Result<ExtractedTokens> {
let nbr_typos = number_of_typos_allowed(ctx)?;
let mut query_terms = Vec::new();
let mut negative_phrase = false;
let mut phrase: Option<PhraseBuilder> = None;
let mut encountered_whitespace = true;
let mut negative_next_token = false;
let mut negative_words = Vec::new();
let mut negative_phrases = Vec::new();
let parts_limit = words_limit.unwrap_or(usize::MAX);
// start with the last position as we will wrap around to position 0 at the beginning of the loop below.
let mut position = u16::MAX;
let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable();
while let Some(token) = peekable.next() {
if token.lemma().is_empty() {
continue;
}
// early return if word limit is exceeded
if query_terms.len() >= parts_limit {
return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
}
match token.kind {
TokenKind::Word | TokenKind::StopWord => {
// On first loop, goes from u16::MAX to 0, then normal increment.
position = position.wrapping_add(1);
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word.
if let Some(phrase) = &mut phrase {
phrase.push_word(ctx, &token, position)
} else if negative_next_token {
let word = token.lemma().to_string();
let word = Word::Original(ctx.word_interner.insert(word));
negative_words.push(word);
negative_next_token = false;
} else if peekable.peek().is_some() {
match token.kind {
TokenKind::Word => {
let word = token.lemma();
let term = partially_initialized_term_from_word(
ctx,
word,
nbr_typos(word),
false,
false,
)?;
let located_term = LocatedQueryTerm {
value: ctx.term_interner.push(term),
positions: position..=position,
};
query_terms.push(located_term);
}
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
}
} else {
let word = token.lemma();
let term = partially_initialized_term_from_word(
ctx,
word,
nbr_typos(word),
true,
false,
)?;
let located_term = LocatedQueryTerm {
value: ctx.term_interner.push(term),
positions: position..=position,
};
query_terms.push(located_term);
}
}
TokenKind::Separator(separator_kind) => {
// add penalty for hard separators
if let SeparatorKind::Hard = separator_kind {
position = position.wrapping_add(7);
}
phrase = 'phrase: {
let phrase = phrase.take();
// If we have a hard separator inside a phrase, we immediately start a new phrase
let phrase = if separator_kind == SeparatorKind::Hard {
if let Some(phrase) = phrase {
if let Some(located_query_term) = phrase.build(ctx) {
// as we are evaluating a negative operator we put the phrase
// in the negative one *but* we don't reset the negative operator
// as we are immediately starting a new negative phrase.
if negative_phrase {
negative_phrases.push(located_query_term);
} else {
query_terms.push(located_query_term);
}
}
Some(PhraseBuilder::empty())
} else {
None
}
} else {
phrase
};
// We close and start a new phrase depending on the number of double quotes
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
if quote_count == 0 {
break 'phrase phrase;
}
// Consume the closing quote and the phrase
if let Some(phrase) = phrase {
// Per the check above, quote_count > 0
quote_count -= 1;
if let Some(located_query_term) = phrase.build(ctx) {
// we were evaluating a negative operator so we
// put the phrase in the negative phrases
if negative_phrase {
negative_phrases.push(located_query_term);
negative_phrase = false;
} else {
query_terms.push(located_query_term);
}
}
}
// Start new phrase if the token ends with an opening quote
if quote_count % 2 == 1 {
negative_phrase = negative_next_token;
Some(PhraseBuilder::empty())
} else {
None
}
};
negative_next_token =
phrase.is_none() && token.lemma() == "-" && encountered_whitespace;
}
_ => (),
}
encountered_whitespace =
token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some();
}
// If a quote is never closed, we consider all of the end of the query as a phrase.
if let Some(phrase) = phrase.take() {
if let Some(located_query_term) = phrase.build(ctx) {
// put the phrase in the negative set if we are evaluating a negative operator.
if negative_phrase {
negative_phrases.push(located_query_term);
} else {
query_terms.push(located_query_term);
}
}
}
Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
}
pub fn number_of_typos_allowed<'ctx>(
ctx: &SearchContext<'ctx>,
) -> Result<impl Fn(&str) -> u8 + 'ctx> {
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
let exact_words = ctx.index.exact_words(ctx.txn)?;
Ok(Box::new(move |word: &str| {
if !authorize_typos
|| word.len() < min_len_one_typo as usize
|| exact_words.as_ref().map_or(false, |fst| fst.contains(word))
{
0
} else if word.len() < min_len_two_typos as usize {
1
} else {
2
}
}))
}
pub fn make_ngram(
ctx: &mut SearchContext<'_>,
terms: &[LocatedQueryTerm],
number_of_typos_allowed: &impl Fn(&str) -> u8,
) -> Result<Option<LocatedQueryTerm>> {
assert!(!terms.is_empty());
for t in terms {
if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() {
return Ok(None);
}
}
for ts in terms.windows(2) {
let [t1, t2] = ts else { panic!() };
if *t1.positions.end() != t2.positions.start() - 1 {
return Ok(None);
}
}
let mut words_interned = vec![];
for term in terms {
if let Some(original_term_word) = term.value.original_single_word(ctx) {
words_interned.push(original_term_word);
} else {
return Ok(None);
}
}
let words =
words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::<Vec<_>>();
let start = *terms.first().as_ref().unwrap().positions.start();
let end = *terms.last().as_ref().unwrap().positions.end();
let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix;
let ngram_str = words.join("");
if ngram_str.len() > MAX_WORD_LENGTH {
return Ok(None);
}
let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone());
let max_nbr_typos =
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
let mut term =
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
// Now add the synonyms
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
term.zero_typo.synonyms.extend(
index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| {
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
ctx.phrase_interner.insert(Phrase { words })
}),
);
let term = QueryTerm {
original: ngram_str_interned,
ngram_words: Some(words_interned),
is_prefix,
max_levenshtein_distance: max_nbr_typos,
zero_typo: term.zero_typo,
one_typo: Lazy::Uninit,
two_typo: Lazy::Uninit,
};
let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end };
Ok(Some(term))
}
struct PhraseBuilder {
words: Vec<Option<crate::search::new::Interned<String>>>,
start: u16,
end: u16,
}
impl PhraseBuilder {
fn empty() -> Self {
Self { words: Default::default(), start: u16::MAX, end: u16::MAX }
}
fn is_empty(&self) -> bool {
self.words.is_empty() || self.words.iter().all(Option::is_none)
}
// precondition: token has kind Word or StopWord
fn push_word(
&mut self,
ctx: &mut SearchContext<'_>,
token: &charabia::Token<'_>,
position: u16,
) {
if self.is_empty() {
self.start = position;
}
self.end = position;
if let TokenKind::StopWord = token.kind {
self.words.push(None);
} else {
// token has kind Word
let word = ctx.word_interner.insert(token.lemma().to_string());
self.words.push(Some(word));
}
}
fn build(self, ctx: &mut SearchContext<'_>) -> Option<LocatedQueryTerm> {
if self.is_empty() {
return None;
}
Some(LocatedQueryTerm {
value: ctx.term_interner.push({
let phrase = ctx.phrase_interner.insert(Phrase { words: self.words });
let phrase_desc = phrase.description(ctx);
QueryTerm {
original: ctx.word_interner.insert(phrase_desc),
ngram_words: None,
max_levenshtein_distance: 0,
is_prefix: false,
zero_typo: ZeroTypoTerm {
phrase: Some(phrase),
exact: None,
prefix_of: BTreeSet::default(),
synonyms: BTreeSet::default(),
use_prefix_db: None,
},
one_typo: Lazy::Uninit,
two_typo: Lazy::Uninit,
}
}),
positions: self.start..=self.end,
})
}
}
#[cfg(test)]
mod tests {
use charabia::TokenizerBuilder;
use super::*;
use crate::index::tests::TempIndex;
fn temp_index_with_documents() -> TempIndex {
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
{ "id": 2, "name": "Westfália" },
{ "id": 3, "name": "Ŵôřlḑôle" },
]))
.unwrap();
temp_index
}
#[test]
fn start_with_hard_separator() -> Result<()> {
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let tokens = tokenizer.tokenize(".");
let index = temp_index_with_documents();
let rtxn = index.read_txn()?;
let mut ctx = SearchContext::new(&index, &rtxn)?;
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
let ExtractedTokens { query_terms, .. } =
located_query_terms_from_tokens(&mut ctx, tokens, None)?;
assert!(query_terms.is_empty());
Ok(())
}
}

View file

@ -0,0 +1,21 @@
use itertools::Itertools;
use crate::search::new::interner::Interned;
use crate::SearchContext;
/// A phrase in the user's search query, consisting of several words
/// that must appear side-by-side in the search results.
#[derive(Default, Clone, PartialEq, Eq, Hash)]
pub struct Phrase {
pub words: Vec<Option<Interned<String>>>,
}
impl Interned<Phrase> {
pub fn description(self, ctx: &SearchContext<'_>) -> String {
let p = ctx.phrase_interner.get(self);
p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ")
}
pub fn words(self, ctx: &SearchContext<'_>) -> Vec<Option<Interned<String>>> {
let p = ctx.phrase_interner.get(self);
p.words.clone()
}
}

View file

@ -0,0 +1,92 @@
use std::collections::HashSet;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, MappedInterner};
use crate::search::new::query_graph::{QueryNode, QueryNodeData};
use crate::search::new::small_bitmap::SmallBitmap;
use crate::search::new::{QueryGraph, SearchContext};
use crate::Result;
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
/// Build the ranking rule graph from the given query graph
pub fn build(
ctx: &mut SearchContext<'_>,
query_graph: QueryGraph,
cost_of_ignoring_node: MappedInterner<QueryNode, Option<(u32, SmallBitmap<QueryNode>)>>,
) -> Result<Self> {
let QueryGraph { nodes: graph_nodes, .. } = &query_graph;
let mut conditions_interner = DedupInterner::default();
let mut edges_store = DedupInterner::default();
let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new());
for (source_id, source_node) in graph_nodes.iter() {
let new_edges = edges_of_node.get_mut(source_id);
for dest_idx in source_node.successors.iter() {
let src_term = match &source_node.data {
QueryNodeData::Term(t) => Some(t),
QueryNodeData::Start => None,
QueryNodeData::Deleted | QueryNodeData::End => panic!(),
};
let dest_node = graph_nodes.get(dest_idx);
let dest_term = match &dest_node.data {
QueryNodeData::Term(t) => t,
QueryNodeData::End => {
let new_edge_id = edges_store.insert(Some(Edge {
source_node: source_id,
dest_node: dest_idx,
cost: 0,
condition: None,
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
}));
new_edges.insert(new_edge_id);
continue;
}
QueryNodeData::Deleted | QueryNodeData::Start => panic!(),
};
if let Some((cost_of_ignoring, forbidden_nodes)) =
cost_of_ignoring_node.get(dest_idx)
{
let dest = graph_nodes.get(dest_idx);
let dest_size = match &dest.data {
QueryNodeData::Term(term) => term.term_ids.len(),
_ => panic!(),
};
let new_edge_id = edges_store.insert(Some(Edge {
source_node: source_id,
dest_node: dest_idx,
cost: *cost_of_ignoring * dest_size as u32,
condition: None,
nodes_to_skip: forbidden_nodes.clone(),
}));
new_edges.insert(new_edge_id);
}
let edges = G::build_edges(ctx, &mut conditions_interner, src_term, dest_term)?;
if edges.is_empty() {
continue;
}
for (cost, condition) in edges {
let new_edge_id = edges_store.insert(Some(Edge {
source_node: source_id,
dest_node: dest_idx,
cost,
condition: Some(condition),
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
}));
new_edges.insert(new_edge_id);
}
}
}
let edges_store = edges_store.freeze();
let edges_of_node =
edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store));
let conditions_interner = conditions_interner.freeze();
Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner })
}
}

View file

@ -0,0 +1,400 @@
/** Implements a "PathVisitor" which finds all paths of a certain cost
from the START to END node of a ranking rule graph.
A path is a list of conditions. A condition is the data associated with
an edge, given by the ranking rule. Some edges don't have a condition associated
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
The algorithm uses a depth-first search. It benefits from two main optimisations:
- The list of all possible costs to go from any node to the END node is precomputed
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
untraversable depending on what other edges were selected.
These two optimisations are meant to avoid traversing edges that wouldn't lead
to a valid path. In practically all cases, we avoid the exponential complexity
that is inherent to depth-first search in a large ranking rule graph.
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
conditions to a list of traversed conditions.
For example, the DeadEndsCache could say the following:
- Immediately, from the start, the conditions `[a,b]` are forbidden
- if we take the condition `c`, then the conditions `[e]` are also forbidden
- and if after that, we take `f`, then `[h,i]` are also forbidden
- etc.
- if we take `g`, then `[f]` is also forbidden
- etc.
- etc.
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
When a path is found from START to END, we give it to the `visit` closure.
This closure takes a mutable reference to the `DeadEndsCache`. This means that
the caller can update this cache. Therefore, we must handle the case where the
DeadEndsCache has been updated. This means potentially backtracking up to the point
where the traversed conditions are all allowed by the new DeadEndsCache.
The algorithm also implements the `TermsMatchingStrategy` logic.
Some edges are augmented with a list of "nodes_to_skip". Skipping
a node means "reaching this node through an unconditional edge". If we have
already traversed (ie. not skipped) a node that is in this list, then we know that we
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
future node that was present in the "nodes_to_skip" list.
The caller can decide to stop the path finding algorithm
by returning a `ControlFlow::Break` from the `visit` closure.
*/
use std::collections::{BTreeSet, VecDeque};
use std::iter::FromIterator;
use std::ops::ControlFlow;
use fxhash::FxHashSet;
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{Interned, MappedInterner};
use crate::search::new::query_graph::QueryNode;
use crate::search::new::small_bitmap::SmallBitmap;
use crate::Result;
/// Closure which processes a path found by the `PathVisitor`
type VisitFn<'f, G> = &'f mut dyn FnMut(
// the path as a list of conditions
&[Interned<<G as RankingRuleGraphTrait>::Condition>],
&mut RankingRuleGraph<G>,
// a mutable reference to the DeadEndsCache, to update it in case the given
// path doesn't resolve to any valid document ids
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
) -> Result<ControlFlow<()>>;
/// A structure which is kept but not updated during the traversal of the graph.
/// It can however be updated by the `visit` closure once a valid path has been found.
struct VisitorContext<'a, G: RankingRuleGraphTrait> {
graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
}
/// The internal state of the traversal algorithm
struct VisitorState<G: RankingRuleGraphTrait> {
/// Budget from the current node to the end node
remaining_cost: u64,
/// Previously visited conditions, in order.
path: Vec<Interned<G::Condition>>,
/// Previously visited conditions, as an efficient and compact set.
visited_conditions: SmallBitmap<G::Condition>,
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
visited_nodes: SmallBitmap<QueryNode>,
/// The conditions that cannot be visited anymore
forbidden_conditions: SmallBitmap<G::Condition>,
/// The nodes that cannot be visited anymore (they must be skipped)
nodes_to_skip: SmallBitmap<QueryNode>,
}
/// See module documentation
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
state: VisitorState<G>,
ctx: VisitorContext<'a, G>,
}
impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
pub fn new(
cost: u64,
graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
) -> Self {
Self {
state: VisitorState {
remaining_cost: cost,
path: vec![],
visited_conditions: SmallBitmap::for_interned_values_in(&graph.conditions_interner),
visited_nodes: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
forbidden_conditions: SmallBitmap::for_interned_values_in(
&graph.conditions_interner,
),
nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
},
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
}
}
/// See module documentation
pub fn visit_paths(mut self, visit: VisitFn<'_, G>) -> Result<()> {
let _ =
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
Ok(())
}
}
impl<G: RankingRuleGraphTrait> VisitorState<G> {
/// Visits a node: traverse all its valid conditional and unconditional edges.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_node(
&mut self,
from_node: Interned<QueryNode>,
visit: VisitFn<'_, G>,
ctx: &mut VisitorContext<'_, G>,
) -> Result<ControlFlow<(), bool>> {
// any valid path will be found from this point
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
// and we will need to do more work to potentially backtrack
let mut any_valid = false;
let edges = ctx.graph.edges_of_node.get(from_node).clone();
for edge_idx in edges.iter() {
// could be none if the edge was deleted
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
if self.remaining_cost < edge.cost as u64 {
continue;
}
self.remaining_cost -= edge.cost as u64;
let cf = match edge.condition {
Some(condition) => self.visit_condition(
condition,
edge.dest_node,
&edge.nodes_to_skip,
visit,
ctx,
)?,
None => self.visit_no_condition(edge.dest_node, &edge.nodes_to_skip, visit, ctx)?,
};
self.remaining_cost += edge.cost as u64;
let ControlFlow::Continue(next_any_valid) = cf else {
return Ok(ControlFlow::Break(()));
};
any_valid |= next_any_valid;
if next_any_valid {
// backtrack as much as possible if a valid path was found and the dead_ends_cache
// was updated such that the current prefix is now invalid
self.forbidden_conditions = ctx
.dead_ends_cache
.forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied());
if self.visited_conditions.intersects(&self.forbidden_conditions) {
return Ok(ControlFlow::Continue(true));
}
}
}
Ok(ControlFlow::Continue(any_valid))
}
/// Visits an unconditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_no_condition(
&mut self,
dest_node: Interned<QueryNode>,
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
visit: VisitFn<'_, G>,
ctx: &mut VisitorContext<'_, G>,
) -> Result<ControlFlow<(), bool>> {
if !ctx
.all_costs_from_node
.get(dest_node)
.iter()
.any(|next_cost| *next_cost == self.remaining_cost)
{
return Ok(ControlFlow::Continue(false));
}
// We've reached the END node!
if dest_node == ctx.graph.query_graph.end_node {
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
// We could change the return type of the visit closure such that the caller
// tells us whether the dead ends cache was updated or not.
// Alternatively, maybe the DeadEndsCache should have a generation number
// to it, so that we don't need to play with these booleans at all.
match control_flow {
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
}
} else {
let old_fbct = self.nodes_to_skip.clone();
self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?;
self.nodes_to_skip = old_fbct;
Ok(cf)
}
}
/// Visits a conditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_condition(
&mut self,
condition: Interned<G::Condition>,
dest_node: Interned<QueryNode>,
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
visit: VisitFn<'_, G>,
ctx: &mut VisitorContext<'_, G>,
) -> Result<ControlFlow<(), bool>> {
assert!(dest_node != ctx.graph.query_graph.end_node);
if self.forbidden_conditions.contains(condition)
|| self.nodes_to_skip.contains(dest_node)
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes)
{
return Ok(ControlFlow::Continue(false));
}
// Checking that from the destination node, there is at least
// one cost that we can visit that corresponds to our remaining budget.
if !ctx
.all_costs_from_node
.get(dest_node)
.iter()
.any(|next_cost| *next_cost == self.remaining_cost)
{
return Ok(ControlFlow::Continue(false));
}
self.path.push(condition);
self.visited_nodes.insert(dest_node);
self.visited_conditions.insert(condition);
let old_forb_cond = self.forbidden_conditions.clone();
if let Some(next_forbidden) =
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
{
self.forbidden_conditions.union(&next_forbidden);
}
let old_nodes_to_skip = self.nodes_to_skip.clone();
self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?;
self.nodes_to_skip = old_nodes_to_skip;
self.forbidden_conditions = old_forb_cond;
self.visited_conditions.remove(condition);
self.visited_nodes.remove(dest_node);
self.path.pop();
Ok(cf)
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
if cur_node == self.query_graph.end_node {
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
return;
}
let mut self_costs = Vec::<u64>::new();
let cur_node_edges = &self.edges_of_node.get(cur_node);
for edge_idx in cur_node_edges.iter() {
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
let succ_node = edge.dest_node;
let succ_costs = costs_to_end.get(succ_node);
for succ_cost in succ_costs {
self_costs.push(edge.cost as u64 + succ_cost);
}
}
self_costs.sort_unstable();
self_costs.dedup();
*costs_to_end.get_mut(cur_node) = self_costs;
});
costs_to_end
}
pub fn update_all_costs_before_node(
&self,
node_with_removed_outgoing_conditions: Interned<QueryNode>,
costs: &mut MappedInterner<QueryNode, Vec<u64>>,
) {
// Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
// We first check that no other node is contributing the same total cost to a predecessor before removing
// the cost from the predecessor.
self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
let mut costs_to_remove = FxHashSet::default();
costs_to_remove.extend(costs.get(cur_node).iter().copied());
let cur_node_edges = &self.edges_of_node.get(cur_node);
for edge_idx in cur_node_edges.iter() {
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
for cost in costs.get(edge.dest_node).iter() {
costs_to_remove.remove(&(*cost + edge.cost as u64));
if costs_to_remove.is_empty() {
return;
}
}
}
if costs_to_remove.is_empty() {
return;
}
let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
for c in costs_to_remove {
new_costs.remove(&c);
}
*costs.get_mut(cur_node) = new_costs.into_iter().collect();
});
}
/// Traverse the graph backwards from the given node such that every time
/// a node is visited, we are guaranteed that all its successors either:
/// 1. have already been visited; OR
/// 2. were not reachable from the given node
pub fn traverse_breadth_first_backward(
&self,
from: Interned<QueryNode>,
mut visit: impl FnMut(Interned<QueryNode>),
) {
let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
{
// go backward to get the set of all reachable nodes from the given node
// the nodes that are not reachable will be set as `visited`
let mut stack = VecDeque::new();
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
enqueued.insert(from);
stack.push_back(from);
while let Some(n) = stack.pop_front() {
if reachable.contains(n) {
continue;
}
reachable.insert(n);
for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
};
let mut unreachable_or_visited =
SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
for (n, _) in self.query_graph.nodes.iter() {
if !reachable.contains(n) {
unreachable_or_visited.insert(n);
}
}
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
let mut stack = VecDeque::new();
enqueued.insert(from);
stack.push_back(from);
while let Some(cur_node) = stack.pop_front() {
if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
stack.push_back(cur_node);
continue;
}
unreachable_or_visited.insert(cur_node);
visit(cur_node);
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
}
}

Some files were not shown because too many files have changed in this diff Show more