mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Merge branch 'main' into all-cpus-in-import-dump
This commit is contained in:
commit
75a7e40a27
74 changed files with 2402 additions and 1726 deletions
|
@ -1,6 +1,13 @@
|
|||
pub static VERSION_MAJOR: &str = env!("CARGO_PKG_VERSION_MAJOR");
|
||||
pub static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR");
|
||||
pub static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH");
|
||||
pub const VERSION_MAJOR: u32 = parse_u32(env!("CARGO_PKG_VERSION_MAJOR"));
|
||||
pub const VERSION_MINOR: u32 = parse_u32(env!("CARGO_PKG_VERSION_MINOR"));
|
||||
pub const VERSION_PATCH: u32 = parse_u32(env!("CARGO_PKG_VERSION_PATCH"));
|
||||
|
||||
const fn parse_u32(s: &str) -> u32 {
|
||||
match u32::from_str_radix(s, 10) {
|
||||
Ok(version) => version,
|
||||
Err(_) => panic!("could not parse as u32"),
|
||||
}
|
||||
}
|
||||
|
||||
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
||||
pub const RESERVED_GEO_FIELD_NAME: &str = "_geo";
|
||||
|
|
50
crates/milli/src/disabled_typos_terms.rs
Normal file
50
crates/milli/src/disabled_typos_terms.rs
Normal file
|
@ -0,0 +1,50 @@
|
|||
use heed::{
|
||||
types::{SerdeJson, Str},
|
||||
RoTxn, RwTxn,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{index::main_key, Index};
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DisabledTyposTerms {
|
||||
pub disable_on_numbers: bool,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn disabled_typos_terms(&self, txn: &RoTxn<'_>) -> heed::Result<DisabledTyposTerms> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
|
||||
.get(txn, main_key::DISABLED_TYPOS_TERMS)
|
||||
.map(|option| option.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub(crate) fn put_disabled_typos_terms(
|
||||
&self,
|
||||
txn: &mut RwTxn<'_>,
|
||||
disabled_typos_terms: &DisabledTyposTerms,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<DisabledTyposTerms>>().put(
|
||||
txn,
|
||||
main_key::DISABLED_TYPOS_TERMS,
|
||||
disabled_typos_terms,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
|
||||
.delete(txn, main_key::DISABLED_TYPOS_TERMS)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl DisabledTyposTerms {
|
||||
pub fn is_exact(&self, word: &str) -> bool {
|
||||
// If disable_on_numbers is true, we disable the word if it contains only numbers or punctuation
|
||||
self.disable_on_numbers && word.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -12,6 +12,7 @@ mod asc_desc;
|
|||
mod attribute_patterns;
|
||||
mod criterion;
|
||||
pub mod database_stats;
|
||||
pub mod disabled_typos_terms;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
pub mod facet;
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::ops::Bound::{self, Excluded, Included};
|
||||
use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
||||
|
||||
use either::Either;
|
||||
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token};
|
||||
use heed::types::LazyDecode;
|
||||
use heed::BytesEncode;
|
||||
use memchr::memmem::Finder;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
use serde_json::Value;
|
||||
|
@ -14,7 +15,7 @@ use crate::constants::RESERVED_GEO_FIELD_NAME;
|
|||
use crate::error::{Error, UserError};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, OrderedF64Codec,
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::index::db_name::FACET_ID_STRING_DOCIDS;
|
||||
use crate::{
|
||||
|
@ -271,7 +272,7 @@ impl<'a> Filter<'a> {
|
|||
// as the facets values are all in the same database and prefixed by the
|
||||
// field id and the level.
|
||||
|
||||
let (left, right) = match operator {
|
||||
let (number_bounds, (left_str, right_str)) = match operator {
|
||||
// return an error if the filter is not allowed for this field
|
||||
Condition::GreaterThan(_)
|
||||
| Condition::GreaterThanOrEqual(_)
|
||||
|
@ -305,17 +306,37 @@ impl<'a> Filter<'a> {
|
|||
));
|
||||
}
|
||||
Condition::GreaterThan(val) => {
|
||||
(Excluded(val.parse_finite_float()?), Included(f64::MAX))
|
||||
let number = val.parse_finite_float().ok();
|
||||
let number_bounds = number.map(|number| (Excluded(number), Included(f64::MAX)));
|
||||
let str_bounds = (Excluded(val.value()), Unbounded);
|
||||
(number_bounds, str_bounds)
|
||||
}
|
||||
Condition::GreaterThanOrEqual(val) => {
|
||||
(Included(val.parse_finite_float()?), Included(f64::MAX))
|
||||
let number = val.parse_finite_float().ok();
|
||||
let number_bounds = number.map(|number| (Included(number), Included(f64::MAX)));
|
||||
let str_bounds = (Included(val.value()), Unbounded);
|
||||
(number_bounds, str_bounds)
|
||||
}
|
||||
Condition::LowerThan(val) => {
|
||||
let number = val.parse_finite_float().ok();
|
||||
let number_bounds = number.map(|number| (Included(f64::MIN), Excluded(number)));
|
||||
let str_bounds = (Unbounded, Excluded(val.value()));
|
||||
(number_bounds, str_bounds)
|
||||
}
|
||||
Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse_finite_float()?)),
|
||||
Condition::LowerThanOrEqual(val) => {
|
||||
(Included(f64::MIN), Included(val.parse_finite_float()?))
|
||||
let number = val.parse_finite_float().ok();
|
||||
let number_bounds = number.map(|number| (Included(f64::MIN), Included(number)));
|
||||
let str_bounds = (Unbounded, Included(val.value()));
|
||||
(number_bounds, str_bounds)
|
||||
}
|
||||
Condition::Between { from, to } => {
|
||||
(Included(from.parse_finite_float()?), Included(to.parse_finite_float()?))
|
||||
let from_number = from.parse_finite_float().ok();
|
||||
let to_number = to.parse_finite_float().ok();
|
||||
|
||||
let number_bounds =
|
||||
from_number.zip(to_number).map(|(from, to)| (Included(from), Included(to)));
|
||||
let str_bounds = (Included(from.value()), Included(to.value()));
|
||||
(number_bounds, str_bounds)
|
||||
}
|
||||
Condition::Null => {
|
||||
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
|
||||
|
@ -415,29 +436,47 @@ impl<'a> Filter<'a> {
|
|||
};
|
||||
|
||||
let mut output = RoaringBitmap::new();
|
||||
Self::explore_facet_number_levels(
|
||||
|
||||
if let Some((left_number, right_number)) = number_bounds {
|
||||
Self::explore_facet_levels(
|
||||
rtxn,
|
||||
numbers_db,
|
||||
field_id,
|
||||
&left_number,
|
||||
&right_number,
|
||||
universe,
|
||||
&mut output,
|
||||
)?;
|
||||
}
|
||||
|
||||
Self::explore_facet_levels(
|
||||
rtxn,
|
||||
numbers_db,
|
||||
strings_db,
|
||||
field_id,
|
||||
left,
|
||||
right,
|
||||
&left_str,
|
||||
&right_str,
|
||||
universe,
|
||||
&mut output,
|
||||
)?;
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
/// Aggregates the documents ids that are part of the specified range automatically
|
||||
/// going deeper through the levels.
|
||||
fn explore_facet_number_levels(
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
db: heed::Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||
fn explore_facet_levels<'data, BoundCodec>(
|
||||
rtxn: &'data heed::RoTxn<'data>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BoundCodec>, FacetGroupValueCodec>,
|
||||
field_id: FieldId,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
left: &'data Bound<<BoundCodec as heed::BytesEncode<'data>>::EItem>,
|
||||
right: &'data Bound<<BoundCodec as heed::BytesEncode<'data>>::EItem>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
output: &mut RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
) -> Result<()>
|
||||
where
|
||||
BoundCodec: for<'b> BytesEncode<'b>,
|
||||
for<'b> <BoundCodec as BytesEncode<'b>>::EItem: Sized + PartialOrd,
|
||||
{
|
||||
match (left, right) {
|
||||
// lower TO upper when lower > upper must return no result
|
||||
(Included(l), Included(r)) if l > r => return Ok(()),
|
||||
|
@ -446,8 +485,8 @@ impl<'a> Filter<'a> {
|
|||
(Excluded(l), Included(r)) if l >= r => return Ok(()),
|
||||
(_, _) => (),
|
||||
}
|
||||
facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
rtxn, db, field_id, &left, &right, universe, output,
|
||||
facet_range_search::find_docids_of_facet_within_bounds::<BoundCodec>(
|
||||
rtxn, db, field_id, left, right, universe, output,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
|
@ -1249,28 +1288,24 @@ mod tests {
|
|||
let result = filter.evaluate(&rtxn, &index).unwrap();
|
||||
assert!(result.contains(0));
|
||||
let filter = Filter::from_str("price < inf").unwrap().unwrap();
|
||||
assert!(matches!(
|
||||
filter.evaluate(&rtxn, &index),
|
||||
Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_)))
|
||||
));
|
||||
let result = filter.evaluate(&rtxn, &index).unwrap();
|
||||
// this is allowed due to filters with strings
|
||||
assert!(result.contains(1));
|
||||
|
||||
let filter = Filter::from_str("price = NaN").unwrap().unwrap();
|
||||
let result = filter.evaluate(&rtxn, &index).unwrap();
|
||||
assert!(result.is_empty());
|
||||
let filter = Filter::from_str("price < NaN").unwrap().unwrap();
|
||||
assert!(matches!(
|
||||
filter.evaluate(&rtxn, &index),
|
||||
Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_)))
|
||||
));
|
||||
let result = filter.evaluate(&rtxn, &index).unwrap();
|
||||
assert!(result.contains(1));
|
||||
|
||||
let filter = Filter::from_str("price = infinity").unwrap().unwrap();
|
||||
let result = filter.evaluate(&rtxn, &index).unwrap();
|
||||
assert!(result.contains(2));
|
||||
let filter = Filter::from_str("price < infinity").unwrap().unwrap();
|
||||
assert!(matches!(
|
||||
filter.evaluate(&rtxn, &index),
|
||||
Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_)))
|
||||
));
|
||||
let result = filter.evaluate(&rtxn, &index).unwrap();
|
||||
assert!(result.contains(0));
|
||||
assert!(result.contains(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -8,7 +8,7 @@ use roaring::bitmap::RoaringBitmap;
|
|||
|
||||
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
|
||||
use self::new::{execute_vector_search, PartialSearchResult};
|
||||
use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::vector::Embedder;
|
||||
|
@ -269,6 +269,12 @@ impl<'a> Search<'a> {
|
|||
)?,
|
||||
};
|
||||
|
||||
if let Some(VectorStoreStats { total_time, total_queries, total_results }) =
|
||||
ctx.vector_store_stats
|
||||
{
|
||||
tracing::debug!("Vector store stats: total_time={total_time:.02?}, total_queries={total_queries}, total_results={total_results}");
|
||||
}
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
|
|
|
@ -22,6 +22,8 @@ mod vector_sort;
|
|||
mod tests;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
use bucket_sort::{bucket_sort, BucketSortOutput};
|
||||
use charabia::{Language, TokenizerBuilder};
|
||||
|
@ -72,6 +74,7 @@ pub struct SearchContext<'ctx> {
|
|||
pub phrase_docids: PhraseDocIdsCache,
|
||||
pub restricted_fids: Option<RestrictedFids>,
|
||||
pub prefix_search: PrefixSearch,
|
||||
pub vector_store_stats: Option<VectorStoreStats>,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
|
@ -101,6 +104,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
phrase_docids: <_>::default(),
|
||||
restricted_fids: None,
|
||||
prefix_search,
|
||||
vector_store_stats: None,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -166,6 +170,25 @@ impl<'ctx> SearchContext<'ctx> {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct VectorStoreStats {
|
||||
/// The total time spent on vector search.
|
||||
pub total_time: Duration,
|
||||
/// The number of searches performed.
|
||||
pub total_queries: usize,
|
||||
/// The number of nearest neighbors found.
|
||||
pub total_results: usize,
|
||||
}
|
||||
|
||||
impl AddAssign for VectorStoreStats {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
let Self { total_time, total_queries, total_results } = self;
|
||||
*total_time += other.total_time;
|
||||
*total_queries += other.total_queries;
|
||||
*total_results += other.total_results;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
|
||||
pub enum Word {
|
||||
Original(Interned<String>),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
use std::iter::FromIterator;
|
||||
use std::time::Instant;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use super::VectorStoreStats;
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::vector::{ArroyWrapper, DistributionShift, Embedder};
|
||||
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
||||
|
@ -53,9 +55,15 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
|||
) -> Result<()> {
|
||||
let target = &self.target;
|
||||
|
||||
let before = Instant::now();
|
||||
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized);
|
||||
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
||||
self.cached_sorted_docids = results.into_iter();
|
||||
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {
|
||||
total_time: before.elapsed(),
|
||||
total_queries: 1,
|
||||
total_results: self.cached_sorted_docids.len(),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
[0, ]
|
|
@ -1,4 +0,0 @@
|
|||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
[]
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: crates/milli/src/test_index.rs
|
||||
---
|
||||
[0, ]
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
source: crates/milli/src/test_index.rs
|
||||
---
|
||||
[]
|
1399
crates/milli/src/test_index.rs
Normal file
1399
crates/milli/src/test_index.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -127,7 +127,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid)
|
||||
|| settings_diff.old.disabled_typos_terms.is_exact(w);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, value)?;
|
||||
|
@ -139,7 +140,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid)
|
||||
|| settings_diff.new.disabled_typos_terms.is_exact(w);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, value)?;
|
||||
|
|
|
@ -273,14 +273,11 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
unreachable!();
|
||||
};
|
||||
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
let clonable_exact_word_docids =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
|
||||
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
|
||||
}
|
||||
|
||||
let word_docids_merger = word_docids_builder.build();
|
||||
|
|
|
@ -319,8 +319,11 @@ impl WordDocidsExtractors {
|
|||
let doc_alloc = &context.doc_alloc;
|
||||
|
||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||
let is_exact_attribute =
|
||||
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
|
||||
let is_exact = |fname: &str, word: &str| {
|
||||
exact_attributes.iter().any(|attr| contained_in(fname, attr))
|
||||
|| disabled_typos_terms.is_exact(word)
|
||||
};
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
|
@ -328,7 +331,7 @@ impl WordDocidsExtractors {
|
|||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
|
@ -356,7 +359,7 @@ impl WordDocidsExtractors {
|
|||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
|
@ -372,7 +375,7 @@ impl WordDocidsExtractors {
|
|||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
|
@ -389,7 +392,7 @@ impl WordDocidsExtractors {
|
|||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
|
|
|
@ -9,6 +9,7 @@ pub use document_operation::{DocumentOperation, PayloadStats};
|
|||
use hashbrown::HashMap;
|
||||
use heed::RwTxn;
|
||||
pub use partial_dump::PartialDump;
|
||||
pub use post_processing::recompute_word_fst_from_word_docids_database;
|
||||
pub use update_by_function::UpdateByFunction;
|
||||
pub use write::ChannelCongestion;
|
||||
use write::{build_vectors, update_index, write_to_db};
|
||||
|
|
|
@ -131,6 +131,20 @@ fn compute_word_fst(
|
|||
}
|
||||
}
|
||||
|
||||
pub fn recompute_word_fst_from_word_docids_database(index: &Index, wtxn: &mut RwTxn) -> Result<()> {
|
||||
let fst = fst::Set::default().map_data(std::borrow::Cow::Owned)?;
|
||||
let mut word_fst_builder = WordFstBuilder::new(&fst)?;
|
||||
let words = index.word_docids.iter(wtxn)?.remap_data_type::<DecodeIgnore>();
|
||||
for res in words {
|
||||
let (word, _) = res?;
|
||||
word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?;
|
||||
}
|
||||
let (word_fst_mmap, _) = word_fst_builder.build(index, wtxn)?;
|
||||
index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_search")]
|
||||
fn compute_facet_search_database(
|
||||
index: &Index,
|
||||
|
|
|
@ -17,6 +17,7 @@ use super::IndexerConfig;
|
|||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::disabled_typos_terms::DisabledTyposTerms;
|
||||
use crate::error::UserError;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::filterable_attributes_rules::match_faceted_field;
|
||||
|
@ -169,6 +170,7 @@ pub struct Settings<'a, 't, 'i> {
|
|||
synonyms: Setting<BTreeMap<String, Vec<String>>>,
|
||||
primary_key: Setting<String>,
|
||||
authorize_typos: Setting<bool>,
|
||||
disable_on_numbers: Setting<bool>,
|
||||
min_word_len_two_typos: Setting<u8>,
|
||||
min_word_len_one_typo: Setting<u8>,
|
||||
exact_words: Setting<BTreeSet<String>>,
|
||||
|
@ -207,6 +209,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
synonyms: Setting::NotSet,
|
||||
primary_key: Setting::NotSet,
|
||||
authorize_typos: Setting::NotSet,
|
||||
disable_on_numbers: Setting::NotSet,
|
||||
exact_words: Setting::NotSet,
|
||||
min_word_len_two_typos: Setting::NotSet,
|
||||
min_word_len_one_typo: Setting::NotSet,
|
||||
|
@ -354,6 +357,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.min_word_len_one_typo = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_disable_on_numbers(&mut self, disable_on_numbers: bool) {
|
||||
self.disable_on_numbers = Setting::Set(disable_on_numbers);
|
||||
}
|
||||
|
||||
pub fn reset_disable_on_numbers(&mut self) {
|
||||
self.disable_on_numbers = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
|
||||
self.exact_words = Setting::Set(words);
|
||||
}
|
||||
|
@ -866,6 +877,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn update_disabled_typos_terms(&mut self) -> Result<()> {
|
||||
let mut disabled_typos_terms = self.index.disabled_typos_terms(self.wtxn)?;
|
||||
match self.disable_on_numbers {
|
||||
Setting::Set(disable_on_numbers) => {
|
||||
disabled_typos_terms.disable_on_numbers = disable_on_numbers;
|
||||
}
|
||||
Setting::Reset => {
|
||||
self.index.delete_disabled_typos_terms(self.wtxn)?;
|
||||
disabled_typos_terms.disable_on_numbers =
|
||||
DisabledTyposTerms::default().disable_on_numbers;
|
||||
}
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
self.index.put_disabled_typos_terms(self.wtxn, &disabled_typos_terms)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_exact_words(&mut self) -> Result<()> {
|
||||
match self.exact_words {
|
||||
Setting::Set(ref mut words) => {
|
||||
|
@ -1246,6 +1275,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.update_prefix_search()?;
|
||||
self.update_facet_search()?;
|
||||
self.update_localized_attributes_rules()?;
|
||||
self.update_disabled_typos_terms()?;
|
||||
|
||||
let embedding_config_updates = self.update_embedding_configs()?;
|
||||
|
||||
|
@ -1327,6 +1357,7 @@ impl InnerIndexSettingsDiff {
|
|||
|| old_settings.prefix_search != new_settings.prefix_search
|
||||
|| old_settings.localized_attributes_rules
|
||||
!= new_settings.localized_attributes_rules
|
||||
|| old_settings.disabled_typos_terms != new_settings.disabled_typos_terms
|
||||
};
|
||||
|
||||
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
|
||||
|
@ -1526,6 +1557,7 @@ pub(crate) struct InnerIndexSettings {
|
|||
pub user_defined_searchable_attributes: Option<Vec<String>>,
|
||||
pub sortable_fields: HashSet<String>,
|
||||
pub exact_attributes: HashSet<FieldId>,
|
||||
pub disabled_typos_terms: DisabledTyposTerms,
|
||||
pub proximity_precision: ProximityPrecision,
|
||||
pub embedding_configs: EmbeddingConfigs,
|
||||
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
|
@ -1574,7 +1606,7 @@ impl InnerIndexSettings {
|
|||
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
|
||||
let builder = MetadataBuilder::from_index(index, rtxn)?;
|
||||
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
|
||||
|
||||
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
|
||||
Ok(Self {
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
|
@ -1592,6 +1624,7 @@ impl InnerIndexSettings {
|
|||
geo_fields_ids,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
disabled_typos_terms,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -896,6 +896,7 @@ fn test_correct_settings_init() {
|
|||
localized_attributes_rules,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
disable_on_numbers,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
|
@ -923,6 +924,7 @@ fn test_correct_settings_init() {
|
|||
assert!(matches!(localized_attributes_rules, Setting::NotSet));
|
||||
assert!(matches!(prefix_search, Setting::NotSet));
|
||||
assert!(matches!(facet_search, Setting::NotSet));
|
||||
assert!(matches!(disable_on_numbers, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
mod v1_12;
|
||||
mod v1_13;
|
||||
mod v1_14;
|
||||
|
||||
mod v1_15;
|
||||
use heed::RwTxn;
|
||||
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
|
||||
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
|
||||
use v1_14::Latest_V1_13_To_Latest_V1_14;
|
||||
use v1_15::Latest_V1_14_To_Latest_V1_15;
|
||||
|
||||
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
||||
use crate::progress::{Progress, VariableNameStep};
|
||||
use crate::{Index, InternalError, Result};
|
||||
|
||||
|
@ -23,12 +25,16 @@ trait UpgradeIndex {
|
|||
}
|
||||
|
||||
/// Return true if the cached stats of the index must be regenerated
|
||||
pub fn upgrade(
|
||||
pub fn upgrade<MSP>(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
db_version: (u32, u32, u32),
|
||||
must_stop_processing: MSP,
|
||||
progress: Progress,
|
||||
) -> Result<bool> {
|
||||
) -> Result<bool>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let from = index.get_version(wtxn)?.unwrap_or(db_version);
|
||||
let upgrade_functions: &[&dyn UpgradeIndex] = &[
|
||||
&V1_12_To_V1_12_3 {},
|
||||
|
@ -36,6 +42,10 @@ pub fn upgrade(
|
|||
&V1_13_0_To_V1_13_1 {},
|
||||
&V1_13_1_To_Latest_V1_13 {},
|
||||
&Latest_V1_13_To_Latest_V1_14 {},
|
||||
&Latest_V1_14_To_Latest_V1_15 {},
|
||||
// This is the last upgrade function, it will be called when the index is up to date.
|
||||
// any other upgrade function should be added before this one.
|
||||
&ToCurrentNoOp {},
|
||||
];
|
||||
|
||||
let start = match from {
|
||||
|
@ -43,8 +53,9 @@ pub fn upgrade(
|
|||
(1, 12, 3..) => 1,
|
||||
(1, 13, 0) => 2,
|
||||
(1, 13, _) => 4,
|
||||
(1, 14, _) => 5,
|
||||
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
|
||||
(1, 14, _) => 4,
|
||||
(1, 15, _) => 6,
|
||||
(major, minor, patch) => {
|
||||
return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into())
|
||||
}
|
||||
|
@ -56,6 +67,9 @@ pub fn upgrade(
|
|||
let mut current_version = from;
|
||||
let mut regenerate_stats = false;
|
||||
for (i, upgrade) in upgrade_path.iter().enumerate() {
|
||||
if (must_stop_processing)() {
|
||||
return Err(crate::Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
let target = upgrade.target_version();
|
||||
progress.update_progress(VariableNameStep::<UpgradeVersion>::new(
|
||||
format!(
|
||||
|
@ -77,3 +91,22 @@ pub fn upgrade(
|
|||
|
||||
Ok(regenerate_stats)
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
struct ToCurrentNoOp {}
|
||||
|
||||
impl UpgradeIndex for ToCurrentNoOp {
|
||||
fn upgrade(
|
||||
&self,
|
||||
_wtxn: &mut RwTxn,
|
||||
_index: &Index,
|
||||
_original: (u32, u32, u32),
|
||||
_progress: Progress,
|
||||
) -> Result<bool> {
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn target_version(&self) -> (u32, u32, u32) {
|
||||
(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
use heed::RwTxn;
|
||||
|
||||
use super::UpgradeIndex;
|
||||
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
||||
use crate::database_stats::DatabaseStats;
|
||||
use crate::progress::Progress;
|
||||
use crate::{make_enum_progress, Index, Result};
|
||||
|
@ -51,10 +50,6 @@ impl UpgradeIndex for V1_13_1_To_Latest_V1_13 {
|
|||
}
|
||||
|
||||
fn target_version(&self) -> (u32, u32, u32) {
|
||||
(
|
||||
VERSION_MAJOR.parse().unwrap(),
|
||||
VERSION_MINOR.parse().unwrap(),
|
||||
VERSION_PATCH.parse().unwrap(),
|
||||
)
|
||||
(1, 13, 3)
|
||||
}
|
||||
}
|
||||
|
|
35
crates/milli/src/update/upgrade/v1_15.rs
Normal file
35
crates/milli/src/update/upgrade/v1_15.rs
Normal file
|
@ -0,0 +1,35 @@
|
|||
use heed::RwTxn;
|
||||
|
||||
use super::UpgradeIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer::recompute_word_fst_from_word_docids_database;
|
||||
use crate::{make_enum_progress, Index, Result};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub(super) struct Latest_V1_14_To_Latest_V1_15();
|
||||
|
||||
impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 {
|
||||
fn upgrade(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
_original: (u32, u32, u32),
|
||||
progress: Progress,
|
||||
) -> Result<bool> {
|
||||
// Recompute the word FST from the word docids database.
|
||||
make_enum_progress! {
|
||||
enum TypoTolerance {
|
||||
RecomputeWordFst,
|
||||
}
|
||||
};
|
||||
|
||||
progress.update_progress(TypoTolerance::RecomputeWordFst);
|
||||
recompute_word_fst_from_word_docids_database(index, wtxn)?;
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn target_version(&self) -> (u32, u32, u32) {
|
||||
(1, 15, 0)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue