mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Merge remote-tracking branch 'origin/main' into search-refactor
This commit is contained in:
commit
a81165f0d8
282 changed files with 4457 additions and 587 deletions
|
@ -81,6 +81,8 @@ impl FromStr for Member {
|
|||
if is_reserved_keyword(text)
|
||||
|| text.starts_with("_geoRadius(")
|
||||
|| text.starts_with("_geoBoundingBox(")
|
||||
|| text.starts_with("_geo(")
|
||||
|| text.starts_with("_geoDistance(")
|
||||
{
|
||||
return Err(AscDescError::ReservedKeyword { name: text.to_string() })?;
|
||||
}
|
||||
|
@ -265,6 +267,13 @@ mod tests {
|
|||
("_geoPoint(0, -180.000001):desc", GeoError(BadGeoError::Lng(-180.000001))),
|
||||
("_geoPoint(159.256, 130):asc", GeoError(BadGeoError::Lat(159.256))),
|
||||
("_geoPoint(12, -2021):desc", GeoError(BadGeoError::Lng(-2021.))),
|
||||
("_geo(12, -2021):asc", ReservedKeyword { name: S("_geo(12, -2021)") }),
|
||||
("_geo(12, -2021):desc", ReservedKeyword { name: S("_geo(12, -2021)") }),
|
||||
("_geoDistance(12, -2021):asc", ReservedKeyword { name: S("_geoDistance(12, -2021)") }),
|
||||
(
|
||||
"_geoDistance(12, -2021):desc",
|
||||
ReservedKeyword { name: S("_geoDistance(12, -2021)") },
|
||||
),
|
||||
];
|
||||
|
||||
for (req, expected_error) in invalid_req {
|
||||
|
|
|
@ -1222,11 +1222,22 @@ impl Index {
|
|||
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
|
||||
|
||||
let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
|
||||
let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new();
|
||||
let mut total = 0;
|
||||
for sl in self.script_language_docids.iter(rtxn)? {
|
||||
let ((script, language), docids) = sl?;
|
||||
|
||||
// keep only Languages that contains at least 1 document.
|
||||
if !soft_deleted_documents.is_superset(&docids) {
|
||||
let remaining_documents_count = (docids - &soft_deleted_documents).len();
|
||||
total += remaining_documents_count;
|
||||
if remaining_documents_count > 0 {
|
||||
script_language_doc_count.push((script, language, remaining_documents_count));
|
||||
}
|
||||
}
|
||||
|
||||
let threshold = total / 20; // 5% (arbitrary)
|
||||
for (script, language, count) in script_language_doc_count {
|
||||
if count > threshold {
|
||||
if let Some(languages) = script_language.get_mut(&script) {
|
||||
(*languages).push(language);
|
||||
} else {
|
||||
|
@ -1586,35 +1597,35 @@ pub(crate) mod tests {
|
|||
|
||||
// match a document in the middle of the rectangle
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([10, -10], [-10, 10])").unwrap().unwrap())
|
||||
.filter(Filter::from_str("_geoBoundingBox([10, 10], [-10, -10])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0]>");
|
||||
|
||||
// select everything
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([90, -180], [-90, 180])").unwrap().unwrap())
|
||||
.filter(Filter::from_str("_geoBoundingBox([90, 180], [-90, -180])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0, 1, 2, 3, 4]>");
|
||||
|
||||
// go on the edge of the longitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 180], [0, -170])").unwrap().unwrap())
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, -170], [0, 180])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[1]>");
|
||||
|
||||
// go on the other edge of the longitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 170], [0, -180])").unwrap().unwrap())
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, -180], [0, 170])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[2]>");
|
||||
|
||||
// wrap around the longitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 170], [0, -170])").unwrap().unwrap())
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, -170], [0, 170])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[1, 2]>");
|
||||
|
@ -1640,20 +1651,26 @@ pub(crate) mod tests {
|
|||
.filter(Filter::from_str("_geoBoundingBox([-80, 0], [80, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap_err();
|
||||
insta::assert_display_snapshot!(error, @r###"
|
||||
insta::assert_display_snapshot!(
|
||||
error,
|
||||
@r###"
|
||||
The top latitude `-80` is below the bottom latitude `80`.
|
||||
32:33 _geoBoundingBox([-80, 0], [80, 0])
|
||||
"###);
|
||||
"###
|
||||
);
|
||||
|
||||
// send a top latitude lower than the bottow latitude
|
||||
let error = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([-10, 0], [10, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap_err();
|
||||
insta::assert_display_snapshot!(error, @r###"
|
||||
insta::assert_display_snapshot!(
|
||||
error,
|
||||
@r###"
|
||||
The top latitude `-10` is below the bottom latitude `10`.
|
||||
32:33 _geoBoundingBox([-10, 0], [10, 0])
|
||||
"###);
|
||||
"###
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -74,6 +74,7 @@ use std::collections::{BTreeMap, HashMap};
|
|||
use std::convert::{TryFrom, TryInto};
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
|
||||
pub use filter_parser::{Condition, FilterCondition, Span, Token};
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
|
@ -323,6 +324,10 @@ pub fn is_faceted_by(field: &str, facet: &str) -> bool {
|
|||
&& field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true)
|
||||
}
|
||||
|
||||
pub fn normalize_facet(original: &str) -> String {
|
||||
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
|
|
@ -54,8 +54,6 @@ impl Display for BadGeoError {
|
|||
enum FilterError<'a> {
|
||||
AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet<String> },
|
||||
ParseGeoError(BadGeoError),
|
||||
ReservedGeo(&'a str),
|
||||
Reserved(&'a str),
|
||||
TooDeep,
|
||||
}
|
||||
impl<'a> std::error::Error for FilterError<'a> {}
|
||||
|
@ -96,12 +94,6 @@ impl<'a> Display for FilterError<'a> {
|
|||
"Too many filter conditions, can't process more than {} filters.",
|
||||
MAX_FILTER_DEPTH
|
||||
),
|
||||
Self::ReservedGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` field coordinates.", keyword),
|
||||
Self::Reserved(keyword) => write!(
|
||||
f,
|
||||
"`{}` is a reserved keyword and thus can't be used as a filter expression.",
|
||||
keyword
|
||||
),
|
||||
Self::ParseGeoError(error) => write!(f, "{}", error),
|
||||
}
|
||||
}
|
||||
|
@ -230,7 +222,7 @@ impl<'a> Filter<'a> {
|
|||
&FacetGroupKey {
|
||||
field_id,
|
||||
level: 0,
|
||||
left_bound: &val.value().to_lowercase(),
|
||||
left_bound: &crate::normalize_facet(val.value()),
|
||||
},
|
||||
)?
|
||||
.map(|v| v.bitmap)
|
||||
|
@ -332,23 +324,10 @@ impl<'a> Filter<'a> {
|
|||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
} else {
|
||||
match fid.value() {
|
||||
attribute @ "_geo" => {
|
||||
Err(fid.as_external_error(FilterError::ReservedGeo(attribute)))?
|
||||
}
|
||||
attribute if attribute.starts_with("_geoPoint(") => {
|
||||
Err(fid.as_external_error(FilterError::ReservedGeo("_geoPoint")))?
|
||||
}
|
||||
attribute @ "_geoDistance" => {
|
||||
Err(fid.as_external_error(FilterError::Reserved(attribute)))?
|
||||
}
|
||||
attribute => {
|
||||
Err(fid.as_external_error(FilterError::AttributeNotFilterable {
|
||||
attribute,
|
||||
filterable_fields: filterable_fields.clone(),
|
||||
}))?
|
||||
}
|
||||
}
|
||||
Err(fid.as_external_error(FilterError::AttributeNotFilterable {
|
||||
attribute: fid.value(),
|
||||
filterable_fields: filterable_fields.clone(),
|
||||
}))?
|
||||
}
|
||||
}
|
||||
FilterCondition::Or(subfilters) => {
|
||||
|
@ -419,54 +398,56 @@ impl<'a> Filter<'a> {
|
|||
}))?
|
||||
}
|
||||
}
|
||||
FilterCondition::GeoBoundingBox { top_left_point, bottom_right_point } => {
|
||||
FilterCondition::GeoBoundingBox { top_right_point, bottom_left_point } => {
|
||||
if filterable_fields.contains("_geo") {
|
||||
let top_left: [f64; 2] = [
|
||||
top_left_point[0].parse_finite_float()?,
|
||||
top_left_point[1].parse_finite_float()?,
|
||||
let top_right: [f64; 2] = [
|
||||
top_right_point[0].parse_finite_float()?,
|
||||
top_right_point[1].parse_finite_float()?,
|
||||
];
|
||||
let bottom_right: [f64; 2] = [
|
||||
bottom_right_point[0].parse_finite_float()?,
|
||||
bottom_right_point[1].parse_finite_float()?,
|
||||
let bottom_left: [f64; 2] = [
|
||||
bottom_left_point[0].parse_finite_float()?,
|
||||
bottom_left_point[1].parse_finite_float()?,
|
||||
];
|
||||
if !(-90.0..=90.0).contains(&top_left[0]) {
|
||||
if !(-90.0..=90.0).contains(&top_right[0]) {
|
||||
return Err(
|
||||
top_left_point[0].as_external_error(BadGeoError::Lat(top_left[0]))
|
||||
top_right_point[0].as_external_error(BadGeoError::Lat(top_right[0]))
|
||||
)?;
|
||||
}
|
||||
if !(-180.0..=180.0).contains(&top_left[1]) {
|
||||
if !(-180.0..=180.0).contains(&top_right[1]) {
|
||||
return Err(
|
||||
top_left_point[1].as_external_error(BadGeoError::Lng(top_left[1]))
|
||||
top_right_point[1].as_external_error(BadGeoError::Lng(top_right[1]))
|
||||
)?;
|
||||
}
|
||||
if !(-90.0..=90.0).contains(&bottom_right[0]) {
|
||||
return Err(bottom_right_point[0]
|
||||
.as_external_error(BadGeoError::Lat(bottom_right[0])))?;
|
||||
if !(-90.0..=90.0).contains(&bottom_left[0]) {
|
||||
return Err(bottom_left_point[0]
|
||||
.as_external_error(BadGeoError::Lat(bottom_left[0])))?;
|
||||
}
|
||||
if !(-180.0..=180.0).contains(&bottom_right[1]) {
|
||||
return Err(bottom_right_point[1]
|
||||
.as_external_error(BadGeoError::Lng(bottom_right[1])))?;
|
||||
if !(-180.0..=180.0).contains(&bottom_left[1]) {
|
||||
return Err(bottom_left_point[1]
|
||||
.as_external_error(BadGeoError::Lng(bottom_left[1])))?;
|
||||
}
|
||||
if top_left[0] < bottom_right[0] {
|
||||
return Err(bottom_right_point[1].as_external_error(
|
||||
BadGeoError::BoundingBoxTopIsBelowBottom(top_left[0], bottom_right[0]),
|
||||
if top_right[0] < bottom_left[0] {
|
||||
return Err(bottom_left_point[1].as_external_error(
|
||||
BadGeoError::BoundingBoxTopIsBelowBottom(top_right[0], bottom_left[0]),
|
||||
))?;
|
||||
}
|
||||
|
||||
// Instead of writing a custom `GeoBoundingBox` filter we're simply going to re-use the range
|
||||
// filter to create the following filter;
|
||||
// `_geo.lat {top_left[0]} TO {bottom_right[0]} AND _geo.lng {top_left[1]} TO {bottom_right[1]}`
|
||||
// `_geo.lat {top_right[0]} TO {bottom_left[0]} AND _geo.lng {top_right[1]} TO {bottom_left[1]}`
|
||||
// As we can see, we need to use a bunch of tokens that don't exist in the original filter,
|
||||
// thus we're going to create tokens that point to a random span but contain our text.
|
||||
|
||||
let geo_lat_token =
|
||||
Token::new(top_left_point[0].original_span(), Some("_geo.lat".to_string()));
|
||||
let geo_lat_token = Token::new(
|
||||
top_right_point[0].original_span(),
|
||||
Some("_geo.lat".to_string()),
|
||||
);
|
||||
|
||||
let condition_lat = FilterCondition::Condition {
|
||||
fid: geo_lat_token,
|
||||
op: Condition::Between {
|
||||
from: bottom_right_point[0].clone(),
|
||||
to: top_left_point[0].clone(),
|
||||
from: bottom_left_point[0].clone(),
|
||||
to: top_right_point[0].clone(),
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -476,27 +457,29 @@ impl<'a> Filter<'a> {
|
|||
filterable_fields,
|
||||
)?;
|
||||
|
||||
let geo_lng_token =
|
||||
Token::new(top_left_point[1].original_span(), Some("_geo.lng".to_string()));
|
||||
let selected_lng = if top_left[1] > bottom_right[1] {
|
||||
let geo_lng_token = Token::new(
|
||||
top_right_point[1].original_span(),
|
||||
Some("_geo.lng".to_string()),
|
||||
);
|
||||
let selected_lng = if top_right[1] < bottom_left[1] {
|
||||
// In this case the bounding box is wrapping around the earth (going from 180 to -180).
|
||||
// We need to update the lng part of the filter from;
|
||||
// `_geo.lng {top_left[1]} TO {bottom_right[1]}` to
|
||||
// `_geo.lng {top_left[1]} TO 180 AND _geo.lng -180 TO {bottom_right[1]}`
|
||||
// `_geo.lng {top_right[1]} TO {bottom_left[1]}` to
|
||||
// `_geo.lng {bottom_left[1]} TO 180 AND _geo.lng -180 TO {top_right[1]}`
|
||||
|
||||
let min_lng_token = Token::new(
|
||||
top_left_point[1].original_span(),
|
||||
top_right_point[1].original_span(),
|
||||
Some("-180.0".to_string()),
|
||||
);
|
||||
let max_lng_token = Token::new(
|
||||
top_left_point[1].original_span(),
|
||||
top_right_point[1].original_span(),
|
||||
Some("180.0".to_string()),
|
||||
);
|
||||
|
||||
let condition_left = FilterCondition::Condition {
|
||||
fid: geo_lng_token.clone(),
|
||||
op: Condition::Between {
|
||||
from: top_left_point[1].clone(),
|
||||
from: bottom_left_point[1].clone(),
|
||||
to: max_lng_token,
|
||||
},
|
||||
};
|
||||
|
@ -510,7 +493,7 @@ impl<'a> Filter<'a> {
|
|||
fid: geo_lng_token,
|
||||
op: Condition::Between {
|
||||
from: min_lng_token,
|
||||
to: bottom_right_point[1].clone(),
|
||||
to: top_right_point[1].clone(),
|
||||
},
|
||||
};
|
||||
let right = Filter { condition: condition_right }.inner_evaluate(
|
||||
|
@ -524,8 +507,8 @@ impl<'a> Filter<'a> {
|
|||
let condition_lng = FilterCondition::Condition {
|
||||
fid: geo_lng_token,
|
||||
op: Condition::Between {
|
||||
from: top_left_point[1].clone(),
|
||||
to: bottom_right_point[1].clone(),
|
||||
from: bottom_left_point[1].clone(),
|
||||
to: top_right_point[1].clone(),
|
||||
},
|
||||
};
|
||||
Filter { condition: condition_lng }.inner_evaluate(
|
||||
|
@ -537,10 +520,12 @@ impl<'a> Filter<'a> {
|
|||
|
||||
Ok(selected_lat & selected_lng)
|
||||
} else {
|
||||
Err(top_left_point[0].as_external_error(FilterError::AttributeNotFilterable {
|
||||
attribute: "_geo",
|
||||
filterable_fields: filterable_fields.clone(),
|
||||
}))?
|
||||
Err(top_right_point[0].as_external_error(
|
||||
FilterError::AttributeNotFilterable {
|
||||
attribute: "_geo",
|
||||
filterable_fields: filterable_fields.clone(),
|
||||
},
|
||||
))?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,9 @@ pub fn located_query_terms_from_string(
|
|||
|
||||
let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable();
|
||||
while let Some(token) = peekable.next() {
|
||||
if token.lemma().is_empty() {
|
||||
continue;
|
||||
}
|
||||
// early return if word limit is exceeded
|
||||
if located_terms.len() >= parts_limit {
|
||||
return Ok(located_terms);
|
||||
|
|
|
@ -3,12 +3,14 @@ use std::convert::TryInto;
|
|||
use std::fs::File;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::index_documents::MergeFn;
|
||||
use crate::{
|
||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
|
@ -33,7 +35,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut script_language_pair = HashMap::new();
|
||||
let mut script_language_docids = HashMap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
concat_u32s_array,
|
||||
|
@ -43,13 +45,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut field_buffer = String::new();
|
||||
let mut builder = TokenizerBuilder::new();
|
||||
let mut buffers = Buffers::default();
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
builder.stop_words(stop_words);
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
let tokenizer = builder.build();
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
|
@ -57,49 +58,122 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = obkv::KvReader::<FieldId>::new(value);
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
documents_ids.push(document_id);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
buffers.key_buffer.clear();
|
||||
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
|
||||
for (index, token) in tokens {
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry = script_language_pair
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut buffers,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
|
||||
// if we detect a potetial mistake in the language detection,
|
||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||
if script_language_word_count
|
||||
.values()
|
||||
.map(Vec::as_slice)
|
||||
.any(potential_language_detection_error)
|
||||
{
|
||||
// build an allow list with the most frequent detected languages in the document.
|
||||
let script_language: HashMap<_, _> =
|
||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||
|
||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
|
||||
// rerun the extraction.
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut buffers,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (script, languages_frequency) in script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
}
|
||||
|
||||
fn extract_tokens_from_document<T: AsRef<[u8]>>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer<T>,
|
||||
max_positions_per_attributes: u32,
|
||||
buffers: &mut Buffers,
|
||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// if a language has been detected for the token, we update the counter.
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry =
|
||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||
Some((_, n)) => *n += 1,
|
||||
None => entry.push((language, 1)),
|
||||
}
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(token.as_bytes());
|
||||
}
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
buffers.key_buffer.truncate(mem::size_of::<u32>());
|
||||
buffers.key_buffer.extend_from_slice(token.as_bytes());
|
||||
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter
|
||||
.insert(&key_buffer, position.to_ne_bytes())?;
|
||||
}
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter
|
||||
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (documents_ids, reader, script_language_pair))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
|
@ -183,3 +257,46 @@ fn process_tokens<'a>(
|
|||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool {
|
||||
if languages_frequency.len() > 1 {
|
||||
let threshold = compute_language_frequency_threshold(languages_frequency);
|
||||
languages_frequency.iter().any(|(_, c)| *c <= threshold)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn most_frequent_languages(
|
||||
(script, languages_frequency): (&Script, &Vec<(Language, usize)>),
|
||||
) -> Option<(Script, Vec<Language>)> {
|
||||
if languages_frequency.len() > 1 {
|
||||
let threshold = compute_language_frequency_threshold(languages_frequency);
|
||||
|
||||
let languages: Vec<_> =
|
||||
languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect();
|
||||
|
||||
if languages.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some((*script, languages))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize {
|
||||
let total: usize = languages_frequency.iter().map(|(_, c)| c).sum();
|
||||
total / 10 // 10% is a completely arbitrary value.
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Buffers {
|
||||
// the key buffer is the concatenation of the internal document id with the field id.
|
||||
// The buffer has to be completelly cleared between documents,
|
||||
// and the field id part must be cleared between each field.
|
||||
key_buffer: Vec<u8>,
|
||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||
field_buffer: String,
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@ use std::fs::File;
|
|||
use std::io;
|
||||
use std::mem::size_of;
|
||||
|
||||
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
|
||||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
@ -136,9 +135,7 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
|||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
let normalized = CompatibilityDecompositionNormalizer
|
||||
.normalize_str(original.trim())
|
||||
.to_lowercase();
|
||||
let normalized = crate::normalize_facet(original);
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
|
|
|
@ -565,8 +565,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||
self.index.put_primary_key(self.wtxn, primary_key)?;
|
||||
Ok(())
|
||||
} else {
|
||||
let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
|
||||
Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
|
||||
let curr_primary_key = self.index.primary_key(self.wtxn)?.unwrap().to_string();
|
||||
if primary_key == &curr_primary_key {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(UserError::PrimaryKeyCannotBeChanged(curr_primary_key).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
Setting::Reset => {
|
||||
|
@ -1332,6 +1336,17 @@ mod tests {
|
|||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Updating settings with the same primary key should do nothing
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("mykey"));
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Updating the settings with a different (or no) primary key causes an error
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let error = index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue