Fix facet normalization

This commit is contained in:
ManyTheFish 2023-03-29 10:57:02 +02:00
parent b744f33530
commit efea1e5837
3 changed files with 7 additions and 5 deletions

View File

@ -22,6 +22,7 @@ use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto};
use std::hash::BuildHasherDefault;
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
pub use filter_parser::{Condition, FilterCondition, Span, Token};
use fxhash::{FxHasher32, FxHasher64};
pub use grenad::CompressionType;
@ -252,6 +253,10 @@ pub fn is_faceted_by(field: &str, facet: &str) -> bool {
&& field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true)
}
pub fn normalize_facet(original: &str) -> String {
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
}
#[cfg(test)]
mod tests {
use serde_json::json;

View File

@ -230,7 +230,7 @@ impl<'a> Filter<'a> {
&FacetGroupKey {
field_id,
level: 0,
left_bound: &val.value().to_lowercase(),
left_bound: &crate::normalize_facet(val.value()),
},
)?
.map(|v| v.bitmap)

View File

@ -4,7 +4,6 @@ use std::fs::File;
use std::io;
use std::mem::size_of;
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
use heed::zerocopy::AsBytes;
use heed::BytesEncode;
use roaring::RoaringBitmap;
@ -136,9 +135,7 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
}
}
Value::String(original) => {
let normalized = CompatibilityDecompositionNormalizer
.normalize_str(original.trim())
.to_lowercase();
let normalized = crate::normalize_facet(original);
output_strings.push((normalized, original.clone()));
}
Value::Array(values) => {