3372: Enhance facet string normalization r=ManyTheFish a=ManyTheFish

# Pull Request

Use compatibility decomposition normalizer in facet string extraction in order to have a more human friendly sort order.

Now, [é (U+00E9)](https://www.compart.com/fr/unicode/U+00E9) is converted to [e (U+0065)](https://www.compart.com/fr/unicode/U+0065) + [◌́ (U+0301)](https://www.compart.com/fr/unicode/U+0301). This way any word starting with an accented/diacritized version of a character is put just after the words starting with the unaccented version of the character.

## Related issue

Fixes #3260 


Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
bors[bot] 2023-01-18 17:17:53 +00:00 committed by GitHub
commit b3166df7ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 18 deletions

View File

@ -4,6 +4,7 @@ use std::fs::File;
use std::io;
use std::mem::size_of;
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
use heed::zerocopy::AsBytes;
use heed::BytesEncode;
use roaring::RoaringBitmap;
@ -135,7 +136,9 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
}
}
Value::String(original) => {
let normalized = original.trim().to_lowercase();
let normalized = CompatibilityDecompositionNormalizer
.normalize_str(original.trim())
.to_lowercase();
output_strings.push((normalized, original.clone()));
}
Value::Array(values) => {

View File

@ -1,17 +1,17 @@
{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null], "tag_in": 1}
{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": [], "tag_in": 2}
{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null, "tag_in": 3}
{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4, "tag_in": "four"}
{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E", "tag_in": "five"}
{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"], "tag_in": null}
{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]}
{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8], "tag_in": 8}
{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":"", "tag_in": "nine"}
{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}, "tag_in": 10}
{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] , "tag_in": "eleven"}
{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}, "tag_in": 12}
{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]}
{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}}}
{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]}
{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16}
{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""}
{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"etiopia","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null], "tag_in": 1}
{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"fehérorosz","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": [], "tag_in": 2}
{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"etiopia","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null, "tag_in": 3}
{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"fehérorosz","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4, "tag_in": "four"}
{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"észak-korea","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E", "tag_in": "five"}
{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"etiopia","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"], "tag_in": null}
{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"fehérorosz","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]}
{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"észak-korea","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8], "tag_in": 8}
{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"etiopia","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":"", "tag_in": "nine"}
{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"észak-korea","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}, "tag_in": 10}
{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"fehérorosz","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] , "tag_in": "eleven"}
{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"etiopia","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}, "tag_in": 12}
{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"fehérorosz","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]}
{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"észak-korea","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}}}
{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"etiopia","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]}
{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"fehérorosz","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16}
{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"észak-korea","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""}