mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-26 05:07:28 +01:00
implement crop around
This commit is contained in:
parent
56c9633c53
commit
7473cc6e27
@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::collections::{BTreeMap, HashSet, VecDeque};
|
||||||
use std::collections::{BTreeMap, HashSet};
|
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
use std::{borrow::Cow, collections::HashMap};
|
||||||
|
|
||||||
use anyhow::bail;
|
use anyhow::bail;
|
||||||
use either::Either;
|
use either::Either;
|
||||||
@ -157,7 +157,12 @@ impl Index {
|
|||||||
|
|
||||||
let stop_words = fst::Set::default();
|
let stop_words = fst::Set::default();
|
||||||
let highlighter =
|
let highlighter =
|
||||||
Highlighter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
Formatter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
||||||
|
|
||||||
|
let to_crop = to_crop_ids
|
||||||
|
.into_iter()
|
||||||
|
.map(|id| (id, query.crop_length))
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
|
||||||
for (_id, obkv) in self.documents(&rtxn, documents_ids)? {
|
for (_id, obkv) in self.documents(&rtxn, documents_ids)? {
|
||||||
let document = make_document(&all_attributes, &fields_ids_map, obkv)?;
|
let document = make_document(&all_attributes, &fields_ids_map, obkv)?;
|
||||||
@ -168,7 +173,7 @@ impl Index {
|
|||||||
&matching_words,
|
&matching_words,
|
||||||
all_formatted.as_ref().as_slice(),
|
all_formatted.as_ref().as_slice(),
|
||||||
&to_highlight_ids,
|
&to_highlight_ids,
|
||||||
&to_crop_ids,
|
&to_crop,
|
||||||
)?;
|
)?;
|
||||||
let hit = SearchHit {
|
let hit = SearchHit {
|
||||||
document,
|
document,
|
||||||
@ -230,11 +235,11 @@ fn make_document(
|
|||||||
fn compute_formatted<A: AsRef<[u8]>>(
|
fn compute_formatted<A: AsRef<[u8]>>(
|
||||||
field_ids_map: &FieldsIdsMap,
|
field_ids_map: &FieldsIdsMap,
|
||||||
obkv: obkv::KvReader,
|
obkv: obkv::KvReader,
|
||||||
highlighter: &Highlighter<A>,
|
highlighter: &Formatter<A>,
|
||||||
matching_words: &impl Matcher,
|
matching_words: &impl Matcher,
|
||||||
all_formatted: &[FieldId],
|
all_formatted: &[FieldId],
|
||||||
to_highlight_fields: &HashSet<FieldId>,
|
to_highlight_fields: &HashSet<FieldId>,
|
||||||
to_crop_fields: &HashSet<FieldId>,
|
to_crop_fields: &HashMap<FieldId, Option<usize>>,
|
||||||
) -> anyhow::Result<Document> {
|
) -> anyhow::Result<Document> {
|
||||||
let mut document = Document::new();
|
let mut document = Document::new();
|
||||||
|
|
||||||
@ -242,15 +247,12 @@ fn compute_formatted<A: AsRef<[u8]>>(
|
|||||||
if let Some(value) = obkv.get(*field) {
|
if let Some(value) = obkv.get(*field) {
|
||||||
let mut value: Value = serde_json::from_slice(value)?;
|
let mut value: Value = serde_json::from_slice(value)?;
|
||||||
|
|
||||||
let need_to_crop = if to_crop_fields.contains(field) {
|
value = highlighter.format_value(
|
||||||
Some(200) // TO CHANGE
|
value,
|
||||||
} else {
|
matching_words,
|
||||||
None
|
to_crop_fields.get(field).copied().flatten(),
|
||||||
};
|
to_highlight_fields.contains(field),
|
||||||
|
);
|
||||||
if to_highlight_fields.contains(field) {
|
|
||||||
value = highlighter.format_value(value, matching_words, need_to_crop, to_highlight_fields.contains(field));
|
|
||||||
}
|
|
||||||
|
|
||||||
// This unwrap must be safe since we got the ids from the fields_ids_map just
|
// This unwrap must be safe since we got the ids from the fields_ids_map just
|
||||||
// before.
|
// before.
|
||||||
@ -284,12 +286,12 @@ impl Matcher for MatchingWords {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Highlighter<'a, A> {
|
struct Formatter<'a, A> {
|
||||||
analyzer: Analyzer<'a, A>,
|
analyzer: Analyzer<'a, A>,
|
||||||
marks: (String, String),
|
marks: (String, String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
|
||||||
pub fn new(stop_words: &'a fst::Set<A>, marks: (String, String)) -> Self {
|
pub fn new(stop_words: &'a fst::Set<A>, marks: (String, String)) -> Self {
|
||||||
let mut config = AnalyzerConfig::default();
|
let mut config = AnalyzerConfig::default();
|
||||||
config.stop_words(stop_words);
|
config.stop_words(stop_words);
|
||||||
@ -305,10 +307,11 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
|||||||
matcher: &impl Matcher,
|
matcher: &impl Matcher,
|
||||||
need_to_crop: Option<usize>,
|
need_to_crop: Option<usize>,
|
||||||
need_to_highlight: bool,
|
need_to_highlight: bool,
|
||||||
) -> Value {
|
) -> Value {
|
||||||
match value {
|
match value {
|
||||||
Value::String(old_string) => {
|
Value::String(old_string) => {
|
||||||
let value = self.format_string(old_string, matcher, need_to_crop, need_to_highlight);
|
let value =
|
||||||
|
self.format_string(old_string, matcher, need_to_crop, need_to_highlight);
|
||||||
Value::String(value)
|
Value::String(value)
|
||||||
}
|
}
|
||||||
Value::Array(values) => Value::Array(
|
Value::Array(values) => Value::Array(
|
||||||
@ -326,41 +329,67 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
|||||||
value => value,
|
value => value,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn format_string(&self, s: String, matcher: &impl Matcher, need_to_crop: Option<usize>, need_to_highlight: bool) -> String {
|
fn format_string(
|
||||||
|
&self,
|
||||||
|
s: String,
|
||||||
|
matcher: &impl Matcher,
|
||||||
|
need_to_crop: Option<usize>,
|
||||||
|
need_to_highlight: bool,
|
||||||
|
) -> String {
|
||||||
let analyzed = self.analyzer.analyze(&s);
|
let analyzed = self.analyzer.analyze(&s);
|
||||||
|
|
||||||
let tokens: Box<dyn Iterator<Item=(&str, Token)>> = match need_to_crop {
|
let tokens: Box<dyn Iterator<Item = (&str, Token)>> = match need_to_crop {
|
||||||
Some(crop_len) => {
|
Some(crop_len) => {
|
||||||
let mut taken = 0;
|
let mut buffer = VecDeque::new();
|
||||||
let iter = analyzed
|
let mut tokens = analyzed.reconstruct().peekable();
|
||||||
.reconstruct()
|
let mut taken_before = 0;
|
||||||
.skip_while(|(_, token)| !matcher.matches(token.text()))
|
while let Some((word, token)) = tokens.next_if(|(_, token)| !matcher.matches(token.text())) {
|
||||||
|
buffer.push_back((word, token));
|
||||||
|
taken_before += word.chars().count();
|
||||||
|
while taken_before > crop_len {
|
||||||
|
if let Some((word, _)) = buffer.pop_front() {
|
||||||
|
taken_before -= word.chars().count();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(token) = tokens.next() {
|
||||||
|
buffer.push_back(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut taken_after = 0;
|
||||||
|
|
||||||
|
let after_iter = tokens
|
||||||
.take_while(move |(word, _)| {
|
.take_while(move |(word, _)| {
|
||||||
let take = taken < crop_len;
|
let take = taken_after <= crop_len;
|
||||||
taken += word.chars().count();
|
taken_after += word.chars().count();
|
||||||
take
|
take
|
||||||
});
|
});
|
||||||
|
let iter = buffer
|
||||||
|
.into_iter()
|
||||||
|
.chain(after_iter);
|
||||||
|
|
||||||
Box::new(iter)
|
Box::new(iter)
|
||||||
},
|
}
|
||||||
None => Box::new(analyzed.reconstruct()),
|
None => Box::new(analyzed.reconstruct()),
|
||||||
};
|
};
|
||||||
|
|
||||||
tokens.map(|(word, token)| {
|
tokens
|
||||||
if need_to_highlight && token.is_word() && matcher.matches(token.text()){
|
.map(|(word, token)| {
|
||||||
let mut new_word = String::new();
|
if need_to_highlight && token.is_word() && matcher.matches(token.text()) {
|
||||||
new_word.push_str(&self.marks.0);
|
let mut new_word = String::new();
|
||||||
new_word.push_str(&word);
|
new_word.push_str(&self.marks.0);
|
||||||
new_word.push_str(&self.marks.1);
|
new_word.push_str(&word);
|
||||||
new_word
|
new_word.push_str(&self.marks.1);
|
||||||
} else {
|
new_word
|
||||||
word.to_string()
|
} else {
|
||||||
}
|
word.to_string()
|
||||||
})
|
}
|
||||||
.collect::<String>()
|
})
|
||||||
|
.collect::<String>()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fn parse_facets(
|
fn parse_facets(
|
||||||
facets: &Value,
|
facets: &Value,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
@ -412,7 +441,7 @@ mod test {
|
|||||||
fn no_formatted() {
|
fn no_formatted() {
|
||||||
let stop_words = fst::Set::default();
|
let stop_words = fst::Set::default();
|
||||||
let highlighter =
|
let highlighter =
|
||||||
Highlighter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
Formatter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
||||||
|
|
||||||
let mut fields = FieldsIdsMap::new();
|
let mut fields = FieldsIdsMap::new();
|
||||||
let id = fields.insert("test").unwrap();
|
let id = fields.insert("test").unwrap();
|
||||||
@ -439,7 +468,8 @@ mod test {
|
|||||||
&all_formatted,
|
&all_formatted,
|
||||||
&to_highlight_ids,
|
&to_highlight_ids,
|
||||||
&to_crop_ids,
|
&to_crop_ids,
|
||||||
).unwrap();
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert!(value.is_empty());
|
assert!(value.is_empty());
|
||||||
}
|
}
|
||||||
@ -448,7 +478,7 @@ mod test {
|
|||||||
fn formatted_no_highlight() {
|
fn formatted_no_highlight() {
|
||||||
let stop_words = fst::Set::default();
|
let stop_words = fst::Set::default();
|
||||||
let highlighter =
|
let highlighter =
|
||||||
Highlighter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
Formatter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
||||||
|
|
||||||
let mut fields = FieldsIdsMap::new();
|
let mut fields = FieldsIdsMap::new();
|
||||||
let id = fields.insert("test").unwrap();
|
let id = fields.insert("test").unwrap();
|
||||||
@ -475,7 +505,8 @@ mod test {
|
|||||||
&all_formatted,
|
&all_formatted,
|
||||||
&to_highlight_ids,
|
&to_highlight_ids,
|
||||||
&to_crop_ids,
|
&to_crop_ids,
|
||||||
).unwrap();
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(value["test"], "hello");
|
assert_eq!(value["test"], "hello");
|
||||||
}
|
}
|
||||||
@ -484,7 +515,7 @@ mod test {
|
|||||||
fn formatted_with_highlight() {
|
fn formatted_with_highlight() {
|
||||||
let stop_words = fst::Set::default();
|
let stop_words = fst::Set::default();
|
||||||
let highlighter =
|
let highlighter =
|
||||||
Highlighter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
Formatter::new(&stop_words, (String::from("<em>"), String::from("</em>")));
|
||||||
|
|
||||||
let mut fields = FieldsIdsMap::new();
|
let mut fields = FieldsIdsMap::new();
|
||||||
let id = fields.insert("test").unwrap();
|
let id = fields.insert("test").unwrap();
|
||||||
@ -511,7 +542,8 @@ mod test {
|
|||||||
&all_formatted,
|
&all_formatted,
|
||||||
&to_highlight_ids,
|
&to_highlight_ids,
|
||||||
&to_crop_ids,
|
&to_crop_ids,
|
||||||
).unwrap();
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(value["test"], "<em>hello</em>");
|
assert_eq!(value["test"], "<em>hello</em>");
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user