From 246286f0ebede97517bc262fc9cf67448e221194 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 2 Mar 2021 11:14:10 +0100 Subject: [PATCH] take hard separator into account --- milli/src/update/index_documents/store.rs | 41 +++++++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 96d1098f9..05974d55e 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -471,14 +471,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { }; let analyzed = self.analyzer.analyze(&content); - let tokens = analyzed - .tokens() - .filter(|t| t.is_word()) - .map(|t| t.text().to_string()); + let tokens = process_tokens(analyzed.tokens()); - for (pos, word) in tokens.enumerate().take(MAX_POSITION) { + for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); } } } @@ -609,6 +606,36 @@ enum FacetValue { Integer(i64), } +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { use FacetValue::*;