From e09eec37bc4fc6529129f24fd45c7c6d28ec2297 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 25 Aug 2021 15:09:46 +0200 Subject: [PATCH] Handle distance addition with hard separators --- .../extract/extract_docid_word_positions.rs | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index fb3372660..894a193bf 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,7 +3,8 @@ use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; use roaring::RoaringBitmap; use serde_json::Value; @@ -61,11 +62,8 @@ pub fn extract_docid_word_positions( field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); - let tokens = analyzed - .tokens() - .filter(Token::is_word) - .enumerate() - .take_while(|(i, _)| (*i as u32) < ONE_ATTRIBUTE); + let tokens = process_tokens(analyzed.tokens()) + .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); for (index, token) in tokens { let token = token.text().trim(); @@ -134,3 +132,36 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st None } } + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +}