hard separator offset

This commit is contained in:
mpostma 2020-11-26 10:18:36 +01:00 committed by many
parent 8843062604
commit e616b1e356
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 45 additions and 26 deletions

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
use std::ops::Range; use std::ops::Range;
use std::time::Instant; use std::time::Instant;
@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by}; use itertools::{EitherOrBoth, merge_join_by};
use log::debug; use log::debug;
use meilisearch_tokenizer::Token; use meilisearch_tokenizer::{Token, token::SeparatorKind};
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use sdset::{Set, SetBuf, SetOperation}; use sdset::{Set, SetBuf, SetOperation};
@ -175,13 +175,20 @@ where I: IntoIterator<Item=Operation>,
const MAX_NGRAM: usize = 3; const MAX_NGRAM: usize = 3;
fn split_query_string(s: &str) -> Vec<(usize, String)> { fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, String)> {
// TODO: Use global instance instead // TODO: Use global instance instead
let analyzer = Analyzer::new(AnalyzerConfig::default()); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
analyzer analyzer
.analyze(s) .analyze(s)
.tokens() .tokens()
.filter(|t| !t.is_stopword()) .scan(0, |offset, mut token| {
token.char_index += *offset;
if let Some(SeparatorKind::Hard) = token.is_separator() {
*offset += 8;
}
Some(token)
})
.filter(|t| t.is_word())
.enumerate() .enumerate()
.map(|(i, Token { word, .. })| (i, word.to_string())) .map(|(i, Token { word, .. })| (i, word.to_string()))
.collect() .collect()
@ -193,7 +200,13 @@ pub fn create_query_tree(
query: &str, query: &str,
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)> ) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
{ {
let words = split_query_string(query); // TODO: use a shared analyzer instance
let words = split_query_string(query, ctx.stop_words
.stream()
.into_strs()
.unwrap_or_default()
.into_iter().
collect());
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));

View File

@ -1,11 +1,10 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom; use std::convert::TryFrom;
use std::println;
use meilisearch_schema::IndexedPos; use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use meilisearch_tokenizer::Token; use meilisearch_tokenizer::{Token, token::SeparatorKind};
use sdset::SetBuf; use sdset::SetBuf;
use crate::{DocIndex, DocumentId}; use crate::{DocIndex, DocumentId};
@ -45,11 +44,18 @@ impl RawIndexer {
let mut number_of_words = 0; let mut number_of_words = 0;
let analyzed_text = self.analyzer.analyze(text); let analyzed_text = self.analyzer.analyze(text);
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() { for (word_pos, token) in analyzed_text.tokens()
print!("token: {}", token.word); .scan(0, |offset, mut token| {
token.char_index += *offset;
if let Some(SeparatorKind::Hard) = token.is_separator() {
*offset += 8;
}
Some(token)
})
.filter(|t| t.is_word())
.enumerate() {
let must_continue = index_token( let must_continue = index_token(
token, token,
token_index,
word_pos, word_pos,
id, id,
indexed_pos, indexed_pos,
@ -72,37 +78,39 @@ impl RawIndexer {
where where
I: IntoIterator<Item = &'s str>, I: IntoIterator<Item = &'s str>,
{ {
let mut token_index_offset = 0;
let mut byte_offset = 0; let mut byte_offset = 0;
let mut word_offset = 0; let mut word_offset = 0;
for s in iter.into_iter() { for s in iter.into_iter() {
let current_token_index_offset = token_index_offset;
let current_byte_offset = byte_offset; let current_byte_offset = byte_offset;
let current_word_offset = word_offset; let current_word_offset = word_offset;
let analyzed_text = self.analyzer.analyze(s); let analyzed_text = self.analyzer.analyze(s);
let tokens = analyzed_text let tokens = analyzed_text
.tokens() .tokens()
.enumerate() .scan(0, |offset, mut token| {
.filter(|(_, t)| t.is_word()) token.char_index += *offset;
.map(|(i, mut t)| { if let Some(SeparatorKind::Hard) = token.is_separator() {
*offset += 8;
}
Some(token)
})
.filter(|t| t.is_word())
.map(|mut t| {
t.byte_start = t.byte_start + current_byte_offset; t.byte_start = t.byte_start + current_byte_offset;
t.byte_end = t.byte_end + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset;
(i + current_token_index_offset, t) t
}) })
.enumerate() .enumerate()
.map(|(i, t)| (i + current_word_offset, t)); .map(|(i, t)| (i + current_word_offset, t));
for (word_pos, (token_index, token)) in tokens { for (word_pos, token) in tokens {
token_index_offset = token_index + 1;
word_offset = word_pos + 1; word_offset = word_pos + 1;
byte_offset = token.byte_end + 1; byte_offset = token.byte_end + 1;
let must_continue = index_token( let must_continue = index_token(
token, token,
word_pos, word_pos,
token_index,
id, id,
indexed_pos, indexed_pos,
self.word_limit, self.word_limit,
@ -144,7 +152,6 @@ impl RawIndexer {
fn index_token( fn index_token(
token: Token, token: Token,
position: usize,
word_pos: usize, word_pos: usize,
id: DocumentId, id: DocumentId,
indexed_pos: IndexedPos, indexed_pos: IndexedPos,
@ -153,7 +160,6 @@ fn index_token(
docs_words: &mut HashMap<DocumentId, Vec<Word>>, docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool ) -> bool
{ {
println!(" position {}, limit: {}", position, word_limit);
if word_pos >= word_limit { if word_pos >= word_limit {
return false; return false;
} }