mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
hard separator offset
This commit is contained in:
parent
8843062604
commit
e616b1e356
@ -1,5 +1,5 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::hash::{Hash, Hasher};
|
use std::hash::{Hash, Hasher};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once};
|
|||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use itertools::{EitherOrBoth, merge_join_by};
|
use itertools::{EitherOrBoth, merge_join_by};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::Token;
|
use meilisearch_tokenizer::{Token, token::SeparatorKind};
|
||||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
use sdset::{Set, SetBuf, SetOperation};
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
|
|
||||||
@ -175,13 +175,20 @@ where I: IntoIterator<Item=Operation>,
|
|||||||
|
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
|
|
||||||
fn split_query_string(s: &str) -> Vec<(usize, String)> {
|
fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, String)> {
|
||||||
// TODO: Use global instance instead
|
// TODO: Use global instance instead
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default());
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
||||||
analyzer
|
analyzer
|
||||||
.analyze(s)
|
.analyze(s)
|
||||||
.tokens()
|
.tokens()
|
||||||
.filter(|t| !t.is_stopword())
|
.scan(0, |offset, mut token| {
|
||||||
|
token.char_index += *offset;
|
||||||
|
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
||||||
|
*offset += 8;
|
||||||
|
}
|
||||||
|
Some(token)
|
||||||
|
})
|
||||||
|
.filter(|t| t.is_word())
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, Token { word, .. })| (i, word.to_string()))
|
.map(|(i, Token { word, .. })| (i, word.to_string()))
|
||||||
.collect()
|
.collect()
|
||||||
@ -193,7 +200,13 @@ pub fn create_query_tree(
|
|||||||
query: &str,
|
query: &str,
|
||||||
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||||
{
|
{
|
||||||
let words = split_query_string(query);
|
// TODO: use a shared analyzer instance
|
||||||
|
let words = split_query_string(query, ctx.stop_words
|
||||||
|
.stream()
|
||||||
|
.into_strs()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter().
|
||||||
|
collect());
|
||||||
|
|
||||||
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||||
|
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::println;
|
|
||||||
|
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
|
||||||
use meilisearch_tokenizer::Token;
|
use meilisearch_tokenizer::{Token, token::SeparatorKind};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
use crate::{DocIndex, DocumentId};
|
use crate::{DocIndex, DocumentId};
|
||||||
@ -45,11 +44,18 @@ impl RawIndexer {
|
|||||||
let mut number_of_words = 0;
|
let mut number_of_words = 0;
|
||||||
|
|
||||||
let analyzed_text = self.analyzer.analyze(text);
|
let analyzed_text = self.analyzer.analyze(text);
|
||||||
for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() {
|
for (word_pos, token) in analyzed_text.tokens()
|
||||||
print!("token: {}", token.word);
|
.scan(0, |offset, mut token| {
|
||||||
|
token.char_index += *offset;
|
||||||
|
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
||||||
|
*offset += 8;
|
||||||
|
}
|
||||||
|
Some(token)
|
||||||
|
})
|
||||||
|
.filter(|t| t.is_word())
|
||||||
|
.enumerate() {
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
token_index,
|
|
||||||
word_pos,
|
word_pos,
|
||||||
id,
|
id,
|
||||||
indexed_pos,
|
indexed_pos,
|
||||||
@ -72,37 +78,39 @@ impl RawIndexer {
|
|||||||
where
|
where
|
||||||
I: IntoIterator<Item = &'s str>,
|
I: IntoIterator<Item = &'s str>,
|
||||||
{
|
{
|
||||||
let mut token_index_offset = 0;
|
|
||||||
let mut byte_offset = 0;
|
let mut byte_offset = 0;
|
||||||
let mut word_offset = 0;
|
let mut word_offset = 0;
|
||||||
|
|
||||||
for s in iter.into_iter() {
|
for s in iter.into_iter() {
|
||||||
let current_token_index_offset = token_index_offset;
|
|
||||||
let current_byte_offset = byte_offset;
|
let current_byte_offset = byte_offset;
|
||||||
let current_word_offset = word_offset;
|
let current_word_offset = word_offset;
|
||||||
|
|
||||||
let analyzed_text = self.analyzer.analyze(s);
|
let analyzed_text = self.analyzer.analyze(s);
|
||||||
let tokens = analyzed_text
|
let tokens = analyzed_text
|
||||||
.tokens()
|
.tokens()
|
||||||
.enumerate()
|
.scan(0, |offset, mut token| {
|
||||||
.filter(|(_, t)| t.is_word())
|
token.char_index += *offset;
|
||||||
.map(|(i, mut t)| {
|
if let Some(SeparatorKind::Hard) = token.is_separator() {
|
||||||
|
*offset += 8;
|
||||||
|
}
|
||||||
|
Some(token)
|
||||||
|
})
|
||||||
|
.filter(|t| t.is_word())
|
||||||
|
.map(|mut t| {
|
||||||
t.byte_start = t.byte_start + current_byte_offset;
|
t.byte_start = t.byte_start + current_byte_offset;
|
||||||
t.byte_end = t.byte_end + current_byte_offset;
|
t.byte_end = t.byte_end + current_byte_offset;
|
||||||
(i + current_token_index_offset, t)
|
t
|
||||||
})
|
})
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, t)| (i + current_word_offset, t));
|
.map(|(i, t)| (i + current_word_offset, t));
|
||||||
|
|
||||||
for (word_pos, (token_index, token)) in tokens {
|
for (word_pos, token) in tokens {
|
||||||
token_index_offset = token_index + 1;
|
|
||||||
word_offset = word_pos + 1;
|
word_offset = word_pos + 1;
|
||||||
byte_offset = token.byte_end + 1;
|
byte_offset = token.byte_end + 1;
|
||||||
|
|
||||||
let must_continue = index_token(
|
let must_continue = index_token(
|
||||||
token,
|
token,
|
||||||
word_pos,
|
word_pos,
|
||||||
token_index,
|
|
||||||
id,
|
id,
|
||||||
indexed_pos,
|
indexed_pos,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
@ -144,7 +152,6 @@ impl RawIndexer {
|
|||||||
|
|
||||||
fn index_token(
|
fn index_token(
|
||||||
token: Token,
|
token: Token,
|
||||||
position: usize,
|
|
||||||
word_pos: usize,
|
word_pos: usize,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
indexed_pos: IndexedPos,
|
indexed_pos: IndexedPos,
|
||||||
@ -153,7 +160,6 @@ fn index_token(
|
|||||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||||
) -> bool
|
) -> bool
|
||||||
{
|
{
|
||||||
println!(" position {}, limit: {}", position, word_limit);
|
|
||||||
if word_pos >= word_limit {
|
if word_pos >= word_limit {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user