fix indexing length

This commit is contained in:
mpostma 2020-11-26 20:01:53 +01:00 committed by many
parent 206308c1aa
commit c6434f609c
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 39 additions and 37 deletions

View File

@ -181,27 +181,25 @@ fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set<A>)
analyzer analyzer
.analyze(s) .analyze(s)
.tokens() .tokens()
.scan((0, None), |(offset, sepcat), mut token| { .scan((0, false), |(offset, is_hard_sep), mut token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
if let Some(SeparatorKind::Hard) = sepcat { if *is_hard_sep {
*offset += 8; *offset += 8;
} else {
*offset += 1;
} }
*sepcat = None; *is_hard_sep = false;
token.char_index += *offset; token.char_index += *offset;
} }
TokenKind::Separator(SeparatorKind::Hard) => { TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard); *is_hard_sep = true;
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
} }
_ => (), _ => (),
} }
Some(token) Some((*offset, token))
}) })
.filter(|t| t.is_word()) .filter(|(_, t)| t.is_word())
.enumerate()
.map(|(i, Token { word, .. })| (i, word.to_string())) .map(|(i, Token { word, .. })| (i, word.to_string()))
.collect() .collect()
} }

View File

@ -50,31 +50,32 @@ where
let mut number_of_words = 0; let mut number_of_words = 0;
let analyzed_text = self.analyzer.analyze(text); let analyzed_text = self.analyzer.analyze(text);
for (word_pos, token) in analyzed_text.tokens() for (token_pos, (word_pos, token)) in analyzed_text
.scan((0, None), |(offset, sepcat), mut token| { .tokens()
.scan((0, false), |(offset, is_hard_sep), mut token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { TokenKind::Word => {
if let Some(SeparatorKind::Hard) = sepcat { if *is_hard_sep {
*offset += 8; *offset += 8;
} else {
*offset += 1;
} }
*sepcat = None; *is_hard_sep = false;
token.char_index += *offset; token.char_index += *offset;
} }
TokenKind::Separator(SeparatorKind::Hard) => { TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard); *is_hard_sep = true;
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
} }
_ => (), _ => (),
} }
Some(token) Some((*offset, token))
}) })
.filter(|t| t.is_word()) .filter(|(_, t)| t.is_word())
.enumerate() { .enumerate() {
let must_continue = index_token( let must_continue = index_token(
token, token,
word_pos, word_pos,
token_pos,
id, id,
indexed_pos, indexed_pos,
self.word_limit, self.word_limit,
@ -106,41 +107,41 @@ where
let analyzed_text = self.analyzer.analyze(s); let analyzed_text = self.analyzer.analyze(s);
let tokens = analyzed_text let tokens = analyzed_text
.tokens() .tokens()
.scan((0, None), |(offset, sepcat), mut token| { .scan((0, false), |(offset, is_hard_sep), mut token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
if let Some(SeparatorKind::Hard) = sepcat { if *is_hard_sep {
*offset += 8; *offset += 8;
} else {
*offset += 1;
} }
*sepcat = None; *is_hard_sep = false;
token.char_index += *offset; token.char_index += *offset;
} }
TokenKind::Separator(SeparatorKind::Hard) => { TokenKind::Separator(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Hard); *is_hard_sep = true;
}
TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => {
*sepcat = Some(SeparatorKind::Soft);
} }
_ => (), _ => (),
} }
Some(token) Some((*offset, token))
}) })
.filter(|t| t.is_word()) .filter(|(_, t)| t.is_word())
.map(|mut t| { .map(|(i, mut t)| {
t.byte_start = t.byte_start + current_byte_offset; t.byte_start = t.byte_start + current_byte_offset;
t.byte_end = t.byte_end + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset;
t (i, t)
}) })
.enumerate() .map(|(i, t)| (i + current_word_offset, t))
.map(|(i, t)| (i + current_word_offset, t)); .enumerate();
for (word_pos, token) in tokens { for (token_pos, (word_pos, token)) in tokens {
word_offset = word_pos + 1; word_offset = word_pos + 1;
byte_offset = token.byte_end + 1; byte_offset = token.byte_end + 1;
let must_continue = index_token( let must_continue = index_token(
token, token,
word_pos, word_pos,
token_pos,
id, id,
indexed_pos, indexed_pos,
self.word_limit, self.word_limit,
@ -183,6 +184,7 @@ where
fn index_token( fn index_token(
token: Token, token: Token,
word_pos: usize, word_pos: usize,
token_pos: usize,
id: DocumentId, id: DocumentId,
indexed_pos: IndexedPos, indexed_pos: IndexedPos,
word_limit: usize, word_limit: usize,
@ -190,7 +192,7 @@ fn index_token(
docs_words: &mut HashMap<DocumentId, Vec<Word>>, docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool ) -> bool
{ {
if word_pos >= word_limit { if token_pos >= word_limit {
return false; return false;
} }
@ -330,7 +332,7 @@ mod tests {
let Indexed { let Indexed {
words_doc_indexes, .. words_doc_indexes, ..
} = indexer.build(); } = indexer.build();
assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some()); assert!(words_doc_indexes.get(&"request".to_owned().into_bytes()).is_some());
} }
#[test] #[test]

View File

@ -102,6 +102,8 @@ async fn placeholder_search_witch_crop() {
"cropLength": 20 "cropLength": 20
}); });
println!("here");
test_post_get_search!(server, query, |response, status_code| { test_post_get_search!(server, query, |response, status_code| {
assert_eq!(status_code, 200); assert_eq!(status_code, 200);