fix indexed document length bug

This commit is contained in:
mpostma 2020-06-30 17:11:15 +02:00
parent 0953d99198
commit 3423c0b246
2 changed files with 38 additions and 1 deletions

View File

@ -124,7 +124,7 @@ fn index_token<A>(
) -> bool ) -> bool
where A: AsRef<[u8]>, where A: AsRef<[u8]>,
{ {
if token.word_index >= word_limit { if token.index >= word_limit {
return false; return false;
} }

View File

@ -101,11 +101,14 @@ pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
#[derive(Debug, Copy, Clone, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> { pub struct Token<'a> {
pub word: &'a str, pub word: &'a str,
/// index of the token in the token sequence
pub index: usize,
pub word_index: usize, pub word_index: usize,
pub char_index: usize, pub char_index: usize,
} }
pub struct Tokenizer<'a> { pub struct Tokenizer<'a> {
count: usize,
inner: &'a str, inner: &'a str,
word_index: usize, word_index: usize,
char_index: usize, char_index: usize,
@ -121,6 +124,7 @@ impl<'a> Tokenizer<'a> {
.fold((0, 0), chars_count_index); .fold((0, 0), chars_count_index);
Tokenizer { Tokenizer {
count: 0,
inner: &string[index..], inner: &string[index..],
word_index: 0, word_index: 0,
char_index: count, char_index: count,
@ -150,6 +154,7 @@ impl<'a> Iterator for Tokenizer<'a> {
let token = Token { let token = Token {
word: string, word: string,
index: self.count,
word_index: self.word_index, word_index: self.word_index,
char_index: self.char_index, char_index: self.char_index,
}; };
@ -158,6 +163,7 @@ impl<'a> Iterator for Tokenizer<'a> {
self.word_index += 1; self.word_index += 1;
} }
self.count += 1;
self.char_index += count; self.char_index += count;
self.inner = &self.inner[index..]; self.inner = &self.inner[index..];
@ -175,6 +181,7 @@ where
{ {
inner: I, inner: I,
current: Option<Peekable<Tokenizer<'a>>>, current: Option<Peekable<Tokenizer<'a>>>,
count: usize,
word_offset: usize, word_offset: usize,
char_offset: usize, char_offset: usize,
} }
@ -188,6 +195,7 @@ where
SeqTokenizer { SeqTokenizer {
inner: iter, inner: iter,
current, current,
count: 0,
word_offset: 0, word_offset: 0,
char_offset: 0, char_offset: 0,
} }
@ -209,6 +217,7 @@ where
// to the token before returning it // to the token before returning it
let token = Token { let token = Token {
word: token.word, word: token.word,
index: self.count,
word_index: token.word_index + self.word_offset, word_index: token.word_index + self.word_offset,
char_index: token.char_index + self.char_offset, char_index: token.char_index + self.char_offset,
}; };
@ -249,6 +258,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "salut", word: "salut",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@ -261,6 +271,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@ -276,6 +287,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 4 char_index: 4
}) })
@ -284,6 +296,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolo", word: "lolo",
index: 1,
word_index: 1, word_index: 1,
char_index: 7 char_index: 7
}) })
@ -292,6 +305,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "aïe", word: "aïe",
index: 2,
word_index: 9, word_index: 9,
char_index: 13 char_index: 13
}) })
@ -300,6 +314,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "ouch", word: "ouch",
index: 3,
word_index: 17, word_index: 17,
char_index: 18 char_index: 18
}) })
@ -312,6 +327,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@ -320,6 +336,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolo", word: "lolo",
index: 1,
word_index: 8, word_index: 8,
char_index: 5 char_index: 5
}) })
@ -328,6 +345,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "wtf", word: "wtf",
index: 2,
word_index: 16, word_index: 16,
char_index: 12 char_index: 12
}) })
@ -336,6 +354,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lol", word: "lol",
index: 3,
word_index: 17, word_index: 17,
char_index: 18 char_index: 18
}) })
@ -344,6 +363,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "aïe", word: "aïe",
index: 4,
word_index: 25, word_index: 25,
char_index: 24 char_index: 24
}) })
@ -359,6 +379,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 4 char_index: 4
}) })
@ -367,6 +388,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "😂", word: "😂",
index: 1,
word_index: 1, word_index: 1,
char_index: 7 char_index: 7
}) })
@ -375,6 +397,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "aïe", word: "aïe",
index: 2,
word_index: 9, word_index: 9,
char_index: 10 char_index: 10
}) })
@ -387,6 +410,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "yo", word: "yo",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@ -395,6 +419,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolo", word: "lolo",
index: 1,
word_index: 8, word_index: 8,
char_index: 5 char_index: 5
}) })
@ -403,6 +428,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "😱", word: "😱",
index: 2,
word_index: 16, word_index: 16,
char_index: 12 char_index: 12
}) })
@ -411,6 +437,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lol", word: "lol",
index: 3,
word_index: 17, word_index: 17,
char_index: 16 char_index: 16
}) })
@ -419,6 +446,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "😣", word: "😣",
index: 4,
word_index: 25, word_index: 25,
char_index: 22 char_index: 22
}) })
@ -434,6 +462,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec4}", word: "\u{2ec4}",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@ -442,6 +471,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolilol", word: "lolilol",
index: 1,
word_index: 1, word_index: 1,
char_index: 1 char_index: 1
}) })
@ -450,6 +480,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec7}", word: "\u{2ec7}",
index: 2,
word_index: 2, word_index: 2,
char_index: 8 char_index: 8
}) })
@ -462,6 +493,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec4}", word: "\u{2ec4}",
index: 0,
word_index: 0, word_index: 0,
char_index: 0 char_index: 0
}) })
@ -470,6 +502,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ed3}", word: "\u{2ed3}",
index: 1,
word_index: 1, word_index: 1,
char_index: 1 char_index: 1
}) })
@ -478,6 +511,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ef2}", word: "\u{2ef2}",
index: 2,
word_index: 2, word_index: 2,
char_index: 2 char_index: 2
}) })
@ -486,6 +520,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "lolilol", word: "lolilol",
index: 3,
word_index: 3, word_index: 3,
char_index: 4 char_index: 4
}) })
@ -494,6 +529,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "hello", word: "hello",
index: 4,
word_index: 4, word_index: 4,
char_index: 14 char_index: 14
}) })
@ -502,6 +538,7 @@ mod tests {
tokenizer.next(), tokenizer.next(),
Some(Token { Some(Token {
word: "\u{2ec7}", word: "\u{2ec7}",
index: 5,
word_index: 5, word_index: 5,
char_index: 23 char_index: 23
}) })