fix suggestions

This commit is contained in:
mpostma 2020-12-03 12:34:22 +01:00 committed by many
parent 8b149c9aa3
commit 8e64a24d19
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 15 additions and 23 deletions

View File

@ -395,7 +395,6 @@ mod tests {
let mut writer = db.main_write_txn().unwrap(); let mut writer = db.main_write_txn().unwrap();
let word = normalize_str(word); let word = normalize_str(word);
println!("synonym: {}", word);
let alternatives = self let alternatives = self
.index .index
@ -1261,7 +1260,6 @@ mod tests {
let builder = store.query_builder(); let builder = store.query_builder();
let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap();
println!("documents: {:#?}", documents);
let mut iter = documents.into_iter(); let mut iter = documents.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
@ -1297,7 +1295,6 @@ mod tests {
let builder = store.query_builder(); let builder = store.query_builder();
let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap();
let mut iter = documents.into_iter(); let mut iter = documents.into_iter();
// this test was in the opposite order, I am not sure why...
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter(); let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. }));

View File

@ -14,10 +14,7 @@ const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer<'a, A> pub struct RawIndexer<'a, A> {
where
A: AsRef<[u8]>
{
word_limit: usize, // the maximum number of indexed words word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>, docs_words: HashMap<DocumentId, Vec<Word>>,
@ -73,25 +70,24 @@ where
number_of_words number_of_words
} }
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I)
where where
I: IntoIterator<Item = &'s str>, I: IntoIterator<Item = &'s str>,
{ {
let mut byte_offset = 0; let mut byte_offset = 0;
let mut word_offset = 0; let mut word_offset = 0;
for s in iter.into_iter() { for text in text_iter.into_iter() {
let current_byte_offset = byte_offset; let current_byte_offset = byte_offset;
let current_word_offset = word_offset; let current_word_offset = word_offset;
let analyzed_text = self.analyzer.analyze(s); let analyzed_text = self.analyzer.analyze(text);
let tokens = process_tokens(analyzed_text.tokens()) let tokens = process_tokens(analyzed_text.tokens())
.map(|(i, mut t)| { .map(|(i, mut t)| {
t.byte_start = t.byte_start + current_byte_offset; t.byte_start = t.byte_start + current_byte_offset;
t.byte_end = t.byte_end + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset;
(i, t) (i + current_word_offset, t)
}) })
.map(|(i, t)| (i + current_word_offset, t))
.enumerate(); .enumerate();
for (token_pos, (word_pos, token)) in tokens { for (token_pos, (word_pos, token)) in tokens {
@ -143,21 +139,22 @@ where
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> { fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens tokens
.scan((0, None), |(offset, sepkind), token| { .scan((0, None), |(offset, prev_kind), token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
*offset += match *sepkind { *offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1, Some(_) => 1,
None => 0, None => 0,
}; };
*sepkind = Some(token.kind) *prev_kind = Some(token.kind)
} }
TokenKind::Separator(SeparatorKind::Hard) => { TokenKind::Separator(SeparatorKind::Hard) => {
*sepkind = Some(token.kind); *prev_kind = Some(token.kind);
} }
TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => { TokenKind::Separator(SeparatorKind::Soft)
*sepkind = Some(token.kind); if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
} }
_ => (), _ => (),
} }
@ -226,12 +223,12 @@ mod tests {
#[test] #[test]
fn test_process_token() { fn test_process_token() {
let text = " Zut, laspirateur, jai oublié de léteindre !"; let text = " 為一包含一千多萬目詞的帶標記平衡語料庫";
let stopwords = Set::default(); let stopwords = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords));
let analyzer = analyzer.analyze(text); let analyzer = analyzer.analyze(text);
let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect(); let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect();
println!("tokens: {:?}", tokens); assert_eq!(tokens, ["", "", "包含", "一千多万", "", "", "", "", "标记", "平衡", "语料库"]);
} }
#[test] #[test]

View File

@ -102,8 +102,6 @@ async fn placeholder_search_witch_crop() {
"cropLength": 20 "cropLength": 20
}); });
println!("here");
test_post_get_search!(server, query, |response, status_code| { test_post_get_search!(server, query, |response, status_code| {
assert_eq!(status_code, 200); assert_eq!(status_code, 200);