diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 201610d44..b8cbe0ac6 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -395,7 +395,6 @@ mod tests { let mut writer = db.main_write_txn().unwrap(); let word = normalize_str(word); - println!("synonym: {}", word); let alternatives = self .index @@ -1261,7 +1260,6 @@ mod tests { let builder = store.query_builder(); let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap(); - println!("documents: {:#?}", documents); let mut iter = documents.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1297,7 +1295,6 @@ mod tests { let builder = store.query_builder(); let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap(); let mut iter = documents.into_iter(); - // this test was in the opposite order, I am not sure why... assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 6e9afc677..3a7519c90 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -14,10 +14,7 @@ const WORD_LENGTH_LIMIT: usize = 80; type Word = Vec; // TODO make it be a SmallVec -pub struct RawIndexer<'a, A> -where - A: AsRef<[u8]> -{ +pub struct RawIndexer<'a, A> { word_limit: usize, // the maximum number of indexed words words_doc_indexes: BTreeMap>, docs_words: HashMap>, @@ -73,25 +70,24 @@ where number_of_words } - pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) + pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I) where I: IntoIterator, { let mut byte_offset = 0; let mut word_offset = 0; - for s in iter.into_iter() { + for text in text_iter.into_iter() { let current_byte_offset = byte_offset; let current_word_offset = word_offset; - let analyzed_text = self.analyzer.analyze(s); + let analyzed_text = self.analyzer.analyze(text); let tokens = process_tokens(analyzed_text.tokens()) .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; - (i, t) + (i + current_word_offset, t) }) - .map(|(i, t)| (i + current_word_offset, t)) .enumerate(); for (token_pos, (word_pos, token)) in tokens { @@ -143,21 +139,22 @@ where fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { tokens - .scan((0, None), |(offset, sepkind), token| { + .scan((0, None), |(offset, prev_kind), token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - *offset += match *sepkind { + *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, None => 0, }; - *sepkind = Some(token.kind) + *prev_kind = Some(token.kind) } TokenKind::Separator(SeparatorKind::Hard) => { - *sepkind = Some(token.kind); + *prev_kind = Some(token.kind); } - TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => { - *sepkind = Some(token.kind); + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { + *prev_kind = Some(token.kind); } _ => (), } @@ -226,12 +223,12 @@ mod tests { #[test] fn test_process_token() { - let text = " Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + let text = " 為一包含一千多萬目詞的帶標記平衡語料庫"; let stopwords = Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); let analyzer = analyzer.analyze(text); - let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect(); - println!("tokens: {:?}", tokens); + let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect(); + assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目", "词", "的", "带", "标记", "平衡", "语料库"]); } #[test] diff --git a/meilisearch-http/tests/placeholder_search.rs b/meilisearch-http/tests/placeholder_search.rs index fb1286248..048ab7f8b 100644 --- a/meilisearch-http/tests/placeholder_search.rs +++ b/meilisearch-http/tests/placeholder_search.rs @@ -102,8 +102,6 @@ async fn placeholder_search_witch_crop() { "cropLength": 20 }); - println!("here"); - test_post_get_search!(server, query, |response, status_code| { assert_eq!(status_code, 200);