fix suggestions

This commit is contained in:
mpostma 2020-12-03 12:34:22 +01:00 committed by many
parent 8b149c9aa3
commit 8e64a24d19
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 15 additions and 23 deletions

View File

@ -395,7 +395,6 @@ mod tests {
let mut writer = db.main_write_txn().unwrap();
let word = normalize_str(word);
println!("synonym: {}", word);
let alternatives = self
.index
@ -1261,7 +1260,6 @@ mod tests {
let builder = store.query_builder();
let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap();
println!("documents: {:#?}", documents);
let mut iter = documents.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
@ -1297,7 +1295,6 @@ mod tests {
let builder = store.query_builder();
let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap();
let mut iter = documents.into_iter();
// this test was in the opposite order, I am not sure why...
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. }));

View File

@ -14,10 +14,7 @@ const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer<'a, A>
where
A: AsRef<[u8]>
{
pub struct RawIndexer<'a, A> {
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
@ -73,25 +70,24 @@ where
number_of_words
}
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I)
where
I: IntoIterator<Item = &'s str>,
{
let mut byte_offset = 0;
let mut word_offset = 0;
for s in iter.into_iter() {
for text in text_iter.into_iter() {
let current_byte_offset = byte_offset;
let current_word_offset = word_offset;
let analyzed_text = self.analyzer.analyze(s);
let analyzed_text = self.analyzer.analyze(text);
let tokens = process_tokens(analyzed_text.tokens())
.map(|(i, mut t)| {
t.byte_start = t.byte_start + current_byte_offset;
t.byte_end = t.byte_end + current_byte_offset;
(i, t)
(i + current_word_offset, t)
})
.map(|(i, t)| (i + current_word_offset, t))
.enumerate();
for (token_pos, (word_pos, token)) in tokens {
@ -143,21 +139,22 @@ where
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.scan((0, None), |(offset, sepkind), token| {
.scan((0, None), |(offset, prev_kind), token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
*offset += match *sepkind {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*sepkind = Some(token.kind)
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*sepkind = Some(token.kind);
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => {
*sepkind = Some(token.kind);
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
}
_ => (),
}
@ -226,12 +223,12 @@ mod tests {
#[test]
fn test_process_token() {
let text = " Zut, laspirateur, jai oublié de léteindre !";
let text = " 為一包含一千多萬目詞的帶標記平衡語料庫";
let stopwords = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords));
let analyzer = analyzer.analyze(text);
let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect();
println!("tokens: {:?}", tokens);
let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect();
assert_eq!(tokens, ["", "", "包含", "一千多万", "", "", "", "", "标记", "平衡", "语料库"]);
}
#[test]

View File

@ -102,8 +102,6 @@ async fn placeholder_search_witch_crop() {
"cropLength": 20
});
println!("here");
test_post_get_search!(server, query, |response, status_code| {
assert_eq!(status_code, 200);