MeiliSearch/meilisearch-core/src/raw_indexer.rs

use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;

use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;

const WORD_LENGTH_LIMIT: usize = 80;

type Word = Vec<u8>; // TODO make it be a SmallVec

pub struct RawIndexer {
    word_limit: usize, // the maximum number of indexed words
    stop_words: fst::Set,
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
}

pub struct Indexed {
    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
    pub docs_words: HashMap<DocumentId, fst::Set>,
}

impl RawIndexer {
    pub fn new(stop_words: fst::Set) -> RawIndexer {
        RawIndexer::with_word_limit(stop_words, 1000)
    }

    pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
        RawIndexer {
            word_limit: limit,
            stop_words,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
        }
    }

    pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
        let mut number_of_words = 0;

        for token in Tokenizer::new(text) {
            let must_continue = index_token(
                token,
                id,
                indexed_pos,
                self.word_limit,
                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );

            number_of_words += 1;

            if !must_continue {
                break;
            }
        }

        number_of_words
    }

    pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
    where
        I: IntoIterator<Item = &'a str>,
    {
        let iter = iter.into_iter();
        for token in SeqTokenizer::new(iter) {
            let must_continue = index_token(
                token,
                id,
                indexed_pos,
                self.word_limit,
                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );

            if !must_continue {
                break;
            }
        }
    }

    pub fn build(self) -> Indexed {
        let words_doc_indexes = self
            .words_doc_indexes
            .into_iter()
            .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
            .collect();

        let docs_words = self
            .docs_words
            .into_iter()
            .map(|(id, mut words)| {
                words.sort_unstable();
                words.dedup();
                (id, fst::Set::from_iter(words).unwrap())
            })
            .collect();

        Indexed {
            words_doc_indexes,
            docs_words,
        }
    }
}

fn index_token(
    token: Token,
    id: DocumentId,
    indexed_pos: IndexedPos,
    word_limit: usize,
    stop_words: &fst::Set,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool {
    if token.word_index >= word_limit {
        return false;
    }

    let lower = token.word.to_lowercase();
    let token = Token {
        word: &lower,
        ..token
    };

    if !stop_words.contains(&token.word) {
        match token_to_docindex(id, indexed_pos, token) {
            Some(docindex) => {
                let word = Vec::from(token.word);

                if word.len() <= WORD_LENGTH_LIMIT {
                    words_doc_indexes
                        .entry(word.clone())
                        .or_insert_with(Vec::new)
                        .push(docindex);
                    docs_words.entry(id).or_insert_with(Vec::new).push(word);

                    if !lower.contains(is_cjk) {
                        let unidecoded = deunicode_with_tofu(&lower, "");
                        if unidecoded != lower && !unidecoded.is_empty() {
                            let word = Vec::from(unidecoded);
                            if word.len() <= WORD_LENGTH_LIMIT {
                                words_doc_indexes
                                    .entry(word.clone())
                                    .or_insert_with(Vec::new)
                                    .push(docindex);
                                docs_words.entry(id).or_insert_with(Vec::new).push(word);
                            }
                        }
                    }
                }
            }
            None => return false,
        }
    }

    true
}

fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
    let word_index = u16::try_from(token.word_index).ok()?;
    let char_index = u16::try_from(token.char_index).ok()?;
    let char_length = u16::try_from(token.word.chars().count()).ok()?;

    let docindex = DocIndex {
        document_id: id,
        attribute: indexed_pos.0,
        word_index,
        char_index,
        char_length,
    };

    Some(docindex)
}

#[cfg(test)]
mod tests {
    use super::*;
    use meilisearch_schema::IndexedPos;

    #[test]
    fn strange_apostrophe() {
        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let indexed_pos = IndexedPos(0);
        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
        indexer.index_text(docid, indexed_pos, text);

        let Indexed {
            words_doc_indexes, ..
        } = indexer.build();

        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        assert!(words_doc_indexes
            .get(&"éteindre".to_owned().into_bytes())
            .is_some());
    }

    #[test]
    fn strange_apostrophe_in_sequence() {
        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let indexed_pos = IndexedPos(0);
        let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
        indexer.index_text_seq(docid, indexed_pos, text);

        let Indexed {
            words_doc_indexes, ..
        } = indexer.build();

        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        assert!(words_doc_indexes
            .get(&"éteindre".to_owned().into_bytes())
            .is_some());
    }

    #[test]
    fn basic_stop_words() {
        let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
        let stop_words = fst::Set::from_iter(stop_words).unwrap();

        let mut indexer = RawIndexer::new(stop_words);

        let docid = DocumentId(0);
        let indexed_pos = IndexedPos(0);
        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
        indexer.index_text(docid, indexed_pos, text);

        let Indexed {
            words_doc_indexes, ..
        } = indexer.build();

        assert!(words_doc_indexes.get(&b"l"[..]).is_none());
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"j"[..]).is_none());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
        assert!(words_doc_indexes.get(&b"de"[..]).is_none());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        assert!(words_doc_indexes
            .get(&"éteindre".to_owned().into_bytes())
            .is_some());
    }

    #[test]
    fn no_empty_unidecode() {
        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let indexed_pos = IndexedPos(0);
        let text = "🇯🇵";
        indexer.index_text(docid, indexed_pos, text);

        let Indexed {
            words_doc_indexes, ..
        } = indexer.build();

        assert!(words_doc_indexes
            .get(&"🇯🇵".to_owned().into_bytes())
            .is_some());
    }
}
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								use std::collections::{BTreeMap, HashMap};
 								use std::convert::TryFrom;
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								use crate::{DocIndex, DocumentId};
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								use deunicode::deunicode_with_tofu;
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								use meilisearch_schema::IndexedPos;
-												Rename MeiliDB into MeiliSearch

											
										
										
											2019-11-26 11:06:55 +01:00
+								use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								use sdset::SetBuf;
-												Ignore words that are too long

											
										
										
											2019-11-10 17:41:32 +01:00
+								const WORD_LENGTH_LIMIT: usize = 80;
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								type Word = Vec<u8>; // TODO make it be a SmallVec
 								pub struct RawIndexer {
 								    word_limit: usize, // the maximum number of indexed words
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								    stop_words: fst::Set,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
 								    docs_words: HashMap<DocumentId, Vec<Word>>,
 								}
 								pub struct Indexed {
 								    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
 								    pub docs_words: HashMap<DocumentId, fst::Set>,
 								}
 								impl RawIndexer {
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								    pub fn new(stop_words: fst::Set) -> RawIndexer {
 								        RawIndexer::with_word_limit(stop_words, 1000)
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    }
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								    pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        RawIndexer {
 								            word_limit: limit,
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								            stop_words,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								            words_doc_indexes: BTreeMap::new(),
 								            docs_words: HashMap::new(),
 								        }
 								    }
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								    pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
-												Make the RawIndexer index_text method return the number of words

											
										
										
											2019-10-14 13:56:52 +02:00
+								        let mut number_of_words = 0;
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								        for token in Tokenizer::new(text) {
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								            let must_continue = index_token(
 								                token,
 								                id,
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								                indexed_pos,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								                self.word_limit,
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								                &self.stop_words,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								                &mut self.words_doc_indexes,
 								                &mut self.docs_words,
 								            );
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								            number_of_words += 1;
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								            if !must_continue {
 								                break;
 								            }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        }
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								        number_of_words
 								    }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								    pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								    where
 								        I: IntoIterator<Item = &'a str>,
 								    {
 								        let iter = iter.into_iter();
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        for token in SeqTokenizer::new(iter) {
 								            let must_continue = index_token(
 								                token,
 								                id,
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								                indexed_pos,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								                self.word_limit,
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								                &self.stop_words,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								                &mut self.words_doc_indexes,
 								                &mut self.docs_words,
 								            );
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								            if !must_continue {
 								                break;
 								            }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        }
 								    }
 								    pub fn build(self) -> Indexed {
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        let words_doc_indexes = self
 								            .words_doc_indexes
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								            .into_iter()
 								            .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
 								            .collect();
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        let docs_words = self
 								            .docs_words
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								            .into_iter()
 								            .map(|(id, mut words)| {
 								                words.sort_unstable();
 								                words.dedup();
 								                (id, fst::Set::from_iter(words).unwrap())
 								            })
 								            .collect();
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        Indexed {
 								            words_doc_indexes,
 								            docs_words,
 								        }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    }
 								}
 								fn index_token(
 								    token: Token,
 								    id: DocumentId,
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								    indexed_pos: IndexedPos,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    word_limit: usize,
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								    stop_words: &fst::Set,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
 								    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								) -> bool {
 								    if token.word_index >= word_limit {
 								        return false;
 								    }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								    let lower = token.word.to_lowercase();
 								    let token = Token {
 								        word: &lower,
 								        ..token
 								    };
-												Add a test to ensure that the indexer support stop words

											
										
										
											2019-10-29 16:04:48 +01:00
+								    if !stop_words.contains(&token.word) {
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								        match token_to_docindex(id, indexed_pos, token) {
-												Add a test to ensure that the indexer support stop words

											
										
										
											2019-10-29 16:04:48 +01:00
+								            Some(docindex) => {
 								                let word = Vec::from(token.word);
-												Ignore words that are too long

											
										
										
											2019-11-10 17:41:32 +01:00
 								                if word.len() <= WORD_LENGTH_LIMIT {
 								                    words_doc_indexes
 								                        .entry(word.clone())
 								                        .or_insert_with(Vec::new)
 								                        .push(docindex);
 								                    docs_words.entry(id).or_insert_with(Vec::new).push(word);
 								                    if !lower.contains(is_cjk) {
 								                        let unidecoded = deunicode_with_tofu(&lower, "");
 								                        if unidecoded != lower && !unidecoded.is_empty() {
 								                            let word = Vec::from(unidecoded);
 								                            if word.len() <= WORD_LENGTH_LIMIT {
 								                                words_doc_indexes
 								                                    .entry(word.clone())
 								                                    .or_insert_with(Vec::new)
 								                                    .push(docindex);
 								                                docs_words.entry(id).or_insert_with(Vec::new).push(word);
 								                            }
 								                        }
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								                    }
 								                }
 								            }
-												Fix an highlighting problem when query was longer than original text

											
										
										
											2019-11-05 16:40:34 +01:00
+								            None => return false,
-												Improve the indexer to not not deunicode before indexing

Revert of #179

											
										
										
											2019-11-04 16:09:32 +01:00
+								        }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    }
 								    true
 								}
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    let word_index = u16::try_from(token.word_index).ok()?;
 								    let char_index = u16::try_from(token.char_index).ok()?;
 								    let char_length = u16::try_from(token.word.chars().count()).ok()?;
 								    let docindex = DocIndex {
 								        document_id: id,
-												squash-me

											
										
										
											2020-01-10 18:20:30 +01:00
+								        attribute: indexed_pos.0,
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        word_index,
 								        char_index,
 								        char_length,
 								    };
 								    Some(docindex)
 								}
 								#[cfg(test)]
 								mod tests {
 								    use super::*;
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								    use meilisearch_schema::IndexedPos;
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
 								    #[test]
 								    fn strange_apostrophe() {
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								        let mut indexer = RawIndexer::new(fst::Set::default());
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
 								        let docid = DocumentId(0);
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        let indexed_pos = IndexedPos(0);
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        indexer.index_text(docid, indexed_pos, text);
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        let Indexed {
 								            words_doc_indexes, ..
 								        } = indexer.build();
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
 								        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        assert!(words_doc_indexes
-												Update the tests

											
										
										
											2019-11-04 16:10:31 +01:00
+								            .get(&"éteindre".to_owned().into_bytes())
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								            .is_some());
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    }
 								    #[test]
 								    fn strange_apostrophe_in_sequence() {
-												Make the RawIndexer support stop words

											
										
										
											2019-10-29 15:53:45 +01:00
+								        let mut indexer = RawIndexer::new(fst::Set::default());
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
 								        let docid = DocumentId(0);
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        let indexed_pos = IndexedPos(0);
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								        let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        indexer.index_text_seq(docid, indexed_pos, text);
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        let Indexed {
 								            words_doc_indexes, ..
 								        } = indexer.build();
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
 								        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								        assert!(words_doc_indexes
-												Update the tests

											
										
										
											2019-11-04 16:10:31 +01:00
+								            .get(&"éteindre".to_owned().into_bytes())
-												Cargo fmt pass

											
										
										
											2019-10-18 13:05:28 +02:00
+								            .is_some());
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								    }
-												Add a test to ensure that the indexer support stop words

											
										
										
											2019-10-29 16:04:48 +01:00
 								    #[test]
 								    fn basic_stop_words() {
 								        let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
 								        let stop_words = fst::Set::from_iter(stop_words).unwrap();
 								        let mut indexer = RawIndexer::new(stop_words);
 								        let docid = DocumentId(0);
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        let indexed_pos = IndexedPos(0);
-												Add a test to ensure that the indexer support stop words

											
										
										
											2019-10-29 16:04:48 +01:00
+								        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        indexer.index_text(docid, indexed_pos, text);
-												Add a test to ensure that the indexer support stop words

											
										
										
											2019-10-29 16:04:48 +01:00
 								        let Indexed {
 								            words_doc_indexes, ..
 								        } = indexer.build();
 								        assert!(words_doc_indexes.get(&b"l"[..]).is_none());
 								        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"j"[..]).is_none());
 								        assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
 								        assert!(words_doc_indexes.get(&b"de"[..]).is_none());
 								        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
 								        assert!(words_doc_indexes
-												Update the tests

											
										
										
											2019-11-04 16:10:31 +01:00
+								            .get(&"éteindre".to_owned().into_bytes())
-												Add a test to ensure that the indexer support stop words

											
										
										
											2019-10-29 16:04:48 +01:00
+								            .is_some());
 								    }
-												Check that the unidecoded words are not empty

											
										
										
											2019-11-04 16:58:02 +01:00
 								    #[test]
 								    fn no_empty_unidecode() {
 								        let mut indexer = RawIndexer::new(fst::Set::default());
 								        let docid = DocumentId(0);
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        let indexed_pos = IndexedPos(0);
-												Check that the unidecoded words are not empty

											
										
										
											2019-11-04 16:58:02 +01:00
+								        let text = "🇯🇵";
-												introduce a new schemaless way

											
										
										
											2020-01-13 19:10:58 +01:00
+								        indexer.index_text(docid, indexed_pos, text);
-												Check that the unidecoded words are not empty

											
										
										
											2019-11-04 16:58:02 +01:00
 								        let Indexed {
 								            words_doc_indexes, ..
 								        } = indexer.build();
 								        assert!(words_doc_indexes
 								            .get(&"🇯🇵".to_owned().into_bytes())
 								            .is_some());
 								    }
-												Introduce a basically working rkv based MeiliDB

											
										
										
											2019-10-02 17:34:32 +02:00
+								}