MeiliSearch/src/lib.rs

use std::collections::HashMap;
use std::hash::BuildHasherDefault;
use std::time::Instant;

use cow_utils::CowUtils;
use fst::{IntoStreamer, Streamer};
use fxhash::FxHasher32;
use heed::types::*;
use heed::{PolyDatabase, Database};
use levenshtein_automata::LevenshteinAutomatonBuilder;
use roaring::RoaringBitmap;
use slice_group_by::StrGroupBy;

pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>;
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
pub type DocumentId = u32;

pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
    let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
    string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
}

#[derive(Clone)]
pub struct Index {
    pub main: PolyDatabase,
    pub postings_ids: Database<Str, ByteSlice>,
    pub prefix_postings_ids: Database<Str, ByteSlice>,
    pub documents: Database<OwnedType<BEU32>, ByteSlice>,
}

impl Index {
    pub fn new(env: &heed::Env) -> heed::Result<Index> {
        let main = env.create_poly_database(None)?;
        let postings_ids = env.create_database(Some("postings-ids"))?;
        let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;
        let documents = env.create_database(Some("documents"))?;

        Ok(Index {
            main,
            postings_ids,
            prefix_postings_ids,
            documents,
        })
    }

    pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
        self.main.get::<_, Str, ByteSlice>(rtxn, "headers")
    }

    pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {
        let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {
            Some(bytes) => fst::Set::new(bytes)?,
            None => return Ok(Vec::new()),
        };

        // Building these factories is not free.
        let lev0 = LevenshteinAutomatonBuilder::new(0, true);
        let lev1 = LevenshteinAutomatonBuilder::new(1, true);
        let lev2 = LevenshteinAutomatonBuilder::new(2, true);

        let words: Vec<_> = alphanumeric_tokens(query).collect();
        let number_of_words = words.len();
        let dfas = words.into_iter().enumerate().map(|(i, word)| {
            let word = word.cow_to_lowercase();
            let is_last = i + 1 == number_of_words;
            let dfa = match word.len() {
                0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
                5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
                _     => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
            };
            (word, dfa)
        });

        let mut intersect_result: Option<RoaringBitmap> = None;
        for (word, dfa) in dfas {
            let before = Instant::now();

            let mut union_result = RoaringBitmap::default();
            if word.len() <= 4 {
                if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(4)])? {
                    union_result = RoaringBitmap::deserialize_from(ids)?;
                }
            } else {
                let mut stream = fst.search(dfa).into_stream();
                while let Some(word) = stream.next() {
                    let word = std::str::from_utf8(word)?;
                    if let Some(ids) = self.postings_ids.get(rtxn, word)? {
                        let right = RoaringBitmap::deserialize_from(ids)?;
                        union_result.union_with(&right);
                    }
                }
                eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
            }

            intersect_result = match intersect_result.take() {
                Some(mut left) => {
                    let before = Instant::now();
                    let left_len = left.len();
                    left.intersect_with(&union_result);
                    eprintln!("intersect between {:?} and {:?} took {:.02?}",
                        left_len, union_result.len(), before.elapsed());
                    Some(left)
                },
                None => Some(union_result),
            };
        }

        Ok(intersect_result.unwrap_or_default().iter().take(20).collect())
    }
}
Support multiple space seperated words 2020-05-31 16:09:34 +02:00			`use std::collections::HashMap;`
			`use std::hash::BuildHasherDefault;`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`use std::time::Instant;`
Support multiple space seperated words 2020-05-31 16:09:34 +02:00
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`use cow_utils::CowUtils;`
			`use fst::{IntoStreamer, Streamer};`
Support multiple space seperated words 2020-05-31 16:09:34 +02:00			`use fxhash::FxHasher32;`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`use heed::types::*;`
			`use heed::{PolyDatabase, Database};`
			`use levenshtein_automata::LevenshteinAutomatonBuilder;`
			`use roaring::RoaringBitmap;`
Support multiple space seperated words 2020-05-31 16:09:34 +02:00			`use slice_group_by::StrGroupBy;`

			`pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;`
			`pub type SmallString32 = smallstr::SmallString<[u8; 32]>;`
			`pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>;`
			`pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;`
			`pub type DocumentId = u32;`

			`pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {`
			`let is_alphanumeric = \|s: &&str\| s.chars().next().map_or(false, char::is_alphanumeric);`
			`string.linear_group_by_key(\|c\| c.is_alphanumeric()).filter(is_alphanumeric)`
			`}`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00
			`#[derive(Clone)]`
			`pub struct Index {`
			`pub main: PolyDatabase,`
			`pub postings_ids: Database<Str, ByteSlice>,`
Introduce prefix postings ids for better perfs 2020-05-31 18:20:49 +02:00			`pub prefix_postings_ids: Database<Str, ByteSlice>,`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`pub documents: Database<OwnedType<BEU32>, ByteSlice>,`
			`}`

			`impl Index {`
			`pub fn new(env: &heed::Env) -> heed::Result<Index> {`
			`let main = env.create_poly_database(None)?;`
			`let postings_ids = env.create_database(Some("postings-ids"))?;`
Introduce prefix postings ids for better perfs 2020-05-31 18:20:49 +02:00			`let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`let documents = env.create_database(Some("documents"))?;`

			`Ok(Index {`
			`main,`
			`postings_ids,`
Introduce prefix postings ids for better perfs 2020-05-31 18:20:49 +02:00			`prefix_postings_ids,`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`documents,`
			`})`
			`}`

			`pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {`
			`self.main.get::<_, Str, ByteSlice>(rtxn, "headers")`
			`}`

			`pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {`
			`let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {`
			`Some(bytes) => fst::Set::new(bytes)?,`
			`None => return Ok(Vec::new()),`
			`};`

			`// Building these factories is not free.`
			`let lev0 = LevenshteinAutomatonBuilder::new(0, true);`
			`let lev1 = LevenshteinAutomatonBuilder::new(1, true);`
			`let lev2 = LevenshteinAutomatonBuilder::new(2, true);`

			`let words: Vec<_> = alphanumeric_tokens(query).collect();`
			`let number_of_words = words.len();`
			`let dfas = words.into_iter().enumerate().map(\|(i, word)\| {`
			`let word = word.cow_to_lowercase();`
			`let is_last = i + 1 == number_of_words;`
			`let dfa = match word.len() {`
			`0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },`
			`5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },`
			`_ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },`
			`};`
			`(word, dfa)`
			`});`

			`let mut intersect_result: Option<RoaringBitmap> = None;`
			`for (word, dfa) in dfas {`
			`let before = Instant::now();`
Introduce prefix postings ids for better perfs 2020-05-31 18:20:49 +02:00
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`let mut union_result = RoaringBitmap::default();`
Introduce prefix postings ids for better perfs 2020-05-31 18:20:49 +02:00			`if word.len() <= 4 {`
			`if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(4)])? {`
			`union_result = RoaringBitmap::deserialize_from(ids)?;`
			`}`
			`} else {`
			`let mut stream = fst.search(dfa).into_stream();`
			`while let Some(word) = stream.next() {`
			`let word = std::str::from_utf8(word)?;`
			`if let Some(ids) = self.postings_ids.get(rtxn, word)? {`
			`let right = RoaringBitmap::deserialize_from(ids)?;`
			`union_result.union_with(&right);`
			`}`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`}`
Introduce prefix postings ids for better perfs 2020-05-31 18:20:49 +02:00			`eprintln!("union for {:?} took {:.02?}", word, before.elapsed());`
Reintroduce a simple HTTP server 2020-05-31 17:48:13 +02:00			`}`

			`intersect_result = match intersect_result.take() {`
			`Some(mut left) => {`
			`let before = Instant::now();`
			`let left_len = left.len();`
			`left.intersect_with(&union_result);`
			`eprintln!("intersect between {:?} and {:?} took {:.02?}",`
			`left_len, union_result.len(), before.elapsed());`
			`Some(left)`
			`},`
			`None => Some(union_result),`
			`};`
			`}`

			`Ok(intersect_result.unwrap_or_default().iter().take(20).collect())`
			`}`
			`}`