mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-01 17:15:46 +01:00
feat(search): Use an external levenstein automaton implementation
This commit is contained in:
parent
80c05a9b50
commit
2cb26e327c
27
Cargo.lock
generated
27
Cargo.lock
generated
@ -91,15 +91,6 @@ dependencies = [
|
|||||||
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fst-levenshtein"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
dependencies = [
|
|
||||||
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fuchsia-zircon"
|
name = "fuchsia-zircon"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
@ -167,6 +158,14 @@ name = "lazycell"
|
|||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "levenshtein_automata"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "git+https://github.com/tantivy-search/levenshtein-automata.git#ba2b62e3631593c408e2b9b8bb95c430384a331e"
|
||||||
|
dependencies = [
|
||||||
|
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.40"
|
version = "0.2.40"
|
||||||
@ -307,9 +306,9 @@ dependencies = [
|
|||||||
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
|
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"levenshtein_automata 0.1.0 (git+https://github.com/tantivy-search/levenshtein-automata.git)",
|
||||||
"serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -603,11 +602,6 @@ dependencies = [
|
|||||||
"percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "utf8-ranges"
|
|
||||||
version = "0.1.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi"
|
name = "winapi"
|
||||||
version = "0.2.8"
|
version = "0.2.8"
|
||||||
@ -659,7 +653,6 @@ dependencies = [
|
|||||||
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
|
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
|
||||||
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
|
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
|
||||||
"checksum fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d94485a00b1827b861dd9d1a2cc9764f9044d4c535514c0760a5a2012ef3399f"
|
"checksum fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d94485a00b1827b861dd9d1a2cc9764f9044d4c535514c0760a5a2012ef3399f"
|
||||||
"checksum fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "64f12af1569dd78afbefe476034bbdce0372d18e9dc75b634bde0e7b8bf994c8"
|
|
||||||
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
|
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
|
||||||
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
|
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
|
||||||
"checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c"
|
"checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c"
|
||||||
@ -670,6 +663,7 @@ dependencies = [
|
|||||||
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
||||||
"checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d"
|
"checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d"
|
||||||
"checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef"
|
"checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef"
|
||||||
|
"checksum levenshtein_automata 0.1.0 (git+https://github.com/tantivy-search/levenshtein-automata.git)" = "<none>"
|
||||||
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
|
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
|
||||||
"checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
|
"checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
|
||||||
"checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2"
|
"checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2"
|
||||||
@ -716,7 +710,6 @@ dependencies = [
|
|||||||
"checksum unicode-normalization 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "90d662d111b0dbb08a180f2761026cba648c258023c355954a7c00e00e354636"
|
"checksum unicode-normalization 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "90d662d111b0dbb08a180f2761026cba648c258023c355954a7c00e00e354636"
|
||||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||||
"checksum url 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f808aadd8cfec6ef90e4a14eb46f24511824d1ac596b9682703c87056c8678b7"
|
"checksum url 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f808aadd8cfec6ef90e4a14eb46f24511824d1ac596b9682703c87056c8678b7"
|
||||||
"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
|
|
||||||
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
|
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
|
||||||
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
|
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
|
||||||
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
|
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
|
||||||
|
@ -7,7 +7,7 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
|||||||
bincode = "1.0"
|
bincode = "1.0"
|
||||||
env_logger = { version = "0.3", default-features = false }
|
env_logger = { version = "0.3", default-features = false }
|
||||||
fst = "0.3"
|
fst = "0.3"
|
||||||
fst-levenshtein = "0.2"
|
levenshtein_automata = { git = "https://github.com/tantivy-search/levenshtein-automata.git", features = ["fst_automaton"] }
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
lazy_static = "1.0"
|
lazy_static = "1.0"
|
||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
|
#[macro_use] extern crate lazy_static;
|
||||||
extern crate env_logger;
|
extern crate env_logger;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
extern crate fst_levenshtein;
|
|
||||||
extern crate futures;
|
extern crate futures;
|
||||||
#[macro_use] extern crate lazy_static;
|
extern crate levenshtein_automata;
|
||||||
extern crate raptor;
|
extern crate raptor;
|
||||||
extern crate tokio_minihttp;
|
extern crate tokio_minihttp;
|
||||||
extern crate tokio_proto;
|
extern crate tokio_proto;
|
||||||
@ -14,8 +14,8 @@ use std::path::Path;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, BufReader};
|
use std::io::{Read, BufReader};
|
||||||
|
|
||||||
use fst_levenshtein::Levenshtein;
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
|
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||||
use futures::future;
|
use futures::future;
|
||||||
use tokio_minihttp::{Request, Response, Http};
|
use tokio_minihttp::{Request, Response, Http};
|
||||||
use tokio_proto::TcpServer;
|
use tokio_proto::TcpServer;
|
||||||
@ -30,10 +30,17 @@ lazy_static! {
|
|||||||
|
|
||||||
FstMap::from_bytes(map, &values).unwrap()
|
FstMap::from_bytes(map, &values).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static ref LEV_AUT_BLDR_0: LevenshteinAutomatonBuilder = LevenshteinAutomatonBuilder::new(0, false);
|
||||||
|
static ref LEV_AUT_BLDR_1: LevenshteinAutomatonBuilder = LevenshteinAutomatonBuilder::new(1, false);
|
||||||
|
static ref LEV_AUT_BLDR_2: LevenshteinAutomatonBuilder = LevenshteinAutomatonBuilder::new(2, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct MainService {
|
struct MainService {
|
||||||
map: &'static FstMap<u64>,
|
map: &'static FstMap<u64>,
|
||||||
|
lev_aut_bldr_0: &'static LevenshteinAutomatonBuilder,
|
||||||
|
lev_aut_bldr_1: &'static LevenshteinAutomatonBuilder,
|
||||||
|
lev_aut_bldr_2: &'static LevenshteinAutomatonBuilder,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn construct_body<'f, S>(mut stream: S) -> String
|
fn construct_body<'f, S>(mut stream: S) -> String
|
||||||
@ -71,14 +78,12 @@ impl Service for MainService {
|
|||||||
if let Some((_, key)) = url.query_pairs().find(|&(ref k, _)| k == "q") {
|
if let Some((_, key)) = url.query_pairs().find(|&(ref k, _)| k == "q") {
|
||||||
let key = key.to_lowercase();
|
let key = key.to_lowercase();
|
||||||
|
|
||||||
// TODO prefer using the `tantivy-search/levenshtein-automata` instead
|
|
||||||
let lev = if key.len() <= 4 {
|
let lev = if key.len() <= 4 {
|
||||||
// TODO prefer using AlwaysMatch with max_len ?
|
self.lev_aut_bldr_0.build_dfa(&key)
|
||||||
Levenshtein::new(&key, 0).unwrap()
|
|
||||||
} else if key.len() <= 8 {
|
} else if key.len() <= 8 {
|
||||||
Levenshtein::new(&key, 1).unwrap()
|
self.lev_aut_bldr_1.build_dfa(&key)
|
||||||
} else {
|
} else {
|
||||||
Levenshtein::new(&key, 2).unwrap()
|
self.lev_aut_bldr_2.build_dfa(&key)
|
||||||
};
|
};
|
||||||
|
|
||||||
let stream = self.map.search(lev).into_stream();
|
let stream = self.map.search(lev).into_stream();
|
||||||
@ -105,5 +110,10 @@ fn main() {
|
|||||||
drop(env_logger::init());
|
drop(env_logger::init());
|
||||||
let addr = "0.0.0.0:8080".parse().unwrap();
|
let addr = "0.0.0.0:8080".parse().unwrap();
|
||||||
|
|
||||||
TcpServer::new(Http, addr).serve(|| Ok(MainService { map: &MAP }))
|
TcpServer::new(Http, addr).serve(|| Ok(MainService {
|
||||||
|
map: &MAP,
|
||||||
|
lev_aut_bldr_0: &LEV_AUT_BLDR_0,
|
||||||
|
lev_aut_bldr_1: &LEV_AUT_BLDR_1,
|
||||||
|
lev_aut_bldr_2: &LEV_AUT_BLDR_2,
|
||||||
|
}))
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user