mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
chore: Rename bin into indexer
This commit is contained in:
parent
d210e5d8db
commit
879e28fb7d
72
Cargo.lock
generated
72
Cargo.lock
generated
@ -1,15 +1,15 @@
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.0.0"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.2.2"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
@ -17,7 +17,7 @@ name = "fst"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33"
|
||||
dependencies = [
|
||||
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
@ -28,15 +28,15 @@ source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ec
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein_automata"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#5e8183a7634c4a0182ea7bb398140b2fe9854f77"
|
||||
version = "0.1.1"
|
||||
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#9d01a14e57ded8e7a9a8d2b4e790f7b364e710b4"
|
||||
dependencies = [
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.40"
|
||||
version = "0.2.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
@ -44,13 +44,13 @@ name = "memmap"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "0.3.8"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -58,46 +58,46 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "0.5.2"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raptor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
|
||||
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
|
||||
"levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
|
||||
"serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.54"
|
||||
version = "1.0.66"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.54"
|
||||
version = "1.0.66"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "0.13.9"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
@ -108,7 +108,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -126,19 +126,19 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[metadata]
|
||||
"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0"
|
||||
"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87"
|
||||
"checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7"
|
||||
"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9"
|
||||
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "<none>"
|
||||
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
|
||||
"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
|
||||
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
|
||||
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
|
||||
"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1"
|
||||
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
|
||||
"checksum proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1b06e2f335f48d24442b35a19df506a835fb3547bc3c06ef27340da9acf5cae7"
|
||||
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
|
||||
"checksum serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "db9c1726bdebaed7ac8afb7028672e068e12cf1b0b97cddd742a3a7939159699"
|
||||
"checksum serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5121751b76f5a2e6f51b4c0d07976f4f04e33ae7a981467c2845e7cd4b67a114"
|
||||
"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038"
|
||||
"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6"
|
||||
"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035"
|
||||
"checksum serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "e9a2d9a9ac5120e0f768801ca2b58ad6eec929dc9d1d616c162f208869c2ce95"
|
||||
"checksum serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "0a90213fa7e0f5eac3f7afe2d5ff6b088af515052cc7303bd68c7e3b91a3fb79"
|
||||
"checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d"
|
||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
|
||||
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
|
||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
@ -87,13 +87,14 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raptor-bin"
|
||||
name = "raptor-indexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"raptor 0.1.0",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -136,6 +137,11 @@ name = "unicode-xid"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "unidecode"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.4"
|
||||
@ -172,6 +178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "f3ad6d546e765177cf3dded3c2e424a8040f870083a0e64064746b958ece9cb1"
|
||||
"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038"
|
||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
|
||||
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
|
||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "raptor-bin"
|
||||
name = "raptor-indexer"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
@ -8,6 +8,8 @@ raptor = { path = ".." }
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
unidecode = "0.3"
|
||||
|
||||
[profile.release]
|
||||
# lto = true
|
||||
debug = true
|
||||
lto = true
|
@ -3,14 +3,33 @@
|
||||
|
||||
extern crate raptor;
|
||||
extern crate serde_json;
|
||||
#[macro_use] extern crate serde_derive;
|
||||
extern crate unidecode;
|
||||
|
||||
use std::path::Path;
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, BufRead};
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufReader, BufRead};
|
||||
use std::iter;
|
||||
|
||||
use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex};
|
||||
use serde_json::from_str;
|
||||
use unidecode::unidecode;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Product {
|
||||
title: String,
|
||||
product_id: u64,
|
||||
ft: String,
|
||||
}
|
||||
|
||||
fn set_readonly<P>(path: P, readonly: bool) -> io::Result<()>
|
||||
where P: AsRef<Path>
|
||||
{
|
||||
let mut perms = fs::metadata(&path)?.permissions();
|
||||
perms.set_readonly(readonly);
|
||||
fs::set_permissions(&path, perms)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let data = File::open("products.json_lines").unwrap();
|
||||
@ -39,31 +58,40 @@ fn main() {
|
||||
for line in data.lines() {
|
||||
let line = line.unwrap();
|
||||
|
||||
let product: serde_json::Value = from_str(&line).unwrap();
|
||||
let product: Product = from_str(&line).unwrap();
|
||||
|
||||
// TODO use a real tokenizer
|
||||
let title = iter::repeat(0).zip(product["title"].as_str().expect("invalid `title`").split_whitespace())
|
||||
.filter(|(_, s)| !common_words.contains(*s))
|
||||
.enumerate();
|
||||
let description = iter::repeat(1).zip(product["ft"].as_str().expect("invalid `ft`").split_whitespace())
|
||||
.filter(|(_, s)| !common_words.contains(*s))
|
||||
.enumerate();
|
||||
let title = iter::repeat(0).zip(product.title.split_whitespace()).enumerate();
|
||||
let description = iter::repeat(1).zip(product.ft.split_whitespace()).enumerate();
|
||||
|
||||
let words = title.chain(description);
|
||||
for (i, (attr, word)) in words {
|
||||
if common_words.contains(word) { continue }
|
||||
let doc_index = DocIndex {
|
||||
document: product["product_id"].as_u64().expect("invalid `product_id`"),
|
||||
document: product.product_id,
|
||||
attribute: attr,
|
||||
attribute_index: i as u32,
|
||||
};
|
||||
builder.insert(word.to_lowercase(), doc_index);
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
builder.insert(word_lower, doc_index);
|
||||
}
|
||||
}
|
||||
|
||||
let map = File::create("map.fst").unwrap();
|
||||
let values = File::create("values.vecs").unwrap();
|
||||
|
||||
let (map, values) = builder.build(map, values).unwrap();
|
||||
|
||||
set_readonly("map.fst", true).unwrap();
|
||||
set_readonly("values.vecs", true).unwrap();
|
||||
|
||||
println!("Checking the dump consistency...");
|
||||
unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() };
|
||||
}
|
@ -27,8 +27,13 @@ fn main() {
|
||||
automatons.push(lev);
|
||||
}
|
||||
|
||||
let mut limit: Option<usize> = env::var("RAPTOR_OUTPUT_LIMIT").ok().and_then(|x| x.parse().ok());
|
||||
let mut stream = RankedStream::new(&map, map.values(), automatons);
|
||||
while let Some(document_id) = stream.next() {
|
||||
if limit == Some(0) { println!("..."); break }
|
||||
|
||||
println!("{:?}", document_id);
|
||||
|
||||
if let Some(ref mut limit) = limit { *limit -= 1 }
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user