From 879e28fb7d105ed6f5c166f8f4eb6ecf6a94f03e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 24 Jun 2018 01:28:27 +0200 Subject: [PATCH] chore: Rename bin into indexer --- Cargo.lock | 72 +++++++++++----------- {raptor-bin => raptor-indexer}/.gitignore | 0 {raptor-bin => raptor-indexer}/Cargo.lock | 9 ++- {raptor-bin => raptor-indexer}/Cargo.toml | 6 +- {raptor-bin => raptor-indexer}/src/main.rs | 52 ++++++++++++---- raptor-search/src/main.rs | 5 ++ 6 files changed, 93 insertions(+), 51 deletions(-) rename {raptor-bin => raptor-indexer}/.gitignore (100%) rename {raptor-bin => raptor-indexer}/Cargo.lock (95%) rename {raptor-bin => raptor-indexer}/Cargo.toml (74%) rename {raptor-bin => raptor-indexer}/src/main.rs (54%) diff --git a/Cargo.lock b/Cargo.lock index 4be02d973..19ce15488 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,15 +1,15 @@ [[package]] name = "bincode" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "byteorder" -version = "1.2.2" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -17,7 +17,7 @@ name = "fst" version = "0.3.0" source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33" dependencies = [ - "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -28,15 +28,15 @@ source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ec [[package]] name = "levenshtein_automata" -version = "0.1.0" -source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#5e8183a7634c4a0182ea7bb398140b2fe9854f77" +version = "0.1.1" +source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#9d01a14e57ded8e7a9a8d2b4e790f7b364e710b4" dependencies = [ "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", ] [[package]] name = "libc" -version = "0.2.40" +version = "0.2.42" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -44,13 +44,13 @@ name = "memmap" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "proc-macro2" -version = "0.3.8" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -58,46 +58,46 @@ dependencies = [ [[package]] name = "quote" -version = "0.5.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "raptor" version = "0.1.0" dependencies = [ - "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", - "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", - "serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", + "levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", + "serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "serde" -version = "1.0.54" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "serde_derive" -version = "1.0.54" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "syn" -version = "0.13.9" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -108,7 +108,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "winapi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -126,19 +126,19 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] -"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0" -"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87" +"checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7" +"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" "checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" -"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" -"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" +"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" +"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" -"checksum proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1b06e2f335f48d24442b35a19df506a835fb3547bc3c06ef27340da9acf5cae7" -"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8" -"checksum serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "db9c1726bdebaed7ac8afb7028672e068e12cf1b0b97cddd742a3a7939159699" -"checksum serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5121751b76f5a2e6f51b4c0d07976f4f04e33ae7a981467c2845e7cd4b67a114" -"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038" +"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" +"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" +"checksum serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "e9a2d9a9ac5120e0f768801ca2b58ad6eec929dc9d1d616c162f208869c2ce95" +"checksum serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "0a90213fa7e0f5eac3f7afe2d5ff6b088af515052cc7303bd68c7e3b91a3fb79" +"checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" -"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3" +"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/raptor-bin/.gitignore b/raptor-indexer/.gitignore similarity index 100% rename from raptor-bin/.gitignore rename to raptor-indexer/.gitignore diff --git a/raptor-bin/Cargo.lock b/raptor-indexer/Cargo.lock similarity index 95% rename from raptor-bin/Cargo.lock rename to raptor-indexer/Cargo.lock index 9cdc1b50b..a275da9de 100644 --- a/raptor-bin/Cargo.lock +++ b/raptor-indexer/Cargo.lock @@ -87,13 +87,14 @@ dependencies = [ ] [[package]] -name = "raptor-bin" +name = "raptor-indexer" version = "0.1.0" dependencies = [ "raptor 0.1.0", "serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -136,6 +137,11 @@ name = "unicode-xid" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "unidecode" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.3.4" @@ -172,6 +178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "f3ad6d546e765177cf3dded3c2e424a8040f870083a0e64064746b958ece9cb1" "checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" "checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/raptor-bin/Cargo.toml b/raptor-indexer/Cargo.toml similarity index 74% rename from raptor-bin/Cargo.toml rename to raptor-indexer/Cargo.toml index dde1a965d..f42a0a2f4 100644 --- a/raptor-bin/Cargo.toml +++ b/raptor-indexer/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "raptor-bin" +name = "raptor-indexer" version = "0.1.0" authors = ["Kerollmops "] @@ -8,6 +8,8 @@ raptor = { path = ".." } serde = "1.0" serde_derive = "1.0" serde_json = "1.0" +unidecode = "0.3" [profile.release] -# lto = true +debug = true +lto = true diff --git a/raptor-bin/src/main.rs b/raptor-indexer/src/main.rs similarity index 54% rename from raptor-bin/src/main.rs rename to raptor-indexer/src/main.rs index 306e372af..bc50f6894 100644 --- a/raptor-bin/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -3,14 +3,33 @@ extern crate raptor; extern crate serde_json; +#[macro_use] extern crate serde_derive; +extern crate unidecode; +use std::path::Path; use std::collections::HashSet; -use std::fs::File; -use std::io::{BufReader, BufRead}; +use std::fs::{self, File}; +use std::io::{self, BufReader, BufRead}; use std::iter; use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex}; use serde_json::from_str; +use unidecode::unidecode; + +#[derive(Debug, Deserialize)] +struct Product { + title: String, + product_id: u64, + ft: String, +} + +fn set_readonly

(path: P, readonly: bool) -> io::Result<()> +where P: AsRef +{ + let mut perms = fs::metadata(&path)?.permissions(); + perms.set_readonly(readonly); + fs::set_permissions(&path, perms) +} fn main() { let data = File::open("products.json_lines").unwrap(); @@ -39,31 +58,40 @@ fn main() { for line in data.lines() { let line = line.unwrap(); - let product: serde_json::Value = from_str(&line).unwrap(); + let product: Product = from_str(&line).unwrap(); - // TODO use a real tokenizer - let title = iter::repeat(0).zip(product["title"].as_str().expect("invalid `title`").split_whitespace()) - .filter(|(_, s)| !common_words.contains(*s)) - .enumerate(); - let description = iter::repeat(1).zip(product["ft"].as_str().expect("invalid `ft`").split_whitespace()) - .filter(|(_, s)| !common_words.contains(*s)) - .enumerate(); + let title = iter::repeat(0).zip(product.title.split_whitespace()).enumerate(); + let description = iter::repeat(1).zip(product.ft.split_whitespace()).enumerate(); let words = title.chain(description); for (i, (attr, word)) in words { + if common_words.contains(word) { continue } let doc_index = DocIndex { - document: product["product_id"].as_u64().expect("invalid `product_id`"), + document: product.product_id, attribute: attr, attribute_index: i as u32, }; - builder.insert(word.to_lowercase(), doc_index); + // insert the exact representation + let word_lower = word.to_lowercase(); + + // and the unidecoded lowercased version + let word_unidecoded = unidecode(word).to_lowercase(); + if word_lower != word_unidecoded { + builder.insert(word_unidecoded, doc_index); + } + + builder.insert(word_lower, doc_index); } } let map = File::create("map.fst").unwrap(); let values = File::create("values.vecs").unwrap(); + let (map, values) = builder.build(map, values).unwrap(); + set_readonly("map.fst", true).unwrap(); + set_readonly("values.vecs", true).unwrap(); + println!("Checking the dump consistency..."); unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() }; } diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs index caa118b8a..6e987e2f1 100644 --- a/raptor-search/src/main.rs +++ b/raptor-search/src/main.rs @@ -27,8 +27,13 @@ fn main() { automatons.push(lev); } + let mut limit: Option = env::var("RAPTOR_OUTPUT_LIMIT").ok().and_then(|x| x.parse().ok()); let mut stream = RankedStream::new(&map, map.values(), automatons); while let Some(document_id) = stream.next() { + if limit == Some(0) { println!("..."); break } + println!("{:?}", document_id); + + if let Some(ref mut limit) = limit { *limit -= 1 } } }