chore: Rename bin into indexer

This commit is contained in:
Kerollmops 2018-06-24 01:28:27 +02:00 committed by Clément Renault
parent d210e5d8db
commit 879e28fb7d
6 changed files with 93 additions and 51 deletions

72
Cargo.lock generated
View File

@ -1,15 +1,15 @@
[[package]]
name = "bincode"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "byteorder"
version = "1.2.2"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@ -17,7 +17,7 @@ name = "fst"
version = "0.3.0"
source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33"
dependencies = [
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
@ -28,15 +28,15 @@ source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ec
[[package]]
name = "levenshtein_automata"
version = "0.1.0"
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#5e8183a7634c4a0182ea7bb398140b2fe9854f77"
version = "0.1.1"
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#9d01a14e57ded8e7a9a8d2b4e790f7b364e710b4"
dependencies = [
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
]
[[package]]
name = "libc"
version = "0.2.40"
version = "0.2.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@ -44,13 +44,13 @@ name = "memmap"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "proc-macro2"
version = "0.3.8"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -58,46 +58,46 @@ dependencies = [
[[package]]
name = "quote"
version = "0.5.2"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "raptor"
version = "0.1.0"
dependencies = [
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
"levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
"serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "serde"
version = "1.0.54"
version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde_derive"
version = "1.0.54"
version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "syn"
version = "0.13.9"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
@ -108,7 +108,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.3.4"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -126,19 +126,19 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0"
"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87"
"checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7"
"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9"
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "<none>"
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1"
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
"checksum proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1b06e2f335f48d24442b35a19df506a835fb3547bc3c06ef27340da9acf5cae7"
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
"checksum serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "db9c1726bdebaed7ac8afb7028672e068e12cf1b0b97cddd742a3a7939159699"
"checksum serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5121751b76f5a2e6f51b4c0d07976f4f04e33ae7a981467c2845e7cd4b67a114"
"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038"
"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6"
"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035"
"checksum serde 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "e9a2d9a9ac5120e0f768801ca2b58ad6eec929dc9d1d616c162f208869c2ce95"
"checksum serde_derive 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "0a90213fa7e0f5eac3f7afe2d5ff6b088af515052cc7303bd68c7e3b91a3fb79"
"checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@ -87,13 +87,14 @@ dependencies = [
]
[[package]]
name = "raptor-bin"
name = "raptor-indexer"
version = "0.1.0"
dependencies = [
"raptor 0.1.0",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
"unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -136,6 +137,11 @@ name = "unicode-xid"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unidecode"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.3.4"
@ -172,6 +178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "f3ad6d546e765177cf3dded3c2e424a8040f870083a0e64064746b958ece9cb1"
"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@ -1,5 +1,5 @@
[package]
name = "raptor-bin"
name = "raptor-indexer"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
@ -8,6 +8,8 @@ raptor = { path = ".." }
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
unidecode = "0.3"
[profile.release]
# lto = true
debug = true
lto = true

View File

@ -3,14 +3,33 @@
extern crate raptor;
extern crate serde_json;
#[macro_use] extern crate serde_derive;
extern crate unidecode;
use std::path::Path;
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufReader, BufRead};
use std::fs::{self, File};
use std::io::{self, BufReader, BufRead};
use std::iter;
use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex};
use serde_json::from_str;
use unidecode::unidecode;
#[derive(Debug, Deserialize)]
struct Product {
title: String,
product_id: u64,
ft: String,
}
fn set_readonly<P>(path: P, readonly: bool) -> io::Result<()>
where P: AsRef<Path>
{
let mut perms = fs::metadata(&path)?.permissions();
perms.set_readonly(readonly);
fs::set_permissions(&path, perms)
}
fn main() {
let data = File::open("products.json_lines").unwrap();
@ -39,31 +58,40 @@ fn main() {
for line in data.lines() {
let line = line.unwrap();
let product: serde_json::Value = from_str(&line).unwrap();
let product: Product = from_str(&line).unwrap();
// TODO use a real tokenizer
let title = iter::repeat(0).zip(product["title"].as_str().expect("invalid `title`").split_whitespace())
.filter(|(_, s)| !common_words.contains(*s))
.enumerate();
let description = iter::repeat(1).zip(product["ft"].as_str().expect("invalid `ft`").split_whitespace())
.filter(|(_, s)| !common_words.contains(*s))
.enumerate();
let title = iter::repeat(0).zip(product.title.split_whitespace()).enumerate();
let description = iter::repeat(1).zip(product.ft.split_whitespace()).enumerate();
let words = title.chain(description);
for (i, (attr, word)) in words {
if common_words.contains(word) { continue }
let doc_index = DocIndex {
document: product["product_id"].as_u64().expect("invalid `product_id`"),
document: product.product_id,
attribute: attr,
attribute_index: i as u32,
};
builder.insert(word.to_lowercase(), doc_index);
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
builder.insert(word_unidecoded, doc_index);
}
builder.insert(word_lower, doc_index);
}
}
let map = File::create("map.fst").unwrap();
let values = File::create("values.vecs").unwrap();
let (map, values) = builder.build(map, values).unwrap();
set_readonly("map.fst", true).unwrap();
set_readonly("values.vecs", true).unwrap();
println!("Checking the dump consistency...");
unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() };
}

View File

@ -27,8 +27,13 @@ fn main() {
automatons.push(lev);
}
let mut limit: Option<usize> = env::var("RAPTOR_OUTPUT_LIMIT").ok().and_then(|x| x.parse().ok());
let mut stream = RankedStream::new(&map, map.values(), automatons);
while let Some(document_id) = stream.next() {
if limit == Some(0) { println!("..."); break }
println!("{:?}", document_id);
if let Some(ref mut limit) = limit { *limit -= 1 }
}
}