2018-04-22 17:34:41 +02:00
|
|
|
// TODO make the raptor binary expose multiple subcommand
|
|
|
|
// make only one binary
|
|
|
|
|
|
|
|
extern crate raptor;
|
|
|
|
extern crate serde_json;
|
|
|
|
|
2018-04-22 18:10:01 +02:00
|
|
|
use std::collections::HashSet;
|
2018-04-22 17:34:41 +02:00
|
|
|
use std::fs::File;
|
|
|
|
use std::io::{BufReader, BufRead};
|
2018-05-13 15:12:15 +02:00
|
|
|
use std::iter;
|
2018-04-22 17:34:41 +02:00
|
|
|
|
2018-05-27 11:15:05 +02:00
|
|
|
use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex};
|
2018-04-22 17:34:41 +02:00
|
|
|
use serde_json::from_str;
|
|
|
|
|
|
|
|
fn main() {
|
|
|
|
let data = File::open("products.json_lines").unwrap();
|
|
|
|
let data = BufReader::new(data);
|
|
|
|
|
2018-04-22 18:10:01 +02:00
|
|
|
let common_words = {
|
2018-05-13 15:12:15 +02:00
|
|
|
match File::open("fr.stopwords.txt") {
|
|
|
|
Ok(file) => {
|
|
|
|
let file = BufReader::new(file);
|
|
|
|
let mut set = HashSet::new();
|
|
|
|
for line in file.lines().filter_map(|l| l.ok()) {
|
|
|
|
for word in line.split_whitespace() {
|
|
|
|
set.insert(word.to_owned());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
set
|
|
|
|
},
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("{:?}", e);
|
|
|
|
HashSet::new()
|
|
|
|
},
|
2018-04-22 18:10:01 +02:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2018-05-27 11:15:05 +02:00
|
|
|
let mut builder = DocIndexMapBuilder::new();
|
2018-04-22 17:34:41 +02:00
|
|
|
for line in data.lines() {
|
|
|
|
let line = line.unwrap();
|
|
|
|
|
2018-05-13 15:12:15 +02:00
|
|
|
let product: serde_json::Value = from_str(&line).unwrap();
|
2018-04-22 17:34:41 +02:00
|
|
|
|
2018-05-13 15:12:15 +02:00
|
|
|
// TODO use a real tokenizer
|
|
|
|
let title = iter::repeat(0).zip(product["title"].as_str().expect("invalid `title`").split_whitespace())
|
|
|
|
.filter(|(_, s)| !common_words.contains(*s))
|
|
|
|
.enumerate();
|
|
|
|
let description = iter::repeat(1).zip(product["ft"].as_str().expect("invalid `ft`").split_whitespace())
|
|
|
|
.filter(|(_, s)| !common_words.contains(*s))
|
|
|
|
.enumerate();
|
2018-04-22 17:34:41 +02:00
|
|
|
|
2018-05-13 15:12:15 +02:00
|
|
|
let words = title.chain(description);
|
|
|
|
for (i, (attr, word)) in words {
|
2018-05-27 11:15:05 +02:00
|
|
|
let doc_index = DocIndex {
|
|
|
|
document: product["product_id"].as_u64().expect("invalid `product_id`"),
|
|
|
|
attribute: attr,
|
|
|
|
attribute_index: i as u32,
|
2018-05-13 15:12:15 +02:00
|
|
|
};
|
2018-05-27 15:23:43 +02:00
|
|
|
builder.insert(word.to_lowercase(), doc_index);
|
2018-04-22 17:34:41 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let map = File::create("map.fst").unwrap();
|
|
|
|
let values = File::create("values.vecs").unwrap();
|
|
|
|
let (map, values) = builder.build(map, values).unwrap();
|
|
|
|
|
2018-05-27 11:15:05 +02:00
|
|
|
println!("Checking the dump consistency...");
|
|
|
|
unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() };
|
2018-04-22 17:34:41 +02:00
|
|
|
}
|