mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-03-29 11:00:39 +01:00

The product_id is an id that identify each group variant (e.g. color, size...), it is not useful for the relevance for the moment.
116 lines
3.7 KiB
Rust
116 lines
3.7 KiB
Rust
// TODO make the raptor binary expose multiple subcommand
|
|
// make only one binary
|
|
|
|
#[macro_use] extern crate serde_derive;
|
|
|
|
use std::path::Path;
|
|
use std::collections::{HashSet, BTreeMap};
|
|
use std::io::{self, BufReader, BufRead};
|
|
use std::fs::File;
|
|
use std::{env, iter};
|
|
|
|
use raptor::{MetadataBuilder, DocIndex};
|
|
use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions};
|
|
use serde_json::from_str;
|
|
use unidecode::unidecode;
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct Product {
|
|
title: String,
|
|
group_id: u64,
|
|
ft: String,
|
|
}
|
|
|
|
type CommonWords = HashSet<String>;
|
|
|
|
fn common_words<P>(path: P) -> io::Result<CommonWords>
|
|
where P: AsRef<Path>,
|
|
{
|
|
let file = File::open(path)?;
|
|
let file = BufReader::new(file);
|
|
let mut set = HashSet::new();
|
|
for line in file.lines().filter_map(|l| l.ok()) {
|
|
for word in line.split_whitespace() {
|
|
set.insert(word.to_owned());
|
|
}
|
|
}
|
|
Ok(set)
|
|
}
|
|
|
|
fn main() {
|
|
let data_path = env::args().nth(1).expect("Missing data json lines file (e.g. products.json_lines)");
|
|
let data = File::open(data_path).unwrap();
|
|
let data = BufReader::new(data);
|
|
|
|
let common_path = "fr.stopwords.txt";
|
|
let common_words = common_words(common_path).unwrap_or_else(|e| {
|
|
println!("{:?}: {:?}", common_path, e);
|
|
HashSet::new()
|
|
});
|
|
|
|
// TODO add a subcommand to pack these files in a tar.xxx archive
|
|
let random_name = moby_name_gen::random_name();
|
|
let map_file = format!("{}.map", random_name);
|
|
let idx_file = format!("{}.idx", random_name);
|
|
let sst_file = format!("{}.sst", random_name);
|
|
|
|
let env_options = EnvOptions::new();
|
|
let cf_options = ColumnFamilyOptions::new();
|
|
let mut sst_file_writer = SstFileWriter::new(env_options, cf_options);
|
|
sst_file_writer.open(&sst_file).expect("open the sst file");
|
|
|
|
let map = File::create(&map_file).unwrap();
|
|
let indexes = File::create(&idx_file).unwrap();
|
|
let mut builder = MetadataBuilder::new(map, indexes);
|
|
let mut fields = BTreeMap::new();
|
|
|
|
for line in data.lines() {
|
|
let line = line.unwrap();
|
|
|
|
let product: Product = from_str(&line).unwrap();
|
|
|
|
{
|
|
let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate();
|
|
let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate();
|
|
|
|
let words = title.chain(description);
|
|
for (i, (attr, word)) in words {
|
|
let doc_index = DocIndex {
|
|
document: product.group_id,
|
|
attribute: attr,
|
|
attribute_index: i as u32,
|
|
};
|
|
// insert the exact representation
|
|
let word_lower = word.to_lowercase();
|
|
|
|
// and the unidecoded lowercased version
|
|
let word_unidecoded = unidecode(word).to_lowercase();
|
|
if word_lower != word_unidecoded {
|
|
builder.insert(word_unidecoded, doc_index);
|
|
}
|
|
|
|
builder.insert(word_lower, doc_index);
|
|
}
|
|
}
|
|
|
|
// TODO simplify this by using functions and
|
|
// use the MetadataBuilder internal BTreeMap ?
|
|
let key = format!("{}-title", product.group_id);
|
|
let value = product.title;
|
|
fields.insert(key, value);
|
|
|
|
let key = format!("{}-description", product.group_id);
|
|
let value = product.ft;
|
|
fields.insert(key, value);
|
|
}
|
|
|
|
for (key, value) in fields {
|
|
sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
|
|
}
|
|
let _sst_file_info = sst_file_writer.finish().unwrap();
|
|
|
|
builder.finish().unwrap();
|
|
|
|
println!("Succesfully created {:?} dump.", random_name);
|
|
}
|