mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 05:00:06 +01:00
cli: Make work to index json lines
This commit is contained in:
parent
820f38bd8a
commit
2fa7178ed1
14
Cargo.lock
generated
14
Cargo.lock
generated
@ -6,6 +6,15 @@ dependencies = [
|
|||||||
"nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
"nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bincode"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "1.0.1"
|
version = "1.0.1"
|
||||||
@ -294,6 +303,7 @@ dependencies = [
|
|||||||
name = "raptor"
|
name = "raptor"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
|
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -376,6 +386,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
name = "smallvec"
|
name = "smallvec"
|
||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
@ -625,6 +638,7 @@ dependencies = [
|
|||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
"checksum arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a1e964f9e24d588183fcb43503abda40d288c8657dfc27311516ce2f05675aef"
|
"checksum arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a1e964f9e24d588183fcb43503abda40d288c8657dfc27311516ce2f05675aef"
|
||||||
|
"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0"
|
||||||
"checksum bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b3c30d3802dfb7281680d6285f2ccdaa8c2d8fee41f93805dba5c4cf50dc23cf"
|
"checksum bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b3c30d3802dfb7281680d6285f2ccdaa8c2d8fee41f93805dba5c4cf50dc23cf"
|
||||||
"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87"
|
"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87"
|
||||||
"checksum bytes 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1b7db437d718977f6dc9b2e3fd6fc343c02ac6b899b73fdd2179163447bd9ce9"
|
"checksum bytes 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1b7db437d718977f6dc9b2e3fd6fc343c02ac6b899b73fdd2179163447bd9ce9"
|
||||||
|
@ -4,6 +4,7 @@ version = "0.1.0"
|
|||||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
bincode = "1.0"
|
||||||
env_logger = { version = "0.3", default-features = false }
|
env_logger = { version = "0.3", default-features = false }
|
||||||
fst = "0.3"
|
fst = "0.3"
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
@ -13,5 +14,5 @@ tokio-service = "0.1"
|
|||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
smallvec = "0.6"
|
smallvec = { version = "0.6", features = ["serde"] }
|
||||||
url = "1.7"
|
url = "1.7"
|
||||||
|
54
src/bin/raptor-cli.rs
Normal file
54
src/bin/raptor-cli.rs
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
// TODO make the raptor binary expose multiple subcommand
|
||||||
|
// make only one binary
|
||||||
|
|
||||||
|
extern crate fst;
|
||||||
|
extern crate raptor;
|
||||||
|
extern crate serde_json;
|
||||||
|
#[macro_use] extern crate serde_derive;
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufReader, BufRead};
|
||||||
|
|
||||||
|
use fst::Streamer;
|
||||||
|
use serde_json::from_str;
|
||||||
|
|
||||||
|
use raptor::{MultiMapBuilder, MultiMap};
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct Product {
|
||||||
|
product_id: u64,
|
||||||
|
title: String,
|
||||||
|
ft: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let data = File::open("products.json_lines").unwrap();
|
||||||
|
let data = BufReader::new(data);
|
||||||
|
|
||||||
|
let mut builder = MultiMapBuilder::new();
|
||||||
|
for line in data.lines() {
|
||||||
|
let line = line.unwrap();
|
||||||
|
|
||||||
|
let product: Product = from_str(&line).unwrap();
|
||||||
|
|
||||||
|
// TODO filter words here !!!
|
||||||
|
let title = product.title.split_whitespace();
|
||||||
|
let description = product.ft.split_whitespace();
|
||||||
|
let words = title.chain(description);
|
||||||
|
|
||||||
|
for word in words {
|
||||||
|
builder.insert(word, product.product_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let map = File::create("map.fst").unwrap();
|
||||||
|
let values = File::create("values.vecs").unwrap();
|
||||||
|
let (map, values) = builder.build(map, values).unwrap();
|
||||||
|
|
||||||
|
let map = unsafe { MultiMap::from_paths("map.fst", "values.vecs").unwrap() };
|
||||||
|
|
||||||
|
let mut stream = map.stream();
|
||||||
|
while let Some(x) = stream.next() {
|
||||||
|
println!("{:?}", x);
|
||||||
|
}
|
||||||
|
}
|
80
src/lib.rs
80
src/lib.rs
@ -1,3 +1,4 @@
|
|||||||
|
extern crate bincode;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
extern crate serde;
|
extern crate serde;
|
||||||
extern crate serde_json;
|
extern crate serde_json;
|
||||||
@ -5,19 +6,15 @@ extern crate serde_json;
|
|||||||
extern crate smallvec;
|
extern crate smallvec;
|
||||||
|
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::str::from_utf8_unchecked;
|
||||||
|
|
||||||
pub use fst::MapBuilder;
|
pub use fst::MapBuilder;
|
||||||
use smallvec::SmallVec;
|
use smallvec::SmallVec;
|
||||||
|
|
||||||
type SmallVec16<T> = SmallVec<[T; 16]>;
|
type SmallVec16<T> = SmallVec<[T; 16]>;
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
|
||||||
struct Product<'a> {
|
|
||||||
product_id: u64,
|
|
||||||
title: &'a str,
|
|
||||||
ft: &'a str,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct MultiMap {
|
pub struct MultiMap {
|
||||||
map: fst::Map,
|
map: fst::Map,
|
||||||
@ -25,6 +22,27 @@ pub struct MultiMap {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl MultiMap {
|
impl MultiMap {
|
||||||
|
pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<MultiMap>
|
||||||
|
where
|
||||||
|
P: AsRef<Path>,
|
||||||
|
Q: AsRef<Path>
|
||||||
|
{
|
||||||
|
let map = fst::Map::from_path(map)?;
|
||||||
|
|
||||||
|
// TODO handle error !!!
|
||||||
|
let values_file = File::open(values).unwrap();
|
||||||
|
let values = bincode::deserialize_from(values_file).unwrap();
|
||||||
|
|
||||||
|
Ok(MultiMap { map, values })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stream(&self) -> Stream {
|
||||||
|
Stream {
|
||||||
|
inner: self.map.stream(),
|
||||||
|
values: &self.values,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
|
pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
|
||||||
self.map.contains_key(key)
|
self.map.contains_key(key)
|
||||||
}
|
}
|
||||||
@ -34,22 +52,43 @@ impl MultiMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> {
|
||||||
|
inner: fst::map::Stream<'a, A>,
|
||||||
|
values: &'a [SmallVec16<u64>],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'm, A: fst::Automaton> fst::Streamer<'a> for Stream<'m, A> {
|
||||||
|
type Item = (&'a str, &'a [u64]);
|
||||||
|
|
||||||
|
fn next(&'a mut self) -> Option<Self::Item> {
|
||||||
|
// Here we can't just `map` because of some borrow rules
|
||||||
|
match self.inner.next() {
|
||||||
|
Some((key, i)) => {
|
||||||
|
let key = unsafe { from_utf8_unchecked(key) };
|
||||||
|
Some((key, &*self.values[i as usize]))
|
||||||
|
},
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct MultiMapBuilder<'a> {
|
pub struct MultiMapBuilder {
|
||||||
map: Vec<(&'a str, u64)>,
|
map: Vec<(String, u64)>,
|
||||||
values: Vec<SmallVec16<u64>>,
|
values: Vec<SmallVec16<u64>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> MultiMapBuilder<'a> {
|
impl<'a> MultiMapBuilder {
|
||||||
pub fn new() -> MultiMapBuilder<'a> {
|
pub fn new() -> MultiMapBuilder {
|
||||||
MultiMapBuilder {
|
MultiMapBuilder {
|
||||||
map: Vec::new(),
|
map: Vec::new(),
|
||||||
values: Vec::new(),
|
values: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert(&mut self, key: &'a str, value: u64) {
|
pub fn insert<S: Into<String>>(&mut self, key: S, value: u64) {
|
||||||
match self.map.binary_search_by_key(&key, |&(k, _)| k) {
|
let key = key.into();
|
||||||
|
match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) {
|
||||||
Ok(index) => {
|
Ok(index) => {
|
||||||
let (_, index) = self.map[index];
|
let (_, index) = self.map[index];
|
||||||
let values = &mut self.values[index as usize];
|
let values = &mut self.values[index as usize];
|
||||||
@ -78,4 +117,19 @@ impl<'a> MultiMapBuilder<'a> {
|
|||||||
values: self.values.into_boxed_slice(),
|
values: self.values.into_boxed_slice(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)>
|
||||||
|
where
|
||||||
|
W: Write,
|
||||||
|
X: Write
|
||||||
|
{
|
||||||
|
let mut builder = MapBuilder::new(map_wrt)?;
|
||||||
|
builder.extend_iter(self.map)?;
|
||||||
|
let map = builder.into_inner()?;
|
||||||
|
|
||||||
|
// TODO handle that !!!
|
||||||
|
bincode::serialize_into(&mut values_wrt, &self.values).unwrap();
|
||||||
|
|
||||||
|
Ok((map, values_wrt))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user