feat: Introduce a way to distinct documents

This commit is contained in:
Clément Renault 2018-10-17 13:35:34 +02:00
parent 3acac1458f
commit 37c709c9a9
8 changed files with 167 additions and 84 deletions

View file

@ -117,14 +117,14 @@ impl CsvIndexer {
}
}
fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_index: u64, attr: u8, words: I)
fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_id: u64, attr: u8, words: I)
where A: io::Write,
B: io::Write,
I: IntoIterator<Item=(usize, &'a str)>,
{
for (index, word) in words {
let doc_index = DocIndex {
document: doc_index,
document_id: doc_id,
attribute: attr,
attribute_index: index as u32,
};

View file

@ -122,14 +122,14 @@ impl JsonLinesIndexer {
}
}
fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_index: u64, attr: u8, words: I)
fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_id: u64, attr: u8, words: I)
where A: io::Write,
B: io::Write,
I: IntoIterator<Item=(usize, &'a str)>,
{
for (index, word) in words {
let doc_index = DocIndex {
document: doc_index,
document_id: doc_id,
attribute: attr,
attribute_index: index as u32,
};

View file

@ -5,8 +5,8 @@ use std::path::PathBuf;
use elapsed::measure_time;
use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
use raptor::rank::{criterion, Config, RankedStream, Document};
use raptor::{automaton, Metadata, CommonWords};
use raptor::rank::{criterion, RankedStreamBuilder};
#[derive(Debug, StructOpt)]
pub struct CommandConsole {
@ -62,6 +62,13 @@ impl ConsoleSearch {
}
}
// "Sony" "PlayStation 4 500GB"
fn starts_with_playstation(doc: &Document, database: &DB) -> Vec<u8> {
let title_key = format!("{}-title", doc.id);
let title = database.get(title_key.as_bytes()).unwrap().unwrap();
title.get(0..4).map(|s| s.to_vec()).unwrap_or(Vec::new())
}
fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) {
let mut automatons = Vec::new();
for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) {
@ -69,10 +76,15 @@ fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query:
automatons.push(lev);
}
let mut builder = RankedStreamBuilder::new(metadata, automatons);
builder.criteria(criterion::default());
let config = Config {
metadata: metadata,
automatons: automatons,
criteria: criterion::default(),
distinct: ((), 1),
};
let stream = RankedStream::new(config);
let mut stream = builder.build();
// let documents = stream.retrieve_distinct_documents(|doc| starts_with_playstation(doc, database), 0..20);
let documents = stream.retrieve_documents(0..20);
for document in documents {

View file

@ -7,11 +7,10 @@ use std::path::PathBuf;
use std::error::Error;
use std::sync::Arc;
use raptor::rank::{criterion, RankedStreamBuilder};
use raptor::rank::{criterion, Config, RankedStream};
use raptor::{automaton, Metadata, CommonWords};
use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
use warp::Filter;
use structopt::StructOpt;
#[derive(Debug, StructOpt)]
@ -99,10 +98,14 @@ where M: AsRef<Metadata>,
automatons.push(lev);
}
let mut builder = RankedStreamBuilder::new(metadata.as_ref(), automatons);
builder.criteria(criterion::default());
let config = Config {
metadata: metadata.as_ref(),
automatons: automatons,
criteria: criterion::default(),
distinct: ((), 1),
};
let stream = RankedStream::new(config);
let mut stream = builder.build();
let documents = stream.retrieve_documents(0..20);
let mut body = Vec::new();