mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-22 19:27:27 +01:00
Introduce an example file to test indexing and searching csv documents
This commit is contained in:
parent
0bfba3e4ba
commit
2236ebbd42
@ -11,6 +11,7 @@ crossbeam-channel = "0.3.9"
|
||||
deunicode = "1.0.0"
|
||||
env_logger = "0.7.0"
|
||||
hashbrown = { version = "0.6.0", features = ["serde"] }
|
||||
lmdb-rkv = "0.12.3"
|
||||
log = "0.4.8"
|
||||
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
|
||||
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||
@ -41,3 +42,10 @@ features = ["fst_automaton"]
|
||||
[dependencies.fst]
|
||||
git = "https://github.com/Kerollmops/fst.git"
|
||||
branch = "arc-byte-slice"
|
||||
|
||||
[dev-dependencies]
|
||||
csv = "1.0.7"
|
||||
rustyline = { version = "5.0.0", default-features = false }
|
||||
structopt = "0.3.2"
|
||||
termcolor = "1.0.4"
|
||||
toml = "0.5.3"
|
||||
|
346
meilidb-core/examples/from_file.rs
Normal file
346
meilidb-core/examples/from_file.rs
Normal file
@ -0,0 +1,346 @@
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::error::Error;
|
||||
use std::io::Write;
|
||||
use std::iter::FromIterator;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::{Instant, Duration};
|
||||
use std::{fs, io};
|
||||
|
||||
use rustyline::{Editor, Config};
|
||||
use serde::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
|
||||
use meilidb_core::{Highlight, Database};
|
||||
use meilidb_schema::SchemaAttr;
|
||||
|
||||
const INDEX_NAME: &str = "default";
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
struct IndexCommand {
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
database_path: PathBuf,
|
||||
|
||||
/// The csv file to index.
|
||||
#[structopt(parse(from_os_str))]
|
||||
csv_data_path: PathBuf,
|
||||
|
||||
/// The path to the schema.
|
||||
#[structopt(long, parse(from_os_str))]
|
||||
schema: PathBuf,
|
||||
|
||||
#[structopt(long)]
|
||||
update_group_size: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
struct SearchCommand {
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
database_path: PathBuf,
|
||||
|
||||
/// Timeout after which the search will return results.
|
||||
#[structopt(long)]
|
||||
fetch_timeout_ms: Option<u64>,
|
||||
|
||||
/// The number of returned results
|
||||
#[structopt(short, long, default_value = "10")]
|
||||
number_results: usize,
|
||||
|
||||
/// The number of characters before and after the first match
|
||||
#[structopt(short = "C", long, default_value = "35")]
|
||||
char_context: usize,
|
||||
|
||||
/// Fields that must be displayed.
|
||||
displayed_fields: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
enum Command {
|
||||
Index(IndexCommand),
|
||||
Search(SearchCommand),
|
||||
}
|
||||
|
||||
impl Command {
|
||||
fn path(&self) -> &Path {
|
||||
match self {
|
||||
Command::Index(command) => &command.database_path,
|
||||
Command::Search(command) => &command.database_path,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
struct Document(HashMap<String, String>);
|
||||
|
||||
fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dyn Error>> {
|
||||
let start = Instant::now();
|
||||
|
||||
let rkv = database.rkv.read().unwrap();
|
||||
let index = database.open_index(INDEX_NAME)?;
|
||||
|
||||
let schema = {
|
||||
let string = fs::read_to_string(&command.schema)?;
|
||||
toml::from_str(&string).unwrap()
|
||||
};
|
||||
|
||||
let writer = rkv.write().unwrap();
|
||||
match index.main.schema(&writer)? {
|
||||
Some(current_schema) => {
|
||||
if current_schema != schema {
|
||||
return Err(meilidb_core::Error::SchemaDiffer.into())
|
||||
}
|
||||
writer.abort();
|
||||
},
|
||||
None => index.schema_update(writer, schema)?,
|
||||
}
|
||||
|
||||
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
|
||||
let mut max_update_id = 0;
|
||||
let mut i = 0;
|
||||
let mut end_of_file = false;
|
||||
|
||||
while !end_of_file {
|
||||
let mut additions = index.documents_addition();
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
if end_of_file { break }
|
||||
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
additions.update_document(document);
|
||||
|
||||
print!("\rindexing document {}", i);
|
||||
i += 1;
|
||||
|
||||
if let Some(group_size) = command.update_group_size {
|
||||
if i % group_size == 0 { break }
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
|
||||
let writer = rkv.write().unwrap();
|
||||
println!("committing update...");
|
||||
let update_id = additions.finalize(writer)?;
|
||||
max_update_id = max_update_id.max(update_id);
|
||||
}
|
||||
|
||||
loop {
|
||||
println!("Waiting for update {}", max_update_id);
|
||||
|
||||
let reader = rkv.read().unwrap();
|
||||
if let Some(_) = index.updates_results.update_result(&reader, max_update_id)? {
|
||||
break
|
||||
}
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
}
|
||||
|
||||
println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
||||
let mut highlighted = false;
|
||||
|
||||
for range in ranges.windows(2) {
|
||||
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
||||
if highlighted {
|
||||
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
||||
}
|
||||
write!(&mut stdout, "{}", &text[start..end])?;
|
||||
stdout.reset()?;
|
||||
highlighted = !highlighted;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
|
||||
let mut byte_index = 0;
|
||||
let mut byte_length = 0;
|
||||
|
||||
for (n, (i, c)) in text.char_indices().enumerate() {
|
||||
if n == index {
|
||||
byte_index = i;
|
||||
}
|
||||
|
||||
if n + 1 == index + length {
|
||||
byte_length = i - byte_index + c.len_utf8();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
(byte_index, byte_length)
|
||||
}
|
||||
|
||||
fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
|
||||
let mut byte_indexes = BTreeMap::new();
|
||||
|
||||
for highlight in highlights {
|
||||
let char_index = highlight.char_index as usize;
|
||||
let char_length = highlight.char_length as usize;
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut title_areas = Vec::new();
|
||||
title_areas.push(0);
|
||||
for (byte_index, length) in byte_indexes {
|
||||
title_areas.push(byte_index);
|
||||
title_areas.push(byte_index + length);
|
||||
}
|
||||
title_areas.push(text.len());
|
||||
title_areas.sort_unstable();
|
||||
title_areas
|
||||
}
|
||||
|
||||
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
|
||||
///
|
||||
/// ```no_run
|
||||
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
///
|
||||
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
|
||||
///
|
||||
/// let (text, matches) = crop_text(&text, matches, 35);
|
||||
/// ```
|
||||
fn crop_text(
|
||||
text: &str,
|
||||
highlights: impl IntoIterator<Item=Highlight>,
|
||||
context: usize,
|
||||
) -> (String, Vec<Highlight>)
|
||||
{
|
||||
let mut highlights = highlights.into_iter().peekable();
|
||||
|
||||
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
||||
let start = char_index.saturating_sub(context);
|
||||
let text = text.chars().skip(start).take(context * 2).collect();
|
||||
|
||||
let highlights = highlights
|
||||
.take_while(|m| {
|
||||
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
||||
})
|
||||
.map(|highlight| {
|
||||
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
|
||||
})
|
||||
.collect();
|
||||
|
||||
(text, highlights)
|
||||
}
|
||||
|
||||
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
|
||||
let rkv = database.rkv.read().unwrap();
|
||||
let index = database.open_index(INDEX_NAME)?;
|
||||
let reader = rkv.read().unwrap();
|
||||
|
||||
let schema = index.main.schema(&reader)?;
|
||||
let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
|
||||
|
||||
let fields = command.displayed_fields.iter().map(String::as_str);
|
||||
let fields = HashSet::from_iter(fields);
|
||||
|
||||
let config = Config::builder().auto_add_history(true).build();
|
||||
let mut readline = Editor::<()>::with_config(config);
|
||||
let _ = readline.load_history("query-history.txt");
|
||||
|
||||
for result in readline.iter("Searching for: ") {
|
||||
match result {
|
||||
Ok(query) => {
|
||||
let start_total = Instant::now();
|
||||
|
||||
let builder = index.query_builder();
|
||||
let documents = builder.query(&reader, &query, 0..command.number_results)?;
|
||||
|
||||
let mut retrieve_duration = Duration::default();
|
||||
|
||||
let number_of_documents = documents.len();
|
||||
for mut doc in documents {
|
||||
|
||||
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
|
||||
let start_retrieve = Instant::now();
|
||||
let result = index.document::<Document, _>(&reader, Some(&fields), doc.id);
|
||||
retrieve_duration += start_retrieve.elapsed();
|
||||
|
||||
match result {
|
||||
Ok(Some(document)) => {
|
||||
println!("raw-id: {:?}", doc.id);
|
||||
for (name, text) in document.0 {
|
||||
print!("{}: ", name);
|
||||
|
||||
let attr = schema.attribute(&name).unwrap();
|
||||
let highlights = doc.highlights.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, highlights) = crop_text(&text, highlights, command.char_context);
|
||||
let areas = create_highlight_areas(&text, &highlights);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
},
|
||||
Ok(None) => eprintln!("missing document"),
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for highlight in doc.highlights {
|
||||
let attr = SchemaAttr::new(highlight.attribute);
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
|
||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||
println!("matching in: {:?}", matching_attributes);
|
||||
|
||||
println!();
|
||||
}
|
||||
|
||||
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
|
||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
||||
},
|
||||
Err(err) => {
|
||||
println!("Error: {:?}", err);
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
readline.save_history("query-history.txt").unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
env_logger::init();
|
||||
|
||||
let opt = Command::from_args();
|
||||
let database = Database::open_or_create(opt.path())?;
|
||||
|
||||
match opt {
|
||||
Command::Index(command) => index_command(command, database),
|
||||
Command::Search(command) => search_command(command, database),
|
||||
}
|
||||
}
|
@ -1,78 +0,0 @@
|
||||
use std::{fs, path::Path};
|
||||
|
||||
use serde_json::json;
|
||||
use rkv::{Manager, Rkv, SingleStore, Value, StoreOptions};
|
||||
|
||||
use meilidb_core::{Database, MResult};
|
||||
use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED};
|
||||
|
||||
fn main() -> MResult<()> {
|
||||
env_logger::init();
|
||||
|
||||
let path = Path::new("test.rkv");
|
||||
let database = Database::open_or_create(path)?;
|
||||
let rkv = database.rkv.read().unwrap();
|
||||
println!("{:?}", database.indexes_names());
|
||||
|
||||
let hello = database.open_index("hello")?;
|
||||
let hello1 = database.open_index("hello1")?;
|
||||
let hello2 = database.open_index("hello2")?;
|
||||
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("alpha", DISPLAYED);
|
||||
builder.new_attribute("beta", DISPLAYED | INDEXED);
|
||||
builder.new_attribute("gamma", INDEXED);
|
||||
let schema = builder.build();
|
||||
|
||||
let writer = rkv.write()?;
|
||||
|
||||
hello.schema_update(writer, schema)?;
|
||||
|
||||
let object = json!({
|
||||
"id": 23,
|
||||
"beta": "hello",
|
||||
});
|
||||
|
||||
let mut additions = hello.documents_addition();
|
||||
additions.extend(vec![object]);
|
||||
|
||||
let writer = rkv.write()?;
|
||||
|
||||
additions.finalize(writer)?;
|
||||
|
||||
// {
|
||||
// let mut writer = env.write().unwrap();
|
||||
// let mut raw_indexer = RawIndexer::new();
|
||||
|
||||
// let docid = DocumentId(0);
|
||||
// let attr = SchemaAttr(0);
|
||||
// let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
||||
// raw_indexer.index_text(docid, attr, text);
|
||||
|
||||
// let Indexed { words_doc_indexes, .. } = raw_indexer.build();
|
||||
|
||||
// let mut fst_builder = fst::SetBuilder::memory();
|
||||
// fst_builder.extend_iter(words_doc_indexes.keys()).unwrap();
|
||||
// let bytes = fst_builder.into_inner().unwrap();
|
||||
// let fst = fst::raw::Fst::from_bytes(bytes).unwrap();
|
||||
// let fst = fst::Set::from(fst);
|
||||
|
||||
// words.put_words_fst(&mut writer, &fst).unwrap();
|
||||
|
||||
// for (word, indexes) in words_doc_indexes {
|
||||
// words.put_words_indexes(&mut writer, &word, &indexes).unwrap();
|
||||
// }
|
||||
|
||||
// writer.commit().unwrap();
|
||||
// }
|
||||
|
||||
let builder = hello.query_builder();
|
||||
let reader = rkv.read().unwrap();
|
||||
let documents = builder.query(&reader, "hello", 0..20).unwrap();
|
||||
|
||||
println!("{:?}", documents);
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_secs(2));
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user