mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-11 15:38:55 +01:00
216 lines
6.8 KiB
Rust
216 lines
6.8 KiB
Rust
#[global_allocator]
|
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|
|
|
use std::collections::btree_map::{BTreeMap, Entry};
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::iter::FromIterator;
|
|
use std::io::{self, Write};
|
|
use std::time::{Instant, Duration};
|
|
use std::path::PathBuf;
|
|
use std::error::Error;
|
|
|
|
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
|
use structopt::StructOpt;
|
|
use meilidb_core::Match;
|
|
|
|
use meilidb_data::schema::SchemaAttr;
|
|
use meilidb_data::Database;
|
|
|
|
#[derive(Debug, StructOpt)]
|
|
pub struct Opt {
|
|
/// The destination where the database must be created
|
|
#[structopt(parse(from_os_str))]
|
|
pub database_path: PathBuf,
|
|
|
|
/// Fields that must be displayed.
|
|
pub displayed_fields: Vec<String>,
|
|
|
|
/// The number of returned results
|
|
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
|
pub number_results: usize,
|
|
|
|
/// The number of characters before and after the first match
|
|
#[structopt(short = "C", long = "context", default_value = "35")]
|
|
pub char_context: usize,
|
|
}
|
|
|
|
type Document = HashMap<String, String>;
|
|
|
|
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
|
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
|
let mut highlighted = false;
|
|
|
|
for range in ranges.windows(2) {
|
|
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
|
if highlighted {
|
|
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
|
}
|
|
write!(&mut stdout, "{}", &text[start..end])?;
|
|
stdout.reset()?;
|
|
highlighted = !highlighted;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
|
|
let mut byte_index = 0;
|
|
let mut byte_length = 0;
|
|
|
|
for (n, (i, c)) in text.char_indices().enumerate() {
|
|
if n == index {
|
|
byte_index = i;
|
|
}
|
|
|
|
if n + 1 == index + length {
|
|
byte_length = i - byte_index + c.len_utf8();
|
|
break;
|
|
}
|
|
}
|
|
|
|
(byte_index, byte_length)
|
|
}
|
|
|
|
fn create_highlight_areas(text: &str, matches: &[Match]) -> Vec<usize> {
|
|
let mut byte_indexes = BTreeMap::new();
|
|
|
|
for match_ in matches {
|
|
let char_index = match_.char_index as usize;
|
|
let char_length = match_.char_length as usize;
|
|
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
|
|
|
match byte_indexes.entry(byte_index) {
|
|
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
|
Entry::Occupied(mut entry) => {
|
|
if *entry.get() < byte_length {
|
|
entry.insert(byte_length);
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
let mut title_areas = Vec::new();
|
|
title_areas.push(0);
|
|
for (byte_index, length) in byte_indexes {
|
|
title_areas.push(byte_index);
|
|
title_areas.push(byte_index + length);
|
|
}
|
|
title_areas.push(text.len());
|
|
title_areas.sort_unstable();
|
|
title_areas
|
|
}
|
|
|
|
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
|
|
///
|
|
/// ```no_run
|
|
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
|
///
|
|
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
|
|
///
|
|
/// let (text, matches) = crop_text(&text, matches, 35);
|
|
/// ```
|
|
fn crop_text(
|
|
text: &str,
|
|
matches: impl IntoIterator<Item=Match>,
|
|
context: usize,
|
|
) -> (String, Vec<Match>)
|
|
{
|
|
let mut matches = matches.into_iter().peekable();
|
|
|
|
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
|
let start = char_index.saturating_sub(context);
|
|
let text = text.chars().skip(start).take(context * 2).collect();
|
|
|
|
let matches = matches
|
|
.take_while(|m| {
|
|
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
|
})
|
|
.map(|match_| {
|
|
Match { char_index: match_.char_index - start as u16, ..match_ }
|
|
})
|
|
.collect();
|
|
|
|
(text, matches)
|
|
}
|
|
|
|
fn main() -> Result<(), Box<Error>> {
|
|
let _ = env_logger::init();
|
|
let opt = Opt::from_args();
|
|
|
|
let start = Instant::now();
|
|
let database = Database::start_default(&opt.database_path)?;
|
|
|
|
let mut buffer = String::new();
|
|
let input = io::stdin();
|
|
|
|
let index = database.open_index("default")?.unwrap();
|
|
let schema = index.schema();
|
|
|
|
println!("database prepared for you in {:.2?}", start.elapsed());
|
|
|
|
let fields = opt.displayed_fields.iter().map(String::as_str);
|
|
let fields = HashSet::from_iter(fields);
|
|
|
|
loop {
|
|
print!("Searching for: ");
|
|
io::stdout().flush()?;
|
|
|
|
if input.read_line(&mut buffer)? == 0 { break }
|
|
let query = buffer.trim_end_matches('\n');
|
|
|
|
let start_total = Instant::now();
|
|
|
|
let builder = index.query_builder();
|
|
let documents = builder.query(query, 0..opt.number_results);
|
|
|
|
let mut retrieve_duration = Duration::default();
|
|
|
|
let number_of_documents = documents.len();
|
|
for mut doc in documents {
|
|
|
|
doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
|
|
|
|
let start_retrieve = Instant::now();
|
|
let result = index.document::<Document>(Some(&fields), doc.id);
|
|
retrieve_duration += start_retrieve.elapsed();
|
|
|
|
match result {
|
|
Ok(Some(document)) => {
|
|
for (name, text) in document {
|
|
print!("{}: ", name);
|
|
|
|
let attr = schema.attribute(&name).unwrap();
|
|
let matches = doc.matches.iter()
|
|
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
|
.cloned();
|
|
let (text, matches) = crop_text(&text, matches, opt.char_context);
|
|
let areas = create_highlight_areas(&text, &matches);
|
|
display_highlights(&text, &areas)?;
|
|
println!();
|
|
}
|
|
},
|
|
Ok(None) => eprintln!("missing document"),
|
|
Err(e) => eprintln!("{}", e),
|
|
}
|
|
|
|
let mut matching_attributes = HashSet::new();
|
|
for _match in doc.matches {
|
|
let attr = SchemaAttr::new(_match.attribute);
|
|
let name = schema.attribute_name(attr);
|
|
matching_attributes.insert(name);
|
|
}
|
|
|
|
let matching_attributes = Vec::from_iter(matching_attributes);
|
|
println!("matching in: {:?}", matching_attributes);
|
|
|
|
println!();
|
|
}
|
|
|
|
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
|
|
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
|
|
buffer.clear();
|
|
}
|
|
|
|
Ok(())
|
|
}
|