mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
Add support for quoted query phrases
This commit is contained in:
parent
1f7035f18f
commit
eefc6d7c44
@ -12,9 +12,9 @@ use heed::types::*;
|
|||||||
use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions};
|
use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
use slice_group_by::StrGroupBy;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use mega_mini_indexer::alphanumeric_tokens;
|
|
||||||
use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId};
|
use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId};
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
@ -23,6 +23,11 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|||||||
|
|
||||||
static ID_GENERATOR: AtomicUsize = AtomicUsize::new(0); // AtomicU32 ?
|
static ID_GENERATOR: AtomicUsize = AtomicUsize::new(0); // AtomicU32 ?
|
||||||
|
|
||||||
|
pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
|
||||||
|
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
|
||||||
|
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
#[derive(Debug, StructOpt)]
|
||||||
#[structopt(name = "mm-indexer", about = "The indexer side of the MMI project.")]
|
#[structopt(name = "mm-indexer", about = "The indexer side of the MMI project.")]
|
||||||
struct Opt {
|
struct Opt {
|
||||||
@ -186,7 +191,7 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
|
|||||||
let document_id = DocumentId::try_from(document_id).context("Generated id is too big")?;
|
let document_id = DocumentId::try_from(document_id).context("Generated id is too big")?;
|
||||||
|
|
||||||
for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
||||||
for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
|
for (_pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
|
||||||
if !word.is_empty() && word.len() < 500 { // LMDB limits
|
if !word.is_empty() && word.len() < 500 { // LMDB limits
|
||||||
let word = word.cow_to_lowercase();
|
let word = word.cow_to_lowercase();
|
||||||
postings_ids.entry(SmallVec32::from(word.as_bytes()))
|
postings_ids.entry(SmallVec32::from(word.as_bytes()))
|
||||||
|
26
src/lib.rs
26
src/lib.rs
@ -1,3 +1,6 @@
|
|||||||
|
mod query;
|
||||||
|
|
||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@ -10,7 +13,8 @@ use heed::{PolyDatabase, Database};
|
|||||||
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use slice_group_by::StrGroupBy;
|
|
||||||
|
use self::query::{QueryWord, alphanumeric_quoted_tokens};
|
||||||
|
|
||||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||||
@ -22,11 +26,6 @@ pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>;
|
|||||||
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
||||||
pub type DocumentId = u32;
|
pub type DocumentId = u32;
|
||||||
|
|
||||||
pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
|
|
||||||
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
|
|
||||||
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Index {
|
pub struct Index {
|
||||||
pub main: PolyDatabase,
|
pub main: PolyDatabase,
|
||||||
@ -60,17 +59,20 @@ impl Index {
|
|||||||
let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
||||||
let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
||||||
|
|
||||||
let words: Vec<_> = alphanumeric_tokens(query).collect();
|
let words: Vec<_> = alphanumeric_quoted_tokens(query).collect();
|
||||||
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let number_of_words = words.len();
|
let number_of_words = words.len();
|
||||||
let dfas = words.into_iter().enumerate().map(|(i, word)| {
|
let dfas = words.into_iter().enumerate().map(|(i, word)| {
|
||||||
let word = word.cow_to_lowercase();
|
let (word, quoted) = match word {
|
||||||
|
QueryWord::Free(word) => (word.cow_to_lowercase(), false),
|
||||||
|
QueryWord::Quoted(word) => (Cow::Borrowed(word), true),
|
||||||
|
};
|
||||||
let is_last = i + 1 == number_of_words;
|
let is_last = i + 1 == number_of_words;
|
||||||
let is_prefix = is_last && !ends_with_whitespace;
|
let is_prefix = is_last && !ends_with_whitespace && !quoted;
|
||||||
let dfa = match word.len() {
|
let dfa = match word.len() {
|
||||||
0..=4 => if is_prefix { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
|
0..=4 => if is_prefix { lev0.build_prefix_dfa(&word) } else if quoted { lev0.build_dfa(&word) } else { lev0.build_dfa(&word) },
|
||||||
5..=8 => if is_prefix { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
|
5..=8 => if is_prefix { lev1.build_prefix_dfa(&word) } else if quoted { lev0.build_dfa(&word) } else { lev1.build_dfa(&word) },
|
||||||
_ => if is_prefix { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
|
_ => if is_prefix { lev2.build_prefix_dfa(&word) } else if quoted { lev0.build_dfa(&word) } else { lev2.build_dfa(&word) },
|
||||||
};
|
};
|
||||||
(word, dfa)
|
(word, dfa)
|
||||||
});
|
});
|
||||||
|
145
src/query.rs
Normal file
145
src/query.rs
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
pub enum QueryWord<'a> {
|
||||||
|
Free(&'a str),
|
||||||
|
Quoted(&'a str),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alphanumeric_quoted_tokens(string: &str) -> impl Iterator<Item = QueryWord> {
|
||||||
|
use QueryWord::{Quoted, Free};
|
||||||
|
|
||||||
|
enum State {
|
||||||
|
Free(usize),
|
||||||
|
Quoted(usize),
|
||||||
|
Fused,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
fn is_quoted(&self) -> bool {
|
||||||
|
match self { State::Quoted(_) => true, _ => false }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn replace_by(&mut self, state: State) -> State {
|
||||||
|
std::mem::replace(self, state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut state = State::Free(0);
|
||||||
|
let mut string_chars = string.char_indices();
|
||||||
|
std::iter::from_fn(move || {
|
||||||
|
loop {
|
||||||
|
let (i, afteri, c) = match string_chars.next() {
|
||||||
|
Some((i, c)) => (i, i + c.len_utf8(), c),
|
||||||
|
None => return match state.replace_by(State::Fused) {
|
||||||
|
State::Free(s) => if !string[s..].is_empty() {
|
||||||
|
Some(Free(&string[s..]))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
State::Quoted(s) => Some(Quoted(&string[s..])),
|
||||||
|
State::Fused => None,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
if c == '"' {
|
||||||
|
match state.replace_by(State::Free(afteri)) {
|
||||||
|
State::Quoted(s) => return Some(Quoted(&string[s..i])),
|
||||||
|
State::Free(s) => {
|
||||||
|
state = State::Quoted(afteri);
|
||||||
|
if i > s { return Some(Free(&string[s..i])) }
|
||||||
|
},
|
||||||
|
State::Fused => return None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if !state.is_quoted() && !c.is_alphanumeric() {
|
||||||
|
match state.replace_by(State::Free(afteri)) {
|
||||||
|
State::Free(s) if i > s => return Some(Free(&string[s..i])),
|
||||||
|
_ => state = State::Free(afteri),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn one_quoted_string() {
|
||||||
|
use QueryWord::Quoted;
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("\"hello\"");
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn one_pending_quoted_string() {
|
||||||
|
use QueryWord::Quoted;
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("\"hello");
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn one_non_quoted_string() {
|
||||||
|
use QueryWord::Free;
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("hello");
|
||||||
|
assert_eq!(iter.next(), Some(Free("hello")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn quoted_directly_followed_by_free_strings() {
|
||||||
|
use QueryWord::{Quoted, Free};
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("\"hello\"world");
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||||
|
assert_eq!(iter.next(), Some(Free("world")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn free_directly_followed_by_quoted_strings() {
|
||||||
|
use QueryWord::{Quoted, Free};
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("hello\"world\"");
|
||||||
|
assert_eq!(iter.next(), Some(Free("hello")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn free_followed_by_quoted_strings() {
|
||||||
|
use QueryWord::{Quoted, Free};
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("hello \"world\"");
|
||||||
|
assert_eq!(iter.next(), Some(Free("hello")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_spaces_separated_strings() {
|
||||||
|
use QueryWord::Free;
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("hello world ");
|
||||||
|
assert_eq!(iter.next(), Some(Free("hello")));
|
||||||
|
assert_eq!(iter.next(), Some(Free("world")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multi_interleaved_quoted_free_strings() {
|
||||||
|
use QueryWord::{Quoted, Free};
|
||||||
|
|
||||||
|
let mut iter = alphanumeric_quoted_tokens("hello \"world\" coucou \"monde\"");
|
||||||
|
assert_eq!(iter.next(), Some(Free("hello")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||||
|
assert_eq!(iter.next(), Some(Free("coucou")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("monde")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user