mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 06:44:27 +01:00
Introduce the query words mapping along with the query tree
This commit is contained in:
parent
4f7a7ea0bb
commit
da8abebfa2
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -799,6 +799,14 @@ dependencies = [
|
|||||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "intervaltree"
|
||||||
|
version = "0.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"smallvec 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iovec"
|
name = "iovec"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
@ -952,6 +960,7 @@ dependencies = [
|
|||||||
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"intervaltree 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -2715,6 +2724,7 @@ dependencies = [
|
|||||||
"checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e"
|
"checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e"
|
||||||
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
|
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
|
||||||
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
|
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
|
||||||
|
"checksum intervaltree 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "af39074dd8d5eff756ddea3d8f34c7ae287d4dadb6f29fb1b67ca6b3f5036482"
|
||||||
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
|
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
|
||||||
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
||||||
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
|
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
|
||||||
|
@ -17,6 +17,7 @@ env_logger = "0.7.0"
|
|||||||
fst = { version = "0.3.5", default-features = false }
|
fst = { version = "0.3.5", default-features = false }
|
||||||
hashbrown = { version = "0.6.0", features = ["serde"] }
|
hashbrown = { version = "0.6.0", features = ["serde"] }
|
||||||
heed = "0.6.1"
|
heed = "0.6.1"
|
||||||
|
intervaltree = "0.2.4"
|
||||||
itertools = "0.8.2" # kill me please
|
itertools = "0.8.2" # kill me please
|
||||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||||
log = "0.4.8"
|
log = "0.4.8"
|
||||||
|
@ -61,8 +61,9 @@ where
|
|||||||
prefix_postings_lists: prefix_postings_lists_cache_store,
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||||
};
|
};
|
||||||
|
|
||||||
let operation = create_query_tree(reader, &context, query).unwrap();
|
let (operation, mapping) = create_query_tree(reader, &context, query).unwrap();
|
||||||
println!("{:?}", operation);
|
println!("{:?}", operation);
|
||||||
|
println!("{:?}", mapping);
|
||||||
|
|
||||||
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap();
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap();
|
||||||
println!("found {} documents", docids.len());
|
println!("found {} documents", docids.len());
|
||||||
|
@ -11,6 +11,7 @@ mod levenshtein;
|
|||||||
mod number;
|
mod number;
|
||||||
mod query_builder;
|
mod query_builder;
|
||||||
mod query_tree;
|
mod query_tree;
|
||||||
|
mod query_words_mapper;
|
||||||
mod ranked_map;
|
mod ranked_map;
|
||||||
mod raw_document;
|
mod raw_document;
|
||||||
mod reordered_attrs;
|
mod reordered_attrs;
|
||||||
@ -28,6 +29,7 @@ pub use self::raw_document::RawDocument;
|
|||||||
pub use self::store::Index;
|
pub use self::store::Index;
|
||||||
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
||||||
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
||||||
|
pub use query_words_mapper::QueryWordsMapper;
|
||||||
|
|
||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
|
use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::hash::{Hash, Hasher};
|
||||||
|
use std::ops::Range;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use std::{cmp, fmt, iter::once};
|
use std::{cmp, fmt, iter::once};
|
||||||
|
|
||||||
@ -11,8 +13,9 @@ use fst::{IntoStreamer, Streamer};
|
|||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::{store, DocumentId, DocIndex, MResult};
|
use crate::{store, DocumentId, DocIndex, MResult};
|
||||||
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
|
use crate::QueryWordsMapper;
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum Operation {
|
pub enum Operation {
|
||||||
And(Vec<Operation>),
|
And(Vec<Operation>),
|
||||||
Or(Vec<Operation>),
|
Or(Vec<Operation>),
|
||||||
@ -39,36 +42,49 @@ impl fmt::Debug for Operation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Operation {
|
||||||
|
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
|
||||||
|
Operation::Query(Query { id, prefix, kind: QueryKind::Tolerant(s.to_string()) })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exact(id: QueryId, prefix: bool, s: &str) -> Operation {
|
||||||
|
Operation::Query(Query { id, prefix, kind: QueryKind::Exact(s.to_string()) })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
|
||||||
|
Operation::Query(Query { id, prefix, kind: QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]) })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub type QueryId = usize;
|
pub type QueryId = usize;
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Clone, Eq)]
|
||||||
pub struct Query {
|
pub struct Query {
|
||||||
pub id: QueryId,
|
pub id: QueryId,
|
||||||
pub prefix: bool,
|
pub prefix: bool,
|
||||||
pub kind: QueryKind,
|
pub kind: QueryKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
|
impl PartialEq for Query {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.prefix == other.prefix && self.kind == other.kind
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Hash for Query {
|
||||||
|
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||||
|
self.prefix.hash(state);
|
||||||
|
self.kind.hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum QueryKind {
|
pub enum QueryKind {
|
||||||
Tolerant(String),
|
Tolerant(String),
|
||||||
Exact(String),
|
Exact(String),
|
||||||
Phrase(Vec<String>),
|
Phrase(Vec<String>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Query {
|
|
||||||
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Query {
|
|
||||||
Query { id, prefix, kind: QueryKind::Tolerant(s.to_string()) }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exact(id: QueryId, prefix: bool, s: &str) -> Query {
|
|
||||||
Query { id, prefix, kind: QueryKind::Exact(s.to_string()) }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Query {
|
|
||||||
Query { id, prefix, kind: QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for Query {
|
impl fmt::Debug for Query {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
let Query { id, prefix, kind } = self;
|
let Query { id, prefix, kind } = self;
|
||||||
@ -151,54 +167,88 @@ where I: IntoIterator<Item=Operation>,
|
|||||||
|
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
|
|
||||||
pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str) -> MResult<Operation> {
|
pub fn create_query_tree(
|
||||||
|
reader: &heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
query: &str,
|
||||||
|
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||||
|
{
|
||||||
let query = query.to_lowercase();
|
let query = query.to_lowercase();
|
||||||
|
|
||||||
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
|
let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned);
|
||||||
let words = words.filter(|s| !s.contains(char::is_whitespace)).enumerate();
|
let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect();
|
||||||
let words: Vec<_> = words.collect();
|
|
||||||
|
|
||||||
|
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||||
let mut ngrams = Vec::new();
|
let mut ngrams = Vec::new();
|
||||||
for ngram in 1..=MAX_NGRAM {
|
for ngram in 1..=MAX_NGRAM {
|
||||||
|
|
||||||
let ngiter = words.windows(ngram).enumerate().map(|(i, group)| {
|
let ngiter = words.windows(ngram).enumerate().map(|(i, group)| {
|
||||||
let before = words[..i].windows(1);
|
let before = words[0..i].windows(1).enumerate().map(|(i, g)| (i..i+1, g));
|
||||||
let after = words[i + ngram..].windows(1);
|
let after = words[i + ngram..].windows(1)
|
||||||
before.chain(Some(group)).chain(after)
|
.enumerate()
|
||||||
|
.map(move |(j, g)| (i + j + ngram..i + j + ngram + 1, g));
|
||||||
|
before.chain(Some((i..i + ngram, group))).chain(after)
|
||||||
});
|
});
|
||||||
|
|
||||||
for group in ngiter {
|
for group in ngiter {
|
||||||
let mut ops = Vec::new();
|
|
||||||
|
|
||||||
for (is_last, words) in is_last(group) {
|
let mut ops = Vec::new();
|
||||||
|
for (is_last, (range, words)) in is_last(group) {
|
||||||
|
|
||||||
let mut alts = Vec::new();
|
let mut alts = Vec::new();
|
||||||
match words {
|
match words {
|
||||||
[(id, word)] => {
|
[(id, word)] => {
|
||||||
let phrase = split_best_frequency(reader, ctx, word)?
|
let mut idgen = ((id + 1) * 100)..;
|
||||||
.map(|ws| Query::phrase2(*id, is_last, ws))
|
|
||||||
.map(Operation::Query);
|
let phrase = split_best_frequency(reader, ctx, word)?
|
||||||
|
.map(|ws| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
idgen.next().unwrap();
|
||||||
|
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
|
||||||
|
Operation::phrase2(id, is_last, ws)
|
||||||
|
});
|
||||||
|
|
||||||
|
let synonyms = fetch_synonyms(reader, ctx, &[word])?
|
||||||
|
.into_iter()
|
||||||
|
.map(|alts| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
mapper.declare(range.clone(), id, &alts);
|
||||||
|
|
||||||
|
let mut idgen = once(id).chain(&mut idgen);
|
||||||
|
let iter = alts.into_iter().map(|w| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
Operation::exact(id, false, &w)
|
||||||
|
});
|
||||||
|
|
||||||
let synonyms = fetch_synonyms(reader, ctx, &[word])?.into_iter().map(|alts| {
|
|
||||||
let iter = alts.into_iter().map(|w| Query::exact(*id, false, &w)).map(Operation::Query);
|
|
||||||
create_operation(iter, Operation::And)
|
create_operation(iter, Operation::And)
|
||||||
});
|
});
|
||||||
|
|
||||||
let query = Query::tolerant(*id, is_last, word);
|
let query = Operation::tolerant(*id, is_last, word);
|
||||||
|
|
||||||
alts.push(Operation::Query(query));
|
alts.push(query);
|
||||||
alts.extend(synonyms.chain(phrase));
|
alts.extend(synonyms.chain(phrase));
|
||||||
},
|
},
|
||||||
words => {
|
words => {
|
||||||
let id = words[0].0;
|
let id = words[0].0;
|
||||||
|
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
|
||||||
|
|
||||||
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
||||||
|
|
||||||
for synonym in fetch_synonyms(reader, ctx, &words)? {
|
for synonym in fetch_synonyms(reader, ctx, &words)? {
|
||||||
let synonym = synonym.into_iter().map(|s| Operation::Query(Query::exact(id, false, &s)));
|
let id = idgen.next().unwrap();
|
||||||
let synonym = create_operation(synonym, Operation::And);
|
mapper.declare(range.clone(), id, &synonym);
|
||||||
alts.push(synonym);
|
|
||||||
|
let mut idgen = once(id).chain(&mut idgen);
|
||||||
|
let synonym = synonym.into_iter().map(|s| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
Operation::exact(id, false, &s)
|
||||||
|
});
|
||||||
|
alts.push(create_operation(synonym, Operation::And));
|
||||||
}
|
}
|
||||||
|
|
||||||
let query = Query::exact(id, is_last, &words.concat());
|
let id = idgen.next().unwrap();
|
||||||
alts.push(Operation::Query(query));
|
let concat = words.concat();
|
||||||
|
alts.push(Operation::exact(id, is_last, &concat));
|
||||||
|
mapper.declare(range.clone(), id, &[concat]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,7 +260,10 @@ pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(create_operation(ngrams, Operation::Or))
|
let mapping = mapper.mapping();
|
||||||
|
let operation = create_operation(ngrams, Operation::Or);
|
||||||
|
|
||||||
|
Ok((operation, mapping))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>), Cow<'txn, Set<DocIndex>>>;
|
pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>), Cow<'txn, Set<DocIndex>>>;
|
||||||
|
415
meilisearch-core/src/query_words_mapper.rs
Normal file
415
meilisearch-core/src/query_words_mapper.rs
Normal file
@ -0,0 +1,415 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
use std::ops::Range;
|
||||||
|
use intervaltree::{Element, IntervalTree};
|
||||||
|
|
||||||
|
pub type QueryId = usize;
|
||||||
|
|
||||||
|
pub struct QueryWordsMapper {
|
||||||
|
originals: Vec<String>,
|
||||||
|
mappings: HashMap<QueryId, (Range<usize>, Vec<String>)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryWordsMapper {
|
||||||
|
pub fn new<I, A>(originals: I) -> QueryWordsMapper
|
||||||
|
where I: IntoIterator<Item = A>,
|
||||||
|
A: ToString,
|
||||||
|
{
|
||||||
|
let originals = originals.into_iter().map(|s| s.to_string()).collect();
|
||||||
|
QueryWordsMapper { originals, mappings: HashMap::new() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn declare<I, A>(&mut self, range: Range<usize>, id: QueryId, replacement: I)
|
||||||
|
where I: IntoIterator<Item = A>,
|
||||||
|
A: ToString,
|
||||||
|
{
|
||||||
|
assert!(range.len() != 0);
|
||||||
|
assert!(self.originals.get(range.clone()).is_some());
|
||||||
|
assert!(id >= self.originals.len());
|
||||||
|
|
||||||
|
let replacement: Vec<_> = replacement.into_iter().map(|s| s.to_string()).collect();
|
||||||
|
|
||||||
|
assert!(!replacement.is_empty());
|
||||||
|
|
||||||
|
// We detect words at the end and at the front of the
|
||||||
|
// replacement that are common with the originals:
|
||||||
|
//
|
||||||
|
// x a b c d e f g
|
||||||
|
// ^^^/ \^^^
|
||||||
|
// a b x c d k j e f
|
||||||
|
// ^^^ ^^^
|
||||||
|
//
|
||||||
|
|
||||||
|
let left = &self.originals[..range.start];
|
||||||
|
let right = &self.originals[range.end..];
|
||||||
|
|
||||||
|
let common_left = longest_common_prefix(left, &replacement);
|
||||||
|
let common_right = longest_common_prefix(&replacement, right);
|
||||||
|
|
||||||
|
for i in 0..common_left {
|
||||||
|
let range = range.start - common_left + i..range.start - common_left + i + 1;
|
||||||
|
let replacement = vec![replacement[i].clone()];
|
||||||
|
self.mappings.insert(id + i, (range, replacement));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let replacement = replacement[common_left..replacement.len() - common_right].iter().cloned().collect();
|
||||||
|
self.mappings.insert(id + common_left, (range.clone(), replacement));
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..common_right {
|
||||||
|
let id = id + replacement.len() - common_right + i;
|
||||||
|
let range = range.end + i..range.end + i + 1;
|
||||||
|
let replacement = vec![replacement[replacement.len() - common_right + i].clone()];
|
||||||
|
self.mappings.insert(id, (range, replacement));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mapping(self) -> HashMap<QueryId, Range<usize>> {
|
||||||
|
let mappings = self.mappings.into_iter().map(|(i, (r, v))| (r, (i, v)));
|
||||||
|
let intervals = IntervalTree::from_iter(mappings);
|
||||||
|
|
||||||
|
let mut output = HashMap::new();
|
||||||
|
let mut offset = 0;
|
||||||
|
|
||||||
|
// We map each original word to the biggest number of
|
||||||
|
// associated words.
|
||||||
|
for i in 0..self.originals.len() {
|
||||||
|
let max = intervals.query_point(i)
|
||||||
|
.filter_map(|e| {
|
||||||
|
if e.range.end - 1 == i {
|
||||||
|
let len = e.value.1.iter().skip(i - e.range.start).count();
|
||||||
|
if len != 0 { Some(len) } else { None }
|
||||||
|
} else { None }
|
||||||
|
})
|
||||||
|
.max()
|
||||||
|
.unwrap_or(1);
|
||||||
|
|
||||||
|
let range = i + offset..i + offset + max;
|
||||||
|
output.insert(i, range);
|
||||||
|
offset += max - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We retrieve the range that each original word
|
||||||
|
// is mapped to and apply it to each of the words.
|
||||||
|
for i in 0..self.originals.len() {
|
||||||
|
|
||||||
|
let iter = intervals.query_point(i).filter(|e| e.range.end - 1 == i);
|
||||||
|
for Element { range, value: (id, words) } in iter {
|
||||||
|
|
||||||
|
// We ask for the complete range mapped to the area we map.
|
||||||
|
let start = output.get(&range.start).map(|r| r.start).unwrap_or(range.start);
|
||||||
|
let end = output.get(&(range.end - 1)).map(|r| r.end).unwrap_or(range.end);
|
||||||
|
let range = start..end;
|
||||||
|
|
||||||
|
// We map each query id to one word until the last,
|
||||||
|
// we map it to the remainings words.
|
||||||
|
let add = range.len() - words.len();
|
||||||
|
for (j, x) in range.take(words.len()).enumerate() {
|
||||||
|
let add = if j == words.len() - 1 { add } else { 0 }; // is last?
|
||||||
|
let range = x..x + 1 + add;
|
||||||
|
output.insert(id + j, range);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
output
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn longest_common_prefix<T: Eq + std::fmt::Debug>(a: &[T], b: &[T]) -> usize {
|
||||||
|
let mut best = None;
|
||||||
|
for i in (0..a.len()).rev() {
|
||||||
|
let count = a[i..].iter().zip(b).take_while(|(a, b)| a == b).count();
|
||||||
|
best = match best {
|
||||||
|
Some(old) if count > old => Some(count),
|
||||||
|
Some(_) => break,
|
||||||
|
None => Some(count),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
best.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified() {
|
||||||
|
let query = ["new", "york", "city", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// new york = new york city
|
||||||
|
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// new = new york city
|
||||||
|
builder.declare(0..1, 7, &["new", "york", "city"]);
|
||||||
|
// ^ 7 8 9
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // new
|
||||||
|
assert_eq!(mapping[&1], 1..2); // york
|
||||||
|
assert_eq!(mapping[&2], 2..3); // city
|
||||||
|
assert_eq!(mapping[&3], 3..4); // subway
|
||||||
|
|
||||||
|
assert_eq!(mapping[&4], 0..1); // new
|
||||||
|
assert_eq!(mapping[&5], 1..2); // york
|
||||||
|
assert_eq!(mapping[&6], 2..3); // city
|
||||||
|
|
||||||
|
assert_eq!(mapping[&7], 0..1); // new
|
||||||
|
assert_eq!(mapping[&8], 1..2); // york
|
||||||
|
assert_eq!(mapping[&9], 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified2() {
|
||||||
|
let query = ["new", "york", "city", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// city subway = new york city underground train
|
||||||
|
builder.declare(2..4, 4, &["new", "york", "city", "underground", "train"]);
|
||||||
|
// ^ 4 5 6 7 8
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // new
|
||||||
|
assert_eq!(mapping[&1], 1..2); // york
|
||||||
|
assert_eq!(mapping[&2], 2..3); // city
|
||||||
|
assert_eq!(mapping[&3], 3..5); // subway
|
||||||
|
|
||||||
|
assert_eq!(mapping[&4], 0..1); // new
|
||||||
|
assert_eq!(mapping[&5], 1..2); // york
|
||||||
|
assert_eq!(mapping[&6], 2..3); // city
|
||||||
|
assert_eq!(mapping[&7], 3..4); // underground
|
||||||
|
assert_eq!(mapping[&8], 4..5); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified3() {
|
||||||
|
let query = ["a", "b", "x", "x", "a", "b", "c", "d", "e", "f", "g"];
|
||||||
|
// 0 1 2 3 4 5 6 7 8 9 10
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// c d = a b x c d k j e f
|
||||||
|
builder.declare(6..8, 11, &["a", "b", "x", "c", "d", "k", "j", "e", "f"]);
|
||||||
|
// ^^ 11 12 13 14 15 16 17 18 19
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // a
|
||||||
|
assert_eq!(mapping[&1], 1..2); // b
|
||||||
|
assert_eq!(mapping[&2], 2..3); // x
|
||||||
|
assert_eq!(mapping[&3], 3..4); // x
|
||||||
|
assert_eq!(mapping[&4], 4..5); // a
|
||||||
|
assert_eq!(mapping[&5], 5..6); // b
|
||||||
|
assert_eq!(mapping[&6], 6..7); // c
|
||||||
|
assert_eq!(mapping[&7], 7..11); // d
|
||||||
|
assert_eq!(mapping[&8], 11..12); // e
|
||||||
|
assert_eq!(mapping[&9], 12..13); // f
|
||||||
|
assert_eq!(mapping[&10], 13..14); // g
|
||||||
|
|
||||||
|
assert_eq!(mapping[&11], 4..5); // a
|
||||||
|
assert_eq!(mapping[&12], 5..6); // b
|
||||||
|
assert_eq!(mapping[&13], 6..7); // x
|
||||||
|
assert_eq!(mapping[&14], 7..8); // c
|
||||||
|
assert_eq!(mapping[&15], 8..9); // d
|
||||||
|
assert_eq!(mapping[&16], 9..10); // k
|
||||||
|
assert_eq!(mapping[&17], 10..11); // j
|
||||||
|
assert_eq!(mapping[&18], 11..12); // e
|
||||||
|
assert_eq!(mapping[&19], 12..13); // f
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_growing() {
|
||||||
|
let query = ["new", "york", "subway"];
|
||||||
|
// 0 1 2
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// new york = new york city
|
||||||
|
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||||
|
// ^ 3 4 5
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // new
|
||||||
|
assert_eq!(mapping[&1], 1..3); // york
|
||||||
|
assert_eq!(mapping[&2], 3..4); // subway
|
||||||
|
assert_eq!(mapping[&3], 0..1); // new
|
||||||
|
assert_eq!(mapping[&4], 1..2); // york
|
||||||
|
assert_eq!(mapping[&5], 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn same_place_growings() {
|
||||||
|
let query = ["NY", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NY = new york
|
||||||
|
builder.declare(0..1, 2, &["new", "york"]);
|
||||||
|
// ^ 2 3
|
||||||
|
|
||||||
|
// NY = new york city
|
||||||
|
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// NY = NYC
|
||||||
|
builder.declare(0..1, 7, &["NYC"]);
|
||||||
|
// ^ 7
|
||||||
|
|
||||||
|
// NY = new york city
|
||||||
|
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||||
|
// ^ 8 9 10
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(1..2, 11, &["underground", "train"]);
|
||||||
|
// ^ 11 12
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..3); // NY
|
||||||
|
assert_eq!(mapping[&1], 3..5); // subway
|
||||||
|
assert_eq!(mapping[&2], 0..1); // new
|
||||||
|
assert_eq!(mapping[&3], 1..3); // york
|
||||||
|
assert_eq!(mapping[&4], 0..1); // new
|
||||||
|
assert_eq!(mapping[&5], 1..2); // york
|
||||||
|
assert_eq!(mapping[&6], 2..3); // city
|
||||||
|
assert_eq!(mapping[&7], 0..3); // NYC
|
||||||
|
assert_eq!(mapping[&8], 0..1); // new
|
||||||
|
assert_eq!(mapping[&9], 1..2); // york
|
||||||
|
assert_eq!(mapping[&10], 2..3); // city
|
||||||
|
assert_eq!(mapping[&11], 3..4); // underground
|
||||||
|
assert_eq!(mapping[&12], 4..5); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bigger_growing() {
|
||||||
|
let query = ["NYC", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||||
|
// ^ 2 3 4
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..3); // NYC
|
||||||
|
assert_eq!(mapping[&1], 3..4); // subway
|
||||||
|
assert_eq!(mapping[&2], 0..1); // new
|
||||||
|
assert_eq!(mapping[&3], 1..2); // york
|
||||||
|
assert_eq!(mapping[&4], 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn middle_query_growing() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // great
|
||||||
|
assert_eq!(mapping[&1], 1..2); // awesome
|
||||||
|
assert_eq!(mapping[&2], 2..5); // NYC
|
||||||
|
assert_eq!(mapping[&3], 5..6); // subway
|
||||||
|
assert_eq!(mapping[&4], 2..3); // new
|
||||||
|
assert_eq!(mapping[&5], 3..4); // york
|
||||||
|
assert_eq!(mapping[&6], 4..5); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn end_query_growing() {
|
||||||
|
let query = ["NYC", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(1..2, 2, &["underground", "train"]);
|
||||||
|
// ^ 2 3
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // NYC
|
||||||
|
assert_eq!(mapping[&1], 1..3); // subway
|
||||||
|
assert_eq!(mapping[&2], 1..2); // underground
|
||||||
|
assert_eq!(mapping[&3], 2..3); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_growings() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(3..4, 7, &["underground", "train"]);
|
||||||
|
// ^ 7 8
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // great
|
||||||
|
assert_eq!(mapping[&1], 1..2); // awesome
|
||||||
|
assert_eq!(mapping[&2], 2..5); // NYC
|
||||||
|
assert_eq!(mapping[&3], 5..7); // subway
|
||||||
|
assert_eq!(mapping[&4], 2..3); // new
|
||||||
|
assert_eq!(mapping[&5], 3..4); // york
|
||||||
|
assert_eq!(mapping[&6], 4..5); // city
|
||||||
|
assert_eq!(mapping[&7], 5..6); // underground
|
||||||
|
assert_eq!(mapping[&8], 6..7); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_probable_growings() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(3..4, 7, &["underground", "train"]);
|
||||||
|
// ^ 7 8
|
||||||
|
|
||||||
|
// great awesome = good
|
||||||
|
builder.declare(0..2, 9, &["good"]);
|
||||||
|
// ^ 9
|
||||||
|
|
||||||
|
// awesome NYC = NY
|
||||||
|
builder.declare(1..3, 10, &["NY"]);
|
||||||
|
// ^^ 10
|
||||||
|
|
||||||
|
// NYC subway = metro
|
||||||
|
builder.declare(2..4, 11, &["metro"]);
|
||||||
|
// ^^ 11
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // great
|
||||||
|
assert_eq!(mapping[&1], 1..2); // awesome
|
||||||
|
assert_eq!(mapping[&2], 2..5); // NYC
|
||||||
|
assert_eq!(mapping[&3], 5..7); // subway
|
||||||
|
assert_eq!(mapping[&4], 2..3); // new
|
||||||
|
assert_eq!(mapping[&5], 3..4); // york
|
||||||
|
assert_eq!(mapping[&6], 4..5); // city
|
||||||
|
assert_eq!(mapping[&7], 5..6); // underground
|
||||||
|
assert_eq!(mapping[&8], 6..7); // train
|
||||||
|
assert_eq!(mapping[&9], 0..2); // good
|
||||||
|
assert_eq!(mapping[&10], 1..5); // NY
|
||||||
|
assert_eq!(mapping[&11], 2..7); // metro
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user