mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 13:40:31 +01:00
Merge branch 'search-refactor-typo-attributes' into search-refactor
This commit is contained in:
commit
644e136aee
@ -4,10 +4,14 @@ use std::hash::Hash;
|
|||||||
|
|
||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesEncode, Database, RoTxn};
|
use heed::{BytesDecode, BytesEncode, Database, RoTxn};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::interner::Interned;
|
use super::interner::Interned;
|
||||||
use crate::{Result, SearchContext};
|
use super::Word;
|
||||||
|
use crate::{
|
||||||
|
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
||||||
|
};
|
||||||
|
|
||||||
/// A cache storing pointers to values in the LMDB databases.
|
/// A cache storing pointers to values in the LMDB databases.
|
||||||
///
|
///
|
||||||
@ -25,6 +29,7 @@ pub struct DatabaseCache<'ctx> {
|
|||||||
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
|
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
|
|
||||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||||
@ -64,28 +69,103 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn word_docids(&mut self, word: Word) -> Result<Option<RoaringBitmap>> {
|
||||||
|
match word {
|
||||||
|
Word::Original(word) => {
|
||||||
|
let exact = self.get_db_exact_word_docids(word)?;
|
||||||
|
let tolerant = self.get_db_word_docids(word)?;
|
||||||
|
Ok(match (exact, tolerant) {
|
||||||
|
(None, None) => None,
|
||||||
|
(None, Some(tolerant)) => Some(tolerant),
|
||||||
|
(Some(exact), None) => Some(exact),
|
||||||
|
(Some(exact), Some(tolerant)) => {
|
||||||
|
let mut both = exact;
|
||||||
|
both |= tolerant;
|
||||||
|
Some(both)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Word::Derived(word) => self.get_db_word_docids(word),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieve or insert the given value in the `word_docids` database.
|
/// Retrieve or insert the given value in the `word_docids` database.
|
||||||
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> {
|
fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
&mut self.db_cache.word_docids,
|
&mut self.db_cache.word_docids,
|
||||||
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_db_exact_word_docids(
|
||||||
|
&mut self,
|
||||||
|
word: Interned<String>,
|
||||||
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
|
DatabaseCache::get_value(
|
||||||
|
self.txn,
|
||||||
|
word,
|
||||||
|
self.word_interner.get(word).as_str(),
|
||||||
|
&mut self.db_cache.exact_word_docids,
|
||||||
|
self.index.exact_word_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)?
|
||||||
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
|
||||||
|
match prefix {
|
||||||
|
Word::Original(prefix) => {
|
||||||
|
let exact = self.get_db_exact_word_prefix_docids(prefix)?;
|
||||||
|
let tolerant = self.get_db_word_prefix_docids(prefix)?;
|
||||||
|
Ok(match (exact, tolerant) {
|
||||||
|
(None, None) => None,
|
||||||
|
(None, Some(tolerant)) => Some(tolerant),
|
||||||
|
(Some(exact), None) => Some(exact),
|
||||||
|
(Some(exact), Some(tolerant)) => {
|
||||||
|
let mut both = exact;
|
||||||
|
both |= tolerant;
|
||||||
|
Some(both)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
||||||
pub fn get_db_word_prefix_docids(
|
fn get_db_word_prefix_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
prefix: Interned<String>,
|
prefix: Interned<String>,
|
||||||
) -> Result<Option<&'ctx [u8]>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
&mut self.db_cache.word_prefix_docids,
|
&mut self.db_cache.word_prefix_docids,
|
||||||
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_db_exact_word_prefix_docids(
|
||||||
|
&mut self,
|
||||||
|
prefix: Interned<String>,
|
||||||
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
|
DatabaseCache::get_value(
|
||||||
|
self.txn,
|
||||||
|
prefix,
|
||||||
|
self.word_interner.get(prefix).as_str(),
|
||||||
|
&mut self.db_cache.exact_word_prefix_docids,
|
||||||
|
self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)?
|
||||||
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_db_word_pair_proximity_docids(
|
pub fn get_db_word_pair_proximity_docids(
|
||||||
@ -93,7 +173,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
word1: Interned<String>,
|
word1: Interned<String>,
|
||||||
word2: Interned<String>,
|
word2: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<&'ctx [u8]>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, word1, word2),
|
(proximity, word1, word2),
|
||||||
@ -104,7 +184,32 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
),
|
),
|
||||||
&mut self.db_cache.word_pair_proximity_docids,
|
&mut self.db_cache.word_pair_proximity_docids,
|
||||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_db_word_pair_proximity_docids_len(
|
||||||
|
&mut self,
|
||||||
|
word1: Interned<String>,
|
||||||
|
word2: Interned<String>,
|
||||||
|
proximity: u8,
|
||||||
|
) -> Result<Option<u64>> {
|
||||||
|
DatabaseCache::get_value(
|
||||||
|
self.txn,
|
||||||
|
(proximity, word1, word2),
|
||||||
|
&(
|
||||||
|
proximity,
|
||||||
|
self.word_interner.get(word1).as_str(),
|
||||||
|
self.word_interner.get(word2).as_str(),
|
||||||
|
),
|
||||||
|
&mut self.db_cache.word_pair_proximity_docids,
|
||||||
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)?
|
||||||
|
.map(|bytes| {
|
||||||
|
CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())
|
||||||
|
})
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_db_word_prefix_pair_proximity_docids(
|
pub fn get_db_word_prefix_pair_proximity_docids(
|
||||||
@ -112,7 +217,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
word1: Interned<String>,
|
word1: Interned<String>,
|
||||||
prefix2: Interned<String>,
|
prefix2: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<&'ctx [u8]>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, word1, prefix2),
|
(proximity, word1, prefix2),
|
||||||
@ -123,14 +228,16 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
),
|
),
|
||||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
||||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
left_prefix: Interned<String>,
|
left_prefix: Interned<String>,
|
||||||
right: Interned<String>,
|
right: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<&'ctx [u8]>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, left_prefix, right),
|
(proximity, left_prefix, right),
|
||||||
@ -141,34 +248,40 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
),
|
),
|
||||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
||||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_db_word_position_docids(
|
pub fn get_db_word_position_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
word: Interned<String>,
|
word: Interned<String>,
|
||||||
position: u16,
|
position: u16,
|
||||||
) -> Result<Option<&'ctx [u8]>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
(word, position),
|
(word, position),
|
||||||
&(self.word_interner.get(word).as_str(), position),
|
&(self.word_interner.get(word).as_str(), position),
|
||||||
&mut self.db_cache.word_position_docids,
|
&mut self.db_cache.word_position_docids,
|
||||||
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_db_word_fid_docids(
|
pub fn get_db_word_fid_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
word: Interned<String>,
|
word: Interned<String>,
|
||||||
fid: u16,
|
fid: u16,
|
||||||
) -> Result<Option<&'ctx [u8]>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
self.txn,
|
self.txn,
|
||||||
(word, fid),
|
(word, fid),
|
||||||
&(self.word_interner.get(word).as_str(), fid),
|
&(self.word_interner.get(word).as_str(), fid),
|
||||||
&mut self.db_cache.word_fid_docids,
|
&mut self.db_cache.word_fid_docids,
|
||||||
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
||||||
)
|
)?
|
||||||
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
use heed::BytesDecode;
|
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
use super::query_graph::QueryGraph;
|
use super::query_graph::QueryGraph;
|
||||||
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||||
use crate::search::new::query_graph::QueryNodeData;
|
use crate::search::new::query_graph::QueryNodeData;
|
||||||
use crate::search::new::query_term::ExactTerm;
|
use crate::search::new::query_term::ExactTerm;
|
||||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
use crate::{Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
/// A ranking rule that produces 3 disjoint buckets:
|
/// A ranking rule that produces 3 disjoint buckets:
|
||||||
///
|
///
|
||||||
@ -161,10 +160,8 @@ impl State {
|
|||||||
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
||||||
// longer phrases we'll be losing on precision here.
|
// longer phrases we'll be losing on precision here.
|
||||||
let bucketed_position = crate::bucketed_position(position + offset);
|
let bucketed_position = crate::bucketed_position(position + offset);
|
||||||
let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
|
let word_position_docids =
|
||||||
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(),
|
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default();
|
||||||
)
|
|
||||||
.unwrap_or_default();
|
|
||||||
candidates &= word_position_docids;
|
candidates &= word_position_docids;
|
||||||
if candidates.is_empty() {
|
if candidates.is_empty() {
|
||||||
return Ok(State::Empty(query_graph.clone()));
|
return Ok(State::Empty(query_graph.clone()));
|
||||||
@ -191,11 +188,7 @@ impl State {
|
|||||||
// ignore stop words words in phrases
|
// ignore stop words words in phrases
|
||||||
.flatten()
|
.flatten()
|
||||||
.map(|word| -> Result<_> {
|
.map(|word| -> Result<_> {
|
||||||
Ok(ctx
|
Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default())
|
||||||
.get_db_word_fid_docids(*word, fid)?
|
|
||||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
|
||||||
.unwrap_or_default()
|
|
||||||
.unwrap_or_default())
|
|
||||||
}),
|
}),
|
||||||
)?;
|
)?;
|
||||||
intersection &= &candidates;
|
intersection &= &candidates;
|
||||||
|
@ -427,14 +427,14 @@ fill: \"#B6E2D3\"
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||||
let w = ctx.word_interner.get(w);
|
let w = ctx.word_interner.get(w.interned());
|
||||||
writeln!(file, "{w}: word")?;
|
writeln!(file, "{w}: word")?;
|
||||||
}
|
}
|
||||||
for p in term_subset.all_phrases(ctx)? {
|
for p in term_subset.all_phrases(ctx)? {
|
||||||
writeln!(file, "{}: phrase", p.description(ctx))?;
|
writeln!(file, "{}: phrase", p.description(ctx))?;
|
||||||
}
|
}
|
||||||
if let Some(w) = term_subset.use_prefix_db(ctx) {
|
if let Some(w) = term_subset.use_prefix_db(ctx) {
|
||||||
let w = ctx.word_interner.get(w);
|
let w = ctx.word_interner.get(w.interned());
|
||||||
writeln!(file, "{w}: prefix db")?;
|
writeln!(file, "{w}: prefix db")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,6 +50,8 @@ use ranking_rules::{BoxRankingRule, RankingRule};
|
|||||||
use resolve_query_graph::compute_query_graph_docids;
|
use resolve_query_graph::compute_query_graph_docids;
|
||||||
use sort::Sort;
|
use sort::Sort;
|
||||||
|
|
||||||
|
use self::interner::Interned;
|
||||||
|
|
||||||
/// A structure used throughout the execution of a search query.
|
/// A structure used throughout the execution of a search query.
|
||||||
pub struct SearchContext<'ctx> {
|
pub struct SearchContext<'ctx> {
|
||||||
pub index: &'ctx Index,
|
pub index: &'ctx Index,
|
||||||
@ -75,6 +77,21 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
|
||||||
|
pub enum Word {
|
||||||
|
Original(Interned<String>),
|
||||||
|
Derived(Interned<String>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Word {
|
||||||
|
pub fn interned(&self) -> Interned<String> {
|
||||||
|
match self {
|
||||||
|
Word::Original(word) => *word,
|
||||||
|
Word::Derived(word) => *word,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
|
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn resolve_maximally_reduced_query_graph(
|
fn resolve_maximally_reduced_query_graph(
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
use fst::automaton::Str;
|
|
||||||
use fst::{Automaton, IntoStreamer, Streamer};
|
|
||||||
use heed::types::DecodeIgnore;
|
|
||||||
use heed::BytesDecode;
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
|
|
||||||
|
use fst::automaton::Str;
|
||||||
|
use fst::{Automaton, IntoStreamer, Streamer};
|
||||||
|
use heed::types::DecodeIgnore;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
use crate::search::new::query_term::TwoTypoTerm;
|
use crate::search::new::query_term::TwoTypoTerm;
|
||||||
use crate::search::new::{limits, SearchContext};
|
use crate::search::new::{limits, SearchContext};
|
||||||
use crate::search::{build_dfa, get_first};
|
use crate::search::{build_dfa, get_first};
|
||||||
use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH};
|
use crate::{Result, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub enum NumberOfTypos {
|
pub enum NumberOfTypos {
|
||||||
@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word(
|
|||||||
word: &str,
|
word: &str,
|
||||||
max_typo: u8,
|
max_typo: u8,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
|
is_ngram: bool,
|
||||||
) -> Result<QueryTerm> {
|
) -> Result<QueryTerm> {
|
||||||
let word_interned = ctx.word_interner.insert(word.to_owned());
|
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||||
|
|
||||||
@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word(
|
|||||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||||
|
|
||||||
let use_prefix_db = is_prefix
|
let use_prefix_db = is_prefix
|
||||||
&& ctx
|
&& (ctx
|
||||||
.index
|
.index
|
||||||
.word_prefix_docids
|
.word_prefix_docids
|
||||||
.remap_data_type::<DecodeIgnore>()
|
.remap_data_type::<DecodeIgnore>()
|
||||||
.get(ctx.txn, word)?
|
.get(ctx.txn, word)?
|
||||||
.is_some();
|
.is_some()
|
||||||
|
|| (!is_ngram
|
||||||
|
&& ctx
|
||||||
|
.index
|
||||||
|
.exact_word_prefix_docids
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.get(ctx.txn, word)?
|
||||||
|
.is_some()));
|
||||||
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
||||||
|
|
||||||
let mut zero_typo = None;
|
let mut zero_typo = None;
|
||||||
@ -385,9 +393,7 @@ fn split_best_frequency(
|
|||||||
let left = ctx.word_interner.insert(left.to_owned());
|
let left = ctx.word_interner.insert(left.to_owned());
|
||||||
let right = ctx.word_interner.insert(right.to_owned());
|
let right = ctx.word_interner.insert(right.to_owned());
|
||||||
|
|
||||||
if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? {
|
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
|
||||||
let frequency =
|
|
||||||
CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
|
|
||||||
if best.map_or(true, |(old, _, _)| frequency > old) {
|
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||||
best = Some((frequency, left, right));
|
best = Some((frequency, left, right));
|
||||||
}
|
}
|
||||||
|
@ -3,18 +3,18 @@ mod ntypo_subset;
|
|||||||
mod parse_query;
|
mod parse_query;
|
||||||
mod phrase;
|
mod phrase;
|
||||||
|
|
||||||
use super::interner::{DedupInterner, Interned};
|
|
||||||
use super::{limits, SearchContext};
|
|
||||||
use crate::Result;
|
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::ops::RangeInclusive;
|
use std::ops::RangeInclusive;
|
||||||
|
|
||||||
|
use compute_derivations::partially_initialized_term_from_word;
|
||||||
use either::Either;
|
use either::Either;
|
||||||
pub use ntypo_subset::NTypoTermSubset;
|
pub use ntypo_subset::NTypoTermSubset;
|
||||||
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
||||||
pub use phrase::Phrase;
|
pub use phrase::Phrase;
|
||||||
|
|
||||||
use compute_derivations::partially_initialized_term_from_word;
|
use super::interner::{DedupInterner, Interned};
|
||||||
|
use super::{limits, SearchContext, Word};
|
||||||
|
use crate::Result;
|
||||||
|
|
||||||
/// A set of word derivations attached to a location in the search query.
|
/// A set of word derivations attached to a location in the search query.
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
@ -159,12 +159,12 @@ impl QueryTermSubset {
|
|||||||
self.two_typo_subset.intersect(&other.two_typo_subset);
|
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Interned<String>> {
|
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Word> {
|
||||||
let original = ctx.term_interner.get(self.original);
|
let original = ctx.term_interner.get(self.original);
|
||||||
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
|
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
|
||||||
return None
|
return None
|
||||||
};
|
};
|
||||||
match &self.zero_typo_subset {
|
let word = match &self.zero_typo_subset {
|
||||||
NTypoTermSubset::All => Some(use_prefix_db),
|
NTypoTermSubset::All => Some(use_prefix_db),
|
||||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
// TODO: use a subset of prefix words instead
|
// TODO: use a subset of prefix words instead
|
||||||
@ -175,12 +175,19 @@ impl QueryTermSubset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Nothing => None,
|
NTypoTermSubset::Nothing => None,
|
||||||
}
|
};
|
||||||
|
word.map(|word| {
|
||||||
|
if original.ngram_words.is_some() {
|
||||||
|
Word::Derived(word)
|
||||||
|
} else {
|
||||||
|
Word::Original(word)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
pub fn all_single_words_except_prefix_db(
|
pub fn all_single_words_except_prefix_db(
|
||||||
&self,
|
&self,
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
) -> Result<BTreeSet<Interned<String>>> {
|
) -> Result<BTreeSet<Word>> {
|
||||||
let mut result = BTreeSet::default();
|
let mut result = BTreeSet::default();
|
||||||
// TODO: a compute_partially funtion
|
// TODO: a compute_partially funtion
|
||||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||||
@ -197,8 +204,20 @@ impl QueryTermSubset {
|
|||||||
synonyms: _,
|
synonyms: _,
|
||||||
use_prefix_db: _,
|
use_prefix_db: _,
|
||||||
} = &original.zero_typo;
|
} = &original.zero_typo;
|
||||||
result.extend(zero_typo.iter().copied());
|
result.extend(zero_typo.iter().copied().map(|w| {
|
||||||
result.extend(prefix_of.iter().copied());
|
if original.ngram_words.is_some() {
|
||||||
|
Word::Derived(w)
|
||||||
|
} else {
|
||||||
|
Word::Original(w)
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
result.extend(prefix_of.iter().copied().map(|w| {
|
||||||
|
if original.ngram_words.is_some() {
|
||||||
|
Word::Derived(w)
|
||||||
|
} else {
|
||||||
|
Word::Original(w)
|
||||||
|
}
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
let ZeroTypoTerm {
|
let ZeroTypoTerm {
|
||||||
@ -210,10 +229,20 @@ impl QueryTermSubset {
|
|||||||
} = &original.zero_typo;
|
} = &original.zero_typo;
|
||||||
if let Some(zero_typo) = zero_typo {
|
if let Some(zero_typo) = zero_typo {
|
||||||
if words.contains(zero_typo) {
|
if words.contains(zero_typo) {
|
||||||
result.insert(*zero_typo);
|
if original.ngram_words.is_some() {
|
||||||
|
result.insert(Word::Derived(*zero_typo));
|
||||||
|
} else {
|
||||||
|
result.insert(Word::Original(*zero_typo));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result.extend(prefix_of.intersection(words).copied());
|
result.extend(prefix_of.intersection(words).copied().map(|w| {
|
||||||
|
if original.ngram_words.is_some() {
|
||||||
|
Word::Derived(w)
|
||||||
|
} else {
|
||||||
|
Word::Original(w)
|
||||||
|
}
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Nothing => {}
|
NTypoTermSubset::Nothing => {}
|
||||||
}
|
}
|
||||||
@ -223,13 +252,13 @@ impl QueryTermSubset {
|
|||||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||||
panic!()
|
panic!()
|
||||||
};
|
};
|
||||||
result.extend(one_typo.iter().copied())
|
result.extend(one_typo.iter().copied().map(Word::Derived))
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||||
panic!()
|
panic!()
|
||||||
};
|
};
|
||||||
result.extend(one_typo.intersection(words));
|
result.extend(one_typo.intersection(words).copied().map(Word::Derived));
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Nothing => {}
|
NTypoTermSubset::Nothing => {}
|
||||||
};
|
};
|
||||||
@ -239,13 +268,13 @@ impl QueryTermSubset {
|
|||||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||||
panic!()
|
panic!()
|
||||||
};
|
};
|
||||||
result.extend(two_typos.iter().copied());
|
result.extend(two_typos.iter().copied().map(Word::Derived));
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||||
panic!()
|
panic!()
|
||||||
};
|
};
|
||||||
result.extend(two_typos.intersection(words));
|
result.extend(two_typos.intersection(words).copied().map(Word::Derived));
|
||||||
}
|
}
|
||||||
NTypoTermSubset::Nothing => {}
|
NTypoTermSubset::Nothing => {}
|
||||||
};
|
};
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind};
|
use charabia::normalizer::NormalizedTokenIter;
|
||||||
|
use charabia::{SeparatorKind, TokenKind};
|
||||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
/// Convert the tokenised search query into a list of located query terms.
|
/// Convert the tokenised search query into a list of located query terms.
|
||||||
// TODO: checking if the positions are correct for phrases, separators, ngrams
|
// TODO: checking if the positions are correct for phrases, separators, ngrams
|
||||||
@ -51,6 +51,7 @@ pub fn located_query_terms_from_string(
|
|||||||
word,
|
word,
|
||||||
nbr_typos(word),
|
nbr_typos(word),
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
)?;
|
)?;
|
||||||
let located_term = LocatedQueryTerm {
|
let located_term = LocatedQueryTerm {
|
||||||
value: ctx.term_interner.push(term),
|
value: ctx.term_interner.push(term),
|
||||||
@ -62,8 +63,13 @@ pub fn located_query_terms_from_string(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let word = token.lemma();
|
let word = token.lemma();
|
||||||
let term =
|
let term = partially_initialized_term_from_word(
|
||||||
partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?;
|
ctx,
|
||||||
|
word,
|
||||||
|
nbr_typos(word),
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
)?;
|
||||||
let located_term = LocatedQueryTerm {
|
let located_term = LocatedQueryTerm {
|
||||||
value: ctx.term_interner.push(term),
|
value: ctx.term_interner.push(term),
|
||||||
positions: position..=position,
|
positions: position..=position,
|
||||||
@ -195,7 +201,8 @@ pub fn make_ngram(
|
|||||||
let max_nbr_typos =
|
let max_nbr_typos =
|
||||||
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
||||||
|
|
||||||
let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?;
|
let mut term =
|
||||||
|
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
|
||||||
|
|
||||||
// Now add the synonyms
|
// Now add the synonyms
|
||||||
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
use heed::BytesDecode;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||||
use crate::search::new::interner::{DedupInterner, Interned};
|
use crate::search::new::interner::{DedupInterner, Interned};
|
||||||
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
|
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
|
||||||
use crate::{Result, RoaringBitmapCodec, SearchContext};
|
use crate::search::new::Word;
|
||||||
|
use crate::{Result, SearchContext};
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum ExactnessCondition {
|
pub enum ExactnessCondition {
|
||||||
@ -27,8 +27,8 @@ fn compute_docids(
|
|||||||
let mut candidates = match exact_term {
|
let mut candidates = match exact_term {
|
||||||
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
|
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
|
||||||
ExactTerm::Word(word) => {
|
ExactTerm::Word(word) => {
|
||||||
if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
|
if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? {
|
||||||
RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)?
|
word_candidates
|
||||||
} else {
|
} else {
|
||||||
return Ok(Default::default());
|
return Ok(Default::default());
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
use heed::BytesDecode;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::ProximityCondition;
|
use super::ProximityCondition;
|
||||||
@ -10,8 +9,8 @@ use crate::search::new::interner::Interned;
|
|||||||
use crate::search::new::query_term::{Phrase, QueryTermSubset};
|
use crate::search::new::query_term::{Phrase, QueryTermSubset};
|
||||||
use crate::search::new::ranking_rule_graph::ComputedCondition;
|
use crate::search::new::ranking_rule_graph::ComputedCondition;
|
||||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||||
use crate::search::new::SearchContext;
|
use crate::search::new::{SearchContext, Word};
|
||||||
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
|
use crate::Result;
|
||||||
|
|
||||||
pub fn compute_docids(
|
pub fn compute_docids(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
@ -55,8 +54,8 @@ pub fn compute_docids(
|
|||||||
{
|
{
|
||||||
compute_prefix_edges(
|
compute_prefix_edges(
|
||||||
ctx,
|
ctx,
|
||||||
left_word,
|
left_word.interned(),
|
||||||
right_prefix,
|
right_prefix.interned(),
|
||||||
left_phrase,
|
left_phrase,
|
||||||
forward_proximity,
|
forward_proximity,
|
||||||
backward_proximity,
|
backward_proximity,
|
||||||
@ -92,9 +91,7 @@ pub fn compute_docids(
|
|||||||
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
|
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
|
} else if let Some(left_word_docids) = ctx.word_docids(left_word)? {
|
||||||
let left_word_docids =
|
|
||||||
RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?;
|
|
||||||
if universe.is_disjoint(&left_word_docids) {
|
if universe.is_disjoint(&left_word_docids) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -104,7 +101,7 @@ pub fn compute_docids(
|
|||||||
for (right_word, right_phrase) in right_derivs {
|
for (right_word, right_phrase) in right_derivs {
|
||||||
compute_non_prefix_edges(
|
compute_non_prefix_edges(
|
||||||
ctx,
|
ctx,
|
||||||
left_word,
|
left_word.interned(),
|
||||||
right_word,
|
right_word,
|
||||||
left_phrase,
|
left_phrase,
|
||||||
right_phrase,
|
right_phrase,
|
||||||
@ -155,7 +152,7 @@ fn compute_prefix_edges(
|
|||||||
if let Some(new_docids) =
|
if let Some(new_docids) =
|
||||||
ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
|
ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
|
||||||
{
|
{
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & new_docids;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_left_words.insert(left_word);
|
used_left_words.insert(left_word);
|
||||||
used_right_prefix.insert(right_prefix);
|
used_right_prefix.insert(right_prefix);
|
||||||
@ -170,7 +167,7 @@ fn compute_prefix_edges(
|
|||||||
left_word,
|
left_word,
|
||||||
backward_proximity,
|
backward_proximity,
|
||||||
)? {
|
)? {
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & new_docids;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_left_words.insert(left_word);
|
used_left_words.insert(left_word);
|
||||||
used_right_prefix.insert(right_prefix);
|
used_right_prefix.insert(right_prefix);
|
||||||
@ -217,7 +214,7 @@ fn compute_non_prefix_edges(
|
|||||||
if let Some(new_docids) =
|
if let Some(new_docids) =
|
||||||
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
|
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
|
||||||
{
|
{
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & new_docids;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_left_words.insert(word1);
|
used_left_words.insert(word1);
|
||||||
used_right_words.insert(word2);
|
used_right_words.insert(word2);
|
||||||
@ -231,7 +228,7 @@ fn compute_non_prefix_edges(
|
|||||||
if let Some(new_docids) =
|
if let Some(new_docids) =
|
||||||
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
|
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
|
||||||
{
|
{
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & new_docids;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_left_words.insert(word2);
|
used_left_words.insert(word2);
|
||||||
used_right_words.insert(word1);
|
used_right_words.insert(word1);
|
||||||
@ -246,7 +243,7 @@ fn compute_non_prefix_edges(
|
|||||||
fn last_words_of_term_derivations(
|
fn last_words_of_term_derivations(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
t: &QueryTermSubset,
|
t: &QueryTermSubset,
|
||||||
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Interned<String>)>> {
|
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Word)>> {
|
||||||
let mut result = BTreeSet::new();
|
let mut result = BTreeSet::new();
|
||||||
|
|
||||||
for w in t.all_single_words_except_prefix_db(ctx)? {
|
for w in t.all_single_words_except_prefix_db(ctx)? {
|
||||||
@ -256,7 +253,7 @@ fn last_words_of_term_derivations(
|
|||||||
let phrase = ctx.phrase_interner.get(p);
|
let phrase = ctx.phrase_interner.get(p);
|
||||||
let last_term_of_phrase = phrase.words.last().unwrap();
|
let last_term_of_phrase = phrase.words.last().unwrap();
|
||||||
if let Some(last_word) = last_term_of_phrase {
|
if let Some(last_word) = last_term_of_phrase {
|
||||||
result.insert((Some(p), *last_word));
|
result.insert((Some(p), Word::Original(*last_word)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -269,7 +266,7 @@ fn first_word_of_term_iter(
|
|||||||
let mut result = BTreeSet::new();
|
let mut result = BTreeSet::new();
|
||||||
let all_words = t.all_single_words_except_prefix_db(ctx)?;
|
let all_words = t.all_single_words_except_prefix_db(ctx)?;
|
||||||
for w in all_words {
|
for w in all_words {
|
||||||
result.insert((w, None));
|
result.insert((w.interned(), None));
|
||||||
}
|
}
|
||||||
for p in t.all_phrases(ctx)? {
|
for p in t.all_phrases(ctx)? {
|
||||||
let phrase = ctx.phrase_interner.get(p);
|
let phrase = ctx.phrase_interner.get(p);
|
||||||
|
@ -3,16 +3,15 @@
|
|||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::BytesDecode;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::interner::Interned;
|
use super::interner::Interned;
|
||||||
use super::query_graph::QueryNodeData;
|
use super::query_graph::QueryNodeData;
|
||||||
use super::query_term::{Phrase, QueryTermSubset};
|
use super::query_term::{Phrase, QueryTermSubset};
|
||||||
use super::small_bitmap::SmallBitmap;
|
use super::small_bitmap::SmallBitmap;
|
||||||
use super::{QueryGraph, SearchContext};
|
use super::{QueryGraph, SearchContext, Word};
|
||||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
|
use crate::Result;
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct PhraseDocIdsCache {
|
pub struct PhraseDocIdsCache {
|
||||||
@ -36,8 +35,8 @@ pub fn compute_query_term_subset_docids(
|
|||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for word in term.all_single_words_except_prefix_db(ctx)? {
|
for word in term.all_single_words_except_prefix_db(ctx)? {
|
||||||
if let Some(word_docids) = ctx.get_db_word_docids(word)? {
|
if let Some(word_docids) = ctx.word_docids(word)? {
|
||||||
docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?;
|
docids |= word_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for phrase in term.all_phrases(ctx)? {
|
for phrase in term.all_phrases(ctx)? {
|
||||||
@ -45,9 +44,8 @@ pub fn compute_query_term_subset_docids(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(prefix) = term.use_prefix_db(ctx) {
|
if let Some(prefix) = term.use_prefix_db(ctx) {
|
||||||
if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? {
|
if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? {
|
||||||
docids |=
|
docids |= prefix_docids;
|
||||||
RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -127,9 +125,8 @@ pub fn compute_phrase_docids(
|
|||||||
}
|
}
|
||||||
if words.len() == 1 {
|
if words.len() == 1 {
|
||||||
if let Some(word) = &words[0] {
|
if let Some(word) = &words[0] {
|
||||||
if let Some(word_docids) = ctx.get_db_word_docids(*word)? {
|
if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
|
||||||
return RoaringBitmapCodec::bytes_decode(word_docids)
|
return Ok(word_docids);
|
||||||
.ok_or(heed::Error::Decoding.into());
|
|
||||||
} else {
|
} else {
|
||||||
return Ok(RoaringBitmap::new());
|
return Ok(RoaringBitmap::new());
|
||||||
}
|
}
|
||||||
@ -158,7 +155,7 @@ pub fn compute_phrase_docids(
|
|||||||
{
|
{
|
||||||
if dist == 0 {
|
if dist == 0 {
|
||||||
match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
|
match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
|
||||||
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
|
Some(m) => bitmaps.push(m),
|
||||||
// If there are no documents for this pair, there will be no
|
// If there are no documents for this pair, there will be no
|
||||||
// results for the phrase query.
|
// results for the phrase query.
|
||||||
None => return Ok(RoaringBitmap::new()),
|
None => return Ok(RoaringBitmap::new()),
|
||||||
@ -169,7 +166,7 @@ pub fn compute_phrase_docids(
|
|||||||
if let Some(m) =
|
if let Some(m) =
|
||||||
ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
|
ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
|
||||||
{
|
{
|
||||||
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
|
bitmap |= m;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if bitmap.is_empty() {
|
if bitmap.is_empty() {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user