mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-05 04:28:55 +01:00
Merge pull request #122 from meilisearch/attribute-criterion
Introduce the Attribute criterion
This commit is contained in:
commit
5a10de1b9f
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -122,6 +122,12 @@ version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
|
||||
|
||||
[[package]]
|
||||
name = "big_s"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8"
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.3.1"
|
||||
@ -1251,6 +1257,7 @@ name = "milli"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
"bstr",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
@ -1957,9 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1"
|
||||
|
||||
[[package]]
|
||||
name = "roaring"
|
||||
version = "0.6.5"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f"
|
||||
checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
|
@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::fs::{create_dir_all, File};
|
||||
use std::net::SocketAddr;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@ -228,7 +228,6 @@ enum UpdateMeta {
|
||||
ClearDocuments,
|
||||
Settings(Settings),
|
||||
Facets(Facets),
|
||||
WordsPrefixes(WordsPrefixes),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@ -281,6 +280,14 @@ struct WordsPrefixes {
|
||||
max_prefix_length: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct WordsLevelPositions {
|
||||
level_group_size: Option<NonZeroU32>,
|
||||
min_level_size: Option<NonZeroU32>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let opt = Opt::from_args();
|
||||
@ -479,21 +486,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
Err(e) => Err(e)
|
||||
}
|
||||
}
|
||||
UpdateMeta::WordsPrefixes(settings) => {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut wtxn = index_cloned.write_txn()?;
|
||||
let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned);
|
||||
if let Some(value) = settings.threshold {
|
||||
builder.threshold(value);
|
||||
}
|
||||
if let Some(value) = settings.max_prefix_length {
|
||||
builder.max_prefix_length(value);
|
||||
}
|
||||
match builder.execute() {
|
||||
Ok(()) => wtxn.commit().map_err(Into::into),
|
||||
Err(e) => Err(e)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let meta = match result {
|
||||
@ -910,19 +902,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
warp::reply()
|
||||
});
|
||||
|
||||
let update_store_cloned = update_store.clone();
|
||||
let update_status_sender_cloned = update_status_sender.clone();
|
||||
let change_words_prefixes_route = warp::filters::method::post()
|
||||
.and(warp::path!("words-prefixes"))
|
||||
.and(warp::body::json())
|
||||
.map(move |settings: WordsPrefixes| {
|
||||
let meta = UpdateMeta::WordsPrefixes(settings);
|
||||
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
||||
eprintln!("update {} registered", update_id);
|
||||
warp::reply()
|
||||
});
|
||||
|
||||
let update_store_cloned = update_store.clone();
|
||||
let update_status_sender_cloned = update_status_sender.clone();
|
||||
let abort_update_id_route = warp::filters::method::delete()
|
||||
@ -997,7 +976,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
.or(clearing_route)
|
||||
.or(change_settings_route)
|
||||
.or(change_facet_levels_route)
|
||||
.or(change_words_prefixes_route)
|
||||
.or(update_ws_route);
|
||||
|
||||
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
|
||||
|
@ -11,7 +11,7 @@ csv = "1.1.5"
|
||||
heed = "0.10.6"
|
||||
jemallocator = "0.3.2"
|
||||
milli = { path = "../milli" }
|
||||
roaring = "0.6.5"
|
||||
roaring = "0.6.6"
|
||||
serde_json = "1.0.62"
|
||||
stderrlog = "0.5.1"
|
||||
structopt = { version = "0.3.21", default-features = false }
|
||||
|
@ -5,7 +5,7 @@ use std::{str, io, fmt};
|
||||
use anyhow::Context;
|
||||
use byte_unit::Byte;
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::Index;
|
||||
use milli::{Index, TreeLevel};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use Command::*;
|
||||
@ -19,9 +19,11 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids";
|
||||
const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids";
|
||||
const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
|
||||
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
|
||||
const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids";
|
||||
const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values";
|
||||
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
|
||||
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
|
||||
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
|
||||
const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
|
||||
const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
|
||||
const DOCUMENTS_DB_NAME: &str = "documents";
|
||||
|
||||
const ALL_DATABASE_NAMES: &[&str] = &[
|
||||
@ -31,8 +33,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[
|
||||
DOCID_WORD_POSITIONS_DB_NAME,
|
||||
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
|
||||
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
|
||||
FACET_FIELD_ID_VALUE_DOCIDS_NAME,
|
||||
FIELD_ID_DOCID_FACET_VALUES_NAME,
|
||||
WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
|
||||
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
|
||||
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
|
||||
FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
|
||||
DOCUMENTS_DB_NAME,
|
||||
];
|
||||
|
||||
@ -114,6 +118,27 @@ enum Command {
|
||||
field_name: String,
|
||||
},
|
||||
|
||||
/// Outputs a CSV with the documents ids along with the word level positions where it appears.
|
||||
WordsLevelPositionsDocids {
|
||||
/// Display the whole documents ids in details.
|
||||
#[structopt(long)]
|
||||
full_display: bool,
|
||||
|
||||
/// Words appearing in the documents.
|
||||
words: Vec<String>,
|
||||
},
|
||||
|
||||
/// Outputs a CSV with the documents ids along with
|
||||
/// the word prefix level positions where it appears.
|
||||
WordPrefixesLevelPositionsDocids {
|
||||
/// Display the whole documents ids in details.
|
||||
#[structopt(long)]
|
||||
full_display: bool,
|
||||
|
||||
/// Prefixes of words appearing in the documents.
|
||||
prefixes: Vec<String>,
|
||||
},
|
||||
|
||||
/// Outputs a CSV with the documents ids, words and the positions where this word appears.
|
||||
DocidsWordsPositions {
|
||||
/// Display the whole positions in detail.
|
||||
@ -221,6 +246,12 @@ fn main() -> anyhow::Result<()> {
|
||||
FacetValuesDocids { full_display, field_name } => {
|
||||
facet_values_docids(&index, &rtxn, !full_display, field_name)
|
||||
},
|
||||
WordsLevelPositionsDocids { full_display, words } => {
|
||||
words_level_positions_docids(&index, &rtxn, !full_display, words)
|
||||
},
|
||||
WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
|
||||
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
|
||||
},
|
||||
DocidsWordsPositions { full_display, internal_documents_ids } => {
|
||||
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
|
||||
},
|
||||
@ -319,9 +350,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
word_level_position_docids,
|
||||
word_prefix_level_position_docids,
|
||||
facet_field_id_value_docids,
|
||||
field_id_docid_facet_values: _,
|
||||
documents,
|
||||
documents
|
||||
} = index;
|
||||
|
||||
let main_name = "main";
|
||||
@ -330,6 +363,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
let docid_word_positions_name = "docid_word_positions";
|
||||
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
|
||||
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
|
||||
let word_level_position_docids_name = "word_level_position_docids";
|
||||
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
|
||||
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
|
||||
let documents_name = "documents";
|
||||
|
||||
@ -386,6 +421,20 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
}
|
||||
|
||||
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((word, level, left, right), value) = result?;
|
||||
let key = format!("{} {} {:?}", word, level, left..=right);
|
||||
heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
}
|
||||
|
||||
for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||
let ((word, level, left, right), value) = result?;
|
||||
let key = format!("{} {} {:?}", word, level, left..=right);
|
||||
heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
}
|
||||
|
||||
let faceted_fields = index.faceted_fields_ids(rtxn)?;
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
for (field_id, field_type) in faceted_fields {
|
||||
@ -524,6 +573,84 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
|
||||
Ok(wtr.flush()?)
|
||||
}
|
||||
|
||||
fn words_level_positions_docids(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
words: Vec<String>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
|
||||
|
||||
for word in words.iter().map(AsRef::as_ref) {
|
||||
let range = {
|
||||
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
|
||||
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
|
||||
left..=right
|
||||
};
|
||||
for result in index.word_level_position_docids.range(rtxn, &range)? {
|
||||
let ((w, level, left, right), docids) = result?;
|
||||
|
||||
let count = docids.len().to_string();
|
||||
let docids = if debug {
|
||||
format!("{:?}", docids)
|
||||
} else {
|
||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||
};
|
||||
let position_range = if level == TreeLevel::min_value() {
|
||||
format!("{:?}", left)
|
||||
} else {
|
||||
format!("{:?}", left..=right)
|
||||
};
|
||||
let level = level.to_string();
|
||||
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(wtr.flush()?)
|
||||
}
|
||||
|
||||
fn word_prefixes_level_positions_docids(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
prefixes: Vec<String>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
|
||||
|
||||
for word in prefixes.iter().map(AsRef::as_ref) {
|
||||
let range = {
|
||||
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
|
||||
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
|
||||
left..=right
|
||||
};
|
||||
for result in index.word_prefix_level_position_docids.range(rtxn, &range)? {
|
||||
let ((w, level, left, right), docids) = result?;
|
||||
|
||||
let count = docids.len().to_string();
|
||||
let docids = if debug {
|
||||
format!("{:?}", docids)
|
||||
} else {
|
||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||
};
|
||||
let position_range = if level == TreeLevel::min_value() {
|
||||
format!("{:?}", left)
|
||||
} else {
|
||||
format!("{:?}", left..=right)
|
||||
};
|
||||
let level = level.to_string();
|
||||
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(wtr.flush()?)
|
||||
}
|
||||
|
||||
fn docids_words_positions(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
@ -715,6 +842,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any
|
||||
fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> anyhow::Result<()> {
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
let Index {
|
||||
env: _,
|
||||
main,
|
||||
word_docids,
|
||||
word_prefix_docids,
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
word_level_position_docids,
|
||||
word_prefix_level_position_docids,
|
||||
facet_field_id_value_docids,
|
||||
field_id_docid_facet_values,
|
||||
documents,
|
||||
} = index;
|
||||
|
||||
let names = if names.is_empty() {
|
||||
ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect()
|
||||
} else {
|
||||
@ -723,30 +865,35 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
||||
|
||||
for name in names {
|
||||
let database = match name.as_str() {
|
||||
MAIN_DB_NAME => &index.main,
|
||||
WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(),
|
||||
WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(),
|
||||
DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
|
||||
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
|
||||
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(),
|
||||
FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(),
|
||||
DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
|
||||
MAIN_DB_NAME => &main,
|
||||
WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(),
|
||||
WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(),
|
||||
DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(),
|
||||
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(),
|
||||
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
|
||||
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
|
||||
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(),
|
||||
FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(),
|
||||
DOCUMENTS_DB_NAME => documents.as_polymorph(),
|
||||
unknown => anyhow::bail!("unknown database {:?}", unknown),
|
||||
};
|
||||
|
||||
let mut key_size: u64 = 0;
|
||||
let mut val_size: u64 = 0;
|
||||
let mut number_entries: u64 = 0;
|
||||
for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
|
||||
let (k, v) = result?;
|
||||
key_size += k.len() as u64;
|
||||
val_size += v.len() as u64;
|
||||
number_entries += 1;
|
||||
}
|
||||
|
||||
println!("The {} database weigh:", name);
|
||||
println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true));
|
||||
println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true));
|
||||
println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true));
|
||||
println!("\tnumber of entries: {}", number_entries);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
@ -27,7 +27,7 @@ once_cell = "1.5.2"
|
||||
ordered-float = "2.1.1"
|
||||
rayon = "1.5.0"
|
||||
regex = "1.4.3"
|
||||
roaring = "0.6.5"
|
||||
roaring = "0.6.6"
|
||||
serde = { version = "1.0.123", features = ["derive"] }
|
||||
serde_json = { version = "1.0.62", features = ["preserve_order"] }
|
||||
slice-group-by = "0.2.6"
|
||||
@ -52,13 +52,11 @@ logging_timer = "1.0.0"
|
||||
tinytemplate = "=1.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
big_s = "1.0.2"
|
||||
criterion = "0.3.4"
|
||||
maplit = "1.0.2"
|
||||
rand = "0.8.3"
|
||||
|
||||
[build-dependencies]
|
||||
fst = "0.4.5"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
|
@ -2,6 +2,7 @@ mod beu32_str_codec;
|
||||
mod obkv_codec;
|
||||
mod roaring_bitmap;
|
||||
mod roaring_bitmap_length;
|
||||
mod str_level_position_codec;
|
||||
mod str_str_u8_codec;
|
||||
pub mod facet;
|
||||
|
||||
@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
pub use self::obkv_codec::ObkvCodec;
|
||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
pub use self::str_level_position_codec::StrLevelPositionCodec;
|
||||
pub use self::str_str_u8_codec::StrStrU8Codec;
|
||||
|
45
milli/src/heed_codec/str_level_position_codec.rs
Normal file
45
milli/src/heed_codec/str_level_position_codec.rs
Normal file
@ -0,0 +1,45 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::mem::size_of;
|
||||
use std::str;
|
||||
|
||||
use crate::TreeLevel;
|
||||
|
||||
pub struct StrLevelPositionCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
|
||||
type DItem = (&'a str, TreeLevel, u32, u32);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
|
||||
|
||||
if bytes.len() < footer_len { return None }
|
||||
|
||||
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let word = str::from_utf8(word).ok()?;
|
||||
|
||||
let (level, bytes) = bytes.split_first()?;
|
||||
let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?;
|
||||
let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?;
|
||||
let level = TreeLevel::try_from(*level).ok()?;
|
||||
|
||||
Some((word, level, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
|
||||
type EItem = (&'a str, TreeLevel, u32, u32);
|
||||
|
||||
fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let left = left.to_be_bytes();
|
||||
let right = right.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.push((*level).into());
|
||||
bytes.extend_from_slice(&left[..]);
|
||||
bytes.extend_from_slice(&right[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution,
|
||||
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
|
||||
use crate::{
|
||||
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec,
|
||||
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
|
||||
};
|
||||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
@ -52,6 +52,10 @@ pub struct Index {
|
||||
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
||||
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the word, level and position range with the docids that corresponds to it.
|
||||
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the level positions of a word prefix with all the docids where this prefix appears.
|
||||
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
|
||||
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||
/// Maps the document id, the facet field id and the globally ordered value.
|
||||
@ -62,7 +66,7 @@ pub struct Index {
|
||||
|
||||
impl Index {
|
||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
||||
options.max_dbs(9);
|
||||
options.max_dbs(11);
|
||||
|
||||
let env = options.open(path)?;
|
||||
let main = env.create_poly_database(Some("main"))?;
|
||||
@ -71,6 +75,8 @@ impl Index {
|
||||
let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
|
||||
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
||||
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
|
||||
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
|
||||
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
|
||||
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
|
||||
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
|
||||
let documents = env.create_database(Some("documents"))?;
|
||||
@ -94,6 +100,8 @@ impl Index {
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
word_level_position_docids,
|
||||
word_prefix_level_position_docids,
|
||||
facet_field_id_value_docids,
|
||||
field_id_docid_facet_values,
|
||||
documents,
|
||||
|
@ -9,6 +9,7 @@ pub mod facet;
|
||||
pub mod heed_codec;
|
||||
pub mod index;
|
||||
pub mod proximity;
|
||||
pub mod tree_level;
|
||||
pub mod update;
|
||||
|
||||
use std::borrow::Cow;
|
||||
@ -22,11 +23,12 @@ use serde_json::{Map, Value};
|
||||
pub use self::criterion::{Criterion, default_criteria};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
|
||||
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
|
||||
pub use self::tree_level::TreeLevel;
|
||||
pub use self::update_store::UpdateStore;
|
||||
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
|
@ -31,32 +31,10 @@ pub struct AscDesc<'t> {
|
||||
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
faceted_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
}
|
||||
|
||||
impl<'t> AscDesc<'t> {
|
||||
pub fn initial_asc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
field_name: String,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Self::initial(index, rtxn, query_tree, candidates, field_name, true)
|
||||
}
|
||||
|
||||
pub fn initial_desc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
field_name: String,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
Self::initial(index, rtxn, query_tree, candidates, field_name, false)
|
||||
}
|
||||
|
||||
pub fn asc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> {
|
||||
Self::new(index, rtxn, parent, field_name, false)
|
||||
}
|
||||
|
||||
fn initial(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
field_name: String,
|
||||
ascending: bool,
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let faceted_fields = index.faceted_fields(rtxn)?;
|
||||
let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?;
|
||||
|
||||
let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?;
|
||||
let candidates = match &query_tree {
|
||||
Some(qt) => {
|
||||
let context = CriteriaBuilder::new(rtxn, index)?;
|
||||
let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?;
|
||||
if let Some(candidates) = candidates {
|
||||
qt_candidates.intersect_with(&candidates);
|
||||
}
|
||||
qt_candidates
|
||||
},
|
||||
None => candidates.unwrap_or(faceted_candidates.clone()),
|
||||
};
|
||||
|
||||
Ok(AscDesc {
|
||||
index,
|
||||
rtxn,
|
||||
field_name,
|
||||
field_id,
|
||||
facet_type,
|
||||
ascending,
|
||||
query_tree,
|
||||
candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?,
|
||||
faceted_candidates,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn new(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> {
|
||||
candidates: Box::new(std::iter::empty()),
|
||||
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
parent,
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -156,12 +93,9 @@ impl<'t> Criterion for AscDesc<'t> {
|
||||
|
||||
match self.candidates.next().transpose()? {
|
||||
None => {
|
||||
let query_tree = self.query_tree.take();
|
||||
let bucket_candidates = take(&mut self.bucket_candidates);
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next(wdcache)? {
|
||||
match self.parent.next(wdcache)? {
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
let candidates_is_some = candidates.is_some();
|
||||
self.query_tree = query_tree;
|
||||
let candidates = match (&self.query_tree, candidates) {
|
||||
(_, Some(mut candidates)) => {
|
||||
@ -176,11 +110,22 @@ impl<'t> Criterion for AscDesc<'t> {
|
||||
},
|
||||
(None, None) => take(&mut self.faceted_candidates),
|
||||
};
|
||||
if bucket_candidates.is_empty() {
|
||||
self.bucket_candidates.union_with(&candidates);
|
||||
} else {
|
||||
|
||||
// If our parent returns candidates it means that the bucket
|
||||
// candidates were already computed before and we can use them.
|
||||
//
|
||||
// If not, we must use the just computed candidates as our bucket
|
||||
// candidates.
|
||||
if candidates_is_some {
|
||||
self.bucket_candidates.union_with(&bucket_candidates);
|
||||
} else {
|
||||
self.bucket_candidates.union_with(&candidates);
|
||||
}
|
||||
|
||||
if candidates.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
self.candidates = facet_ordered(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
@ -193,27 +138,11 @@ impl<'t> Criterion for AscDesc<'t> {
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => if query_tree.is_none() && bucket_candidates.is_empty() {
|
||||
return Ok(None)
|
||||
},
|
||||
}
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates: Some(RoaringBitmap::new()),
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
Some(candidates) => {
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: Some(candidates),
|
||||
bucket_candidates,
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
}
|
||||
|
737
milli/src/search/criteria/attribute.rs
Normal file
737
milli/src/search/criteria/attribute.rs
Normal file
@ -0,0 +1,737 @@
|
||||
use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap};
|
||||
use std::collections::{BTreeMap, HashMap, btree_map};
|
||||
use std::collections::binary_heap::PeekMut;
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{TreeLevel, search::build_dfa};
|
||||
use crate::search::criteria::Query;
|
||||
use crate::search::query_tree::{Operation, QueryKind};
|
||||
use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use super::{Criterion, CriterionResult, Context, resolve_query_tree};
|
||||
|
||||
/// To be able to divide integers by the number of words in the query
|
||||
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
|
||||
/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
|
||||
const LCM_10_FIRST_NUMBERS: u32 = 2520;
|
||||
|
||||
/// To compute the interval size of a level,
|
||||
/// we use 4 as the exponentiation base and the level as the exponent.
|
||||
const LEVEL_EXPONENTIATION_BASE: u32 = 4;
|
||||
|
||||
pub struct Attribute<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
flattened_query_tree: Option<Vec<Vec<Vec<Query>>>>,
|
||||
current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
|
||||
}
|
||||
|
||||
impl<'t> Attribute<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Attribute {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
candidates: None,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent,
|
||||
flattened_query_tree: None,
|
||||
current_buckets: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Attribute<'t> {
|
||||
#[logging_timer::time("Attribute::{}")]
|
||||
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
||||
loop {
|
||||
match (&self.query_tree, &mut self.candidates) {
|
||||
(_, Some(candidates)) if candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some(qt), Some(candidates)) => {
|
||||
let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| {
|
||||
flatten_query_tree(&qt)
|
||||
});
|
||||
|
||||
let found_candidates = if candidates.len() < 1000 {
|
||||
let current_buckets = match self.current_buckets.as_mut() {
|
||||
Some(current_buckets) => current_buckets,
|
||||
None => {
|
||||
let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
|
||||
self.current_buckets.get_or_insert(new_buckets.into_iter())
|
||||
},
|
||||
};
|
||||
|
||||
match current_buckets.next() {
|
||||
Some((_score, candidates)) => candidates,
|
||||
None => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
}
|
||||
} else {
|
||||
match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
candidates.difference_with(&found_candidates);
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: Some(found_candidates),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some(qt), None) => {
|
||||
let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?;
|
||||
self.bucket_candidates |= &query_tree_candidates;
|
||||
self.candidates = Some(query_tree_candidates);
|
||||
},
|
||||
(None, Some(_)) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(None, None) => {
|
||||
match self.parent.next(wdcache)? {
|
||||
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: None,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree;
|
||||
self.candidates = candidates;
|
||||
self.bucket_candidates |= bucket_candidates;
|
||||
self.flattened_query_tree = None;
|
||||
self.current_buckets = None;
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word,
|
||||
/// it will begin at the first non-empty interval and will return every interval without
|
||||
/// jumping over empty intervals.
|
||||
struct WordLevelIterator<'t, 'q> {
|
||||
inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
|
||||
level: TreeLevel,
|
||||
interval_size: u32,
|
||||
word: Cow<'q, str>,
|
||||
in_prefix_cache: bool,
|
||||
inner_next: Option<(u32, u32, RoaringBitmap)>,
|
||||
current_interval: Option<(u32, u32)>,
|
||||
}
|
||||
|
||||
impl<'t, 'q> WordLevelIterator<'t, 'q> {
|
||||
fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
|
||||
match ctx.word_position_last_level(&word, in_prefix_cache)? {
|
||||
Some(level) => {
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
|
||||
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
|
||||
Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
|
||||
},
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
|
||||
let level = level.min(&self.level).clone();
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
|
||||
let word = self.word.clone();
|
||||
let in_prefix_cache = self.in_prefix_cache;
|
||||
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
|
||||
|
||||
Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
|
||||
}
|
||||
|
||||
fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left }
|
||||
|
||||
let inner_next = match self.inner_next.take() {
|
||||
Some(inner_next) => Some(inner_next),
|
||||
None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)),
|
||||
};
|
||||
|
||||
match inner_next {
|
||||
Some((left, right, docids)) => {
|
||||
match self.current_interval {
|
||||
Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
|
||||
let blank_left = last_left + self.interval_size;
|
||||
let blank_right = last_right + self.interval_size;
|
||||
self.current_interval = Some((blank_left, blank_right));
|
||||
self.inner_next = Some((left, right, docids));
|
||||
Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
|
||||
},
|
||||
_ => {
|
||||
self.current_interval = Some((left, right));
|
||||
Ok(Some((left, right, docids)))
|
||||
}
|
||||
}
|
||||
},
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// QueryLevelIterator is an pseudo-Iterator for a Query,
|
||||
/// It contains WordLevelIterators and is chainned with other QueryLevelIterator.
|
||||
struct QueryLevelIterator<'t, 'q> {
|
||||
parent: Option<Box<QueryLevelIterator<'t, 'q>>>,
|
||||
inner: Vec<WordLevelIterator<'t, 'q>>,
|
||||
level: TreeLevel,
|
||||
accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
|
||||
parent_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
|
||||
interval_to_skip: usize,
|
||||
}
|
||||
|
||||
impl<'t, 'q> QueryLevelIterator<'t, 'q> {
|
||||
fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<Self>> {
|
||||
let mut inner = Vec::with_capacity(queries.len());
|
||||
for query in queries {
|
||||
match &query.kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if !query.prefix || ctx.in_prefix_cache(&word) {
|
||||
let word = Cow::Borrowed(query.kind.word());
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? {
|
||||
inner.push(word_level_iterator);
|
||||
}
|
||||
} else {
|
||||
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? {
|
||||
let word = Cow::Owned(word.to_owned());
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
|
||||
inner.push(word_level_iterator);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? {
|
||||
let word = Cow::Owned(word.to_owned());
|
||||
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
|
||||
inner.push(word_level_iterator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone());
|
||||
match highest {
|
||||
Some(level) => Ok(Some(Self {
|
||||
parent: None,
|
||||
inner,
|
||||
level,
|
||||
accumulator: vec![],
|
||||
parent_accumulator: vec![],
|
||||
interval_to_skip: 0,
|
||||
})),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self {
|
||||
self.parent = Some(Box::new(parent));
|
||||
self
|
||||
}
|
||||
|
||||
/// create a new QueryLevelIterator with a lower level than the current one.
|
||||
fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result<Self> {
|
||||
let (level, parent) = match &self.parent {
|
||||
Some(parent) => {
|
||||
let parent = parent.dig(ctx)?;
|
||||
(parent.level.min(self.level), Some(Box::new(parent)))
|
||||
},
|
||||
None => (self.level.saturating_sub(1), None),
|
||||
};
|
||||
|
||||
let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten();
|
||||
let mut inner = Vec::with_capacity(self.inner.len());
|
||||
for word_level_iterator in self.inner.iter() {
|
||||
inner.push(word_level_iterator.dig(ctx, &level, left_interval)?);
|
||||
}
|
||||
|
||||
Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0})
|
||||
}
|
||||
|
||||
fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None;
|
||||
let u8_level = Into::<u8>::into(level);
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
|
||||
for wli in self.inner.iter_mut() {
|
||||
let wli_u8_level = Into::<u8>::into(wli.level.clone());
|
||||
let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32);
|
||||
for _ in 0..accumulated_count {
|
||||
if let Some((next_left, _, next_docids)) = wli.next()? {
|
||||
accumulated = match accumulated.take(){
|
||||
Some((acc_left, acc_right, mut acc_docids)) => {
|
||||
acc_docids |= next_docids;
|
||||
Some((acc_left, acc_right, acc_docids))
|
||||
},
|
||||
None => Some((next_left, next_left + interval_size, next_docids)),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(accumulated)
|
||||
}
|
||||
|
||||
/// return the next meta-interval created from inner WordLevelIterators,
|
||||
/// and from eventual chainned QueryLevelIterator.
|
||||
fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
|
||||
let parent_result = match self.parent.as_mut() {
|
||||
Some(parent) => Some(parent.next(allowed_candidates, tree_level)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
match parent_result {
|
||||
Some(parent_next) => {
|
||||
let inner_next = self.inner_next(tree_level)?;
|
||||
self.interval_to_skip += interval_to_skip(
|
||||
&self.parent_accumulator,
|
||||
&self.accumulator,
|
||||
self.interval_to_skip,
|
||||
allowed_candidates
|
||||
);
|
||||
self.accumulator.push(inner_next);
|
||||
self.parent_accumulator.push(parent_next);
|
||||
let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
|
||||
|
||||
for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) {
|
||||
if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
|
||||
match merged_interval.as_mut() {
|
||||
Some((_, _, merged_docids)) => *merged_docids |= a & b,
|
||||
None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(merged_interval)
|
||||
},
|
||||
None => {
|
||||
let level = self.level;
|
||||
match self.inner_next(level)? {
|
||||
Some((left, right, mut candidates)) => {
|
||||
self.accumulator = vec![Some((left, right, RoaringBitmap::new()))];
|
||||
candidates &= allowed_candidates;
|
||||
Ok(Some((left, right, candidates)))
|
||||
|
||||
},
|
||||
None => {
|
||||
self.accumulator = vec![None];
|
||||
Ok(None)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Count the number of interval that can be skiped when we make the cross-intersections
|
||||
/// in order to compute the next meta-interval.
|
||||
/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids.
|
||||
fn interval_to_skip(
|
||||
parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>],
|
||||
current_accumulator: &[Option<(u32, u32, RoaringBitmap)>],
|
||||
already_skiped: usize,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> usize {
|
||||
parent_accumulator.into_iter()
|
||||
.zip(current_accumulator.into_iter())
|
||||
.skip(already_skiped)
|
||||
.take_while(|(parent, current)| {
|
||||
let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
|
||||
let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
|
||||
skip_parent && skip_current
|
||||
})
|
||||
.count()
|
||||
|
||||
}
|
||||
|
||||
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
|
||||
/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates.
|
||||
struct Branch<'t, 'q> {
|
||||
query_level_iterator: QueryLevelIterator<'t, 'q>,
|
||||
last_result: (u32, u32, RoaringBitmap),
|
||||
tree_level: TreeLevel,
|
||||
branch_size: u32,
|
||||
}
|
||||
|
||||
impl<'t, 'q> Branch<'t, 'q> {
|
||||
/// return the next meta-interval of the branch,
|
||||
/// and update inner interval in order to be ranked by the BinaryHeap.
|
||||
fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> {
|
||||
let tree_level = self.query_level_iterator.level;
|
||||
match self.query_level_iterator.next(allowed_candidates, tree_level)? {
|
||||
Some(last_result) => {
|
||||
self.last_result = last_result;
|
||||
self.tree_level = tree_level;
|
||||
Ok(true)
|
||||
},
|
||||
None => Ok(false),
|
||||
}
|
||||
}
|
||||
|
||||
/// make the current Branch iterate over smaller intervals.
|
||||
fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> {
|
||||
self.query_level_iterator = self.query_level_iterator.dig(ctx)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// because next() method could be time consuming,
|
||||
/// update inner interval in order to be ranked by the binary_heap without computing it,
|
||||
/// the next() method should be called when the real interval is needed.
|
||||
fn lazy_next(&mut self) {
|
||||
let u8_level = Into::<u8>::into(self.tree_level.clone());
|
||||
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
|
||||
let (left, right, _) = self.last_result;
|
||||
|
||||
self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new());
|
||||
}
|
||||
|
||||
/// return the score of the current inner interval.
|
||||
fn compute_rank(&self) -> u32 {
|
||||
// we compute a rank from the left interval.
|
||||
let (left, _, _) = self.last_result;
|
||||
left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
|
||||
}
|
||||
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let self_rank = self.compute_rank();
|
||||
let other_rank = other.compute_rank();
|
||||
let left_cmp = self_rank.cmp(&other_rank).reverse();
|
||||
// on level: lower is better,
|
||||
// we want to dig faster into levels on interesting branches.
|
||||
let level_cmp = self.tree_level.cmp(&other.tree_level).reverse();
|
||||
|
||||
left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, 'q> Ord for Branch<'t, 'q> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.cmp(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, 'q> PartialOrd for Branch<'t, 'q> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, 'q> PartialEq for Branch<'t, 'q> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, 'q> Eq for Branch<'t, 'q> {}
|
||||
|
||||
fn initialize_query_level_iterators<'t, 'q>(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
branches: &'q Vec<Vec<Vec<Query>>>,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> {
|
||||
|
||||
let mut positions = BinaryHeap::with_capacity(branches.len());
|
||||
for branch in branches {
|
||||
let mut branch_positions = Vec::with_capacity(branch.len());
|
||||
for queries in branch {
|
||||
match QueryLevelIterator::new(ctx, queries, wdcache)? {
|
||||
Some(qli) => branch_positions.push(qli),
|
||||
None => {
|
||||
// the branch seems to be invalid, so we skip it.
|
||||
branch_positions.clear();
|
||||
break;
|
||||
},
|
||||
}
|
||||
}
|
||||
// QueryLevelIterator need to be sorted by level and folded in descending order.
|
||||
branch_positions.sort_unstable_by_key(|qli| qli.level);
|
||||
let folded_query_level_iterators = branch_positions
|
||||
.into_iter()
|
||||
.fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
|
||||
Some(fold) => {
|
||||
qli.parent(fold);
|
||||
Some(qli)
|
||||
},
|
||||
None => Some(qli),
|
||||
});
|
||||
|
||||
if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
|
||||
let tree_level = folded_query_level_iterators.level;
|
||||
let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?;
|
||||
if let Some(last_result) = last_result {
|
||||
let branch = Branch {
|
||||
last_result,
|
||||
tree_level,
|
||||
query_level_iterator: folded_query_level_iterators,
|
||||
branch_size: branch.len() as u32,
|
||||
};
|
||||
positions.push(branch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(positions)
|
||||
}
|
||||
|
||||
fn set_compute_candidates<'t>(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
branches: &Vec<Vec<Vec<Query>>>,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Option<RoaringBitmap>>
|
||||
{
|
||||
let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
|
||||
let lowest_level = TreeLevel::min_value();
|
||||
let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
|
||||
let mut allowed_candidates = allowed_candidates.clone();
|
||||
|
||||
while let Some(mut branch) = branches_heap.peek_mut() {
|
||||
let is_lowest_level = branch.tree_level == lowest_level;
|
||||
let branch_rank = branch.compute_rank();
|
||||
// if current is worst than best we break to return
|
||||
// candidates that correspond to the best rank
|
||||
if let Some((best_rank, _)) = final_candidates {
|
||||
if branch_rank > best_rank { break }
|
||||
}
|
||||
let _left = branch.last_result.0;
|
||||
let candidates = take(&mut branch.last_result.2);
|
||||
if candidates.is_empty() {
|
||||
// we don't have candidates, get next interval.
|
||||
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
|
||||
}
|
||||
else if is_lowest_level {
|
||||
// we have candidates, but we can't dig deeper.
|
||||
allowed_candidates -= &candidates;
|
||||
final_candidates = match final_candidates.take() {
|
||||
// we add current candidates to best candidates
|
||||
Some((best_rank, mut best_candidates)) => {
|
||||
best_candidates |= candidates;
|
||||
branch.lazy_next();
|
||||
Some((best_rank, best_candidates))
|
||||
},
|
||||
// we take current candidates as best candidates
|
||||
None => {
|
||||
branch.lazy_next();
|
||||
Some((branch_rank, candidates))
|
||||
},
|
||||
};
|
||||
} else {
|
||||
// we have candidates, lets dig deeper in levels.
|
||||
branch.dig(ctx)?;
|
||||
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Ok(final_candidates.map(|(_rank, candidates)| candidates))
|
||||
}
|
||||
|
||||
fn linear_compute_candidates(
|
||||
ctx: &dyn Context,
|
||||
branches: &Vec<Vec<Vec<Query>>>,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
|
||||
{
|
||||
fn compute_candidate_rank(branches: &Vec<Vec<Vec<Query>>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
|
||||
let mut min_rank = u64::max_value();
|
||||
for branch in branches {
|
||||
|
||||
let branch_len = branch.len();
|
||||
let mut branch_rank = Vec::with_capacity(branch_len);
|
||||
for derivates in branch {
|
||||
let mut position = None;
|
||||
for Query { prefix, kind } in derivates {
|
||||
// find the best position of the current word in the document.
|
||||
let current_position = match kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if *prefix {
|
||||
word_derivations(word, true, 0, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next()).min()
|
||||
} else {
|
||||
words_positions.get(word)
|
||||
.map(|positions| positions.iter().next())
|
||||
.flatten()
|
||||
}
|
||||
},
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
word_derivations(word, *prefix, *typo, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next()).min()
|
||||
},
|
||||
};
|
||||
|
||||
match (position, current_position) {
|
||||
(Some(p), Some(cp)) => position = Some(cmp::min(p, cp)),
|
||||
(None, Some(cp)) => position = Some(cp),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
// if a position is found, we add it to the branch score,
|
||||
// otherwise the branch is considered as unfindable in this document and we break.
|
||||
if let Some(position) = position {
|
||||
branch_rank.push(position as u64);
|
||||
} else {
|
||||
branch_rank.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !branch_rank.is_empty() {
|
||||
branch_rank.sort_unstable();
|
||||
// because several words in same query can't match all a the position 0,
|
||||
// we substract the word index to the position.
|
||||
let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
|
||||
// here we do the means of the words of the branch
|
||||
min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
|
||||
}
|
||||
}
|
||||
|
||||
min_rank
|
||||
}
|
||||
|
||||
fn word_derivations<'a>(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||
{
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
match dfa.eval(document_word) {
|
||||
Distance::Exact(_) => Some(positions),
|
||||
Distance::AtLeast(_) => None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
let mut candidates = BTreeMap::new();
|
||||
for docid in allowed_candidates {
|
||||
let words_positions = ctx.docid_words_positions(docid)?;
|
||||
let rank = compute_candidate_rank(branches, words_positions);
|
||||
candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
// TODO can we keep refs of Query
|
||||
fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
|
||||
use crate::search::criteria::Operation::{And, Or, Consecutive};
|
||||
|
||||
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Vec<Query>>> {
|
||||
match tail.split_first() {
|
||||
Some((thead, tail)) => {
|
||||
let tail = and_recurse(thead, tail);
|
||||
let mut out = Vec::new();
|
||||
for array in recurse(head) {
|
||||
for tail_array in &tail {
|
||||
let mut array = array.clone();
|
||||
array.extend(tail_array.iter().cloned());
|
||||
out.push(array);
|
||||
}
|
||||
}
|
||||
out
|
||||
},
|
||||
None => recurse(head),
|
||||
}
|
||||
}
|
||||
|
||||
fn recurse(op: &Operation) -> Vec<Vec<Vec<Query>>> {
|
||||
match op {
|
||||
And(ops) | Consecutive(ops) => {
|
||||
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
|
||||
},
|
||||
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
|
||||
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
|
||||
} else {
|
||||
ops.into_iter().map(recurse).flatten().collect()
|
||||
},
|
||||
Operation::Query(query) => vec![vec![vec![query.clone()]]],
|
||||
}
|
||||
}
|
||||
|
||||
recurse(query_tree)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
|
||||
use crate::search::criteria::QueryKind;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_flatten_query_tree() {
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
|
||||
]),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
|
||||
Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
|
||||
]),
|
||||
]),
|
||||
]),
|
||||
]);
|
||||
|
||||
let expected = vec![
|
||||
vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],
|
||||
vec![
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }],
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
|
||||
],
|
||||
vec![
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }],
|
||||
],
|
||||
vec![
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }],
|
||||
vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
|
||||
],
|
||||
];
|
||||
|
||||
let result = flatten_query_tree(&query_tree);
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
}
|
@ -1,135 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::WordDerivationsCache;
|
||||
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
|
||||
|
||||
/// The result of a call to the fetcher.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct FetcherResult {
|
||||
/// The query tree corresponding to the current bucket of the last criterion.
|
||||
pub query_tree: Option<Operation>,
|
||||
/// The candidates of the current bucket of the last criterion.
|
||||
pub candidates: RoaringBitmap,
|
||||
/// Candidates that comes from the current bucket of the initial criterion.
|
||||
pub bucket_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
pub struct Fetcher<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Candidates,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
should_get_documents_ids: bool,
|
||||
wdcache: WordDerivationsCache,
|
||||
}
|
||||
|
||||
impl<'t> Fetcher<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Fetcher {
|
||||
ctx,
|
||||
query_tree,
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
parent: None,
|
||||
should_get_documents_ids: true,
|
||||
wdcache: WordDerivationsCache::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
) -> Self
|
||||
{
|
||||
Fetcher {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
candidates: Candidates::default(),
|
||||
parent: Some(parent),
|
||||
should_get_documents_ids: true,
|
||||
wdcache: WordDerivationsCache::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("Fetcher::{}")]
|
||||
pub fn next(&mut self) -> anyhow::Result<Option<FetcherResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
loop {
|
||||
debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})",
|
||||
self.should_get_documents_ids, self.candidates,
|
||||
);
|
||||
|
||||
let should_get_documents_ids = take(&mut self.should_get_documents_ids);
|
||||
match &mut self.candidates {
|
||||
Allowed(_) => {
|
||||
let candidates = take(&mut self.candidates).into_inner();
|
||||
let candidates = match &self.query_tree {
|
||||
Some(qt) if should_get_documents_ids => {
|
||||
let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?;
|
||||
docids.intersect_with(&candidates);
|
||||
docids
|
||||
},
|
||||
_ => candidates,
|
||||
};
|
||||
|
||||
return Ok(Some(FetcherResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
Forbidden(_) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next(&mut self.wdcache)? {
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
let candidates = match (&query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
|
||||
(None, None) => RoaringBitmap::new(),
|
||||
};
|
||||
|
||||
return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates }))
|
||||
},
|
||||
None => if should_get_documents_ids {
|
||||
let candidates = match &self.query_tree {
|
||||
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?,
|
||||
None => self.ctx.documents_ids()?,
|
||||
};
|
||||
|
||||
return Ok(Some(FetcherResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
}
|
||||
},
|
||||
None => if should_get_documents_ids {
|
||||
let candidates = match &self.query_tree {
|
||||
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?,
|
||||
None => self.ctx.documents_ids()?,
|
||||
};
|
||||
|
||||
return Ok(Some(FetcherResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
}
|
||||
return Ok(None);
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
53
milli/src/search/criteria/final.rs
Normal file
53
milli/src/search/criteria/final.rs
Normal file
@ -0,0 +1,53 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::WordDerivationsCache;
|
||||
use super::{resolve_query_tree, Criterion, CriterionResult, Context};
|
||||
|
||||
/// The result of a call to the fetcher.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct FinalResult {
|
||||
/// The query tree corresponding to the current bucket of the last criterion.
|
||||
pub query_tree: Option<Operation>,
|
||||
/// The candidates of the current bucket of the last criterion.
|
||||
pub candidates: RoaringBitmap,
|
||||
/// Candidates that comes from the current bucket of the initial criterion.
|
||||
pub bucket_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
pub struct Final<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
wdcache: WordDerivationsCache,
|
||||
}
|
||||
|
||||
impl<'t> Final<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
|
||||
Final { ctx, parent, wdcache: WordDerivationsCache::new() }
|
||||
}
|
||||
|
||||
#[logging_timer::time("Final::{}")]
|
||||
pub fn next(&mut self) -> anyhow::Result<Option<FinalResult>> {
|
||||
loop {
|
||||
debug!("Final iteration");
|
||||
|
||||
match self.parent.next(&mut self.wdcache)? {
|
||||
Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => {
|
||||
let candidates = match (&query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
|
||||
(None, None) => self.ctx.documents_ids()?,
|
||||
};
|
||||
|
||||
bucket_candidates.union_with(&candidates);
|
||||
|
||||
return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }));
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
28
milli/src/search/criteria/initial.rs
Normal file
28
milli/src/search/criteria/initial.rs
Normal file
@ -0,0 +1,28 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::WordDerivationsCache;
|
||||
|
||||
use super::{Criterion, CriterionResult};
|
||||
|
||||
pub struct Initial {
|
||||
answer: Option<CriterionResult>
|
||||
}
|
||||
|
||||
impl Initial {
|
||||
pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial {
|
||||
let answer = CriterionResult {
|
||||
query_tree,
|
||||
candidates: candidates.clone(),
|
||||
bucket_candidates: candidates.take().unwrap_or_default(),
|
||||
};
|
||||
Initial { answer: Some(answer) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Criterion for Initial {
|
||||
#[logging_timer::time("Initial::{}")]
|
||||
fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
||||
Ok(self.answer.take())
|
||||
}
|
||||
}
|
@ -4,21 +4,25 @@ use std::borrow::Cow;
|
||||
use anyhow::bail;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}};
|
||||
use crate::{Index, DocumentId};
|
||||
|
||||
use super::query_tree::{Operation, Query, QueryKind};
|
||||
use self::asc_desc::AscDesc;
|
||||
use self::attribute::Attribute;
|
||||
use self::r#final::Final;
|
||||
use self::initial::Initial;
|
||||
use self::proximity::Proximity;
|
||||
use self::typo::Typo;
|
||||
use self::words::Words;
|
||||
use self::asc_desc::AscDesc;
|
||||
use self::proximity::Proximity;
|
||||
use self::fetcher::Fetcher;
|
||||
|
||||
mod asc_desc;
|
||||
mod attribute;
|
||||
mod initial;
|
||||
mod proximity;
|
||||
mod typo;
|
||||
mod words;
|
||||
mod asc_desc;
|
||||
mod proximity;
|
||||
pub mod fetcher;
|
||||
pub mod r#final;
|
||||
|
||||
pub trait Criterion {
|
||||
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>>;
|
||||
@ -59,7 +63,8 @@ impl Default for Candidates {
|
||||
Self::Forbidden(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
pub trait Context {
|
||||
|
||||
pub trait Context<'c> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
|
||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
@ -68,6 +73,8 @@ pub trait Context {
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
|
||||
fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>;
|
||||
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>;
|
||||
}
|
||||
pub struct CriteriaBuilder<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
@ -76,7 +83,7 @@ pub struct CriteriaBuilder<'t> {
|
||||
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
|
||||
}
|
||||
|
||||
impl<'a> Context for CriteriaBuilder<'a> {
|
||||
impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
||||
self.index.documents_ids(self.rtxn)
|
||||
}
|
||||
@ -115,6 +122,48 @@ impl<'a> Context for CriteriaBuilder<'a> {
|
||||
}
|
||||
Ok(words_positions)
|
||||
}
|
||||
|
||||
fn word_position_iterator(
|
||||
&self,
|
||||
word: &str,
|
||||
level: TreeLevel,
|
||||
in_prefix_cache: bool,
|
||||
left: Option<u32>,
|
||||
right: Option<u32>
|
||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
|
||||
{
|
||||
let range = {
|
||||
let left = left.unwrap_or(u32::min_value());
|
||||
let right = right.unwrap_or(u32::max_value());
|
||||
let left = (word, level, left, left);
|
||||
let right = (word, level, right, right);
|
||||
left..=right
|
||||
};
|
||||
let db = match in_prefix_cache {
|
||||
true => self.index.word_prefix_level_position_docids,
|
||||
false => self.index.word_level_position_docids,
|
||||
};
|
||||
|
||||
Ok(Box::new(db.range(self.rtxn, &range)?))
|
||||
}
|
||||
|
||||
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
|
||||
let range = {
|
||||
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
|
||||
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
|
||||
left..=right
|
||||
};
|
||||
let db = match in_prefix_cache {
|
||||
true => self.index.word_prefix_level_position_docids,
|
||||
false => self.index.word_level_position_docids,
|
||||
};
|
||||
let last_level = db
|
||||
.remap_data_type::<heed::types::DecodeIgnore>()
|
||||
.range(self.rtxn, &range)?.last().transpose()?
|
||||
.map(|((_, level, _, _), _)| level);
|
||||
|
||||
Ok(last_level)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> CriteriaBuilder<'t> {
|
||||
@ -126,42 +175,26 @@ impl<'t> CriteriaBuilder<'t> {
|
||||
|
||||
pub fn build(
|
||||
&'t self,
|
||||
mut query_tree: Option<Operation>,
|
||||
mut facet_candidates: Option<RoaringBitmap>,
|
||||
) -> anyhow::Result<Fetcher<'t>>
|
||||
query_tree: Option<Operation>,
|
||||
facet_candidates: Option<RoaringBitmap>,
|
||||
) -> anyhow::Result<Final<'t>>
|
||||
{
|
||||
use crate::criterion::Criterion as Name;
|
||||
|
||||
let mut criterion = None as Option<Box<dyn Criterion>>;
|
||||
let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box<dyn Criterion>;
|
||||
for name in self.index.criteria(&self.rtxn)? {
|
||||
criterion = Some(match criterion.take() {
|
||||
Some(father) => match name {
|
||||
Name::Typo => Box::new(Typo::new(self, father)),
|
||||
Name::Words => Box::new(Words::new(self, father)),
|
||||
Name::Proximity => Box::new(Proximity::new(self, father)),
|
||||
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?),
|
||||
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?),
|
||||
_otherwise => father,
|
||||
},
|
||||
None => match name {
|
||||
Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())),
|
||||
Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())),
|
||||
Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())),
|
||||
Name::Asc(field) => {
|
||||
Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
|
||||
},
|
||||
Name::Desc(field) => {
|
||||
Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
|
||||
},
|
||||
_otherwise => continue,
|
||||
},
|
||||
});
|
||||
criterion = match name {
|
||||
Name::Typo => Box::new(Typo::new(self, criterion)),
|
||||
Name::Words => Box::new(Words::new(self, criterion)),
|
||||
Name::Proximity => Box::new(Proximity::new(self, criterion)),
|
||||
Name::Attribute => Box::new(Attribute::new(self, criterion)),
|
||||
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?),
|
||||
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?),
|
||||
_otherwise => criterion,
|
||||
};
|
||||
}
|
||||
|
||||
match criterion {
|
||||
Some(criterion) => Ok(Fetcher::new(self, criterion)),
|
||||
None => Ok(Fetcher::initial(self, query_tree, facet_candidates)),
|
||||
}
|
||||
Ok(Final::new(self, criterion))
|
||||
}
|
||||
}
|
||||
|
||||
@ -362,9 +395,10 @@ pub mod test {
|
||||
word_prefix_docids: HashMap<String, RoaringBitmap>,
|
||||
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
docid_words: HashMap<u32, Vec<String>>,
|
||||
}
|
||||
|
||||
impl<'a> Context for TestContext<'a> {
|
||||
impl<'c> Context<'c> for TestContext<'c> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
||||
Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids))
|
||||
}
|
||||
@ -395,7 +429,24 @@ pub mod test {
|
||||
self.word_prefix_docids.contains_key(&word.to_string())
|
||||
}
|
||||
|
||||
fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
if let Some(docid_words) = self.docid_words.get(&docid) {
|
||||
Ok(docid_words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))))
|
||||
.collect()
|
||||
)
|
||||
} else {
|
||||
Ok(HashMap::new())
|
||||
}
|
||||
}
|
||||
|
||||
fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@ -431,50 +482,58 @@ pub mod test {
|
||||
s("morning") => random_postings(rng, 125),
|
||||
};
|
||||
|
||||
let mut docid_words = HashMap::new();
|
||||
for (word, docids) in word_docids.iter() {
|
||||
for docid in docids {
|
||||
let words = docid_words.entry(docid).or_insert(vec![]);
|
||||
words.push(word.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let word_prefix_docids = hashmap!{
|
||||
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
|
||||
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
|
||||
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
|
||||
};
|
||||
|
||||
let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")];
|
||||
let hello_world_split = (hello_world.len() / 2) as usize;
|
||||
let hello_world_1 = hello_world.iter().take(hello_world_split).collect();
|
||||
let hello_world_2 = hello_world.iter().skip(hello_world_split).collect();
|
||||
|
||||
let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")];
|
||||
let hello_word_split = (hello_word.len() / 2) as usize;
|
||||
let hello_word_4 = hello_word.iter().take(hello_word_split).collect();
|
||||
let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect();
|
||||
let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect();
|
||||
let word_pair_proximity_docids = hashmap!{
|
||||
(s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")],
|
||||
(s("hello"), s("world"), 1) => hello_world_1,
|
||||
(s("hello"), s("world"), 4) => hello_world_2,
|
||||
(s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")],
|
||||
(s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
|
||||
(s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
|
||||
(s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
|
||||
(s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
|
||||
(s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")],
|
||||
(s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")],
|
||||
(s("hello"), s("word"), 4) => hello_word_4,
|
||||
(s("hello"), s("word"), 6) => hello_word_6,
|
||||
(s("hello"), s("word"), 7) => hello_word_7,
|
||||
(s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")],
|
||||
(s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
|
||||
(s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")],
|
||||
(s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
|
||||
let mut word_pair_proximity_docids = HashMap::new();
|
||||
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
||||
for (lword, lcandidates) in &word_docids {
|
||||
for (rword, rcandidates) in &word_docids {
|
||||
if lword == rword { continue }
|
||||
let candidates = lcandidates & rcandidates;
|
||||
for candidate in candidates {
|
||||
if let Some(docid_words) = docid_words.get(&candidate) {
|
||||
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
|
||||
let rposition = docid_words.iter().position(|w| w == rword).unwrap();
|
||||
let key = if lposition < rposition {
|
||||
(s(lword), s(rword), (rposition - lposition) as i32)
|
||||
} else {
|
||||
(s(lword), s(rword), (lposition - rposition + 1) as i32)
|
||||
};
|
||||
|
||||
let word_prefix_pair_proximity_docids = hashmap!{
|
||||
(s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(),
|
||||
(s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(),
|
||||
(s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(),
|
||||
(s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(),
|
||||
(s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(),
|
||||
(s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(),
|
||||
let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (pword, pcandidates) in &word_prefix_docids {
|
||||
if lword.starts_with(pword) { continue }
|
||||
let candidates = lcandidates & pcandidates;
|
||||
for candidate in candidates {
|
||||
if let Some(docid_words) = docid_words.get(&candidate) {
|
||||
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
|
||||
let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
|
||||
let key = if lposition < rposition {
|
||||
(s(lword), s(pword), (rposition - lposition) as i32)
|
||||
} else {
|
||||
(s(lword), s(pword), (lposition - rposition + 1) as i32)
|
||||
};
|
||||
let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut keys = word_docids.keys().collect::<Vec<_>>();
|
||||
keys.sort_unstable();
|
||||
@ -486,6 +545,7 @@ pub mod test {
|
||||
word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
docid_words,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -8,48 +8,29 @@ use log::debug;
|
||||
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
|
||||
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
||||
use crate::search::{build_dfa, WordDerivationsCache};
|
||||
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
|
||||
use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
|
||||
|
||||
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
|
||||
|
||||
pub struct Proximity<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<(usize, Operation)>,
|
||||
ctx: &'t dyn Context<'t>,
|
||||
/// ((max_proximity, query_tree), allowed_candidates)
|
||||
state: Option<(Option<(usize, Operation)>, RoaringBitmap)>,
|
||||
proximity: u8,
|
||||
candidates: Candidates,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
candidates_cache: Cache,
|
||||
plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
|
||||
}
|
||||
|
||||
impl<'t> Proximity<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Proximity {
|
||||
ctx,
|
||||
query_tree: query_tree.map(|op| (maximum_proximity(&op), op)),
|
||||
state: None,
|
||||
proximity: 0,
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
candidates_cache: Cache::new(),
|
||||
plane_sweep_cache: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Proximity {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
proximity: 0,
|
||||
candidates: Candidates::default(),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
parent,
|
||||
candidates_cache: Cache::new(),
|
||||
plane_sweep_cache: None,
|
||||
}
|
||||
@ -59,27 +40,20 @@ impl<'t> Proximity<'t> {
|
||||
impl<'t> Criterion for Proximity<'t> {
|
||||
#[logging_timer::time("Proximity::{}")]
|
||||
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
loop {
|
||||
debug!("Proximity at iteration {} (max {:?}) ({:?})",
|
||||
debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
|
||||
self.proximity,
|
||||
self.query_tree.as_ref().map(|(mp, _)| mp),
|
||||
self.candidates,
|
||||
self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)),
|
||||
self.state.as_ref().map(|(_, cd)| cd),
|
||||
);
|
||||
|
||||
match (&mut self.query_tree, &mut self.candidates) {
|
||||
(_, Allowed(candidates)) if candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take().map(|(_, qt)| qt),
|
||||
candidates: Some(take(&mut self.candidates).into_inner()),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
match &mut self.state {
|
||||
Some((_, candidates)) if candidates.is_empty() => {
|
||||
self.state = None; // reset state
|
||||
},
|
||||
(Some((max_prox, query_tree)), Allowed(candidates)) => {
|
||||
Some((Some((max_prox, query_tree)), candidates)) => {
|
||||
if self.proximity as usize > *max_prox {
|
||||
// reset state to (None, Forbidden(_))
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
self.state = None; // reset state
|
||||
} else {
|
||||
let mut new_candidates = if candidates.len() <= 1000 {
|
||||
if let Some(cache) = self.plane_sweep_cache.as_mut() {
|
||||
@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> {
|
||||
candidates
|
||||
},
|
||||
None => {
|
||||
// reset state to (None, Forbidden(_))
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
self.state = None; // reset state
|
||||
continue
|
||||
},
|
||||
}
|
||||
@ -120,83 +92,58 @@ impl<'t> Criterion for Proximity<'t> {
|
||||
candidates.difference_with(&new_candidates);
|
||||
self.proximity += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree.clone()),
|
||||
candidates: Some(new_candidates),
|
||||
bucket_candidates,
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
}
|
||||
},
|
||||
(Some((max_prox, query_tree)), Forbidden(candidates)) => {
|
||||
if self.proximity as usize > *max_prox {
|
||||
self.query_tree = None;
|
||||
self.candidates = Candidates::default();
|
||||
} else {
|
||||
let mut new_candidates = resolve_candidates(
|
||||
self.ctx,
|
||||
&query_tree,
|
||||
self.proximity,
|
||||
&mut self.candidates_cache,
|
||||
wdcache,
|
||||
)?;
|
||||
|
||||
new_candidates.difference_with(&candidates);
|
||||
candidates.union_with(&new_candidates);
|
||||
self.proximity += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree.clone()),
|
||||
candidates: Some(new_candidates),
|
||||
bucket_candidates,
|
||||
}));
|
||||
}
|
||||
},
|
||||
(None, Allowed(_)) => {
|
||||
let candidates = take(&mut self.candidates).into_inner();
|
||||
Some((None, candidates)) => {
|
||||
let candidates = take(candidates);
|
||||
self.state = None; // reset state
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: Some(candidates.clone()),
|
||||
bucket_candidates: candidates,
|
||||
}));
|
||||
},
|
||||
(None, Forbidden(_)) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next(wdcache)? {
|
||||
None => {
|
||||
match self.parent.next(wdcache)? {
|
||||
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: None,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
let candidates_is_some = candidates.is_some();
|
||||
let candidates = match (&query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?,
|
||||
(None, None) => RoaringBitmap::new(),
|
||||
};
|
||||
|
||||
if bucket_candidates.is_empty() {
|
||||
self.bucket_candidates.union_with(&candidates);
|
||||
} else {
|
||||
// If our parent returns candidates it means that the bucket
|
||||
// candidates were already computed before and we can use them.
|
||||
//
|
||||
// If not, we must use the just computed candidates as our bucket
|
||||
// candidates.
|
||||
if candidates_is_some {
|
||||
self.bucket_candidates.union_with(&bucket_candidates);
|
||||
} else {
|
||||
self.bucket_candidates.union_with(&candidates);
|
||||
}
|
||||
|
||||
self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
|
||||
let query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
|
||||
self.state = Some((query_tree, candidates));
|
||||
self.proximity = 0;
|
||||
self.candidates = Candidates::Allowed(candidates);
|
||||
self.plane_sweep_cache = None;
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,41 +9,24 @@ use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
|
||||
|
||||
pub struct Typo<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
ctx: &'t dyn Context<'t>,
|
||||
query_tree: Option<(usize, Operation)>,
|
||||
number_typos: u8,
|
||||
candidates: Candidates,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl<'t> Typo<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Typo {
|
||||
ctx,
|
||||
query_tree: query_tree.map(|op| (maximum_typo(&op), op)),
|
||||
number_typos: 0,
|
||||
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
candidates_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Typo {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
number_typos: 0,
|
||||
candidates: Candidates::default(),
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
parent,
|
||||
candidates_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> {
|
||||
candidates.difference_with(&new_candidates);
|
||||
self.number_typos += 1;
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => new_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(new_query_tree),
|
||||
candidates: Some(new_candidates),
|
||||
bucket_candidates,
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
}
|
||||
},
|
||||
@ -145,9 +123,14 @@ impl<'t> Criterion for Typo<'t> {
|
||||
}));
|
||||
},
|
||||
(None, Forbidden(_)) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next(wdcache)? {
|
||||
match self.parent.next(wdcache)? {
|
||||
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: None,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
|
||||
self.number_typos = 0;
|
||||
@ -157,9 +140,6 @@ impl<'t> Criterion for Typo<'t> {
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -334,8 +314,8 @@ fn resolve_candidates<'t>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
use super::super::initial::Initial;
|
||||
use super::super::test::TestContext;
|
||||
|
||||
#[test]
|
||||
@ -345,8 +325,10 @@ mod test {
|
||||
let facet_candidates = None;
|
||||
|
||||
let mut wdcache = WordDerivationsCache::new();
|
||||
let mut criteria = Typo::initial(&context, query_tree, facet_candidates);
|
||||
let parent = Initial::new(query_tree, facet_candidates);
|
||||
let mut criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none());
|
||||
assert!(criteria.next(&mut wdcache).unwrap().is_none());
|
||||
}
|
||||
|
||||
@ -364,7 +346,8 @@ mod test {
|
||||
let facet_candidates = None;
|
||||
|
||||
let mut wdcache = WordDerivationsCache::new();
|
||||
let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates);
|
||||
let parent = Initial::new(Some(query_tree), facet_candidates);
|
||||
let mut criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let candidates_1 = context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
@ -413,7 +396,8 @@ mod test {
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
let mut wdcache = WordDerivationsCache::new();
|
||||
let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone()));
|
||||
let parent = Initial::new(query_tree, Some(facet_candidates.clone()));
|
||||
let mut criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let expected = CriterionResult {
|
||||
query_tree: None,
|
||||
@ -442,7 +426,8 @@ mod test {
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
let mut wdcache = WordDerivationsCache::new();
|
||||
let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone()));
|
||||
let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone()));
|
||||
let mut criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let candidates_1 = context.word_docids("split").unwrap().unwrap()
|
||||
& context.word_docids("this").unwrap().unwrap()
|
||||
@ -456,7 +441,7 @@ mod test {
|
||||
]),
|
||||
])),
|
||||
candidates: Some(&candidates_1 & &facet_candidates),
|
||||
bucket_candidates: candidates_1 & &facet_candidates,
|
||||
bucket_candidates: facet_candidates.clone(),
|
||||
};
|
||||
|
||||
assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1));
|
||||
@ -478,7 +463,7 @@ mod test {
|
||||
]),
|
||||
])),
|
||||
candidates: Some(&candidates_2 & &facet_candidates),
|
||||
bucket_candidates: candidates_2 & &facet_candidates,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
};
|
||||
|
||||
assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2));
|
||||
|
@ -8,38 +8,22 @@ use crate::search::query_tree::Operation;
|
||||
use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache};
|
||||
|
||||
pub struct Words<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
ctx: &'t dyn Context<'t>,
|
||||
query_trees: Vec<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl<'t> Words<'t> {
|
||||
pub fn initial(
|
||||
ctx: &'t dyn Context,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
) -> Self
|
||||
{
|
||||
Words {
|
||||
ctx,
|
||||
query_trees: query_tree.map(explode_query_tree).unwrap_or_default(),
|
||||
candidates,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
candidates_cache: HashMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Words {
|
||||
ctx,
|
||||
query_trees: Vec::default(),
|
||||
candidates: None,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
parent,
|
||||
candidates_cache: HashMap::default(),
|
||||
}
|
||||
}
|
||||
@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> {
|
||||
found_candidates.intersect_with(&candidates);
|
||||
candidates.difference_with(&found_candidates);
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => found_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(qt),
|
||||
candidates: Some(found_candidates),
|
||||
bucket_candidates,
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some(qt), None) => {
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => RoaringBitmap::new(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(qt),
|
||||
candidates: None,
|
||||
bucket_candidates,
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(None, Some(_)) => {
|
||||
@ -97,9 +71,14 @@ impl<'t> Criterion for Words<'t> {
|
||||
}));
|
||||
},
|
||||
(None, None) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next(wdcache)? {
|
||||
match self.parent.next(wdcache)? {
|
||||
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: None,
|
||||
bucket_candidates,
|
||||
}));
|
||||
},
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
|
||||
self.candidates = candidates;
|
||||
@ -108,9 +87,6 @@ impl<'t> Criterion for Words<'t> {
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -13,9 +13,8 @@ use once_cell::sync::Lazy;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
|
||||
|
||||
use crate::search::criteria::fetcher::{Fetcher, FetcherResult};
|
||||
use crate::{DocumentId, Index};
|
||||
use crate::search::criteria::r#final::{Final, FinalResult};
|
||||
use crate::{Index, DocumentId};
|
||||
|
||||
pub use self::facet::{
|
||||
FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
|
||||
@ -162,14 +161,14 @@ impl<'a> Search<'a> {
|
||||
&self,
|
||||
mut distinct: impl for<'c> Distinct<'c>,
|
||||
matching_words: MatchingWords,
|
||||
mut criteria: Fetcher,
|
||||
mut criteria: Final,
|
||||
) -> anyhow::Result<SearchResult> {
|
||||
let mut offset = self.offset;
|
||||
let mut initial_candidates = RoaringBitmap::new();
|
||||
let mut excluded_documents = RoaringBitmap::new();
|
||||
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||
|
||||
while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
||||
while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
||||
debug!("Number of candidates found {}", candidates.len());
|
||||
|
||||
let excluded = take(&mut excluded_documents);
|
||||
|
51
milli/src/tree_level.rs
Normal file
51
milli/src/tree_level.rs
Normal file
@ -0,0 +1,51 @@
|
||||
use std::convert::TryFrom;
|
||||
use std::fmt;
|
||||
|
||||
/// This is just before the lowest printable character (space, sp, 32)
|
||||
const MAX_VALUE: u8 = 31;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum Error {
|
||||
LevelTooHigh(u8),
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[repr(transparent)]
|
||||
pub struct TreeLevel(u8);
|
||||
|
||||
impl TreeLevel {
|
||||
pub const fn max_value() -> TreeLevel {
|
||||
TreeLevel(MAX_VALUE)
|
||||
}
|
||||
|
||||
pub const fn min_value() -> TreeLevel {
|
||||
TreeLevel(0)
|
||||
}
|
||||
|
||||
pub fn saturating_sub(&self, lhs: u8) -> TreeLevel {
|
||||
TreeLevel(self.0.saturating_sub(lhs))
|
||||
}
|
||||
}
|
||||
|
||||
impl Into<u8> for TreeLevel {
|
||||
fn into(self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for TreeLevel {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(value: u8) -> Result<TreeLevel, Error> {
|
||||
match value {
|
||||
0..=MAX_VALUE => Ok(TreeLevel(value)),
|
||||
_ => Err(Error::LevelTooHigh(value)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for TreeLevel {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
@ -28,6 +28,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
word_level_position_docids,
|
||||
word_prefix_level_position_docids,
|
||||
facet_field_id_value_docids,
|
||||
field_id_docid_facet_values,
|
||||
documents,
|
||||
@ -55,6 +57,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
docid_word_positions.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_level_position_docids.clear(self.wtxn)?;
|
||||
word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||
facet_field_id_value_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_values.clear(self.wtxn)?;
|
||||
documents.clear(self.wtxn)?;
|
||||
|
@ -88,6 +88,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
word_level_position_docids,
|
||||
word_prefix_level_position_docids,
|
||||
facet_field_id_value_docids,
|
||||
field_id_docid_facet_values,
|
||||
documents,
|
||||
@ -329,6 +331,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the word level position docids.
|
||||
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids.difference_with(&self.documents_ids);
|
||||
if docids.is_empty() {
|
||||
iter.del_current()?;
|
||||
} else if docids.len() != previous_len {
|
||||
iter.put_current(bytes, &docids)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the word prefix level position docids.
|
||||
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids.difference_with(&self.documents_ids);
|
||||
if docids.is_empty() {
|
||||
iter.del_current()?;
|
||||
} else if docids.len() != previous_len {
|
||||
iter.put_current(bytes, &docids)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
Ok(self.documents_ids.len())
|
||||
}
|
||||
}
|
||||
|
@ -52,6 +52,14 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -
|
||||
cbo_roaring_bitmap_merge(values)
|
||||
}
|
||||
|
||||
pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||
cbo_roaring_bitmap_merge(values)
|
||||
}
|
||||
|
||||
pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||
cbo_roaring_bitmap_merge(values)
|
||||
}
|
||||
|
||||
pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||
cbo_roaring_bitmap_merge(values)
|
||||
}
|
||||
|
@ -2,7 +2,8 @@ use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::str;
|
||||
use std::sync::mpsc::sync_channel;
|
||||
use std::time::Instant;
|
||||
|
||||
@ -13,17 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy
|
||||
use heed::types::ByteSlice;
|
||||
use log::{debug, info, error};
|
||||
use memmap::Mmap;
|
||||
use rayon::ThreadPool;
|
||||
use rayon::prelude::*;
|
||||
use rayon::ThreadPool;
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
use crate::index::Index;
|
||||
use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep};
|
||||
use crate::update::{
|
||||
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
|
||||
WordPrefixPairProximityDocids,
|
||||
};
|
||||
use self::store::{Store, Readers};
|
||||
pub use self::merge_function::{
|
||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||
docid_word_positions_merge, documents_merge, facet_field_value_docids_merge,
|
||||
field_id_docid_facet_values_merge,
|
||||
docid_word_positions_merge, documents_merge,
|
||||
word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
|
||||
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
|
||||
};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
|
||||
@ -262,6 +267,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
facet_min_level_size: Option<NonZeroUsize>,
|
||||
words_prefix_threshold: Option<f64>,
|
||||
max_prefix_length: Option<usize>,
|
||||
words_positions_level_group_size: Option<NonZeroU32>,
|
||||
words_positions_min_level_size: Option<NonZeroU32>,
|
||||
update_method: IndexDocumentsMethod,
|
||||
update_format: UpdateFormat,
|
||||
autogenerate_docids: bool,
|
||||
@ -289,6 +296,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
facet_min_level_size: None,
|
||||
words_prefix_threshold: None,
|
||||
max_prefix_length: None,
|
||||
words_positions_level_group_size: None,
|
||||
words_positions_min_level_size: None,
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
update_format: UpdateFormat::Json,
|
||||
autogenerate_docids: true,
|
||||
@ -402,6 +411,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
enum DatabaseType {
|
||||
Main,
|
||||
WordDocids,
|
||||
WordLevel0PositionDocids,
|
||||
FacetLevel0ValuesDocids,
|
||||
}
|
||||
|
||||
@ -467,6 +477,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
let mut word_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
|
||||
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len());
|
||||
let mut documents_readers = Vec::with_capacity(readers.len());
|
||||
@ -476,6 +487,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
word_docids,
|
||||
docid_word_positions,
|
||||
words_pairs_proximities_docids,
|
||||
word_level_position_docids,
|
||||
facet_field_value_docids,
|
||||
field_id_docid_facet_values,
|
||||
documents
|
||||
@ -484,6 +496,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
word_docids_readers.push(word_docids);
|
||||
docid_word_positions_readers.push(docid_word_positions);
|
||||
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
|
||||
word_level_position_docids_readers.push(word_level_position_docids);
|
||||
facet_field_value_docids_readers.push(facet_field_value_docids);
|
||||
field_id_docid_facet_values_readers.push(field_id_docid_facet_values);
|
||||
documents_readers.push(documents);
|
||||
@ -514,6 +527,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
facet_field_value_docids_readers,
|
||||
facet_field_value_docids_merge,
|
||||
),
|
||||
(
|
||||
DatabaseType::WordLevel0PositionDocids,
|
||||
word_level_position_docids_readers,
|
||||
word_level_position_docids_merge,
|
||||
),
|
||||
]
|
||||
.into_par_iter()
|
||||
.for_each(|(dbtype, readers, merge)| {
|
||||
@ -569,7 +587,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
||||
|
||||
let mut database_count = 0;
|
||||
let total_databases = 7;
|
||||
let total_databases = 8;
|
||||
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: 0,
|
||||
@ -661,7 +679,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
)?;
|
||||
},
|
||||
DatabaseType::FacetLevel0ValuesDocids => {
|
||||
debug!("Writing the facet values docids into LMDB on disk...");
|
||||
debug!("Writing the facet level 0 values docids into LMDB on disk...");
|
||||
let db = *self.index.facet_field_id_value_docids.as_polymorph();
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
@ -671,6 +689,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
DatabaseType::WordLevel0PositionDocids => {
|
||||
debug!("Writing the word level 0 positions docids into LMDB on disk...");
|
||||
let db = *self.index.word_level_position_docids.as_polymorph();
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
db,
|
||||
content,
|
||||
word_level_position_docids_merge,
|
||||
write_method,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
database_count += 1;
|
||||
@ -694,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
builder.execute()?;
|
||||
|
||||
// Run the words prefixes update operation.
|
||||
let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
|
||||
if let Some(value) = self.words_prefix_threshold {
|
||||
builder.threshold(value);
|
||||
}
|
||||
@ -706,6 +732,37 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
}
|
||||
builder.execute()?;
|
||||
|
||||
// Run the word prefix docids update operation.
|
||||
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
builder.max_nb_chunks = self.max_nb_chunks;
|
||||
builder.max_memory = self.max_memory;
|
||||
builder.execute()?;
|
||||
|
||||
// Run the word prefix pair proximity docids update operation.
|
||||
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
builder.max_nb_chunks = self.max_nb_chunks;
|
||||
builder.max_memory = self.max_memory;
|
||||
builder.execute()?;
|
||||
|
||||
// Run the words level positions update operation.
|
||||
let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
if let Some(value) = self.words_positions_level_group_size {
|
||||
builder.level_group_size(value);
|
||||
}
|
||||
if let Some(value) = self.words_positions_min_level_size {
|
||||
builder.min_level_size(value);
|
||||
}
|
||||
builder.execute()?;
|
||||
|
||||
debug_assert_eq!(database_count, total_databases);
|
||||
|
||||
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
|
||||
|
@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId
|
||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||
use super::merge_function::{
|
||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
|
||||
word_level_position_docids_merge, facet_field_value_docids_merge,
|
||||
field_id_docid_facet_values_merge,
|
||||
};
|
||||
|
||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||
@ -43,6 +44,7 @@ pub struct Readers {
|
||||
pub word_docids: Reader<FileFuse>,
|
||||
pub docid_word_positions: Reader<FileFuse>,
|
||||
pub words_pairs_proximities_docids: Reader<FileFuse>,
|
||||
pub word_level_position_docids: Reader<FileFuse>,
|
||||
pub facet_field_value_docids: Reader<FileFuse>,
|
||||
pub field_id_docid_facet_values: Reader<FileFuse>,
|
||||
pub documents: Reader<FileFuse>,
|
||||
@ -69,6 +71,7 @@ pub struct Store<'s, A> {
|
||||
main_sorter: Sorter<MergeFn>,
|
||||
word_docids_sorter: Sorter<MergeFn>,
|
||||
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
||||
word_level_position_docids_sorter: Sorter<MergeFn>,
|
||||
facet_field_value_docids_sorter: Sorter<MergeFn>,
|
||||
field_id_docid_facet_values_sorter: Sorter<MergeFn>,
|
||||
// MTBL writers
|
||||
@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
) -> anyhow::Result<Self>
|
||||
{
|
||||
// We divide the max memory by the number of sorter the Store have.
|
||||
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4));
|
||||
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
|
||||
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
|
||||
|
||||
let main_sorter = create_sorter(
|
||||
@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let word_level_position_docids_sorter = create_sorter(
|
||||
word_level_position_docids_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let facet_field_value_docids_sorter = create_sorter(
|
||||
facet_field_value_docids_merge,
|
||||
chunk_compression_type,
|
||||
@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
main_sorter,
|
||||
word_docids_sorter,
|
||||
words_pairs_proximities_docids_sorter,
|
||||
word_level_position_docids_sorter,
|
||||
facet_field_value_docids_sorter,
|
||||
field_id_docid_facet_values_sorter,
|
||||
// MTBL writers
|
||||
@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
||||
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
|
||||
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
|
||||
|
||||
words_positions.clear();
|
||||
|
||||
@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_word_position_docids(
|
||||
writer: &mut Sorter<MergeFn>,
|
||||
document_id: DocumentId,
|
||||
words_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut data_buffer = Vec::new();
|
||||
|
||||
for (word, positions) in words_positions {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word.as_bytes());
|
||||
key_buffer.push(0); // level 0
|
||||
|
||||
for position in positions {
|
||||
key_buffer.truncate(word.len() + 1);
|
||||
let position_bytes = position.to_be_bytes();
|
||||
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
||||
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
||||
|
||||
data_buffer.clear();
|
||||
let positions = RoaringBitmap::from_iter(Some(document_id));
|
||||
// We serialize the positions into a buffer.
|
||||
CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer)
|
||||
.with_context(|| "could not serialize positions")?;
|
||||
|
||||
// that we write under the generated key into MTBL
|
||||
if lmdb_key_valid_size(&key_buffer) {
|
||||
writer.insert(&key_buffer, &data_buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_value_docids<I>(
|
||||
sorter: &mut Sorter<MergeFn>,
|
||||
iter: I,
|
||||
@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||
|
||||
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
|
||||
|
||||
let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?;
|
||||
|
||||
@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
let main = writer_into_reader(main_wtr, shrink_size)?;
|
||||
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
||||
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||
let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?;
|
||||
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||
@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
word_docids,
|
||||
docid_word_positions,
|
||||
words_pairs_proximities_docids,
|
||||
word_level_position_docids,
|
||||
facet_field_value_docids,
|
||||
field_id_docid_facet_values,
|
||||
documents,
|
||||
|
@ -6,7 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::update_builder::UpdateBuilder;
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::words_prefixes::WordsPrefixes;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
|
||||
pub use self::words_level_positions::WordsLevelPositions;
|
||||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
mod available_documents_ids;
|
||||
mod clear_documents;
|
||||
@ -16,5 +19,7 @@ mod index_documents;
|
||||
mod settings;
|
||||
mod update_builder;
|
||||
mod update_step;
|
||||
mod words_prefixes;
|
||||
|
||||
mod word_prefix_docids;
|
||||
mod word_prefix_pair_proximity_docids;
|
||||
mod words_level_positions;
|
||||
mod words_prefixes_fst;
|
||||
|
@ -2,7 +2,7 @@ use grenad::CompressionType;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
use crate::Index;
|
||||
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes};
|
||||
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
|
||||
|
||||
pub struct UpdateBuilder<'a> {
|
||||
pub(crate) log_every_n: Option<usize>,
|
||||
@ -135,19 +135,4 @@ impl<'a> UpdateBuilder<'a> {
|
||||
|
||||
builder
|
||||
}
|
||||
|
||||
pub fn words_prefixes<'t, 'u, 'i>(
|
||||
self,
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> WordsPrefixes<'t, 'u, 'i>
|
||||
{
|
||||
let mut builder = WordsPrefixes::new(wtxn, index, self.update_id);
|
||||
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
|
||||
builder
|
||||
}
|
||||
}
|
||||
|
75
milli/src/update/word_prefix_docids.rs
Normal file
75
milli/src/update/word_prefix_docids.rs
Normal file
@ -0,0 +1,75 @@
|
||||
use std::str;
|
||||
|
||||
use crate::Index;
|
||||
use fst::Streamer;
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database};
|
||||
|
||||
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||
WordPrefixDocids {
|
||||
wtxn,
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self) -> anyhow::Result<()> {
|
||||
// Clear the word prefix docids database.
|
||||
self.index.word_prefix_docids.clear(self.wtxn)?;
|
||||
|
||||
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
|
||||
// It is forbidden to keep a mutable reference into the database
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
word_docids_merge,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
// We iterate over all the prefixes and retrieve the corresponding docids.
|
||||
let mut prefix_stream = prefix_fst.stream();
|
||||
while let Some(bytes) = prefix_stream.next() {
|
||||
let prefix = str::from_utf8(bytes)?;
|
||||
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
|
||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||
let (_word, data) = result?;
|
||||
prefix_docids_sorter.insert(prefix, data)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(prefix_fst);
|
||||
|
||||
// We finally write the word prefix docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_docids.as_polymorph(),
|
||||
prefix_docids_sorter,
|
||||
word_docids_merge,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
89
milli/src/update/word_prefix_pair_proximity_docids.rs
Normal file
89
milli/src/update/word_prefix_pair_proximity_docids.rs
Normal file
@ -0,0 +1,89 @@
|
||||
use std::str;
|
||||
|
||||
use fst::automaton::{Automaton, Str};
|
||||
use fst::{Streamer, IntoStreamer};
|
||||
use grenad::CompressionType;
|
||||
use heed::BytesEncode;
|
||||
use heed::types::ByteSlice;
|
||||
use log::debug;
|
||||
|
||||
use crate::Index;
|
||||
use crate::heed_codec::StrStrU8Codec;
|
||||
use crate::update::index_documents::{
|
||||
WriteMethod, create_sorter, sorter_into_lmdb_database,
|
||||
words_pairs_proximities_docids_merge,
|
||||
};
|
||||
|
||||
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
|
||||
{
|
||||
WordPrefixPairProximityDocids {
|
||||
wtxn,
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self) -> anyhow::Result<()> {
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
|
||||
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
|
||||
// Here we create a sorter akin to the previous one.
|
||||
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
|
||||
words_pairs_proximities_docids_merge,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
// We insert all the word pairs corresponding to the word-prefix pairs
|
||||
// where the prefixes appears in the prefix FST previously constructed.
|
||||
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
|
||||
for result in db.iter(self.wtxn)? {
|
||||
let ((word1, word2, prox), data) = result?;
|
||||
let automaton = Str::new(word2).starts_with();
|
||||
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
||||
while let Some(prefix) = matching_prefixes.next() {
|
||||
let prefix = str::from_utf8(prefix)?;
|
||||
let pair = (word1, prefix, prox);
|
||||
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
|
||||
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(prefix_fst);
|
||||
|
||||
// We finally write the word prefix pair proximity docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
word_prefix_pair_proximity_docids_sorter,
|
||||
words_pairs_proximities_docids_merge,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
261
milli/src/update/words_level_positions.rs
Normal file
261
milli/src/update/words_level_positions.rs
Normal file
@ -0,0 +1,261 @@
|
||||
use std::{cmp, str};
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::num::NonZeroU32;
|
||||
|
||||
use fst::automaton::{self, Automaton};
|
||||
use fst::{Streamer, IntoStreamer};
|
||||
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||
use heed::types::{ByteSlice, DecodeIgnore, Str};
|
||||
use heed::{BytesEncode, Error};
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::update::index_documents::{
|
||||
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
|
||||
word_prefix_level_positions_docids_merge, sorter_into_lmdb_database
|
||||
};
|
||||
use crate::{Index, TreeLevel};
|
||||
|
||||
pub struct WordsLevelPositions<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
level_group_size: NonZeroU32,
|
||||
min_level_size: NonZeroU32,
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
|
||||
WordsLevelPositions {
|
||||
wtxn,
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
level_group_size: NonZeroU32::new(4).unwrap(),
|
||||
min_level_size: NonZeroU32::new(5).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self {
|
||||
self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self {
|
||||
self.min_level_size = value;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute(self) -> anyhow::Result<()> {
|
||||
debug!("Computing and writing the word levels positions docids into LMDB on disk...");
|
||||
|
||||
let entries = compute_positions_levels(
|
||||
self.wtxn,
|
||||
self.index.word_docids.remap_data_type::<DecodeIgnore>(),
|
||||
self.index.word_level_position_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
)?;
|
||||
|
||||
// The previously computed entries also defines the level 0 entries
|
||||
// so we can clear the database and append all of these entries.
|
||||
self.index.word_level_position_docids.clear(self.wtxn)?;
|
||||
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_level_position_docids.as_polymorph(),
|
||||
entries,
|
||||
|_, _| anyhow::bail!("invalid word level position merging"),
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
// We compute the word prefix level positions database.
|
||||
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||
|
||||
let mut word_prefix_level_positions_docids_sorter = create_sorter(
|
||||
word_prefix_level_positions_docids_merge,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
// We insert the word prefix level positions where the level is equal to 0 and
|
||||
// corresponds to the word-prefix level positions where the prefixes appears
|
||||
// in the prefix FST previously constructed.
|
||||
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>();
|
||||
for result in db.iter(self.wtxn)? {
|
||||
let ((word, level, left, right), data) = result?;
|
||||
if level == TreeLevel::min_value() {
|
||||
let automaton = automaton::Str::new(word).starts_with();
|
||||
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
||||
while let Some(prefix) = matching_prefixes.next() {
|
||||
let prefix = str::from_utf8(prefix)?;
|
||||
let key = (prefix, level, left, right);
|
||||
let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap();
|
||||
word_prefix_level_positions_docids_sorter.insert(bytes, data)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We finally write all the word prefix level positions docids with
|
||||
// a level equal to 0 into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||
word_prefix_level_positions_docids_sorter,
|
||||
word_prefix_level_positions_docids_merge,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
let entries = compute_positions_levels(
|
||||
self.wtxn,
|
||||
self.index.word_prefix_docids.remap_data_type::<DecodeIgnore>(),
|
||||
self.index.word_prefix_level_position_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
)?;
|
||||
|
||||
// The previously computed entries also defines the level 0 entries
|
||||
// so we can clear the database and append all of these entries.
|
||||
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||
entries,
|
||||
|_, _| anyhow::bail!("invalid word prefix level position merging"),
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the next number after or equal to `x` that is divisible by `d`.
|
||||
fn next_divisible(x: u32, d: u32) -> u32 {
|
||||
(x.saturating_sub(1) | (d - 1)) + 1
|
||||
}
|
||||
|
||||
/// Returns the previous number after or equal to `x` that is divisible by `d`,
|
||||
/// saturates on zero.
|
||||
fn previous_divisible(x: u32, d: u32) -> u32 {
|
||||
match x.checked_sub(d - 1) {
|
||||
Some(0) | None => 0,
|
||||
Some(x) => next_divisible(x, d),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates all the words positions levels based on the levels zero (including the level zero).
|
||||
fn compute_positions_levels(
|
||||
rtxn: &heed::RoTxn,
|
||||
words_db: heed::Database<Str, DecodeIgnore>,
|
||||
words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||
compression_type: CompressionType,
|
||||
compression_level: Option<u32>,
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroU32,
|
||||
min_level_size: NonZeroU32,
|
||||
) -> anyhow::Result<Reader<FileFuse>>
|
||||
{
|
||||
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||
// therefore we write the facet levels entries into a grenad file before transfering them.
|
||||
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(compression_type, compression_level, file)
|
||||
})?;
|
||||
|
||||
for result in words_db.iter(rtxn)? {
|
||||
let (word, ()) = result?;
|
||||
|
||||
let level_0_range = {
|
||||
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
|
||||
let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value());
|
||||
left..=right
|
||||
};
|
||||
|
||||
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
||||
.range(rtxn, &level_0_range)?
|
||||
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
|
||||
|
||||
// Groups sizes are always a power of the original level_group_size and therefore a group
|
||||
// always maps groups of the previous level and never splits previous levels groups in half.
|
||||
let group_size_iter = (1u8..)
|
||||
.map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32)))
|
||||
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
|
||||
|
||||
// As specified in the documentation, we also write the level 0 entries.
|
||||
for result in words_positions_db.range(rtxn, &level_0_range)? {
|
||||
let ((word, level, left, right), docids) = result?;
|
||||
write_level_entry(&mut writer, word, level, left, right, &docids)?;
|
||||
}
|
||||
|
||||
for (level, group_size) in group_size_iter {
|
||||
let mut left = 0;
|
||||
let mut right = 0;
|
||||
let mut group_docids = RoaringBitmap::new();
|
||||
|
||||
for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
|
||||
let ((_word, _level, value, _right), docids) = result?;
|
||||
|
||||
if i == 0 {
|
||||
left = previous_divisible(value, group_size);
|
||||
right = left + (group_size - 1);
|
||||
}
|
||||
|
||||
if value > right {
|
||||
// we found the first bound of the next group, we must store the left
|
||||
// and right bounds associated with the docids.
|
||||
write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
|
||||
|
||||
// We save the left bound for the new group and also reset the docids.
|
||||
group_docids = RoaringBitmap::new();
|
||||
left = previous_divisible(value, group_size);
|
||||
right = left + (group_size - 1);
|
||||
}
|
||||
|
||||
// The right bound is always the bound we run through.
|
||||
group_docids.union_with(&docids);
|
||||
}
|
||||
|
||||
if !group_docids.is_empty() {
|
||||
write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer, shrink_size)
|
||||
}
|
||||
|
||||
fn write_level_entry(
|
||||
writer: &mut Writer<File>,
|
||||
word: &str,
|
||||
level: TreeLevel,
|
||||
left: u32,
|
||||
right: u32,
|
||||
ids: &RoaringBitmap,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
let key = (word, level, left, right);
|
||||
let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
|
||||
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
|
||||
writer.insert(&key, &data)?;
|
||||
Ok(())
|
||||
}
|
@ -1,196 +0,0 @@
|
||||
use std::iter::FromIterator;
|
||||
use std::str;
|
||||
|
||||
use chrono::Utc;
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, Streamer, IntoStreamer};
|
||||
use grenad::CompressionType;
|
||||
use heed::BytesEncode;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use crate::heed_codec::StrStrU8Codec;
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database};
|
||||
use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
|
||||
use crate::{Index, SmallString32};
|
||||
|
||||
pub struct WordsPrefixes<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
threshold: f64,
|
||||
max_prefix_length: usize,
|
||||
_update_id: u64,
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
update_id: u64,
|
||||
) -> WordsPrefixes<'t, 'u, 'i>
|
||||
{
|
||||
WordsPrefixes {
|
||||
wtxn,
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
threshold: 0.1 / 100.0, // .01%
|
||||
max_prefix_length: 4,
|
||||
_update_id: update_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
|
||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
||||
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
||||
///
|
||||
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
|
||||
/// to these bounds otherwise.
|
||||
pub fn threshold(&mut self, value: f64) -> &mut Self {
|
||||
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the maximum length of prefixes in bytes.
|
||||
///
|
||||
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
|
||||
/// to these bounds, otherwise.
|
||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||
self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute(self) -> anyhow::Result<()> {
|
||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
||||
// Clear the words prefixes datastructures.
|
||||
self.index.word_prefix_docids.clear(self.wtxn)?;
|
||||
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
|
||||
let words_fst = self.index.words_fst(&self.wtxn)?;
|
||||
let number_of_words = words_fst.len();
|
||||
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
|
||||
|
||||
// It is forbidden to keep a mutable reference into the database
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
word_docids_merge,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
|
||||
for n in 1..=self.max_prefix_length {
|
||||
|
||||
let mut current_prefix = SmallString32::new();
|
||||
let mut current_prefix_count = 0;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
let mut stream = words_fst.stream();
|
||||
while let Some(bytes) = stream.next() {
|
||||
// We try to get the first n bytes out of this string but we only want
|
||||
// to split at valid characters bounds. If we try to split in the middle of
|
||||
// a character we ignore this word and go to the next one.
|
||||
let word = str::from_utf8(bytes)?;
|
||||
let prefix = match word.get(..n) {
|
||||
Some(prefix) => prefix,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// This is the first iteration of the loop,
|
||||
// or the current word doesn't starts with the current prefix.
|
||||
if current_prefix_count == 0 || prefix != current_prefix.as_str() {
|
||||
current_prefix = SmallString32::from(prefix);
|
||||
current_prefix_count = 0;
|
||||
}
|
||||
|
||||
current_prefix_count += 1;
|
||||
|
||||
// There is enough words corresponding to this prefix to add it to the cache.
|
||||
if current_prefix_count == min_number_of_words {
|
||||
builder.insert(prefix)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We construct the final set for prefixes of size n.
|
||||
prefix_fsts.push(builder.into_set());
|
||||
}
|
||||
|
||||
// We merge all of the previously computed prefixes into on final set.
|
||||
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(op.r#union())?;
|
||||
let prefix_fst = builder.into_set();
|
||||
|
||||
// We iterate over all the prefixes and retrieve the corresponding docids.
|
||||
let mut prefix_stream = prefix_fst.stream();
|
||||
while let Some(bytes) = prefix_stream.next() {
|
||||
let prefix = str::from_utf8(bytes)?;
|
||||
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
|
||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||
let (_word, data) = result?;
|
||||
prefix_docids_sorter.insert(prefix, data)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Set the words prefixes FST in the dtabase.
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
|
||||
|
||||
// We finally write the word prefix docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_docids.as_polymorph(),
|
||||
prefix_docids_sorter,
|
||||
word_docids_merge,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
// We compute the word prefix pair proximity database.
|
||||
|
||||
// Here we create a sorter akin to the previous one.
|
||||
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
|
||||
words_pairs_proximities_docids_merge,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
// We insert all the word pairs corresponding to the word-prefix pairs
|
||||
// where the prefixes appears in the prefix FST previously constructed.
|
||||
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
|
||||
for result in db.iter(self.wtxn)? {
|
||||
let ((word1, word2, prox), data) = result?;
|
||||
let automaton = Str::new(word2).starts_with();
|
||||
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
||||
while let Some(prefix) = matching_prefixes.next() {
|
||||
let prefix = str::from_utf8(prefix)?;
|
||||
let pair = (word1, prefix, prox);
|
||||
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
|
||||
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We finally write the word prefix pair proximity docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
word_prefix_pair_proximity_docids_sorter,
|
||||
words_pairs_proximities_docids_merge,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
104
milli/src/update/words_prefixes_fst.rs
Normal file
104
milli/src/update/words_prefixes_fst.rs
Normal file
@ -0,0 +1,104 @@
|
||||
use std::iter::FromIterator;
|
||||
use std::str;
|
||||
|
||||
use fst::Streamer;
|
||||
use crate::{Index, SmallString32};
|
||||
|
||||
pub struct WordsPrefixesFst<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
threshold: f64,
|
||||
max_prefix_length: usize,
|
||||
_update_id: u64,
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
update_id: u64,
|
||||
) -> WordsPrefixesFst<'t, 'u, 'i>
|
||||
{
|
||||
WordsPrefixesFst {
|
||||
wtxn,
|
||||
index,
|
||||
threshold: 0.1 / 100.0, // .01%
|
||||
max_prefix_length: 4,
|
||||
_update_id: update_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
|
||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
||||
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
||||
///
|
||||
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
|
||||
/// to these bounds otherwise.
|
||||
pub fn threshold(&mut self, value: f64) -> &mut Self {
|
||||
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the maximum length of prefixes in bytes.
|
||||
///
|
||||
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
|
||||
/// to these bounds, otherwise.
|
||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||
self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute(self) -> anyhow::Result<()> {
|
||||
let words_fst = self.index.words_fst(&self.wtxn)?;
|
||||
let number_of_words = words_fst.len();
|
||||
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
|
||||
|
||||
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
|
||||
for n in 1..=self.max_prefix_length {
|
||||
|
||||
let mut current_prefix = SmallString32::new();
|
||||
let mut current_prefix_count = 0;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
let mut stream = words_fst.stream();
|
||||
while let Some(bytes) = stream.next() {
|
||||
// We try to get the first n bytes out of this string but we only want
|
||||
// to split at valid characters bounds. If we try to split in the middle of
|
||||
// a character we ignore this word and go to the next one.
|
||||
let word = str::from_utf8(bytes)?;
|
||||
let prefix = match word.get(..n) {
|
||||
Some(prefix) => prefix,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// This is the first iteration of the loop,
|
||||
// or the current word doesn't starts with the current prefix.
|
||||
if current_prefix_count == 0 || prefix != current_prefix.as_str() {
|
||||
current_prefix = SmallString32::from(prefix);
|
||||
current_prefix_count = 0;
|
||||
}
|
||||
|
||||
current_prefix_count += 1;
|
||||
|
||||
// There is enough words corresponding to this prefix to add it to the cache.
|
||||
if current_prefix_count == min_number_of_words {
|
||||
builder.insert(prefix)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We construct the final set for prefixes of size n.
|
||||
prefix_fsts.push(builder.into_set());
|
||||
}
|
||||
|
||||
// We merge all of the previously computed prefixes into on final set.
|
||||
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(op.r#union())?;
|
||||
let prefix_fst = builder.into_set();
|
||||
|
||||
// Set the words prefixes FST in the dtabase.
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user