Merge pull request #122 from meilisearch/attribute-criterion

Introduce the Attribute criterion
This commit is contained in:
Clément Renault 2021-04-28 14:34:50 +02:00 committed by GitHub
commit 5a10de1b9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 2105 additions and 810 deletions

11
Cargo.lock generated
View File

@ -122,6 +122,12 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
[[package]]
name = "big_s"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8"
[[package]]
name = "bincode"
version = "1.3.1"
@ -1251,6 +1257,7 @@ name = "milli"
version = "0.1.1"
dependencies = [
"anyhow",
"big_s",
"bstr",
"byteorder",
"chrono",
@ -1957,9 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1"
[[package]]
name = "roaring"
version = "0.6.5"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f"
checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447"
dependencies = [
"bytemuck",
"byteorder",

View File

@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fmt::Display;
use std::fs::{create_dir_all, File};
use std::net::SocketAddr;
use std::num::NonZeroUsize;
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
@ -228,7 +228,6 @@ enum UpdateMeta {
ClearDocuments,
Settings(Settings),
Facets(Facets),
WordsPrefixes(WordsPrefixes),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -281,6 +280,14 @@ struct WordsPrefixes {
max_prefix_length: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
struct WordsLevelPositions {
level_group_size: Option<NonZeroU32>,
min_level_size: Option<NonZeroU32>,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let opt = Opt::from_args();
@ -479,21 +486,6 @@ async fn main() -> anyhow::Result<()> {
Err(e) => Err(e)
}
}
UpdateMeta::WordsPrefixes(settings) => {
// We must use the write transaction of the update here.
let mut wtxn = index_cloned.write_txn()?;
let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned);
if let Some(value) = settings.threshold {
builder.threshold(value);
}
if let Some(value) = settings.max_prefix_length {
builder.max_prefix_length(value);
}
match builder.execute() {
Ok(()) => wtxn.commit().map_err(Into::into),
Err(e) => Err(e)
}
}
};
let meta = match result {
@ -910,19 +902,6 @@ async fn main() -> anyhow::Result<()> {
warp::reply()
});
let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone();
let change_words_prefixes_route = warp::filters::method::post()
.and(warp::path!("words-prefixes"))
.and(warp::body::json())
.map(move |settings: WordsPrefixes| {
let meta = UpdateMeta::WordsPrefixes(settings);
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
eprintln!("update {} registered", update_id);
warp::reply()
});
let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone();
let abort_update_id_route = warp::filters::method::delete()
@ -997,7 +976,6 @@ async fn main() -> anyhow::Result<()> {
.or(clearing_route)
.or(change_settings_route)
.or(change_facet_levels_route)
.or(change_words_prefixes_route)
.or(update_ws_route);
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;

View File

@ -11,7 +11,7 @@ csv = "1.1.5"
heed = "0.10.6"
jemallocator = "0.3.2"
milli = { path = "../milli" }
roaring = "0.6.5"
roaring = "0.6.6"
serde_json = "1.0.62"
stderrlog = "0.5.1"
structopt = { version = "0.3.21", default-features = false }

View File

@ -5,7 +5,7 @@ use std::{str, io, fmt};
use anyhow::Context;
use byte_unit::Byte;
use heed::EnvOpenOptions;
use milli::Index;
use milli::{Index, TreeLevel};
use structopt::StructOpt;
use Command::*;
@ -19,9 +19,11 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids";
const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids";
const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids";
const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values";
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
const DOCUMENTS_DB_NAME: &str = "documents";
const ALL_DATABASE_NAMES: &[&str] = &[
@ -31,8 +33,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[
DOCID_WORD_POSITIONS_DB_NAME,
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
FACET_FIELD_ID_VALUE_DOCIDS_NAME,
FIELD_ID_DOCID_FACET_VALUES_NAME,
WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
DOCUMENTS_DB_NAME,
];
@ -114,6 +118,27 @@ enum Command {
field_name: String,
},
/// Outputs a CSV with the documents ids along with the word level positions where it appears.
WordsLevelPositionsDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// Words appearing in the documents.
words: Vec<String>,
},
/// Outputs a CSV with the documents ids along with
/// the word prefix level positions where it appears.
WordPrefixesLevelPositionsDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// Prefixes of words appearing in the documents.
prefixes: Vec<String>,
},
/// Outputs a CSV with the documents ids, words and the positions where this word appears.
DocidsWordsPositions {
/// Display the whole positions in detail.
@ -221,6 +246,12 @@ fn main() -> anyhow::Result<()> {
FacetValuesDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, field_name)
},
WordsLevelPositionsDocids { full_display, words } => {
words_level_positions_docids(&index, &rtxn, !full_display, words)
},
WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
},
DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
},
@ -319,9 +350,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values: _,
documents,
documents
} = index;
let main_name = "main";
@ -330,6 +363,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let docid_word_positions_name = "docid_word_positions";
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let word_level_position_docids_name = "word_level_position_docids";
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
let documents_name = "documents";
@ -386,6 +421,20 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); }
}
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
if heap.len() > limit { heap.pop(); }
}
for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
if heap.len() > limit { heap.pop(); }
}
let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
for (field_id, field_type) in faceted_fields {
@ -524,6 +573,84 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
Ok(wtr.flush()?)
}
fn words_level_positions_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
words: Vec<String>,
) -> anyhow::Result<()>
{
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
for word in words.iter().map(AsRef::as_ref) {
let range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
left..=right
};
for result in index.word_level_position_docids.range(rtxn, &range)? {
let ((w, level, left, right), docids) = result?;
let count = docids.len().to_string();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
let position_range = if level == TreeLevel::min_value() {
format!("{:?}", left)
} else {
format!("{:?}", left..=right)
};
let level = level.to_string();
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
}
}
Ok(wtr.flush()?)
}
fn word_prefixes_level_positions_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
prefixes: Vec<String>,
) -> anyhow::Result<()>
{
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
for word in prefixes.iter().map(AsRef::as_ref) {
let range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
left..=right
};
for result in index.word_prefix_level_position_docids.range(rtxn, &range)? {
let ((w, level, left, right), docids) = result?;
let count = docids.len().to_string();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
let position_range = if level == TreeLevel::min_value() {
format!("{:?}", left)
} else {
format!("{:?}", left..=right)
};
let level = level.to_string();
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
}
}
Ok(wtr.flush()?)
}
fn docids_words_positions(
index: &Index,
rtxn: &heed::RoTxn,
@ -715,6 +842,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any
fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> anyhow::Result<()> {
use heed::types::ByteSlice;
let Index {
env: _,
main,
word_docids,
word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,
} = index;
let names = if names.is_empty() {
ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect()
} else {
@ -723,30 +865,35 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
for name in names {
let database = match name.as_str() {
MAIN_DB_NAME => &index.main,
WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(),
WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(),
DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(),
FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(),
DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
MAIN_DB_NAME => &main,
WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(),
WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(),
DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(),
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(),
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(),
DOCUMENTS_DB_NAME => documents.as_polymorph(),
unknown => anyhow::bail!("unknown database {:?}", unknown),
};
let mut key_size: u64 = 0;
let mut val_size: u64 = 0;
let mut number_entries: u64 = 0;
for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
let (k, v) = result?;
key_size += k.len() as u64;
val_size += v.len() as u64;
number_entries += 1;
}
println!("The {} database weigh:", name);
println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true));
println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true));
println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true));
println!("\tnumber of entries: {}", number_entries);
}
Ok(())

View File

@ -27,7 +27,7 @@ once_cell = "1.5.2"
ordered-float = "2.1.1"
rayon = "1.5.0"
regex = "1.4.3"
roaring = "0.6.5"
roaring = "0.6.6"
serde = { version = "1.0.123", features = ["derive"] }
serde_json = { version = "1.0.62", features = ["preserve_order"] }
slice-group-by = "0.2.6"
@ -52,13 +52,11 @@ logging_timer = "1.0.0"
tinytemplate = "=1.1.0"
[dev-dependencies]
big_s = "1.0.2"
criterion = "0.3.4"
maplit = "1.0.2"
rand = "0.8.3"
[build-dependencies]
fst = "0.4.5"
[features]
default = []

View File

@ -2,6 +2,7 @@ mod beu32_str_codec;
mod obkv_codec;
mod roaring_bitmap;
mod roaring_bitmap_length;
mod str_level_position_codec;
mod str_str_u8_codec;
pub mod facet;
@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
pub use self::str_level_position_codec::StrLevelPositionCodec;
pub use self::str_str_u8_codec::StrStrU8Codec;

View File

@ -0,0 +1,45 @@
use std::borrow::Cow;
use std::convert::{TryFrom, TryInto};
use std::mem::size_of;
use std::str;
use crate::TreeLevel;
pub struct StrLevelPositionCodec;
impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
type DItem = (&'a str, TreeLevel, u32, u32);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
if bytes.len() < footer_len { return None }
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word).ok()?;
let (level, bytes) = bytes.split_first()?;
let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?;
let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?;
let level = TreeLevel::try_from(*level).ok()?;
Some((word, level, left, right))
}
}
impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
type EItem = (&'a str, TreeLevel, u32, u32);
fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let left = left.to_be_bytes();
let right = right.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len());
bytes.extend_from_slice(word.as_bytes());
bytes.push((*level).into());
bytes.extend_from_slice(&left[..]);
bytes.extend_from_slice(&right[..]);
Some(Cow::Owned(bytes))
}
}

View File

@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution,
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
@ -52,6 +52,10 @@ pub struct Index {
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the word, level and position range with the docids that corresponds to it.
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the level positions of a word prefix with all the docids where this prefix appears.
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
/// Maps the document id, the facet field id and the globally ordered value.
@ -62,7 +66,7 @@ pub struct Index {
impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
options.max_dbs(9);
options.max_dbs(11);
let env = options.open(path)?;
let main = env.create_poly_database(Some("main"))?;
@ -71,6 +75,8 @@ impl Index {
let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
let documents = env.create_database(Some("documents"))?;
@ -94,6 +100,8 @@ impl Index {
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,

View File

@ -9,6 +9,7 @@ pub mod facet;
pub mod heed_codec;
pub mod index;
pub mod proximity;
pub mod tree_level;
pub mod update;
use std::borrow::Cow;
@ -22,11 +23,12 @@ use serde_json::{Map, Value};
pub use self::criterion::{Criterion, default_criteria};
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::index::Index;
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
pub use self::tree_level::TreeLevel;
pub use self::update_store::UpdateStore;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -31,32 +31,10 @@ pub struct AscDesc<'t> {
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
bucket_candidates: RoaringBitmap,
faceted_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
parent: Box<dyn Criterion + 't>,
}
impl<'t> AscDesc<'t> {
pub fn initial_asc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
field_name: String,
) -> anyhow::Result<Self>
{
Self::initial(index, rtxn, query_tree, candidates, field_name, true)
}
pub fn initial_desc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
field_name: String,
) -> anyhow::Result<Self>
{
Self::initial(index, rtxn, query_tree, candidates, field_name, false)
}
pub fn asc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> {
Self::new(index, rtxn, parent, field_name, false)
}
fn initial(
index: &'t Index,
rtxn: &'t heed::RoTxn,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
field_name: String,
ascending: bool,
) -> anyhow::Result<Self>
{
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?;
let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?;
let candidates = match &query_tree {
Some(qt) => {
let context = CriteriaBuilder::new(rtxn, index)?;
let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?;
if let Some(candidates) = candidates {
qt_candidates.intersect_with(&candidates);
}
qt_candidates
},
None => candidates.unwrap_or(faceted_candidates.clone()),
};
Ok(AscDesc {
index,
rtxn,
field_name,
field_id,
facet_type,
ascending,
query_tree,
candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?,
faceted_candidates,
bucket_candidates: RoaringBitmap::new(),
parent: None,
})
}
fn new(
index: &'t Index,
rtxn: &'t heed::RoTxn,
@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> {
candidates: Box::new(std::iter::empty()),
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
parent,
})
}
}
@ -156,12 +93,9 @@ impl<'t> Criterion for AscDesc<'t> {
match self.candidates.next().transpose()? {
None => {
let query_tree = self.query_tree.take();
let bucket_candidates = take(&mut self.bucket_candidates);
match self.parent.as_mut() {
Some(parent) => {
match parent.next(wdcache)? {
match self.parent.next(wdcache)? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates_is_some = candidates.is_some();
self.query_tree = query_tree;
let candidates = match (&self.query_tree, candidates) {
(_, Some(mut candidates)) => {
@ -176,11 +110,22 @@ impl<'t> Criterion for AscDesc<'t> {
},
(None, None) => take(&mut self.faceted_candidates),
};
if bucket_candidates.is_empty() {
self.bucket_candidates.union_with(&candidates);
} else {
// If our parent returns candidates it means that the bucket
// candidates were already computed before and we can use them.
//
// If not, we must use the just computed candidates as our bucket
// candidates.
if candidates_is_some {
self.bucket_candidates.union_with(&bucket_candidates);
} else {
self.bucket_candidates.union_with(&candidates);
}
if candidates.is_empty() {
continue;
}
self.candidates = facet_ordered(
self.index,
self.rtxn,
@ -193,27 +138,11 @@ impl<'t> Criterion for AscDesc<'t> {
None => return Ok(None),
}
},
None => if query_tree.is_none() && bucket_candidates.is_empty() {
return Ok(None)
},
}
return Ok(Some(CriterionResult {
query_tree,
candidates: Some(RoaringBitmap::new()),
bucket_candidates,
}));
},
Some(candidates) => {
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(candidates),
bucket_candidates,
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
}

View File

@ -0,0 +1,737 @@
use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap};
use std::collections::{BTreeMap, HashMap, btree_map};
use std::collections::binary_heap::PeekMut;
use std::mem::take;
use roaring::RoaringBitmap;
use crate::{TreeLevel, search::build_dfa};
use crate::search::criteria::Query;
use crate::search::query_tree::{Operation, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use super::{Criterion, CriterionResult, Context, resolve_query_tree};
/// To be able to divide integers by the number of words in the query
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
const LCM_10_FIRST_NUMBERS: u32 = 2520;
/// To compute the interval size of a level,
/// we use 4 as the exponentiation base and the level as the exponent.
const LEVEL_EXPONENTIATION_BASE: u32 = 4;
pub struct Attribute<'t> {
ctx: &'t dyn Context<'t>,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
bucket_candidates: RoaringBitmap,
parent: Box<dyn Criterion + 't>,
flattened_query_tree: Option<Vec<Vec<Vec<Query>>>>,
current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
}
impl<'t> Attribute<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Attribute {
ctx,
query_tree: None,
candidates: None,
bucket_candidates: RoaringBitmap::new(),
parent,
flattened_query_tree: None,
current_buckets: None,
}
}
}
impl<'t> Criterion for Attribute<'t> {
#[logging_timer::time("Attribute::{}")]
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
loop {
match (&self.query_tree, &mut self.candidates) {
(_, Some(candidates)) if candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some(qt), Some(candidates)) => {
let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| {
flatten_query_tree(&qt)
});
let found_candidates = if candidates.len() < 1000 {
let current_buckets = match self.current_buckets.as_mut() {
Some(current_buckets) => current_buckets,
None => {
let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
self.current_buckets.get_or_insert(new_buckets.into_iter())
},
};
match current_buckets.next() {
Some((_score, candidates)) => candidates,
None => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
}
} else {
match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? {
Some(candidates) => candidates,
None => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
}
};
candidates.difference_with(&found_candidates);
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(found_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some(qt), None) => {
let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?;
self.bucket_candidates |= &query_tree_candidates;
self.candidates = Some(query_tree_candidates);
},
(None, Some(_)) => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(None, None) => {
match self.parent.next(wdcache)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree;
self.candidates = candidates;
self.bucket_candidates |= bucket_candidates;
self.flattened_query_tree = None;
self.current_buckets = None;
},
None => return Ok(None),
}
},
}
}
}
}
/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word,
/// it will begin at the first non-empty interval and will return every interval without
/// jumping over empty intervals.
struct WordLevelIterator<'t, 'q> {
inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
level: TreeLevel,
interval_size: u32,
word: Cow<'q, str>,
in_prefix_cache: bool,
inner_next: Option<(u32, u32, RoaringBitmap)>,
current_interval: Option<(u32, u32)>,
}
impl<'t, 'q> WordLevelIterator<'t, 'q> {
fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
match ctx.word_position_last_level(&word, in_prefix_cache)? {
Some(level) => {
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
},
None => Ok(None),
}
}
fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
let level = level.min(&self.level).clone();
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
let word = self.word.clone();
let in_prefix_cache = self.in_prefix_cache;
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
}
fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left }
let inner_next = match self.inner_next.take() {
Some(inner_next) => Some(inner_next),
None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)),
};
match inner_next {
Some((left, right, docids)) => {
match self.current_interval {
Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
let blank_left = last_left + self.interval_size;
let blank_right = last_right + self.interval_size;
self.current_interval = Some((blank_left, blank_right));
self.inner_next = Some((left, right, docids));
Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
},
_ => {
self.current_interval = Some((left, right));
Ok(Some((left, right, docids)))
}
}
},
None => Ok(None),
}
}
}
/// QueryLevelIterator is an pseudo-Iterator for a Query,
/// It contains WordLevelIterators and is chainned with other QueryLevelIterator.
struct QueryLevelIterator<'t, 'q> {
parent: Option<Box<QueryLevelIterator<'t, 'q>>>,
inner: Vec<WordLevelIterator<'t, 'q>>,
level: TreeLevel,
accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
parent_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
interval_to_skip: usize,
}
impl<'t, 'q> QueryLevelIterator<'t, 'q> {
fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<Self>> {
let mut inner = Vec::with_capacity(queries.len());
for query in queries {
match &query.kind {
QueryKind::Exact { word, .. } => {
if !query.prefix || ctx.in_prefix_cache(&word) {
let word = Cow::Borrowed(query.kind.word());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? {
inner.push(word_level_iterator);
}
} else {
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? {
let word = Cow::Owned(word.to_owned());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
inner.push(word_level_iterator);
}
}
}
},
QueryKind::Tolerant { typo, word } => {
for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? {
let word = Cow::Owned(word.to_owned());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
inner.push(word_level_iterator);
}
}
}
}
}
let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone());
match highest {
Some(level) => Ok(Some(Self {
parent: None,
inner,
level,
accumulator: vec![],
parent_accumulator: vec![],
interval_to_skip: 0,
})),
None => Ok(None),
}
}
fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self {
self.parent = Some(Box::new(parent));
self
}
/// create a new QueryLevelIterator with a lower level than the current one.
fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result<Self> {
let (level, parent) = match &self.parent {
Some(parent) => {
let parent = parent.dig(ctx)?;
(parent.level.min(self.level), Some(Box::new(parent)))
},
None => (self.level.saturating_sub(1), None),
};
let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten();
let mut inner = Vec::with_capacity(self.inner.len());
for word_level_iterator in self.inner.iter() {
inner.push(word_level_iterator.dig(ctx, &level, left_interval)?);
}
Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0})
}
fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None;
let u8_level = Into::<u8>::into(level);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
for wli in self.inner.iter_mut() {
let wli_u8_level = Into::<u8>::into(wli.level.clone());
let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32);
for _ in 0..accumulated_count {
if let Some((next_left, _, next_docids)) = wli.next()? {
accumulated = match accumulated.take(){
Some((acc_left, acc_right, mut acc_docids)) => {
acc_docids |= next_docids;
Some((acc_left, acc_right, acc_docids))
},
None => Some((next_left, next_left + interval_size, next_docids)),
};
}
}
}
Ok(accumulated)
}
/// return the next meta-interval created from inner WordLevelIterators,
/// and from eventual chainned QueryLevelIterator.
fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
let parent_result = match self.parent.as_mut() {
Some(parent) => Some(parent.next(allowed_candidates, tree_level)?),
None => None,
};
match parent_result {
Some(parent_next) => {
let inner_next = self.inner_next(tree_level)?;
self.interval_to_skip += interval_to_skip(
&self.parent_accumulator,
&self.accumulator,
self.interval_to_skip,
allowed_candidates
);
self.accumulator.push(inner_next);
self.parent_accumulator.push(parent_next);
let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) {
if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
match merged_interval.as_mut() {
Some((_, _, merged_docids)) => *merged_docids |= a & b,
None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)),
}
}
}
Ok(merged_interval)
},
None => {
let level = self.level;
match self.inner_next(level)? {
Some((left, right, mut candidates)) => {
self.accumulator = vec![Some((left, right, RoaringBitmap::new()))];
candidates &= allowed_candidates;
Ok(Some((left, right, candidates)))
},
None => {
self.accumulator = vec![None];
Ok(None)
},
}
}
}
}
}
/// Count the number of interval that can be skiped when we make the cross-intersections
/// in order to compute the next meta-interval.
/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids.
fn interval_to_skip(
parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>],
current_accumulator: &[Option<(u32, u32, RoaringBitmap)>],
already_skiped: usize,
allowed_candidates: &RoaringBitmap,
) -> usize {
parent_accumulator.into_iter()
.zip(current_accumulator.into_iter())
.skip(already_skiped)
.take_while(|(parent, current)| {
let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
skip_parent && skip_current
})
.count()
}
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates.
struct Branch<'t, 'q> {
query_level_iterator: QueryLevelIterator<'t, 'q>,
last_result: (u32, u32, RoaringBitmap),
tree_level: TreeLevel,
branch_size: u32,
}
impl<'t, 'q> Branch<'t, 'q> {
/// return the next meta-interval of the branch,
/// and update inner interval in order to be ranked by the BinaryHeap.
fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> {
let tree_level = self.query_level_iterator.level;
match self.query_level_iterator.next(allowed_candidates, tree_level)? {
Some(last_result) => {
self.last_result = last_result;
self.tree_level = tree_level;
Ok(true)
},
None => Ok(false),
}
}
/// make the current Branch iterate over smaller intervals.
fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> {
self.query_level_iterator = self.query_level_iterator.dig(ctx)?;
Ok(())
}
/// because next() method could be time consuming,
/// update inner interval in order to be ranked by the binary_heap without computing it,
/// the next() method should be called when the real interval is needed.
fn lazy_next(&mut self) {
let u8_level = Into::<u8>::into(self.tree_level.clone());
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
let (left, right, _) = self.last_result;
self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new());
}
/// return the score of the current inner interval.
fn compute_rank(&self) -> u32 {
// we compute a rank from the left interval.
let (left, _, _) = self.last_result;
left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
}
fn cmp(&self, other: &Self) -> Ordering {
let self_rank = self.compute_rank();
let other_rank = other.compute_rank();
let left_cmp = self_rank.cmp(&other_rank).reverse();
// on level: lower is better,
// we want to dig faster into levels on interesting branches.
let level_cmp = self.tree_level.cmp(&other.tree_level).reverse();
left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len()))
}
}
impl<'t, 'q> Ord for Branch<'t, 'q> {
fn cmp(&self, other: &Self) -> Ordering {
self.cmp(other)
}
}
impl<'t, 'q> PartialOrd for Branch<'t, 'q> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'t, 'q> PartialEq for Branch<'t, 'q> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl<'t, 'q> Eq for Branch<'t, 'q> {}
fn initialize_query_level_iterators<'t, 'q>(
ctx: &'t dyn Context<'t>,
branches: &'q Vec<Vec<Vec<Query>>>,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> {
let mut positions = BinaryHeap::with_capacity(branches.len());
for branch in branches {
let mut branch_positions = Vec::with_capacity(branch.len());
for queries in branch {
match QueryLevelIterator::new(ctx, queries, wdcache)? {
Some(qli) => branch_positions.push(qli),
None => {
// the branch seems to be invalid, so we skip it.
branch_positions.clear();
break;
},
}
}
// QueryLevelIterator need to be sorted by level and folded in descending order.
branch_positions.sort_unstable_by_key(|qli| qli.level);
let folded_query_level_iterators = branch_positions
.into_iter()
.fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
Some(fold) => {
qli.parent(fold);
Some(qli)
},
None => Some(qli),
});
if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
let tree_level = folded_query_level_iterators.level;
let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?;
if let Some(last_result) = last_result {
let branch = Branch {
last_result,
tree_level,
query_level_iterator: folded_query_level_iterators,
branch_size: branch.len() as u32,
};
positions.push(branch);
}
}
}
Ok(positions)
}
fn set_compute_candidates<'t>(
ctx: &'t dyn Context<'t>,
branches: &Vec<Vec<Vec<Query>>>,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Option<RoaringBitmap>>
{
let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
let lowest_level = TreeLevel::min_value();
let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
let mut allowed_candidates = allowed_candidates.clone();
while let Some(mut branch) = branches_heap.peek_mut() {
let is_lowest_level = branch.tree_level == lowest_level;
let branch_rank = branch.compute_rank();
// if current is worst than best we break to return
// candidates that correspond to the best rank
if let Some((best_rank, _)) = final_candidates {
if branch_rank > best_rank { break }
}
let _left = branch.last_result.0;
let candidates = take(&mut branch.last_result.2);
if candidates.is_empty() {
// we don't have candidates, get next interval.
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
}
else if is_lowest_level {
// we have candidates, but we can't dig deeper.
allowed_candidates -= &candidates;
final_candidates = match final_candidates.take() {
// we add current candidates to best candidates
Some((best_rank, mut best_candidates)) => {
best_candidates |= candidates;
branch.lazy_next();
Some((best_rank, best_candidates))
},
// we take current candidates as best candidates
None => {
branch.lazy_next();
Some((branch_rank, candidates))
},
};
} else {
// we have candidates, lets dig deeper in levels.
branch.dig(ctx)?;
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
}
}
Ok(final_candidates.map(|(_rank, candidates)| candidates))
}
fn linear_compute_candidates(
ctx: &dyn Context,
branches: &Vec<Vec<Vec<Query>>>,
allowed_candidates: &RoaringBitmap,
) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
{
fn compute_candidate_rank(branches: &Vec<Vec<Vec<Query>>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value();
for branch in branches {
let branch_len = branch.len();
let mut branch_rank = Vec::with_capacity(branch_len);
for derivates in branch {
let mut position = None;
for Query { prefix, kind } in derivates {
// find the best position of the current word in the document.
let current_position = match kind {
QueryKind::Exact { word, .. } => {
if *prefix {
word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().next()).min()
} else {
words_positions.get(word)
.map(|positions| positions.iter().next())
.flatten()
}
},
QueryKind::Tolerant { typo, word } => {
word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().next()).min()
},
};
match (position, current_position) {
(Some(p), Some(cp)) => position = Some(cmp::min(p, cp)),
(None, Some(cp)) => position = Some(cp),
_ => (),
}
}
// if a position is found, we add it to the branch score,
// otherwise the branch is considered as unfindable in this document and we break.
if let Some(position) = position {
branch_rank.push(position as u64);
} else {
branch_rank.clear();
break;
}
}
if !branch_rank.is_empty() {
branch_rank.sort_unstable();
// because several words in same query can't match all a the position 0,
// we substract the word index to the position.
let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
// here we do the means of the words of the branch
min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
}
}
min_rank
}
fn word_derivations<'a>(
word: &str,
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap>
{
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
match dfa.eval(document_word) {
Distance::Exact(_) => Some(positions),
Distance::AtLeast(_) => None,
}
})
}
let mut candidates = BTreeMap::new();
for docid in allowed_candidates {
let words_positions = ctx.docid_words_positions(docid)?;
let rank = compute_candidate_rank(branches, words_positions);
candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
}
Ok(candidates)
}
// TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
use crate::search::criteria::Operation::{And, Or, Consecutive};
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Vec<Query>>> {
match tail.split_first() {
Some((thead, tail)) => {
let tail = and_recurse(thead, tail);
let mut out = Vec::new();
for array in recurse(head) {
for tail_array in &tail {
let mut array = array.clone();
array.extend(tail_array.iter().cloned());
out.push(array);
}
}
out
},
None => recurse(head),
}
}
fn recurse(op: &Operation) -> Vec<Vec<Vec<Query>>> {
match op {
And(ops) | Consecutive(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
},
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
} else {
ops.into_iter().map(recurse).flatten().collect()
},
Operation::Query(query) => vec![vec![vec![query.clone()]]],
}
}
recurse(query_tree)
}
#[cfg(test)]
mod tests {
use big_s::S;
use crate::search::criteria::QueryKind;
use super::*;
#[test]
fn simple_flatten_query_tree() {
let query_tree = Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
]),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
]),
]),
]),
]);
let expected = vec![
vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],
vec![
vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
],
vec![
vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }],
],
vec![
vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }],
vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
],
];
let result = flatten_query_tree(&query_tree);
assert_eq!(expected, result);
}
}

View File

@ -1,135 +0,0 @@
use std::collections::HashMap;
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::Operation;
use crate::search::WordDerivationsCache;
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
/// The result of a call to the fetcher.
#[derive(Debug, Clone, PartialEq)]
pub struct FetcherResult {
/// The query tree corresponding to the current bucket of the last criterion.
pub query_tree: Option<Operation>,
/// The candidates of the current bucket of the last criterion.
pub candidates: RoaringBitmap,
/// Candidates that comes from the current bucket of the initial criterion.
pub bucket_candidates: RoaringBitmap,
}
pub struct Fetcher<'t> {
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Candidates,
parent: Option<Box<dyn Criterion + 't>>,
should_get_documents_ids: bool,
wdcache: WordDerivationsCache,
}
impl<'t> Fetcher<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Fetcher {
ctx,
query_tree,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
parent: None,
should_get_documents_ids: true,
wdcache: WordDerivationsCache::new(),
}
}
pub fn new(
ctx: &'t dyn Context,
parent: Box<dyn Criterion + 't>,
) -> Self
{
Fetcher {
ctx,
query_tree: None,
candidates: Candidates::default(),
parent: Some(parent),
should_get_documents_ids: true,
wdcache: WordDerivationsCache::new(),
}
}
#[logging_timer::time("Fetcher::{}")]
pub fn next(&mut self) -> anyhow::Result<Option<FetcherResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})",
self.should_get_documents_ids, self.candidates,
);
let should_get_documents_ids = take(&mut self.should_get_documents_ids);
match &mut self.candidates {
Allowed(_) => {
let candidates = take(&mut self.candidates).into_inner();
let candidates = match &self.query_tree {
Some(qt) if should_get_documents_ids => {
let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?;
docids.intersect_with(&candidates);
docids
},
_ => candidates,
};
return Ok(Some(FetcherResult {
query_tree: self.query_tree.take(),
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
Forbidden(_) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next(&mut self.wdcache)? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates = match (&query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
(None, None) => RoaringBitmap::new(),
};
return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates }))
},
None => if should_get_documents_ids {
let candidates = match &self.query_tree {
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?,
None => self.ctx.documents_ids()?,
};
return Ok(Some(FetcherResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
}
},
None => if should_get_documents_ids {
let candidates = match &self.query_tree {
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?,
None => self.ctx.documents_ids()?,
};
return Ok(Some(FetcherResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
}
return Ok(None);
},
}
}
}
}

View File

@ -0,0 +1,53 @@
use std::collections::HashMap;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::Operation;
use crate::search::WordDerivationsCache;
use super::{resolve_query_tree, Criterion, CriterionResult, Context};
/// The result of a call to the fetcher.
#[derive(Debug, Clone, PartialEq)]
pub struct FinalResult {
/// The query tree corresponding to the current bucket of the last criterion.
pub query_tree: Option<Operation>,
/// The candidates of the current bucket of the last criterion.
pub candidates: RoaringBitmap,
/// Candidates that comes from the current bucket of the initial criterion.
pub bucket_candidates: RoaringBitmap,
}
pub struct Final<'t> {
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
wdcache: WordDerivationsCache,
}
impl<'t> Final<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
Final { ctx, parent, wdcache: WordDerivationsCache::new() }
}
#[logging_timer::time("Final::{}")]
pub fn next(&mut self) -> anyhow::Result<Option<FinalResult>> {
loop {
debug!("Final iteration");
match self.parent.next(&mut self.wdcache)? {
Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => {
let candidates = match (&query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
(None, None) => self.ctx.documents_ids()?,
};
bucket_candidates.union_with(&candidates);
return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }));
},
None => return Ok(None),
}
}
}
}

View File

@ -0,0 +1,28 @@
use roaring::RoaringBitmap;
use crate::search::query_tree::Operation;
use crate::search::WordDerivationsCache;
use super::{Criterion, CriterionResult};
pub struct Initial {
answer: Option<CriterionResult>
}
impl Initial {
pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial {
let answer = CriterionResult {
query_tree,
candidates: candidates.clone(),
bucket_candidates: candidates.take().unwrap_or_default(),
};
Initial { answer: Some(answer) }
}
}
impl Criterion for Initial {
#[logging_timer::time("Initial::{}")]
fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
Ok(self.answer.take())
}
}

View File

@ -4,21 +4,25 @@ use std::borrow::Cow;
use anyhow::bail;
use roaring::RoaringBitmap;
use crate::search::{word_derivations, WordDerivationsCache};
use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}};
use crate::{Index, DocumentId};
use super::query_tree::{Operation, Query, QueryKind};
use self::asc_desc::AscDesc;
use self::attribute::Attribute;
use self::r#final::Final;
use self::initial::Initial;
use self::proximity::Proximity;
use self::typo::Typo;
use self::words::Words;
use self::asc_desc::AscDesc;
use self::proximity::Proximity;
use self::fetcher::Fetcher;
mod asc_desc;
mod attribute;
mod initial;
mod proximity;
mod typo;
mod words;
mod asc_desc;
mod proximity;
pub mod fetcher;
pub mod r#final;
pub trait Criterion {
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>>;
@ -59,7 +63,8 @@ impl Default for Candidates {
Self::Forbidden(RoaringBitmap::new())
}
}
pub trait Context {
pub trait Context<'c> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
@ -68,6 +73,8 @@ pub trait Context {
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool;
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>;
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>;
}
pub struct CriteriaBuilder<'t> {
rtxn: &'t heed::RoTxn<'t>,
@ -76,7 +83,7 @@ pub struct CriteriaBuilder<'t> {
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
}
impl<'a> Context for CriteriaBuilder<'a> {
impl<'c> Context<'c> for CriteriaBuilder<'c> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
self.index.documents_ids(self.rtxn)
}
@ -115,6 +122,48 @@ impl<'a> Context for CriteriaBuilder<'a> {
}
Ok(words_positions)
}
fn word_position_iterator(
&self,
word: &str,
level: TreeLevel,
in_prefix_cache: bool,
left: Option<u32>,
right: Option<u32>
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
{
let range = {
let left = left.unwrap_or(u32::min_value());
let right = right.unwrap_or(u32::max_value());
let left = (word, level, left, left);
let right = (word, level, right, right);
left..=right
};
let db = match in_prefix_cache {
true => self.index.word_prefix_level_position_docids,
false => self.index.word_level_position_docids,
};
Ok(Box::new(db.range(self.rtxn, &range)?))
}
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
let range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
left..=right
};
let db = match in_prefix_cache {
true => self.index.word_prefix_level_position_docids,
false => self.index.word_level_position_docids,
};
let last_level = db
.remap_data_type::<heed::types::DecodeIgnore>()
.range(self.rtxn, &range)?.last().transpose()?
.map(|((_, level, _, _), _)| level);
Ok(last_level)
}
}
impl<'t> CriteriaBuilder<'t> {
@ -126,42 +175,26 @@ impl<'t> CriteriaBuilder<'t> {
pub fn build(
&'t self,
mut query_tree: Option<Operation>,
mut facet_candidates: Option<RoaringBitmap>,
) -> anyhow::Result<Fetcher<'t>>
query_tree: Option<Operation>,
facet_candidates: Option<RoaringBitmap>,
) -> anyhow::Result<Final<'t>>
{
use crate::criterion::Criterion as Name;
let mut criterion = None as Option<Box<dyn Criterion>>;
let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box<dyn Criterion>;
for name in self.index.criteria(&self.rtxn)? {
criterion = Some(match criterion.take() {
Some(father) => match name {
Name::Typo => Box::new(Typo::new(self, father)),
Name::Words => Box::new(Words::new(self, father)),
Name::Proximity => Box::new(Proximity::new(self, father)),
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?),
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?),
_otherwise => father,
},
None => match name {
Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())),
Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())),
Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())),
Name::Asc(field) => {
Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
},
Name::Desc(field) => {
Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
},
_otherwise => continue,
},
});
criterion = match name {
Name::Typo => Box::new(Typo::new(self, criterion)),
Name::Words => Box::new(Words::new(self, criterion)),
Name::Proximity => Box::new(Proximity::new(self, criterion)),
Name::Attribute => Box::new(Attribute::new(self, criterion)),
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?),
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?),
_otherwise => criterion,
};
}
match criterion {
Some(criterion) => Ok(Fetcher::new(self, criterion)),
None => Ok(Fetcher::initial(self, query_tree, facet_candidates)),
}
Ok(Final::new(self, criterion))
}
}
@ -362,9 +395,10 @@ pub mod test {
word_prefix_docids: HashMap<String, RoaringBitmap>,
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
docid_words: HashMap<u32, Vec<String>>,
}
impl<'a> Context for TestContext<'a> {
impl<'c> Context<'c> for TestContext<'c> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids))
}
@ -395,7 +429,24 @@ pub mod test {
self.word_prefix_docids.contains_key(&word.to_string())
}
fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
if let Some(docid_words) = self.docid_words.get(&docid) {
Ok(docid_words
.iter()
.enumerate()
.map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))))
.collect()
)
} else {
Ok(HashMap::new())
}
}
fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
todo!()
}
fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
todo!()
}
}
@ -431,50 +482,58 @@ pub mod test {
s("morning") => random_postings(rng, 125),
};
let mut docid_words = HashMap::new();
for (word, docids) in word_docids.iter() {
for docid in docids {
let words = docid_words.entry(docid).or_insert(vec![]);
words.push(word.clone());
}
}
let word_prefix_docids = hashmap!{
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
};
let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")];
let hello_world_split = (hello_world.len() / 2) as usize;
let hello_world_1 = hello_world.iter().take(hello_world_split).collect();
let hello_world_2 = hello_world.iter().skip(hello_world_split).collect();
let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")];
let hello_word_split = (hello_word.len() / 2) as usize;
let hello_word_4 = hello_word.iter().take(hello_word_split).collect();
let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect();
let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect();
let word_pair_proximity_docids = hashmap!{
(s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")],
(s("hello"), s("world"), 1) => hello_world_1,
(s("hello"), s("world"), 4) => hello_world_2,
(s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")],
(s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
(s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
(s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
(s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
(s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")],
(s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")],
(s("hello"), s("word"), 4) => hello_word_4,
(s("hello"), s("word"), 6) => hello_word_6,
(s("hello"), s("word"), 7) => hello_word_7,
(s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")],
(s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
(s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")],
(s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
let mut word_pair_proximity_docids = HashMap::new();
let mut word_prefix_pair_proximity_docids = HashMap::new();
for (lword, lcandidates) in &word_docids {
for (rword, rcandidates) in &word_docids {
if lword == rword { continue }
let candidates = lcandidates & rcandidates;
for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) {
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let rposition = docid_words.iter().position(|w| w == rword).unwrap();
let key = if lposition < rposition {
(s(lword), s(rword), (rposition - lposition) as i32)
} else {
(s(lword), s(rword), (lposition - rposition + 1) as i32)
};
let word_prefix_pair_proximity_docids = hashmap!{
(s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(),
(s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(),
(s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(),
(s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(),
(s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(),
(s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(),
let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
docids.push(candidate);
}
}
}
for (pword, pcandidates) in &word_prefix_docids {
if lword.starts_with(pword) { continue }
let candidates = lcandidates & pcandidates;
for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) {
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
let key = if lposition < rposition {
(s(lword), s(pword), (rposition - lposition) as i32)
} else {
(s(lword), s(pword), (lposition - rposition + 1) as i32)
};
let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
docids.push(candidate);
}
}
}
}
let mut keys = word_docids.keys().collect::<Vec<_>>();
keys.sort_unstable();
@ -486,6 +545,7 @@ pub mod test {
word_prefix_docids,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
docid_words,
}
}
}

View File

@ -8,48 +8,29 @@ use log::debug;
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
use crate::search::query_tree::{maximum_proximity, Operation, Query};
use crate::search::{build_dfa, WordDerivationsCache};
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
pub struct Proximity<'t> {
ctx: &'t dyn Context,
query_tree: Option<(usize, Operation)>,
ctx: &'t dyn Context<'t>,
/// ((max_proximity, query_tree), allowed_candidates)
state: Option<(Option<(usize, Operation)>, RoaringBitmap)>,
proximity: u8,
candidates: Candidates,
bucket_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
parent: Box<dyn Criterion + 't>,
candidates_cache: Cache,
plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
}
impl<'t> Proximity<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Proximity {
ctx,
query_tree: query_tree.map(|op| (maximum_proximity(&op), op)),
state: None,
proximity: 0,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
bucket_candidates: RoaringBitmap::new(),
parent: None,
candidates_cache: Cache::new(),
plane_sweep_cache: None,
}
}
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
Proximity {
ctx,
query_tree: None,
proximity: 0,
candidates: Candidates::default(),
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
parent,
candidates_cache: Cache::new(),
plane_sweep_cache: None,
}
@ -59,27 +40,20 @@ impl<'t> Proximity<'t> {
impl<'t> Criterion for Proximity<'t> {
#[logging_timer::time("Proximity::{}")]
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Proximity at iteration {} (max {:?}) ({:?})",
debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
self.proximity,
self.query_tree.as_ref().map(|(mp, _)| mp),
self.candidates,
self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)),
self.state.as_ref().map(|(_, cd)| cd),
);
match (&mut self.query_tree, &mut self.candidates) {
(_, Allowed(candidates)) if candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take().map(|(_, qt)| qt),
candidates: Some(take(&mut self.candidates).into_inner()),
bucket_candidates: take(&mut self.bucket_candidates),
}));
match &mut self.state {
Some((_, candidates)) if candidates.is_empty() => {
self.state = None; // reset state
},
(Some((max_prox, query_tree)), Allowed(candidates)) => {
Some((Some((max_prox, query_tree)), candidates)) => {
if self.proximity as usize > *max_prox {
// reset state to (None, Forbidden(_))
self.query_tree = None;
self.candidates = Candidates::default();
self.state = None; // reset state
} else {
let mut new_candidates = if candidates.len() <= 1000 {
if let Some(cache) = self.plane_sweep_cache.as_mut() {
@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> {
candidates
},
None => {
// reset state to (None, Forbidden(_))
self.query_tree = None;
self.candidates = Candidates::default();
self.state = None; // reset state
continue
},
}
@ -120,83 +92,58 @@ impl<'t> Criterion for Proximity<'t> {
candidates.difference_with(&new_candidates);
self.proximity += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: Some(new_candidates),
bucket_candidates,
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
(Some((max_prox, query_tree)), Forbidden(candidates)) => {
if self.proximity as usize > *max_prox {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let mut new_candidates = resolve_candidates(
self.ctx,
&query_tree,
self.proximity,
&mut self.candidates_cache,
wdcache,
)?;
new_candidates.difference_with(&candidates);
candidates.union_with(&new_candidates);
self.proximity += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: Some(new_candidates),
bucket_candidates,
}));
}
},
(None, Allowed(_)) => {
let candidates = take(&mut self.candidates).into_inner();
Some((None, candidates)) => {
let candidates = take(candidates);
self.state = None; // reset state
return Ok(Some(CriterionResult {
query_tree: None,
candidates: Some(candidates.clone()),
bucket_candidates: candidates,
}));
},
(None, Forbidden(_)) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next(wdcache)? {
None => {
match self.parent.next(wdcache)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates_is_some = candidates.is_some();
let candidates = match (&query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?,
(None, None) => RoaringBitmap::new(),
};
if bucket_candidates.is_empty() {
self.bucket_candidates.union_with(&candidates);
} else {
// If our parent returns candidates it means that the bucket
// candidates were already computed before and we can use them.
//
// If not, we must use the just computed candidates as our bucket
// candidates.
if candidates_is_some {
self.bucket_candidates.union_with(&bucket_candidates);
} else {
self.bucket_candidates.union_with(&candidates);
}
self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
let query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
self.state = Some((query_tree, candidates));
self.proximity = 0;
self.candidates = Candidates::Allowed(candidates);
self.plane_sweep_cache = None;
},
None => return Ok(None),
}
},
None => return Ok(None),
}
},
}
}
}

View File

@ -9,41 +9,24 @@ use crate::search::{word_derivations, WordDerivationsCache};
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
pub struct Typo<'t> {
ctx: &'t dyn Context,
ctx: &'t dyn Context<'t>,
query_tree: Option<(usize, Operation)>,
number_typos: u8,
candidates: Candidates,
bucket_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
}
impl<'t> Typo<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Typo {
ctx,
query_tree: query_tree.map(|op| (maximum_typo(&op), op)),
number_typos: 0,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
bucket_candidates: RoaringBitmap::new(),
parent: None,
candidates_cache: HashMap::new(),
}
}
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Typo {
ctx,
query_tree: None,
number_typos: 0,
candidates: Candidates::default(),
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
parent,
candidates_cache: HashMap::new(),
}
}
@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> {
candidates.difference_with(&new_candidates);
self.number_typos += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: Some(new_candidates),
bucket_candidates,
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
@ -145,9 +123,14 @@ impl<'t> Criterion for Typo<'t> {
}));
},
(None, Forbidden(_)) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next(wdcache)? {
match self.parent.next(wdcache)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
self.number_typos = 0;
@ -157,9 +140,6 @@ impl<'t> Criterion for Typo<'t> {
None => return Ok(None),
}
},
None => return Ok(None),
}
},
}
}
}
@ -334,8 +314,8 @@ fn resolve_candidates<'t>(
#[cfg(test)]
mod test {
use super::*;
use super::super::initial::Initial;
use super::super::test::TestContext;
#[test]
@ -345,8 +325,10 @@ mod test {
let facet_candidates = None;
let mut wdcache = WordDerivationsCache::new();
let mut criteria = Typo::initial(&context, query_tree, facet_candidates);
let parent = Initial::new(query_tree, facet_candidates);
let mut criteria = Typo::new(&context, Box::new(parent));
assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none());
assert!(criteria.next(&mut wdcache).unwrap().is_none());
}
@ -364,7 +346,8 @@ mod test {
let facet_candidates = None;
let mut wdcache = WordDerivationsCache::new();
let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates);
let parent = Initial::new(Some(query_tree), facet_candidates);
let mut criteria = Typo::new(&context, Box::new(parent));
let candidates_1 = context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
@ -413,7 +396,8 @@ mod test {
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut wdcache = WordDerivationsCache::new();
let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone()));
let parent = Initial::new(query_tree, Some(facet_candidates.clone()));
let mut criteria = Typo::new(&context, Box::new(parent));
let expected = CriterionResult {
query_tree: None,
@ -442,7 +426,8 @@ mod test {
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut wdcache = WordDerivationsCache::new();
let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone()));
let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone()));
let mut criteria = Typo::new(&context, Box::new(parent));
let candidates_1 = context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
@ -456,7 +441,7 @@ mod test {
]),
])),
candidates: Some(&candidates_1 & &facet_candidates),
bucket_candidates: candidates_1 & &facet_candidates,
bucket_candidates: facet_candidates.clone(),
};
assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1));
@ -478,7 +463,7 @@ mod test {
]),
])),
candidates: Some(&candidates_2 & &facet_candidates),
bucket_candidates: candidates_2 & &facet_candidates,
bucket_candidates: RoaringBitmap::new(),
};
assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2));

View File

@ -8,38 +8,22 @@ use crate::search::query_tree::Operation;
use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache};
pub struct Words<'t> {
ctx: &'t dyn Context,
ctx: &'t dyn Context<'t>,
query_trees: Vec<Operation>,
candidates: Option<RoaringBitmap>,
bucket_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
}
impl<'t> Words<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Words {
ctx,
query_trees: query_tree.map(explode_query_tree).unwrap_or_default(),
candidates,
bucket_candidates: RoaringBitmap::new(),
parent: None,
candidates_cache: HashMap::default(),
}
}
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Words {
ctx,
query_trees: Vec::default(),
candidates: None,
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
parent,
candidates_cache: HashMap::default(),
}
}
@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> {
found_candidates.intersect_with(&candidates);
candidates.difference_with(&found_candidates);
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => found_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: Some(found_candidates),
bucket_candidates,
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some(qt), None) => {
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => RoaringBitmap::new(),
};
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: None,
bucket_candidates,
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(None, Some(_)) => {
@ -97,9 +71,14 @@ impl<'t> Criterion for Words<'t> {
}));
},
(None, None) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next(wdcache)? {
match self.parent.next(wdcache)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
self.candidates = candidates;
@ -108,9 +87,6 @@ impl<'t> Criterion for Words<'t> {
None => return Ok(None),
}
},
None => return Ok(None),
}
},
}
}
}

View File

@ -13,9 +13,8 @@ use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
use crate::search::criteria::fetcher::{Fetcher, FetcherResult};
use crate::{DocumentId, Index};
use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{Index, DocumentId};
pub use self::facet::{
FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
@ -162,14 +161,14 @@ impl<'a> Search<'a> {
&self,
mut distinct: impl for<'c> Distinct<'c>,
matching_words: MatchingWords,
mut criteria: Fetcher,
mut criteria: Final,
) -> anyhow::Result<SearchResult> {
let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new();
let mut excluded_documents = RoaringBitmap::new();
let mut documents_ids = Vec::with_capacity(self.limit);
while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? {
debug!("Number of candidates found {}", candidates.len());
let excluded = take(&mut excluded_documents);

51
milli/src/tree_level.rs Normal file
View File

@ -0,0 +1,51 @@
use std::convert::TryFrom;
use std::fmt;
/// This is just before the lowest printable character (space, sp, 32)
const MAX_VALUE: u8 = 31;
#[derive(Debug, Copy, Clone)]
pub enum Error {
LevelTooHigh(u8),
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(transparent)]
pub struct TreeLevel(u8);
impl TreeLevel {
pub const fn max_value() -> TreeLevel {
TreeLevel(MAX_VALUE)
}
pub const fn min_value() -> TreeLevel {
TreeLevel(0)
}
pub fn saturating_sub(&self, lhs: u8) -> TreeLevel {
TreeLevel(self.0.saturating_sub(lhs))
}
}
impl Into<u8> for TreeLevel {
fn into(self) -> u8 {
self.0
}
}
impl TryFrom<u8> for TreeLevel {
type Error = Error;
fn try_from(value: u8) -> Result<TreeLevel, Error> {
match value {
0..=MAX_VALUE => Ok(TreeLevel(value)),
_ => Err(Error::LevelTooHigh(value)),
}
}
}
impl fmt::Display for TreeLevel {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}

View File

@ -28,6 +28,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,
@ -55,6 +57,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
docid_word_positions.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
word_level_position_docids.clear(self.wtxn)?;
word_prefix_level_position_docids.clear(self.wtxn)?;
facet_field_id_value_docids.clear(self.wtxn)?;
field_id_docid_facet_values.clear(self.wtxn)?;
documents.clear(self.wtxn)?;

View File

@ -88,6 +88,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,
@ -329,6 +331,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the word level position docids.
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
drop(iter);
// We delete the documents ids that are under the word prefix level position docids.
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
drop(iter);
Ok(self.documents_ids.len())
}
}

View File

@ -52,6 +52,14 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -
cbo_roaring_bitmap_merge(values)
}
pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}
pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
cbo_roaring_bitmap_merge(values)
}

View File

@ -2,7 +2,8 @@ use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, Seek, SeekFrom};
use std::num::NonZeroUsize;
use std::num::{NonZeroU32, NonZeroUsize};
use std::str;
use std::sync::mpsc::sync_channel;
use std::time::Instant;
@ -13,17 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy
use heed::types::ByteSlice;
use log::{debug, info, error};
use memmap::Mmap;
use rayon::ThreadPool;
use rayon::prelude::*;
use rayon::ThreadPool;
use serde::{Serialize, Deserialize};
use crate::index::Index;
use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep};
use crate::update::{
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
WordPrefixPairProximityDocids,
};
use self::store::{Store, Readers};
pub use self::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
docid_word_positions_merge, documents_merge, facet_field_value_docids_merge,
field_id_docid_facet_values_merge,
docid_word_positions_merge, documents_merge,
word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
};
pub use self::transform::{Transform, TransformOutput};
@ -262,6 +267,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
facet_min_level_size: Option<NonZeroUsize>,
words_prefix_threshold: Option<f64>,
max_prefix_length: Option<usize>,
words_positions_level_group_size: Option<NonZeroU32>,
words_positions_min_level_size: Option<NonZeroU32>,
update_method: IndexDocumentsMethod,
update_format: UpdateFormat,
autogenerate_docids: bool,
@ -289,6 +296,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
facet_min_level_size: None,
words_prefix_threshold: None,
max_prefix_length: None,
words_positions_level_group_size: None,
words_positions_min_level_size: None,
update_method: IndexDocumentsMethod::ReplaceDocuments,
update_format: UpdateFormat::Json,
autogenerate_docids: true,
@ -402,6 +411,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
enum DatabaseType {
Main,
WordDocids,
WordLevel0PositionDocids,
FacetLevel0ValuesDocids,
}
@ -467,6 +477,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut word_docids_readers = Vec::with_capacity(readers.len());
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len());
let mut documents_readers = Vec::with_capacity(readers.len());
@ -476,6 +487,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
word_docids,
docid_word_positions,
words_pairs_proximities_docids,
word_level_position_docids,
facet_field_value_docids,
field_id_docid_facet_values,
documents
@ -484,6 +496,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
word_docids_readers.push(word_docids);
docid_word_positions_readers.push(docid_word_positions);
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
word_level_position_docids_readers.push(word_level_position_docids);
facet_field_value_docids_readers.push(facet_field_value_docids);
field_id_docid_facet_values_readers.push(field_id_docid_facet_values);
documents_readers.push(documents);
@ -514,6 +527,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
facet_field_value_docids_readers,
facet_field_value_docids_merge,
),
(
DatabaseType::WordLevel0PositionDocids,
word_level_position_docids_readers,
word_level_position_docids_merge,
),
]
.into_par_iter()
.for_each(|(dbtype, readers, merge)| {
@ -569,7 +587,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
let mut database_count = 0;
let total_databases = 7;
let total_databases = 8;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: 0,
@ -661,7 +679,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
)?;
},
DatabaseType::FacetLevel0ValuesDocids => {
debug!("Writing the facet values docids into LMDB on disk...");
debug!("Writing the facet level 0 values docids into LMDB on disk...");
let db = *self.index.facet_field_id_value_docids.as_polymorph();
write_into_lmdb_database(
self.wtxn,
@ -671,6 +689,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method,
)?;
},
DatabaseType::WordLevel0PositionDocids => {
debug!("Writing the word level 0 positions docids into LMDB on disk...");
let db = *self.index.word_level_position_docids.as_polymorph();
write_into_lmdb_database(
self.wtxn,
db,
content,
word_level_position_docids_merge,
write_method,
)?;
}
}
database_count += 1;
@ -694,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
builder.execute()?;
// Run the words prefixes update operation.
let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
if let Some(value) = self.words_prefix_threshold {
builder.threshold(value);
}
@ -706,6 +732,37 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
}
builder.execute()?;
// Run the word prefix docids update operation.
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
builder.max_nb_chunks = self.max_nb_chunks;
builder.max_memory = self.max_memory;
builder.execute()?;
// Run the word prefix pair proximity docids update operation.
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
builder.max_nb_chunks = self.max_nb_chunks;
builder.max_memory = self.max_memory;
builder.execute()?;
// Run the words level positions update operation.
let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
if let Some(value) = self.words_positions_level_group_size {
builder.level_group_size(value);
}
if let Some(value) = self.words_positions_min_level_size {
builder.min_level_size(value);
}
builder.execute()?;
debug_assert_eq!(database_count, total_databases);
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());

View File

@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
word_level_position_docids_merge, facet_field_value_docids_merge,
field_id_docid_facet_values_merge,
};
const LMDB_MAX_KEY_LENGTH: usize = 511;
@ -43,6 +44,7 @@ pub struct Readers {
pub word_docids: Reader<FileFuse>,
pub docid_word_positions: Reader<FileFuse>,
pub words_pairs_proximities_docids: Reader<FileFuse>,
pub word_level_position_docids: Reader<FileFuse>,
pub facet_field_value_docids: Reader<FileFuse>,
pub field_id_docid_facet_values: Reader<FileFuse>,
pub documents: Reader<FileFuse>,
@ -69,6 +71,7 @@ pub struct Store<'s, A> {
main_sorter: Sorter<MergeFn>,
word_docids_sorter: Sorter<MergeFn>,
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
word_level_position_docids_sorter: Sorter<MergeFn>,
facet_field_value_docids_sorter: Sorter<MergeFn>,
field_id_docid_facet_values_sorter: Sorter<MergeFn>,
// MTBL writers
@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
) -> anyhow::Result<Self>
{
// We divide the max memory by the number of sorter the Store have.
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4));
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
let main_sorter = create_sorter(
@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_nb_chunks,
max_memory,
);
let word_level_position_docids_sorter = create_sorter(
word_level_position_docids_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
max_nb_chunks,
max_memory,
);
let facet_field_value_docids_sorter = create_sorter(
facet_field_value_docids_merge,
chunk_compression_type,
@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
main_sorter,
word_docids_sorter,
words_pairs_proximities_docids_sorter,
word_level_position_docids_sorter,
facet_field_value_docids_sorter,
field_id_docid_facet_values_sorter,
// MTBL writers
@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
words_positions.clear();
@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_word_position_docids(
writer: &mut Sorter<MergeFn>,
document_id: DocumentId,
words_positions: &HashMap<String, SmallVec32<Position>>,
) -> anyhow::Result<()>
{
let mut key_buffer = Vec::new();
let mut data_buffer = Vec::new();
for (word, positions) in words_positions {
key_buffer.clear();
key_buffer.extend_from_slice(word.as_bytes());
key_buffer.push(0); // level 0
for position in positions {
key_buffer.truncate(word.len() + 1);
let position_bytes = position.to_be_bytes();
key_buffer.extend_from_slice(position_bytes.as_bytes());
key_buffer.extend_from_slice(position_bytes.as_bytes());
data_buffer.clear();
let positions = RoaringBitmap::from_iter(Some(document_id));
// We serialize the positions into a buffer.
CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer)
.with_context(|| "could not serialize positions")?;
// that we write under the generated key into MTBL
if lmdb_key_valid_size(&key_buffer) {
writer.insert(&key_buffer, &data_buffer)?;
}
}
}
Ok(())
}
fn write_facet_field_value_docids<I>(
sorter: &mut Sorter<MergeFn>,
iter: I,
@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?;
@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?;
let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids,
docid_word_positions,
words_pairs_proximities_docids,
word_level_position_docids,
facet_field_value_docids,
field_id_docid_facet_values,
documents,

View File

@ -6,7 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc
pub use self::settings::{Setting, Settings};
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;
pub use self::words_prefixes::WordsPrefixes;
pub use self::word_prefix_docids::WordPrefixDocids;
pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
pub use self::words_level_positions::WordsLevelPositions;
pub use self::words_prefixes_fst::WordsPrefixesFst;
mod available_documents_ids;
mod clear_documents;
@ -16,5 +19,7 @@ mod index_documents;
mod settings;
mod update_builder;
mod update_step;
mod words_prefixes;
mod word_prefix_docids;
mod word_prefix_pair_proximity_docids;
mod words_level_positions;
mod words_prefixes_fst;

View File

@ -2,7 +2,7 @@ use grenad::CompressionType;
use rayon::ThreadPool;
use crate::Index;
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes};
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
pub struct UpdateBuilder<'a> {
pub(crate) log_every_n: Option<usize>,
@ -135,19 +135,4 @@ impl<'a> UpdateBuilder<'a> {
builder
}
pub fn words_prefixes<'t, 'u, 'i>(
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordsPrefixes<'t, 'u, 'i>
{
let mut builder = WordsPrefixes::new(wtxn, index, self.update_id);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
builder
}
}

View File

@ -0,0 +1,75 @@
use std::str;
use crate::Index;
use fst::Streamer;
use grenad::CompressionType;
use heed::types::ByteSlice;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database};
pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
pub(crate) max_nb_chunks: Option<usize>,
pub(crate) max_memory: Option<usize>,
}
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids {
wtxn,
index,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
max_nb_chunks: None,
max_memory: None,
}
}
pub fn execute(self) -> anyhow::Result<()> {
// Clear the word prefix docids database.
self.index.word_prefix_docids.clear(self.wtxn)?;
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
// It is forbidden to keep a mutable reference into the database
// and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter(
word_docids_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.max_nb_chunks,
self.max_memory,
);
// We iterate over all the prefixes and retrieve the corresponding docids.
let mut prefix_stream = prefix_fst.stream();
while let Some(bytes) = prefix_stream.next() {
let prefix = str::from_utf8(bytes)?;
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
for result in db.prefix_iter(self.wtxn, prefix)? {
let (_word, data) = result?;
prefix_docids_sorter.insert(prefix, data)?;
}
}
drop(prefix_fst);
// We finally write the word prefix docids into the LMDB database.
sorter_into_lmdb_database(
self.wtxn,
*self.index.word_prefix_docids.as_polymorph(),
prefix_docids_sorter,
word_docids_merge,
WriteMethod::Append,
)?;
Ok(())
}
}

View File

@ -0,0 +1,89 @@
use std::str;
use fst::automaton::{Automaton, Str};
use fst::{Streamer, IntoStreamer};
use grenad::CompressionType;
use heed::BytesEncode;
use heed::types::ByteSlice;
use log::debug;
use crate::Index;
use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::{
WriteMethod, create_sorter, sorter_into_lmdb_database,
words_pairs_proximities_docids_merge,
};
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
pub(crate) max_nb_chunks: Option<usize>,
pub(crate) max_memory: Option<usize>,
}
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
{
WordPrefixPairProximityDocids {
wtxn,
index,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
max_nb_chunks: None,
max_memory: None,
}
}
pub fn execute(self) -> anyhow::Result<()> {
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
// Here we create a sorter akin to the previous one.
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
words_pairs_proximities_docids_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.max_nb_chunks,
self.max_memory,
);
// We insert all the word pairs corresponding to the word-prefix pairs
// where the prefixes appears in the prefix FST previously constructed.
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
for result in db.iter(self.wtxn)? {
let ((word1, word2, prox), data) = result?;
let automaton = Str::new(word2).starts_with();
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
while let Some(prefix) = matching_prefixes.next() {
let prefix = str::from_utf8(prefix)?;
let pair = (word1, prefix, prox);
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
}
}
drop(prefix_fst);
// We finally write the word prefix pair proximity docids into the LMDB database.
sorter_into_lmdb_database(
self.wtxn,
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
word_prefix_pair_proximity_docids_sorter,
words_pairs_proximities_docids_merge,
WriteMethod::Append,
)?;
Ok(())
}
}

View File

@ -0,0 +1,261 @@
use std::{cmp, str};
use std::convert::TryFrom;
use std::fs::File;
use std::num::NonZeroU32;
use fst::automaton::{self, Automaton};
use fst::{Streamer, IntoStreamer};
use grenad::{CompressionType, Reader, Writer, FileFuse};
use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::{BytesEncode, Error};
use log::debug;
use roaring::RoaringBitmap;
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
word_prefix_level_positions_docids_merge, sorter_into_lmdb_database
};
use crate::{Index, TreeLevel};
pub struct WordsLevelPositions<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
pub(crate) max_nb_chunks: Option<usize>,
pub(crate) max_memory: Option<usize>,
level_group_size: NonZeroU32,
min_level_size: NonZeroU32,
}
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
WordsLevelPositions {
wtxn,
index,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
max_nb_chunks: None,
max_memory: None,
level_group_size: NonZeroU32::new(4).unwrap(),
min_level_size: NonZeroU32::new(5).unwrap(),
}
}
pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self {
self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap();
self
}
pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self {
self.min_level_size = value;
self
}
pub fn execute(self) -> anyhow::Result<()> {
debug!("Computing and writing the word levels positions docids into LMDB on disk...");
let entries = compute_positions_levels(
self.wtxn,
self.index.word_docids.remap_data_type::<DecodeIgnore>(),
self.index.word_level_position_docids,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.level_group_size,
self.min_level_size,
)?;
// The previously computed entries also defines the level 0 entries
// so we can clear the database and append all of these entries.
self.index.word_level_position_docids.clear(self.wtxn)?;
write_into_lmdb_database(
self.wtxn,
*self.index.word_level_position_docids.as_polymorph(),
entries,
|_, _| anyhow::bail!("invalid word level position merging"),
WriteMethod::Append,
)?;
// We compute the word prefix level positions database.
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
let mut word_prefix_level_positions_docids_sorter = create_sorter(
word_prefix_level_positions_docids_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.max_nb_chunks,
self.max_memory,
);
// We insert the word prefix level positions where the level is equal to 0 and
// corresponds to the word-prefix level positions where the prefixes appears
// in the prefix FST previously constructed.
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>();
for result in db.iter(self.wtxn)? {
let ((word, level, left, right), data) = result?;
if level == TreeLevel::min_value() {
let automaton = automaton::Str::new(word).starts_with();
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
while let Some(prefix) = matching_prefixes.next() {
let prefix = str::from_utf8(prefix)?;
let key = (prefix, level, left, right);
let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap();
word_prefix_level_positions_docids_sorter.insert(bytes, data)?;
}
}
}
// We finally write all the word prefix level positions docids with
// a level equal to 0 into the LMDB database.
sorter_into_lmdb_database(
self.wtxn,
*self.index.word_prefix_level_position_docids.as_polymorph(),
word_prefix_level_positions_docids_sorter,
word_prefix_level_positions_docids_merge,
WriteMethod::Append,
)?;
let entries = compute_positions_levels(
self.wtxn,
self.index.word_prefix_docids.remap_data_type::<DecodeIgnore>(),
self.index.word_prefix_level_position_docids,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.level_group_size,
self.min_level_size,
)?;
// The previously computed entries also defines the level 0 entries
// so we can clear the database and append all of these entries.
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
write_into_lmdb_database(
self.wtxn,
*self.index.word_prefix_level_position_docids.as_polymorph(),
entries,
|_, _| anyhow::bail!("invalid word prefix level position merging"),
WriteMethod::Append,
)?;
Ok(())
}
}
/// Returns the next number after or equal to `x` that is divisible by `d`.
fn next_divisible(x: u32, d: u32) -> u32 {
(x.saturating_sub(1) | (d - 1)) + 1
}
/// Returns the previous number after or equal to `x` that is divisible by `d`,
/// saturates on zero.
fn previous_divisible(x: u32, d: u32) -> u32 {
match x.checked_sub(d - 1) {
Some(0) | None => 0,
Some(x) => next_divisible(x, d),
}
}
/// Generates all the words positions levels based on the levels zero (including the level zero).
fn compute_positions_levels(
rtxn: &heed::RoTxn,
words_db: heed::Database<Str, DecodeIgnore>,
words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
compression_type: CompressionType,
compression_level: Option<u32>,
shrink_size: Option<u64>,
level_group_size: NonZeroU32,
min_level_size: NonZeroU32,
) -> anyhow::Result<Reader<FileFuse>>
{
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(compression_type, compression_level, file)
})?;
for result in words_db.iter(rtxn)? {
let (word, ()) = result?;
let level_0_range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value());
left..=right
};
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
.range(rtxn, &level_0_range)?
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
// Groups sizes are always a power of the original level_group_size and therefore a group
// always maps groups of the previous level and never splits previous levels groups in half.
let group_size_iter = (1u8..)
.map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32)))
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
// As specified in the documentation, we also write the level 0 entries.
for result in words_positions_db.range(rtxn, &level_0_range)? {
let ((word, level, left, right), docids) = result?;
write_level_entry(&mut writer, word, level, left, right, &docids)?;
}
for (level, group_size) in group_size_iter {
let mut left = 0;
let mut right = 0;
let mut group_docids = RoaringBitmap::new();
for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
let ((_word, _level, value, _right), docids) = result?;
if i == 0 {
left = previous_divisible(value, group_size);
right = left + (group_size - 1);
}
if value > right {
// we found the first bound of the next group, we must store the left
// and right bounds associated with the docids.
write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
// We save the left bound for the new group and also reset the docids.
group_docids = RoaringBitmap::new();
left = previous_divisible(value, group_size);
right = left + (group_size - 1);
}
// The right bound is always the bound we run through.
group_docids.union_with(&docids);
}
if !group_docids.is_empty() {
write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
}
}
}
writer_into_reader(writer, shrink_size)
}
fn write_level_entry(
writer: &mut Writer<File>,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
ids: &RoaringBitmap,
) -> anyhow::Result<()>
{
let key = (word, level, left, right);
let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
writer.insert(&key, &data)?;
Ok(())
}

View File

@ -1,196 +0,0 @@
use std::iter::FromIterator;
use std::str;
use chrono::Utc;
use fst::automaton::Str;
use fst::{Automaton, Streamer, IntoStreamer};
use grenad::CompressionType;
use heed::BytesEncode;
use heed::types::ByteSlice;
use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database};
use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
use crate::{Index, SmallString32};
pub struct WordsPrefixes<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
pub(crate) max_nb_chunks: Option<usize>,
pub(crate) max_memory: Option<usize>,
threshold: f64,
max_prefix_length: usize,
_update_id: u64,
}
impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> WordsPrefixes<'t, 'u, 'i>
{
WordsPrefixes {
wtxn,
index,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
max_nb_chunks: None,
max_memory: None,
threshold: 0.1 / 100.0, // .01%
max_prefix_length: 4,
_update_id: update_id,
}
}
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
/// database. If a word prefix is supposed to match more than this number of words in the
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
///
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
/// to these bounds otherwise.
pub fn threshold(&mut self, value: f64) -> &mut Self {
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
self
}
/// Set the maximum length of prefixes in bytes.
///
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
/// to these bounds, otherwise.
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
self
}
pub fn execute(self) -> anyhow::Result<()> {
self.index.set_updated_at(self.wtxn, &Utc::now())?;
// Clear the words prefixes datastructures.
self.index.word_prefix_docids.clear(self.wtxn)?;
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
let words_fst = self.index.words_fst(&self.wtxn)?;
let number_of_words = words_fst.len();
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
// It is forbidden to keep a mutable reference into the database
// and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter(
word_docids_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.max_nb_chunks,
self.max_memory,
);
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
for n in 1..=self.max_prefix_length {
let mut current_prefix = SmallString32::new();
let mut current_prefix_count = 0;
let mut builder = fst::SetBuilder::memory();
let mut stream = words_fst.stream();
while let Some(bytes) = stream.next() {
// We try to get the first n bytes out of this string but we only want
// to split at valid characters bounds. If we try to split in the middle of
// a character we ignore this word and go to the next one.
let word = str::from_utf8(bytes)?;
let prefix = match word.get(..n) {
Some(prefix) => prefix,
None => continue,
};
// This is the first iteration of the loop,
// or the current word doesn't starts with the current prefix.
if current_prefix_count == 0 || prefix != current_prefix.as_str() {
current_prefix = SmallString32::from(prefix);
current_prefix_count = 0;
}
current_prefix_count += 1;
// There is enough words corresponding to this prefix to add it to the cache.
if current_prefix_count == min_number_of_words {
builder.insert(prefix)?;
}
}
// We construct the final set for prefixes of size n.
prefix_fsts.push(builder.into_set());
}
// We merge all of the previously computed prefixes into on final set.
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
let mut builder = fst::SetBuilder::memory();
builder.extend_stream(op.r#union())?;
let prefix_fst = builder.into_set();
// We iterate over all the prefixes and retrieve the corresponding docids.
let mut prefix_stream = prefix_fst.stream();
while let Some(bytes) = prefix_stream.next() {
let prefix = str::from_utf8(bytes)?;
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
for result in db.prefix_iter(self.wtxn, prefix)? {
let (_word, data) = result?;
prefix_docids_sorter.insert(prefix, data)?;
}
}
// Set the words prefixes FST in the dtabase.
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
// We finally write the word prefix docids into the LMDB database.
sorter_into_lmdb_database(
self.wtxn,
*self.index.word_prefix_docids.as_polymorph(),
prefix_docids_sorter,
word_docids_merge,
WriteMethod::Append,
)?;
// We compute the word prefix pair proximity database.
// Here we create a sorter akin to the previous one.
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
words_pairs_proximities_docids_merge,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.max_nb_chunks,
self.max_memory,
);
// We insert all the word pairs corresponding to the word-prefix pairs
// where the prefixes appears in the prefix FST previously constructed.
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
for result in db.iter(self.wtxn)? {
let ((word1, word2, prox), data) = result?;
let automaton = Str::new(word2).starts_with();
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
while let Some(prefix) = matching_prefixes.next() {
let prefix = str::from_utf8(prefix)?;
let pair = (word1, prefix, prox);
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
}
}
// We finally write the word prefix pair proximity docids into the LMDB database.
sorter_into_lmdb_database(
self.wtxn,
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
word_prefix_pair_proximity_docids_sorter,
words_pairs_proximities_docids_merge,
WriteMethod::Append,
)?;
Ok(())
}
}

View File

@ -0,0 +1,104 @@
use std::iter::FromIterator;
use std::str;
use fst::Streamer;
use crate::{Index, SmallString32};
pub struct WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
threshold: f64,
max_prefix_length: usize,
_update_id: u64,
}
impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> WordsPrefixesFst<'t, 'u, 'i>
{
WordsPrefixesFst {
wtxn,
index,
threshold: 0.1 / 100.0, // .01%
max_prefix_length: 4,
_update_id: update_id,
}
}
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
/// database. If a word prefix is supposed to match more than this number of words in the
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
///
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
/// to these bounds otherwise.
pub fn threshold(&mut self, value: f64) -> &mut Self {
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
self
}
/// Set the maximum length of prefixes in bytes.
///
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
/// to these bounds, otherwise.
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
self
}
pub fn execute(self) -> anyhow::Result<()> {
let words_fst = self.index.words_fst(&self.wtxn)?;
let number_of_words = words_fst.len();
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
for n in 1..=self.max_prefix_length {
let mut current_prefix = SmallString32::new();
let mut current_prefix_count = 0;
let mut builder = fst::SetBuilder::memory();
let mut stream = words_fst.stream();
while let Some(bytes) = stream.next() {
// We try to get the first n bytes out of this string but we only want
// to split at valid characters bounds. If we try to split in the middle of
// a character we ignore this word and go to the next one.
let word = str::from_utf8(bytes)?;
let prefix = match word.get(..n) {
Some(prefix) => prefix,
None => continue,
};
// This is the first iteration of the loop,
// or the current word doesn't starts with the current prefix.
if current_prefix_count == 0 || prefix != current_prefix.as_str() {
current_prefix = SmallString32::from(prefix);
current_prefix_count = 0;
}
current_prefix_count += 1;
// There is enough words corresponding to this prefix to add it to the cache.
if current_prefix_count == min_number_of_words {
builder.insert(prefix)?;
}
}
// We construct the final set for prefixes of size n.
prefix_fsts.push(builder.into_set());
}
// We merge all of the previously computed prefixes into on final set.
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
let mut builder = fst::SetBuilder::memory();
builder.extend_stream(op.r#union())?;
let prefix_fst = builder.into_set();
// Set the words prefixes FST in the dtabase.
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
Ok(())
}
}