Merge branch 'main' into settings-customizing-tokenization

This commit is contained in:
ManyTheFish 2023-08-08 16:08:16 +02:00
commit 4a21fecf67
166 changed files with 2252 additions and 1072 deletions

View file

@ -36,6 +36,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids,
facet_id_f64_docids,
facet_id_string_docids,
facet_id_normalized_string_strings,
facet_id_string_fst,
facet_id_exists_docids,
facet_id_is_null_docids,
@ -94,6 +95,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_fid_docids.clear(self.wtxn)?;
script_language_docids.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?;
facet_id_normalized_string_strings.clear(self.wtxn)?;
facet_id_string_fst.clear(self.wtxn)?;
facet_id_exists_docids.clear(self.wtxn)?;
facet_id_is_null_docids.clear(self.wtxn)?;

View file

@ -4,10 +4,9 @@ use std::collections::{BTreeSet, HashMap, HashSet};
use fst::IntoStreamer;
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
use heed::{BytesDecode, BytesEncode, Database, RwIter};
use hnsw::Searcher;
use instant_distance::PointId;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use space::KnnPoints;
use time::OffsetDateTime;
use super::facet::delete::FacetsDelete;
@ -239,6 +238,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_fid_docids,
facet_id_f64_docids: _,
facet_id_string_docids: _,
facet_id_normalized_string_strings: _,
facet_id_string_fst: _,
field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
@ -438,24 +438,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// An ugly and slow way to remove the vectors from the HNSW
// It basically reconstructs the HNSW from scratch without editing the current one.
let current_hnsw = self.index.vector_hnsw(self.wtxn)?.unwrap_or_default();
if !current_hnsw.is_empty() {
let mut new_hnsw = Hnsw::default();
let mut searcher = Searcher::new();
let mut new_vector_id_docids = Vec::new();
if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? {
let mut points = Vec::new();
let mut docids = Vec::new();
for result in vector_id_docid.iter(self.wtxn)? {
let (vector_id, docid) = result?;
if !self.to_delete_docids.contains(docid.get()) {
let vector = current_hnsw.get_point(vector_id.get() as usize).clone();
let vector_id = new_hnsw.insert(vector, &mut searcher);
new_vector_id_docids.push((vector_id as u32, docid));
let pid = PointId::from(vector_id.get());
let vector = current_hnsw[pid].clone();
points.push(vector);
docids.push(docid);
}
}
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
vector_id_docid.clear(self.wtxn)?;
for (vector_id, docid) in new_vector_id_docids {
vector_id_docid.put(self.wtxn, &BEU32::new(vector_id), &docid)?;
for (pid, docid) in pids.into_iter().zip(docids) {
vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?;
}
self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
}

View file

@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
pub const FACET_GROUP_SIZE: u8 = 4;
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File;
use std::iter::FromIterator;
use heed::types::DecodeIgnore;
use charabia::normalizer::{Normalize, NormalizerOption};
use grenad::{CompressionType, SortAlgorithm};
use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
use heed::BytesEncode;
use log::debug;
use time::OffsetDateTime;
@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result, BEU16};
use crate::update::index_documents::create_sorter;
use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, BEU16};
pub mod bulk;
pub mod delete;
@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> {
incremental_update.execute(wtxn)?;
}
// We clear the list of normalized-for-search facets
// and the previous FSTs to compute everything from scratch
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
self.index.facet_id_string_fst.clear(wtxn)?;
// As we can't use the same write transaction to read and write in two different databases
// we must create a temporary sorter that we will write into LMDB afterward.
// As multiple unnormalized facet values can become the same normalized facet value
// we must merge them together.
let mut sorter = create_sorter(
SortAlgorithm::Unstable,
merge_btreeset_string,
CompressionType::None,
None,
None,
None,
);
// We iterate on the list of original, semi-normalized, facet values
// and normalize them for search, inserting them in LMDB in any given order.
let options = NormalizerOption { lossy: true, ..Default::default() };
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, ()) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
let normalized_facet = left_bound.normalize(&options);
let set = BTreeSet::from_iter(std::iter::once(left_bound));
let key = (field_id, normalized_facet.as_ref());
let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
sorter.insert(key, val)?;
}
}
// In this loop we don't need to take care of merging bitmaps
// as the grenad sorter already merged them for us.
let mut merger_iter = sorter.into_stream_merger_iter()?;
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
self.index
.facet_id_normalized_string_strings
.remap_types::<ByteSlice, ByteSlice>()
.put(wtxn, key_bytes, btreeset_bytes)?;
}
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
let database =
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, _) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(left_bound)?;
let ((field_id, normalized_facet), _) = result?;
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(normalized_facet)?;
}
}
@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
text_fsts.push((field_id, fst));
}
// We remove all of the previous FSTs that were in this database
self.index.facet_id_string_fst.clear(wtxn)?;
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;

View file

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::collections::BTreeSet;
use std::io;
use std::result::Result as StdResult;
@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
}
}
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// TODO improve the perf by using a `#[borrow] Cow<str>`.
let strings: BTreeSet<String> = values
.iter()
.map(AsRef::as_ref)
.map(serde_json::from_slice::<BTreeSet<String>>)
.map(StdResult::unwrap)
.reduce(|mut current, new| {
for x in new {
current.insert(x);
}
current
})
.unwrap();
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
}
}
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(values[0].clone())
}

View file

@ -13,9 +13,9 @@ pub use grenad_helpers::{
GrenadParameters, MergeableReader,
};
pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
MergeFn,
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
serialize_roaring_bitmap, MergeFn,
};
use crate::MAX_WORD_LENGTH;

View file

@ -26,7 +26,7 @@ pub use self::enrich::{
};
pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};

View file

@ -9,22 +9,19 @@ use charabia::{Language, Script};
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::RwTxn;
use hnsw::Searcher;
use roaring::RoaringBitmap;
use space::KnnPoints;
use super::helpers::{
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn};
use crate::distance::NDotProductPoint;
use crate::error::UserError;
use crate::facet::FacetType;
use crate::index::Hnsw;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
use crate::{
lat_lng_to_xyz, normalize_vector, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result,
BEU32,
};
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -292,17 +289,20 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints(vector_points) => {
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
let mut expected_dimensions = match index.vector_id_docid.iter(wtxn)?.next() {
Some(result) => {
let (vector_id, _) = result?;
Some(hnsw.get_point(vector_id.get() as usize).len())
}
None => None,
let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? {
Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(),
None => Default::default(),
};
// Convert the PointIds into DocumentIds
let mut docids = Vec::new();
for pid in pids {
let docid =
index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap();
docids.push(docid.get());
}
let mut expected_dimensions = points.get(0).map(|p| p.len());
let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
@ -318,12 +318,26 @@ pub(crate) fn write_typed_chunk_into_index(
return Err(UserError::InvalidVectorDimensions { expected, found })?;
}
let vector = normalize_vector(vector);
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
points.push(NDotProductPoint::new(vector));
docids.push(docid);
}
log::debug!("There are {} entries in the HNSW so far", hnsw.len());
index.put_vector_hnsw(wtxn, &hnsw)?;
assert_eq!(docids.len(), points.len());
let hnsw_length = points.len();
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
index.vector_id_docid.clear(wtxn)?;
for (docid, pid) in docids.into_iter().zip(pids) {
index.vector_id_docid.put(
wtxn,
&BEU32::new(pid.into_inner()),
&BEU32::new(docid),
)?;
}
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
index.put_vector_hnsw(wtxn, &new_hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();

View file

@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
pub use self::facet::bulk::FacetsUpdateBulk;
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
pub use self::index_documents::{
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
MergeFn,
};
pub use self::indexer_config::IndexerConfig;
pub use self::prefix_word_pairs::{

View file

@ -466,13 +466,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let current = self.index.stop_words(self.wtxn)?;
// Apply an unlossy normalization on stop_words
let stop_words = stop_words
let stop_words: BTreeSet<String> = stop_words
.iter()
.map(|w| w.as_str().normalize(&Default::default()).into_owned());
.map(|w| w.as_str().normalize(&Default::default()).into_owned())
.collect();
// since we can't compare a BTreeSet with an FST we are going to convert the
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
let fst = fst::Set::from_iter(stop_words)?;
let fst = fst::Set::from_iter(stop_words.into_iter())?;
// Does the new FST differ from the previous one?
if current