MeiliSearch/milli/src/update/index_documents/mod.rs

mod enrich;
mod extract;
mod helpers;
mod transform;
mod typed_chunk;

use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek};
use std::iter::FromIterator;
use std::num::NonZeroU32;
use std::result::Result as StdResult;

use crossbeam_channel::{Receiver, Sender};
use grenad::{Merger, MergerBuilder};
use heed::types::Str;
use heed::Database;
use rand::SeedableRng;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use slice_group_by::GroupBy;
use tracing::debug;
use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};

use self::enrich::enrich_documents_batch;
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
pub use self::helpers::{
    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
    valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::vector::EmbeddingConfigs;
use crate::{CboRoaringBitmapCodec, Index, Result};

static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 4;
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DocumentAdditionResult {
    /// The number of documents that were indexed during the update
    pub indexed_documents: u64,
    /// The total number of documents in the index after the update
    pub number_of_documents: u64,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum IndexDocumentsMethod {
    /// Replace the previous document with the new one,
    /// removing all the already known attributes.
    ReplaceDocuments,

    /// Merge the previous version of the document with the new version,
    /// replacing old attributes values with the new ones and add the new attributes.
    UpdateDocuments,
}

impl Default for IndexDocumentsMethod {
    fn default() -> Self {
        Self::ReplaceDocuments
    }
}

pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
    wtxn: &'t mut heed::RwTxn<'i>,
    index: &'i Index,
    config: IndexDocumentsConfig,
    indexer_config: &'a IndexerConfig,
    transform: Option<Transform<'a, 'i>>,
    progress: FP,
    should_abort: FA,
    added_documents: u64,
    deleted_documents: u64,
    embedders: EmbeddingConfigs,
}

#[derive(Default, Debug, Clone)]
pub struct IndexDocumentsConfig {
    pub words_prefix_threshold: Option<u32>,
    pub max_prefix_length: Option<usize>,
    pub words_positions_level_group_size: Option<NonZeroU32>,
    pub words_positions_min_level_size: Option<NonZeroU32>,
    pub update_method: IndexDocumentsMethod,
    pub autogenerate_docids: bool,
}

impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
where
    FP: Fn(UpdateIndexingStep) + Sync + Send,
    FA: Fn() -> bool + Sync + Send,
{
    pub fn new(
        wtxn: &'t mut heed::RwTxn<'i>,
        index: &'i Index,
        indexer_config: &'a IndexerConfig,
        config: IndexDocumentsConfig,
        progress: FP,
        should_abort: FA,
    ) -> Result<IndexDocuments<'t, 'i, 'a, FP, FA>> {
        let transform = Some(Transform::new(
            wtxn,
            index,
            indexer_config,
            config.update_method,
            config.autogenerate_docids,
        )?);

        Ok(IndexDocuments {
            transform,
            config,
            indexer_config,
            progress,
            should_abort,
            wtxn,
            index,
            added_documents: 0,
            deleted_documents: 0,
            embedders: Default::default(),
        })
    }

    /// Adds a batch of documents to the current builder.
    ///
    /// Since the documents are progressively added to the writer, a failure will cause only
    /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward.
    ///
    /// Returns the number of documents added to the builder.
    #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
    pub fn add_documents<R: Read + Seek>(
        mut self,
        reader: DocumentsBatchReader<R>,
    ) -> Result<(Self, StdResult<u64, UserError>)> {
        puffin::profile_function!();

        // Early return when there is no document to add
        if reader.is_empty() {
            return Ok((self, Ok(0)));
        }

        // We check for user errors in this validator and if there is one, we can return
        // the `IndexDocument` struct as it is valid to send more documents into it.
        // However, if there is an internal error we throw it away!
        let enriched_documents_reader = match enrich_documents_batch(
            self.wtxn,
            self.index,
            self.config.autogenerate_docids,
            reader,
        )? {
            Ok(reader) => reader,
            Err(user_error) => return Ok((self, Err(user_error))),
        };

        let indexed_documents =
            self.transform.as_mut().expect("Invalid document addition state").read_documents(
                enriched_documents_reader,
                self.wtxn,
                &self.progress,
                &self.should_abort,
            )? as u64;

        self.added_documents += indexed_documents;

        Ok((self, Ok(indexed_documents)))
    }

    pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
        self.embedders = embedders;
        self
    }

    /// Remove a batch of documents from the current builder.
    ///
    /// Returns the number of documents deleted from the builder.
    #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
    pub fn remove_documents(
        mut self,
        to_delete: Vec<String>,
    ) -> Result<(Self, StdResult<u64, UserError>)> {
        puffin::profile_function!();

        // Early return when there is no document to add
        if to_delete.is_empty() {
            // Maintains Invariant: remove documents actually always returns Ok for the inner result
            return Ok((self, Ok(0)));
        }

        let deleted_documents = self
            .transform
            .as_mut()
            .expect("Invalid document deletion state")
            .remove_documents(to_delete, self.wtxn, &self.should_abort)?
            as u64;

        self.deleted_documents += deleted_documents;

        // Maintains Invariant: remove documents actually always returns Ok for the inner result
        Ok((self, Ok(deleted_documents)))
    }

    /// Removes documents from db using their internal document ids.
    ///
    /// # Warning
    ///
    /// This function is dangerous and will only work correctly if:
    ///
    /// - All the passed ids currently exist in the database
    /// - No batching using the standards `remove_documents` and `add_documents` took place
    ///
    /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
    #[tracing::instrument(level = "trace", skip_all, target = "indexing::details")]
    pub fn remove_documents_from_db_no_batch(
        mut self,
        to_delete: &RoaringBitmap,
    ) -> Result<(Self, u64)> {
        puffin::profile_function!();

        // Early return when there is no document to add
        if to_delete.is_empty() {
            return Ok((self, 0));
        }

        let deleted_documents = self
            .transform
            .as_mut()
            .expect("Invalid document deletion state")
            .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
            as u64;

        self.deleted_documents += deleted_documents;

        Ok((self, deleted_documents))
    }

    #[tracing::instrument(
        level = "trace"
        skip_all,
        target = "indexing::documents",
        name = "index_documents"
    )]
    pub fn execute(mut self) -> Result<DocumentAdditionResult> {
        puffin::profile_function!();

        if self.added_documents == 0 && self.deleted_documents == 0 {
            let number_of_documents = self.index.number_of_documents(self.wtxn)?;
            return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
        }
        let output = self
            .transform
            .take()
            .expect("Invalid document addition state")
            .output_from_sorter(self.wtxn, &self.progress)?;

        let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
        self.index.put_faceted_fields(self.wtxn, &new_facets)?;

        // in case new fields were introduced we're going to recreate the searchable fields.
        if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? {
            // we can't keep references on the faceted fields while we update the index thus we need to own it.
            let faceted_fields: Vec<String> =
                faceted_fields.into_iter().map(str::to_string).collect();
            self.index.put_all_searchable_fields_from_fields_ids_map(
                self.wtxn,
                &faceted_fields.iter().map(String::as_ref).collect::<Vec<_>>(),
                &output.fields_ids_map,
            )?;
        }

        let indexed_documents = output.documents_count as u64;
        let number_of_documents = self.execute_raw(output)?;

        Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
    }

    /// Returns the total number of documents in the index after the update.
    #[tracing::instrument(
        level = "trace",
        skip_all,
        target = "indexing::details",
        name = "index_documents_raw"
    )]
    pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
    where
        FP: Fn(UpdateIndexingStep) + Sync,
        FA: Fn() -> bool + Sync,
    {
        puffin::profile_function!();

        let TransformOutput {
            primary_key,
            fields_ids_map,
            field_distribution,
            documents_count,
            original_documents,
            flattened_documents,
        } = output;

        // The fields_ids_map is put back to the store now so the rest of the transaction sees an
        // up to date field map.
        self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;

        let backup_pool;
        let pool = match self.indexer_config.thread_pool {
            Some(ref pool) => pool,
            #[cfg(not(test))]
            None => {
                // We initialize a bakcup pool with the default
                // settings if none have already been set.
                backup_pool = rayon::ThreadPoolBuilder::new().build()?;
                &backup_pool
            }
            #[cfg(test)]
            None => {
                // We initialize a bakcup pool with the default
                // settings if none have already been set.
                backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
                &backup_pool
            }
        };

        // create LMDB writer channel
        let (lmdb_writer_sx, lmdb_writer_rx): (
            Sender<Result<TypedChunk>>,
            Receiver<Result<TypedChunk>>,
        ) = crossbeam_channel::unbounded();

        // get the primary key field id
        let primary_key_id = fields_ids_map.id(&primary_key).unwrap();

        // get searchable fields for word databases
        let searchable_fields =
            self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
        // get filterable fields for facet databases
        let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
        // get the fid of the `_geo.lat` and `_geo.lng` fields.
        let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;

        // self.index.fields_ids_map($a)? ==>> field_id_map
        let geo_fields_ids = match field_id_map.id("_geo") {
            Some(gfid) => {
                let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
                let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
                // if `_geo` is faceted then we get the `lat` and `lng`
                if is_sortable || is_filterable {
                    let field_ids = field_id_map
                        .insert("_geo.lat")
                        .zip(field_id_map.insert("_geo.lng"))
                        .ok_or(UserError::AttributeLimitReached)?;
                    Some(field_ids)
                } else {
                    None
                }
            }
            None => None,
        };

        let stop_words = self.index.stop_words(self.wtxn)?;
        let separators = self.index.allowed_separators(self.wtxn)?;
        let dictionary = self.index.dictionary(self.wtxn)?;
        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
        let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();

        let pool_params = GrenadParameters {
            chunk_compression_type: self.indexer_config.chunk_compression_type,
            chunk_compression_level: self.indexer_config.chunk_compression_level,
            max_memory: self.indexer_config.max_memory,
            max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
        };
        let documents_chunk_size = match self.indexer_config.documents_chunk_size {
            Some(chunk_size) => chunk_size,
            None => {
                let default_chunk_size = 1024 * 1024 * 4; // 4MiB
                let min_chunk_size = 1024 * 512; // 512KiB

                // compute the chunk size from the number of available threads and the inputed data size.
                let total_size = flattened_documents.metadata().map(|m| m.len());
                let current_num_threads = pool.current_num_threads();
                // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
                let chunk_count = if current_num_threads > 2 {
                    (current_num_threads * 3 / 4).max(2)
                } else {
                    current_num_threads
                };
                total_size
                    .map_or(default_chunk_size, |size| (size as usize) / chunk_count)
                    .max(min_chunk_size)
            }
        };

        let original_documents = grenad::Reader::new(original_documents)?;
        let flattened_documents = grenad::Reader::new(flattened_documents)?;

        let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;

        let cloned_embedder = self.embedders.clone();

        let mut final_documents_ids = RoaringBitmap::new();
        let mut databases_seen = 0;
        let mut word_position_docids = None;
        let mut word_fid_docids = None;
        let mut word_docids = None;
        let mut exact_word_docids = None;
        let mut chunk_accumulator = ChunkAccumulator::default();
        let mut dimension = HashMap::new();
        let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());

        let current_span = tracing::Span::current();

        // Run extraction pipeline in parallel.
        pool.install(|| {
            rayon::spawn(move || {
                let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
            let _enter = child_span.enter();
            puffin::profile_scope!("extract_and_send_grenad_chunks");
                // split obkv file into several chunks
                let original_chunk_iter =
                    grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);

                // split obkv file into several chunks
                let flattened_chunk_iter =
                    grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);

                let separators: Option<Vec<_>> =
                    separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
                let dictionary: Option<Vec<_>> =
                    dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
                let result = original_chunk_iter.and_then(|original_chunk| {
                    let flattened_chunk = flattened_chunk_iter?;
                    // extract all databases from the chunked obkv douments
                    extract::data_from_obkv_documents(
                        original_chunk,
                        flattened_chunk,
                        pool_params,
                        lmdb_writer_sx.clone(),
                        searchable_fields,
                        faceted_fields,
                        primary_key_id,
                        geo_fields_ids,
                        field_id_map,
                        stop_words,
                        separators.as_deref(),
                        dictionary.as_deref(),
                        max_positions_per_attributes,
                        exact_attributes,
                        proximity_precision,
                        cloned_embedder,
                    )
                });

                if let Err(e) = result {
                    let _ = lmdb_writer_sx.send(Err(e));
                }

                // needs to be dropped to avoid channel waiting lock.
                drop(lmdb_writer_sx);
            });

            (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
                databases_seen,
                total_databases: TOTAL_POSTING_DATABASE_COUNT,
            });

            loop {
                if (self.should_abort)() {
                    return Err(Error::InternalError(InternalError::AbortedIndexation));
                }

                match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
                    Err(status) => {
                        if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
                            let (docids, is_merged_database) =
                                write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
                            if !docids.is_empty() {
                                final_documents_ids |= docids;
                                let documents_seen_count = final_documents_ids.len();
                                (self.progress)(UpdateIndexingStep::IndexDocuments {
                                    documents_seen: documents_seen_count as usize,
                                    total_documents: documents_count,
                                });
                                debug!(documents = documents_seen_count, total = documents_count, "Seen");
                            }
                            if is_merged_database {
                                databases_seen += 1;
                                (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
                                    databases_seen,
                                    total_databases: TOTAL_POSTING_DATABASE_COUNT,
                                });
                            }
                        // If no more chunk remains in the chunk accumulator and the channel is disconected, break.
                        } else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
                            break;
                        } else {
                            rayon::yield_now();
                        }
                    }
                    Ok(result) => {
                        let typed_chunk = match result? {
                            TypedChunk::WordDocids {
                                word_docids_reader,
                                exact_word_docids_reader,
                                word_fid_docids_reader,
                            } => {
                                let cloneable_chunk =
                                    unsafe { as_cloneable_grenad(&word_docids_reader)? };
                                let word_docids = word_docids.get_or_insert_with(|| {
                                    MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
                                });
                                word_docids.push(cloneable_chunk.into_cursor()?);
                                let cloneable_chunk =
                                    unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
                                let exact_word_docids =
                                    exact_word_docids.get_or_insert_with(|| {
                                        MergerBuilder::new(
                                            merge_deladd_cbo_roaring_bitmaps as MergeFn,
                                        )
                                    });
                                exact_word_docids.push(cloneable_chunk.into_cursor()?);
                                let cloneable_chunk =
                                    unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
                                let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
                                    MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
                                });
                                word_fid_docids.push(cloneable_chunk.into_cursor()?);
                                TypedChunk::WordDocids {
                                    word_docids_reader,
                                    exact_word_docids_reader,
                                    word_fid_docids_reader,
                                }
                            }
                            TypedChunk::WordPositionDocids(chunk) => {
                                let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
                                let word_position_docids =
                                    word_position_docids.get_or_insert_with(|| {
                                        MergerBuilder::new(
                                            merge_deladd_cbo_roaring_bitmaps as MergeFn,
                                        )
                                    });
                                word_position_docids.push(cloneable_chunk.into_cursor()?);
                                TypedChunk::WordPositionDocids(chunk)
                            }
                            TypedChunk::VectorPoints {
                                expected_dimension,
                                remove_vectors,
                                embeddings,
                                manual_vectors,
                                embedder_name,
                            } => {
                                dimension.insert(embedder_name.clone(), expected_dimension);
                                TypedChunk::VectorPoints {
                                    remove_vectors,
                                    embeddings,
                                    expected_dimension,
                                    manual_vectors,
                                    embedder_name,
                                }
                            }
                            otherwise => otherwise,
                        };

                        chunk_accumulator.insert(typed_chunk);
                    }
                }
            }

            Ok(())
        })?;

        // We write the field distribution into the main database
        self.index.put_field_distribution(self.wtxn, &field_distribution)?;

        // We write the primary key field id into the main database
        self.index.put_primary_key(self.wtxn, &primary_key)?;
        let number_of_documents = self.index.number_of_documents(self.wtxn)?;
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);

        for (embedder_name, dimension) in dimension {
            let wtxn = &mut *self.wtxn;
            let vector_arroy = self.index.vector_arroy;

            let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
                InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
            )?;

            pool.install(|| {
                let writer_index = (embedder_index as u16) << 8;
                for k in 0..=u8::MAX {
                    let writer =
                        arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?;
                    if writer.is_empty(wtxn)? {
                        break;
                    }
                    writer.build(wtxn, &mut rng, None)?;
                }
                Result::Ok(())
            })?;
        }

        self.execute_prefix_databases(
            word_docids.map(MergerBuilder::build),
            exact_word_docids.map(MergerBuilder::build),
            word_position_docids.map(MergerBuilder::build),
            word_fid_docids.map(MergerBuilder::build),
        )?;

        Ok(number_of_documents)
    }

    #[tracing::instrument(
        level = "trace",
        skip_all,
        target = "indexing::prefix",
        name = "index_documents_prefix_databases"
    )]
    pub fn execute_prefix_databases(
        self,
        word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
    ) -> Result<()>
    where
        FP: Fn(UpdateIndexingStep) + Sync,
        FA: Fn() -> bool + Sync,
    {
        puffin::profile_function!();

        // Merged databases are already been indexed, we start from this count;
        let mut databases_seen = MERGED_DATABASE_COUNT;

        if (self.should_abort)() {
            return Err(Error::InternalError(InternalError::AbortedIndexation));
        }

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
            databases_seen,
            total_databases: TOTAL_POSTING_DATABASE_COUNT,
        });

        if (self.should_abort)() {
            return Err(Error::InternalError(InternalError::AbortedIndexation));
        }

        let previous_words_prefixes_fst =
            self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;

        // Run the words prefixes update operation.
        let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
        if let Some(value) = self.config.words_prefix_threshold {
            builder.threshold(value);
        }
        if let Some(value) = self.config.max_prefix_length {
            builder.max_prefix_length(value);
        }
        builder.execute()?;

        if (self.should_abort)() {
            return Err(Error::InternalError(InternalError::AbortedIndexation));
        }

        let current_prefix_fst;
        let common_prefix_fst_words_tmp;
        let common_prefix_fst_words: Vec<_>;
        let new_prefix_fst_words;
        let del_prefix_fst_words;

        {
            let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs");
            let _entered = span.enter();
            puffin::profile_scope!("compute_prefix_diffs");

            current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;

            // We retrieve the common words between the previous and new prefix word fst.
            common_prefix_fst_words_tmp = fst_stream_into_vec(
                previous_words_prefixes_fst.op().add(&current_prefix_fst).intersection(),
            );
            common_prefix_fst_words = common_prefix_fst_words_tmp
                .as_slice()
                .linear_group_by_key(|x| x.chars().next().unwrap())
                .collect();

            // We retrieve the newly added words between the previous and new prefix word fst.
            new_prefix_fst_words = fst_stream_into_vec(
                current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
            );

            // We compute the set of prefixes that are no more part of the prefix fst.
            del_prefix_fst_words = fst_stream_into_hashset(
                previous_words_prefixes_fst.op().add(&current_prefix_fst).difference(),
            );
        }

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
            databases_seen,
            total_databases: TOTAL_POSTING_DATABASE_COUNT,
        });

        if (self.should_abort)() {
            return Err(Error::InternalError(InternalError::AbortedIndexation));
        }

        if let Some(word_docids) = word_docids {
            execute_word_prefix_docids(
                self.wtxn,
                word_docids,
                self.index.word_docids,
                self.index.word_prefix_docids,
                self.indexer_config,
                &new_prefix_fst_words,
                &common_prefix_fst_words,
                &del_prefix_fst_words,
            )?;
        }

        if let Some(exact_word_docids) = exact_word_docids {
            execute_word_prefix_docids(
                self.wtxn,
                exact_word_docids,
                self.index.exact_word_docids,
                self.index.exact_word_prefix_docids,
                self.indexer_config,
                &new_prefix_fst_words,
                &common_prefix_fst_words,
                &del_prefix_fst_words,
            )?;
        }

        if (self.should_abort)() {
            return Err(Error::InternalError(InternalError::AbortedIndexation));
        }

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
            databases_seen,
            total_databases: TOTAL_POSTING_DATABASE_COUNT,
        });

        if let Some(word_position_docids) = word_position_docids {
            // Run the words prefix position docids update operation.
            let mut builder = WordPrefixIntegerDocids::new(
                self.wtxn,
                self.index.word_prefix_position_docids,
                self.index.word_position_docids,
            );
            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
            builder.max_memory = self.indexer_config.max_memory;

            builder.execute(
                word_position_docids,
                &new_prefix_fst_words,
                &common_prefix_fst_words,
                &del_prefix_fst_words,
            )?;
        }
        if let Some(word_fid_docids) = word_fid_docids {
            // Run the words prefix fid docids update operation.
            let mut builder = WordPrefixIntegerDocids::new(
                self.wtxn,
                self.index.word_prefix_fid_docids,
                self.index.word_fid_docids,
            );
            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
            builder.max_memory = self.indexer_config.max_memory;
            builder.execute(
                word_fid_docids,
                &new_prefix_fst_words,
                &common_prefix_fst_words,
                &del_prefix_fst_words,
            )?;
        }

        if (self.should_abort)() {
            return Err(Error::InternalError(InternalError::AbortedIndexation));
        }

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
            databases_seen,
            total_databases: TOTAL_POSTING_DATABASE_COUNT,
        });

        Ok(())
    }
}

/// Run the word prefix docids update operation.
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(
    level = "trace",
    skip_all,
    target = "indexing::prefix",
    name = "index_documents_word_prefix_docids"
)]
fn execute_word_prefix_docids(
    txn: &mut heed::RwTxn,
    merger: Merger<CursorClonableMmap, MergeFn>,
    word_docids_db: Database<Str, CboRoaringBitmapCodec>,
    word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
    indexer_config: &IndexerConfig,
    new_prefix_fst_words: &[String],
    common_prefix_fst_words: &[&[String]],
    del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> {
    puffin::profile_function!();

    let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
    builder.chunk_compression_type = indexer_config.chunk_compression_type;
    builder.chunk_compression_level = indexer_config.chunk_compression_level;
    builder.max_nb_chunks = indexer_config.max_nb_chunks;
    builder.max_memory = indexer_config.max_memory;
    builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
    Ok(())
}

#[cfg(test)]
mod tests {
    use std::collections::BTreeMap;

    use big_s::S;
    use fst::IntoStreamer;
    use heed::RwTxn;
    use maplit::hashset;

    use super::*;
    use crate::documents::documents_batch_reader_from_objects;
    use crate::index::tests::TempIndex;
    use crate::search::TermsMatchingStrategy;
    use crate::update::Setting;
    use crate::{db_snap, Filter, Search};

    #[test]
    fn simple_document_replacement() {
        let index = TempIndex::new();

        // First we send 3 documents with ids from 1 to 3.
        index
            .add_documents(documents!([
                { "id": 1, "name": "kevin" },
                { "id": 2, "name": "kevina" },
                { "id": 3, "name": "benoit" }
            ]))
            .unwrap();

        // Check that there is 3 documents now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        drop(rtxn);

        // Second we send 1 document with id 1, to erase the previous ones.
        index.add_documents(documents!([ { "id": 1, "name": "updated kevin" } ])).unwrap();

        // Check that there is **always** 3 documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        drop(rtxn);

        // Third we send 3 documents again to replace the existing ones.
        index
            .add_documents(documents!([
                { "id": 1, "name": "updated second kevin" },
                { "id": 2, "name": "updated kevina" },
                { "id": 3, "name": "updated benoit" }
            ]))
            .unwrap();

        // Check that there is **always** 3 documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        let count = index.all_documents(&rtxn).unwrap().count();
        assert_eq!(count, 3);

        drop(rtxn);
    }

    #[test]
    fn simple_document_merge() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        // First we send 3 documents with duplicate ids and
        // change the index method to merge documents.
        index
            .add_documents(documents!([
                { "id": 1, "name": "kevin" },
                { "id": 1, "name": "kevina" },
                { "id": 1, "name": "benoit" }
            ]))
            .unwrap();

        // Check that there is only 1 document now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 1);

        // Check that we get only one document from the database.
        let docs = index.documents(&rtxn, Some(0)).unwrap();
        assert_eq!(docs.len(), 1);
        let (id, doc) = docs[0];
        assert_eq!(id, 0);

        // Check that this document is equal to the last one sent.
        let mut doc_iter = doc.iter();
        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
        assert_eq!(doc_iter.next(), None);
        drop(rtxn);

        // Second we send 1 document with id 1, to force it to be merged with the previous one.
        index.add_documents(documents!([ { "id": 1, "age": 25 } ])).unwrap();

        // Check that there is **always** 1 document.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 1);

        // Check that we get only one document from the database.
        let docs = index.documents(&rtxn, Some(0)).unwrap();
        assert_eq!(docs.len(), 1);
        let (id, doc) = docs[0];
        assert_eq!(id, 0);

        // Check that this document is equal to the last one sent.
        let mut doc_iter = doc.iter();
        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
        assert_eq!(doc_iter.next(), Some((2, &b"25"[..])));
        assert_eq!(doc_iter.next(), None);
        drop(rtxn);
    }

    #[test]
    fn not_auto_generated_documents_ids() {
        let index = TempIndex::new();

        let result = index.add_documents(documents!([
            { "name": "kevin" },
            { "name": "kevina" },
            { "name": "benoit" }
        ]));
        assert!(result.is_err());

        // Check that there is no document.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 0);
        drop(rtxn);
    }

    #[test]
    fn simple_auto_generated_documents_ids() {
        let mut index = TempIndex::new();
        index.index_documents_config.autogenerate_docids = true;
        // First we send 3 documents with ids from 1 to 3.
        index
            .add_documents(documents!([
                { "name": "kevin" },
                { "name": "kevina" },
                { "name": "benoit" }
            ]))
            .unwrap();

        // Check that there is 3 documents now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);

        let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
        let (_id, obkv) = docs.iter().find(|(_id, kv)| kv.get(0) == Some(br#""kevin""#)).unwrap();
        let kevin_uuid: String = serde_json::from_slice(obkv.get(1).unwrap()).unwrap();
        drop(rtxn);

        // Second we send 1 document with the generated uuid, to erase the previous ones.
        index.add_documents(documents!([ { "name": "updated kevin", "id": kevin_uuid } ])).unwrap();

        // Check that there is **always** 3 documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);

        // the document 0 has been deleted and reinserted with the id 3
        let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
        let kevin_position =
            docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
        assert_eq!(kevin_position, 2);
        let (_, doc) = docs[kevin_position];

        // Check that this document is equal to the last
        // one sent and that an UUID has been generated.
        assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..]));
        // This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes (").
        assert_eq!(doc.get(1).unwrap().len(), 36 + 2);
        drop(rtxn);
    }

    #[test]
    fn reordered_auto_generated_documents_ids() {
        let mut index = TempIndex::new();

        // First we send 3 documents with ids from 1 to 3.
        index
            .add_documents(documents!([
                { "id": 1, "name": "kevin" },
                { "id": 2, "name": "kevina" },
                { "id": 3, "name": "benoit" }
            ]))
            .unwrap();

        // Check that there is 3 documents now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 3);
        drop(rtxn);

        // Second we send 1 document without specifying the id.
        index.index_documents_config.autogenerate_docids = true;
        index.add_documents(documents!([ { "name": "new kevin" } ])).unwrap();

        // Check that there is 4 documents now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 4);
        drop(rtxn);
    }

    #[test]
    fn empty_update() {
        let index = TempIndex::new();

        // First we send 0 documents and only headers.
        index.add_documents(documents!([])).unwrap();

        // Check that there is no documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 0);
        drop(rtxn);
    }

    #[test]
    fn invalid_documents_ids() {
        let index = TempIndex::new();

        // First we send 1 document with an invalid id.
        // There is a space in the document id.
        index.add_documents(documents!([ { "id": "brume bleue", "name": "kevin" } ])).unwrap_err();

        // Then we send 1 document with a valid id.
        index.add_documents(documents!([ { "id": 32, "name": "kevin" } ])).unwrap();

        // Check that there is 1 document now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 1);
        drop(rtxn);
    }

    #[test]
    fn complex_documents() {
        let index = TempIndex::new();

        // First we send 3 documents with an id for only one of them.
        index
            .add_documents(documents!([
                { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
                { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
                { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
            ]))
            .unwrap();

        // Check that there is 1 documents now.
        let rtxn = index.read_txn().unwrap();

        // Search for a sub object value
        let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
        assert_eq!(result.documents_ids, vec![0]);

        // Search for a sub array value
        let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
        assert_eq!(result.documents_ids, vec![1]);

        // Search for a sub array sub object key
        let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap();
        assert_eq!(result.documents_ids, vec![2]);

        drop(rtxn);
    }

    #[test]
    fn simple_documents_replace() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;

        index.add_documents(documents!([
          { "id": 2,    "title": "Pride and Prejudice",                    "author": "Jane Austin",              "genre": "romance",    "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
          { "id": 456,  "title": "Le Petit Prince",                        "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
          { "id": 1,    "title": "Alice In Wonderland",                    "author": "Lewis Carroll",            "genre": "fantasy",    "price": 25.99 },
          { "id": 1344, "title": "The Hobbit",                             "author": "J. R. R. Tolkien",         "genre": "fantasy" },
          { "id": 4,    "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling",            "genre": "fantasy" },
          { "id": 42,   "title": "The Hitchhiker's Guide to the Galaxy",   "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } }
        ])).unwrap();

        db_snap!(index, word_docids, "initial");

        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        index
            .add_documents(documents!([
                {"id":4,"title":"Harry Potter and the Half-Blood Princess"},
                {"id":456,"title":"The Little Prince"}
            ]))
            .unwrap();

        index
            .add_documents(documents!([
                { "id": 2, "author": "J. Austen", "date": "1813" }
            ]))
            .unwrap();

        // Check that there is **always** 6 documents.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 6);
        let count = index.all_documents(&rtxn).unwrap().count();
        assert_eq!(count, 6);

        db_snap!(index, word_docids, "updated");

        drop(rtxn);
    }

    #[test]
    fn mixed_geo_documents() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;

        // We send 6 documents and mix the ones that have _geo and those that don't have it.
        index
            .add_documents(documents!([
              { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
              { "id": 456 },
              { "id": 1 },
              { "id": 1344 },
              { "id": 4 },
              { "id": 42, "_geo": { "lat": 35, "lng": 23 } }
            ]))
            .unwrap();

        index
            .update_settings(|settings| {
                settings.set_filterable_fields(hashset!(S("_geo")));
            })
            .unwrap();
    }

    #[test]
    fn geo_error() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;

        index
            .update_settings(|settings| {
                settings.set_filterable_fields(hashset!(S("_geo")));
            })
            .unwrap();

        let error = index
            .add_documents(documents!([
              { "id": 0, "_geo": { "lng": 42 } }
            ]))
            .unwrap_err();
        assert_eq!(
            &error.to_string(),
            r#"Could not find latitude in the document with the id: `0`. Was expecting a `_geo.lat` field."#
        );

        let error = index
            .add_documents(documents!([
              { "id": 0, "_geo": { "lat": 42 } }
            ]))
            .unwrap_err();
        assert_eq!(
            &error.to_string(),
            r#"Could not find longitude in the document with the id: `0`. Was expecting a `_geo.lng` field."#
        );

        let error = index
            .add_documents(documents!([
              { "id": 0, "_geo": { "lat": "lol", "lng": 42 } }
            ]))
            .unwrap_err();
        assert_eq!(
            &error.to_string(),
            r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."#
        );

        let error = index
            .add_documents(documents!([
              { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } }
            ]))
            .unwrap_err();
        assert_eq!(
            &error.to_string(),
            r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."#
        );

        let error = index
            .add_documents(documents!([
              { "id": 0, "_geo": { "lat": 12, "lng": "hello" } }
            ]))
            .unwrap_err();
        assert_eq!(
            &error.to_string(),
            r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."#
        );
    }

    #[test]
    fn delete_documents_then_insert() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
                { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
                { "objectId": 456, "title": "Le Petit Prince",     "comment": "A french book" },
                { "objectId": 1,   "title": "Alice In Wonderland", "comment": "A weird book" },
                { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
            ]))
            .unwrap();

        // Delete not all of the documents but some of them.
        index.delete_document("30");

        let txn = index.read_txn().unwrap();
        assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));

        let external_documents_ids = index.external_documents_ids();
        assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());

        index
            .add_documents(documents!([
                { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
            ]))
            .unwrap();

        let wtxn = index.write_txn().unwrap();
        let external_documents_ids = index.external_documents_ids();
        assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
        wtxn.commit().unwrap();

        index
            .add_documents(documents!([
                { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
            ]))
            .unwrap();
    }

    #[test]
    fn index_more_than_256_fields() {
        let index = TempIndex::new();

        let mut big_object = serde_json::Map::new();
        big_object.insert(S("id"), serde_json::Value::from("wow"));
        for i in 0..1000 {
            let key = i.to_string();
            big_object.insert(key, serde_json::Value::from("I am a text!"));
        }

        let documents = documents_batch_reader_from_objects([big_object]);
        index.add_documents(documents).unwrap();
    }

    #[test]
    fn index_more_than_1000_positions_in_a_field() {
        let index = TempIndex::new_with_map_size(4096 * 100_000); // 400 MB
        let mut content = String::with_capacity(382101);
        for i in 0..=u16::MAX {
            content.push_str(&format!("{i} "));
        }
        index
            .add_documents(documents!({
                "id": "wow",
                "content": content
            }))
            .unwrap();

        let rtxn = index.read_txn().unwrap();

        assert!(index.word_docids.get(&rtxn, "0").unwrap().is_some());
        assert!(index.word_docids.get(&rtxn, "64").unwrap().is_some());
        assert!(index.word_docids.get(&rtxn, "256").unwrap().is_some());
        assert!(index.word_docids.get(&rtxn, "1024").unwrap().is_some());
        assert!(index.word_docids.get(&rtxn, "32768").unwrap().is_some());
        assert!(index.word_docids.get(&rtxn, "65535").unwrap().is_some());
    }

    #[test]
    fn index_documents_with_zeroes() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
                {
                    "id": 2,
                    "title": "Prideand Prejudice",
                    "au{hor": "Jane Austin",
                    "genre": "romance",
                    "price$": "3.5$",
                },
                {
                    "id": 456,
                    "title": "Le Petit Prince",
                    "au{hor": "Antoine de Saint-Exupéry",
                    "genre": "adventure",
                    "price$": "10.0$",
                },
                {
                    "id": 1,
                    "title": "Wonderland",
                    "au{hor": "Lewis Carroll",
                    "genre": "fantasy",
                    "price$": "25.99$",
                },
                {
                    "id": 4,
                    "title": "Harry Potter ing fantasy\0lood Prince",
                    "au{hor": "J. K. Rowling",
                    "genre": "fantasy\0",
                },
            ]))
            .unwrap();
    }

    #[test]
    fn index_documents_with_nested_fields() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
                {
                    "id": 0,
                    "title": "The zeroth document",
                },
                {
                    "id": 1,
                    "title": "The first document",
                    "nested": {
                        "object": "field",
                        "machin": "bidule",
                    },
                },
                {
                    "id": 2,
                    "title": "The second document",
                    "nested": [
                        "array",
                        {
                            "object": "field",
                        },
                        {
                            "prout": "truc",
                            "machin": "lol",
                        },
                    ],
                },
                {
                    "id": 3,
                    "title": "The third document",
                    "nested": "I lied",
                },
            ]))
            .unwrap();

        index
            .update_settings(|settings| {
                let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")];
                settings.set_searchable_fields(searchable_fields);

                let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin"));
                settings.set_filterable_fields(faceted_fields);
            })
            .unwrap();

        let rtxn = index.read_txn().unwrap();

        let facets = index.faceted_fields(&rtxn).unwrap();
        assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin")));

        // testing the simple query search
        let mut search = crate::Search::new(&rtxn, &index);
        search.query("document");
        search.terms_matching_strategy(TermsMatchingStrategy::default());
        // all documents should be returned
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids.len(), 4);

        search.query("zeroth");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![0]);
        search.query("first");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1]);
        search.query("second");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![2]);
        search.query("third");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![3]);

        search.query("field");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1, 2]);

        search.query("lol");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![2]);

        search.query("object");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert!(documents_ids.is_empty());

        search.query("array");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert!(documents_ids.is_empty()); // nested is not searchable

        search.query("lied");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert!(documents_ids.is_empty()); // nested is not searchable

        // testing the filters
        let mut search = crate::Search::new(&rtxn, &index);
        search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap());
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1]);

        search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap());
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1, 2]);

        search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap());
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1]);

        search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap());
        let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
        assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));

        search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap());
        let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
        assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
    }

    #[test]
    fn index_documents_with_nested_primary_key() {
        let index = TempIndex::new();

        index
            .update_settings(|settings| {
                settings.set_primary_key("complex.nested.id".to_owned());
            })
            .unwrap();

        index
            .add_documents(documents!([
                {
                    "complex": {
                        "nested": {
                            "id": 0,
                        },
                    },
                    "title": "The zeroth document",
                },
                {
                    "complex.nested": {
                        "id": 1,
                    },
                    "title": "The first document",
                },
                {
                    "complex": {
                        "nested.id": 2,
                    },
                    "title": "The second document",
                },
                {
                    "complex.nested.id": 3,
                    "title": "The third document",
                },
            ]))
            .unwrap();

        let rtxn = index.read_txn().unwrap();

        // testing the simple query search
        let mut search = crate::Search::new(&rtxn, &index);
        search.query("document");
        search.terms_matching_strategy(TermsMatchingStrategy::default());
        // all documents should be returned
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids.len(), 4);

        search.query("zeroth");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![0]);
        search.query("first");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1]);
        search.query("second");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![2]);
        search.query("third");
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![3]);
    }

    #[test]
    fn retrieve_a_b_nested_document_id() {
        let index = TempIndex::new();

        index
            .update_settings(|settings| {
                settings.set_primary_key("a.b".to_owned());
            })
            .unwrap();

        // There must be an issue with the primary key no present in the given document
        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap_err();
    }

    #[test]
    fn retrieve_a_b_c_nested_document_id() {
        let index = TempIndex::new();

        index
            .update_settings(|settings| {
                settings.set_primary_key("a.b.c".to_owned());
            })
            .unwrap();
        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap();

        let rtxn = index.read_txn().unwrap();
        let all_documents_count = index.all_documents(&rtxn).unwrap().count();
        assert_eq!(all_documents_count, 1);
        let external_documents_ids = index.external_documents_ids();
        assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
    }

    #[test]
    fn test_facets_generation() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
                {
                    "id": 0,
                    "dog": {
                        "race": {
                            "bernese mountain": "zeroth",
                        },
                    },
                },
                {
                    "id": 1,
                    "dog.race": {
                        "bernese mountain": "first",
                    },
                },
                {
                    "id": 2,
                    "dog.race.bernese mountain": "second",
                },
                {
                    "id": 3,
                    "dog": {
                        "race.bernese mountain": "third"
                    },
                },
            ]))
            .unwrap();

        index
            .update_settings(|settings| {
                settings.set_filterable_fields(hashset!(String::from("dog")));
            })
            .unwrap();

        db_snap!(index, facet_id_string_docids, @r###"
        3   0  first        1  [1, ]
        3   0  second       1  [2, ]
        3   0  third        1  [3, ]
        3   0  zeroth       1  [0, ]
        "###);
        db_snap!(index, field_id_docid_facet_strings, @r###"
        3   0    zeroth       zeroth
        3   1    first        first
        3   2    second       second
        3   3    third        third
        "###);

        let rtxn = index.read_txn().unwrap();

        let hidden = index.faceted_fields(&rtxn).unwrap();

        assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain")));

        for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] {
            let mut search = crate::Search::new(&rtxn, &index);
            let filter = format!(r#""dog.race.bernese mountain" = {s}"#);
            search.filter(crate::Filter::from_str(&filter).unwrap().unwrap());
            let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
            assert_eq!(documents_ids, vec![i]);
        }
        // Reset the settings
        index
            .update_settings(|settings| {
                settings.reset_filterable_fields();
            })
            .unwrap();

        db_snap!(index, facet_id_string_docids, @"");
        db_snap!(index, field_id_docid_facet_strings, @"");

        let rtxn = index.read_txn().unwrap();

        let facets = index.faceted_fields(&rtxn).unwrap();

        assert_eq!(facets, hashset!());

        // update the settings to test the sortable
        index
            .update_settings(|settings| {
                settings.set_sortable_fields(hashset!(S("dog.race")));
            })
            .unwrap();

        db_snap!(index, facet_id_string_docids, @r###"
        3   0  first        1  [1, ]
        3   0  second       1  [2, ]
        3   0  third        1  [3, ]
        3   0  zeroth       1  [0, ]
        "###);
        db_snap!(index, field_id_docid_facet_strings, @r###"
        3   0    zeroth       zeroth
        3   1    first        first
        3   2    second       second
        3   3    third        third
        "###);

        let rtxn = index.read_txn().unwrap();

        let facets = index.faceted_fields(&rtxn).unwrap();

        assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain")));

        let mut search = crate::Search::new(&rtxn, &index);
        search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S(
            "dog.race.bernese mountain",
        )))]);
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids, vec![1, 2, 3, 0]);
    }

    #[test]
    fn index_2_times_documents_split_by_zero_document_indexation() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
                {"id": 0, "name": "Kerollmops", "score": 78},
                {"id": 1, "name": "ManyTheFish", "score": 75},
                {"id": 2, "name": "Ferdi", "score": 39},
                {"id": 3, "name": "Tommy", "score": 33}
            ]))
            .unwrap();

        // Check that there is 4 document now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 4);

        index.add_documents(documents!([])).unwrap();

        // Check that there is 4 document now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 4);

        index
            .add_documents(documents!([
                {"id": 0, "name": "Kerollmops", "score": 78},
                {"id": 1, "name": "ManyTheFish", "score": 75},
                {"id": 2, "name": "Ferdi", "score": 39},
                {"id": 3, "name": "Tommy", "score": 33}
            ]))
            .unwrap();

        // Check that there is 4 document now.
        let rtxn = index.read_txn().unwrap();
        let count = index.number_of_documents(&rtxn).unwrap();
        assert_eq!(count, 4);
    }

    #[cfg(feature = "chinese")]
    #[test]
    fn test_meilisearch_1714() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
              {"id": "123", "title": "小化妆包" },
              {"id": "456", "title": "Ipad 包" }
            ]))
            .unwrap();

        let rtxn = index.read_txn().unwrap();

        // Only the first document should match.
        let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
        assert_eq!(count, 1);

        // Only the second document should match.
        let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
        assert_eq!(count, 1);

        let mut search = crate::Search::new(&rtxn, &index);
        search.query("化妆包");
        search.terms_matching_strategy(TermsMatchingStrategy::default());

        // only 1 document should be returned
        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
        assert_eq!(documents_ids.len(), 1);
    }

    /// We try to index documents with words that are too long here,
    /// it should not return any error.
    #[test]
    fn text_with_too_long_words() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
              {"id": 1, "title": "a".repeat(256) },
              {"id": 2, "title": "b".repeat(512) },
              {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) },
            ]))
            .unwrap();
    }

    #[test]
    fn text_with_too_long_keys() {
        let index = TempIndex::new();
        let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3";

        // Create 200 documents with a long text
        let content = {
            let documents_iter = (0..200i32)
                .map(|i| serde_json::json!({ "id": i, "script": script }))
                .filter_map(|json| match json {
                    serde_json::Value::Object(object) => Some(object),
                    _ => None,
                });
            documents_batch_reader_from_objects(documents_iter)
        };
        // Index those 200 long documents
        index.add_documents(content).unwrap();

        // Index one long document
        index
            .add_documents(documents!([
              {"id": 400, "script": script },
            ]))
            .unwrap();
    }

    #[test]
    fn index_documents_in_multiple_transforms() {
        let index = TempIndex::new();

        let doc1 = documents! {[{
            "id": 228142,
            "title": "asdsad",
            "state": "automated",
            "priority": "normal",
            "public_uid": "37ccf021",
            "project_id": 78207,
            "branch_id_number": 0
        }]};

        let doc2 = documents! {[{
            "id": 228143,
            "title": "something",
            "state": "automated",
            "priority": "normal",
            "public_uid": "39c6499b",
            "project_id": 78207,
            "branch_id_number": 0
        }]};

        {
            let mut wtxn = index.write_txn().unwrap();
            index.put_primary_key(&mut wtxn, "id").unwrap();
            wtxn.commit().unwrap();
        }

        index.add_documents(doc1).unwrap();
        index.add_documents(doc2).unwrap();

        let wtxn = index.read_txn().unwrap();

        let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
        let ids = map.values().collect::<HashSet<_>>();

        assert_eq!(ids.len(), map.len());
    }

    #[test]
    fn index_documents_check_exists_database() {
        let content = || {
            documents!([
                {
                    "id": 0,
                    "colour": 0,
                },
                {
                    "id": 1,
                    "colour": []
                },
                {
                    "id": 2,
                    "colour": {}
                },
                {
                    "id": 3,
                    "colour": null
                },
                {
                    "id": 4,
                    "colour": [1]
                },
                {
                    "id": 5
                },
                {
                    "id": 6,
                    "colour": {
                        "green": 1
                    }
                },
                {
                    "id": 7,
                    "colour": {
                        "green": {
                            "blue": []
                        }
                    }
                }
            ])
        };

        let check_ok = |index: &Index| {
            let rtxn = index.read_txn().unwrap();
            let facets = index.faceted_fields(&rtxn).unwrap();
            assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));

            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();

            let bitmap_colour =
                index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
            assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);

            let bitmap_colour_green =
                index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
        };

        let faceted_fields = hashset!(S("colour"));

        let index = TempIndex::new();
        index.add_documents(content()).unwrap();
        index
            .update_settings(|settings| {
                settings.set_filterable_fields(faceted_fields.clone());
            })
            .unwrap();
        check_ok(&index);

        let index = TempIndex::new();
        index
            .update_settings(|settings| {
                settings.set_filterable_fields(faceted_fields.clone());
            })
            .unwrap();
        index.add_documents(content()).unwrap();
        check_ok(&index);
    }

    #[test]
    fn index_documents_check_is_null_database() {
        let content = || {
            documents!([
                {
                    "id": 0,
                    "colour": null,
                },
                {
                    "id": 1,
                    "colour": [null], // must not be returned
                },
                {
                    "id": 6,
                    "colour": {
                        "green": null
                    }
                },
                {
                    "id": 7,
                    "colour": {
                        "green": {
                            "blue": null
                        }
                    }
                },
                {
                    "id": 8,
                    "colour": 0,
                },
                {
                    "id": 9,
                    "colour": []
                },
                {
                    "id": 10,
                    "colour": {}
                },
                {
                    "id": 12,
                    "colour": [1]
                },
                {
                    "id": 13
                },
                {
                    "id": 14,
                    "colour": {
                        "green": 1
                    }
                },
                {
                    "id": 15,
                    "colour": {
                        "green": {
                            "blue": []
                        }
                    }
                }
            ])
        };

        let check_ok = |index: &Index| {
            let rtxn = index.read_txn().unwrap();
            let facets = index.faceted_fields(&rtxn).unwrap();
            assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));

            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
            let colour_blue_id =
                index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();

            let bitmap_null_colour =
                index.facet_id_is_null_docids.get(&rtxn, &colour_id).unwrap().unwrap();
            assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);

            let bitmap_colour_green =
                index.facet_id_is_null_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);

            let bitmap_colour_blue =
                index.facet_id_is_null_docids.get(&rtxn, &colour_blue_id).unwrap().unwrap();
            assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
        };

        let faceted_fields = hashset!(S("colour"));

        let index = TempIndex::new();
        index.add_documents(content()).unwrap();
        index
            .update_settings(|settings| {
                settings.set_filterable_fields(faceted_fields.clone());
            })
            .unwrap();
        check_ok(&index);

        let index = TempIndex::new();
        index
            .update_settings(|settings| {
                settings.set_filterable_fields(faceted_fields.clone());
            })
            .unwrap();
        index.add_documents(content()).unwrap();
        check_ok(&index);
    }

    #[test]
    fn index_documents_check_is_empty_database() {
        let content = || {
            documents!([
                {"id": 0, "tags": null },
                {"id": 1, "tags": [null] },
                {"id": 2, "tags": [] },
                {"id": 3, "tags": ["hello","world"] },
                {"id": 4, "tags": [""] },
                {"id": 5 },
                {"id": 6, "tags": {} },
                {"id": 7, "tags": {"green": "cool"} },
                {"id": 8, "tags": {"green": ""} },
                {"id": 9, "tags": "" },
                {"id": 10, "tags": { "green": null } },
                {"id": 11, "tags": { "green": { "blue": null } } },
                {"id": 12, "tags": { "green": { "blue": [] } } }
            ])
        };

        let check_ok = |index: &Index| {
            let rtxn = index.read_txn().unwrap();
            let facets = index.faceted_fields(&rtxn).unwrap();
            assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));

            let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
            let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
            let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();

            let bitmap_empty_tags =
                index.facet_id_is_empty_docids.get(&rtxn, &tags_id).unwrap().unwrap();
            assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);

            let bitmap_tags_green =
                index.facet_id_is_empty_docids.get(&rtxn, &tags_green_id).unwrap().unwrap();
            assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);

            let bitmap_tags_blue =
                index.facet_id_is_empty_docids.get(&rtxn, &tags_blue_id).unwrap().unwrap();
            assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
        };

        let faceted_fields = hashset!(S("tags"));

        let index = TempIndex::new();
        index.add_documents(content()).unwrap();
        index
            .update_settings(|settings| {
                settings.set_filterable_fields(faceted_fields.clone());
            })
            .unwrap();
        check_ok(&index);

        let index = TempIndex::new();
        index
            .update_settings(|settings| {
                settings.set_filterable_fields(faceted_fields.clone());
            })
            .unwrap();
        index.add_documents(content()).unwrap();
        check_ok(&index);
    }

    #[test]
    fn primary_key_must_not_contain_floats() {
        let index = TempIndex::new_with_map_size(4096 * 100);

        let doc1 = documents! {[{
            "id": -228142,
            "title": "asdsad",
        }]};

        let doc2 = documents! {[{
            "id": 228143.56,
            "title": "something",
        }]};

        let doc3 = documents! {[{
            "id": -228143.56,
            "title": "something",
        }]};

        let doc4 = documents! {[{
            "id": 2.0,
            "title": "something",
        }]};

        index.add_documents(doc1).unwrap();
        index.add_documents(doc2).unwrap_err();
        index.add_documents(doc3).unwrap_err();
        index.add_documents(doc4).unwrap_err();
    }

    #[test]
    fn primary_key_must_not_contain_whitespace() {
        let index = TempIndex::new();

        let doc1 = documents! {[{
            "id": " 1",
            "title": "asdsad",
        }]};

        let doc2 = documents! {[{
            "id": "\t2",
            "title": "something",
        }]};

        let doc3 = documents! {[{
            "id": "\r3",
            "title": "something",
        }]};

        let doc4 = documents! {[{
            "id": "\n4",
            "title": "something",
        }]};

        index.add_documents(doc1).unwrap_err();
        index.add_documents(doc2).unwrap_err();
        index.add_documents(doc3).unwrap_err();
        index.add_documents(doc4).unwrap_err();
    }

    #[test]
    fn primary_key_inference() {
        let index = TempIndex::new();

        let doc_no_id = documents! {[{
            "title": "asdsad",
            "state": "automated",
            "priority": "normal",
            "branch_id_number": 0
        }]};
        assert!(matches!(
            index.add_documents(doc_no_id),
            Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
        ));

        let doc_multiple_ids = documents! {[{
            "id": 228143,
            "title": "something",
            "state": "automated",
            "priority": "normal",
            "public_uid": "39c6499b",
            "project_id": 78207,
            "branch_id_number": 0
        }]};

        let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { candidates })) =
            index.add_documents(doc_multiple_ids)
        else {
            panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)")
        };

        assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);

        let doc_inferable = documents! {[{
            "video": "test.mp4",
            "id": 228143,
            "title": "something",
            "state": "automated",
            "priority": "normal",
            "public_uid_": "39c6499b",
            "project_id_": 78207,
            "branch_id_number": 0
        }]};

        index.add_documents(doc_inferable).unwrap();

        let txn = index.read_txn().unwrap();

        assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
    }

    #[test]
    fn long_words_must_be_skipped() {
        let index = TempIndex::new();

        // this is obviousy too long
        let long_word = "lol".repeat(1000);
        let doc1 = documents! {[{
            "id": "1",
            "title": long_word,
        }]};

        index.add_documents(doc1).unwrap();

        let rtxn = index.read_txn().unwrap();
        let words_fst = index.words_fst(&rtxn).unwrap();
        assert!(!words_fst.contains(&long_word));
    }

    #[test]
    fn long_facet_values_must_not_crash() {
        let index = TempIndex::new();

        // this is obviousy too long
        let long_word = "lol".repeat(1000);
        let doc1 = documents! {[{
            "id": "1",
            "title": long_word,
        }]};

        index
            .update_settings(|settings| {
                settings.set_filterable_fields(hashset! { S("title") });
            })
            .unwrap();

        index.add_documents(doc1).unwrap();
    }

    #[cfg(feature = "default")]
    #[test]
    fn store_detected_script_and_language_per_document_during_indexing() {
        use charabia::{Language, Script};
        let index = TempIndex::new();
        index
            .add_documents(documents!([
                { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
                { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
                { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
                { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
                { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
                { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
            ]))
            .unwrap();

        let rtxn = index.read_txn().unwrap();
        let key_jpn = (Script::Cj, Language::Jpn);
        let key_cmn = (Script::Cj, Language::Cmn);
        let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
        let expected_cj_jpn_docids = [3].iter().collect();
        assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
        let expected_cj_cmn_docids = [1, 5].iter().collect();
        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
    }

    #[test]
    fn add_and_delete_documents_in_single_transform() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let documents = documents!([
            { "id": 1, "doggo": "kevin" },
            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
            { "id": 3, "name": "jean", "age": 25 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"3");

        let (builder, removed) = builder.remove_documents(vec![S("2")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"1");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 3,
            number_of_documents: 2,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":1,"doggo":"kevin"}
        {"id":3,"name":"jean","age":25}
        "###);
    }

    #[test]
    fn add_update_and_delete_documents_in_single_transform() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let documents = documents!([
            { "id": 1, "doggo": "kevin" },
            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
            { "id": 3, "name": "jean", "age": 25 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"3");

        let documents = documents!([
            { "id": 2, "catto": "jorts" },
            { "id": 3, "legs": 4 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"2");

        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"2");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 5,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":3,"name":"jean","age":25,"legs":4}
        "###);
    }

    #[test]
    fn add_document_and_in_another_transform_update_and_delete_documents() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let documents = documents!([
            { "id": 1, "doggo": "kevin" },
            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
            { "id": 3, "name": "jean", "age": 25 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"3");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 3,
            number_of_documents: 3,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":1,"doggo":"kevin"}
        {"id":2,"doggo":{"name":"bob","age":20}}
        {"id":3,"name":"jean","age":25}
        "###);

        // A first batch of documents has been inserted

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let documents = documents!([
            { "id": 2, "catto": "jorts" },
            { "id": 3, "legs": 4 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"2");

        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"2");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 2,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":3,"name":"jean","age":25,"legs":4}
        "###);
    }

    #[test]
    fn delete_document_and_then_add_documents_in_the_same_transform() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"0");

        let documents = documents!([
            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
            { "id": 3, "name": "bob", "age": 25 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"2");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 2,
            number_of_documents: 2,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":2,"doggo":{"name":"jean","age":20}}
        {"id":3,"name":"bob","age":25}
        "###);
    }

    #[test]
    fn delete_the_same_document_multiple_time() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let (builder, removed) =
            builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"0");

        let documents = documents!([
            { "id": 1, "doggo": "kevin" },
            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
            { "id": 3, "name": "bob", "age": 25 },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"3");

        let (builder, removed) =
            builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"2");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 3,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":3,"name":"bob","age":25}
        "###);
    }

    #[test]
    fn add_document_and_in_another_transform_delete_the_document_then_add_it_again() {
        let mut index = TempIndex::new();
        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let documents = documents!([
            { "id": 1, "doggo": "kevin" },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"1");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 1,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":1,"doggo":"kevin"}
        "###);

        // A first batch of documents has been inserted

        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"1");

        let documents = documents!([
            { "id": 1, "catto": "jorts" },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"1");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 1,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":1,"catto":"jorts"}
        "###);
    }

    #[test]
    fn test_word_fid_position() {
        let index = TempIndex::new();

        index
            .add_documents(documents!([
              {"id": 0, "text": "sun flowers are looking at the sun" },
              {"id": 1, "text": "sun flowers are looking at the sun" },
              {"id": 2, "text": "the sun is shining today" },
              {
                "id": 3,
                "text": "a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a a a a a a
                a a a a a a a a a a a a a a a a a a a a a "
             }
            ]))
            .unwrap();

        db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
        db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");

        index
            .add_documents(documents!([
              {"id": 4, "text": "sun flowers are looking at the sun" },
              {"id": 5, "text2": "sun flowers are looking at the sun" },
              {"id": 6, "text": "b b b" },
              {
                "id": 7,
                "text2": "a a a a"
             }
            ]))
            .unwrap();

        db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
        db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");

        // Delete not all of the documents but some of them.
        index.delete_documents(vec!["0".into(), "3".into()]);

        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }

    /// Index multiple different number of vectors in documents.
    /// Vectors must be of the same length.
    #[test]
    fn test_multiple_vectors() {
        use crate::vector::settings::EmbeddingSettings;
        let index = TempIndex::new();

        index
            .update_settings(|settings| {
                let mut embedders = BTreeMap::default();
                embedders.insert(
                    "manual".to_string(),
                    Setting::Set(EmbeddingSettings {
                        source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided),
                        model: Setting::NotSet,
                        revision: Setting::NotSet,
                        api_key: Setting::NotSet,
                        dimensions: Setting::Set(3),
                        document_template: Setting::NotSet,
                        url: Setting::NotSet,
                        query: Setting::NotSet,
                        input_field: Setting::NotSet,
                        path_to_embeddings: Setting::NotSet,
                        embedding_object: Setting::NotSet,
                        input_type: Setting::NotSet,
                    }),
                );
                settings.set_embedder_settings(embedders);
            })
            .unwrap();

        index
            .add_documents(
                documents!([{"id": 0, "_vectors": { "manual": [[0, 1, 2], [3, 4, 5]] } }]),
            )
            .unwrap();
        index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap();
        index
               .add_documents(
                   documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]),
               )
               .unwrap();

        let rtxn = index.read_txn().unwrap();
        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap();
        assert_eq!(res.documents_ids.len(), 3);
    }

    #[test]
    fn reproduce_the_bug() {
        /*
            [milli/examples/fuzz.rs:69] &batches = [
            Batch(
                [
                    AddDoc(
                        { "id": 1, "doggo": "bernese" }, => internal 0
                    ),
                ],
            ),
            Batch(
                [
                    DeleteDoc(
                        1, => delete internal 0
                    ),
                    AddDoc(
                        { "id": 0, "catto": "jorts" }, => internal 1
                    ),
                ],
            ),
            Batch(
                [
                    AddDoc(
                        { "id": 1, "catto": "jorts" }, => internal 2
                    ),
                ],
            ),
        ]
        */
        let index = TempIndex::new();

        // START OF BATCH

        println!("--- ENTERING BATCH 1");

        let mut wtxn = index.write_txn().unwrap();

        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        // OP

        let documents = documents!([
            { "id": 1, "doggo": "bernese" },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"1");

        // FINISHING
        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 1,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":1,"doggo":"bernese"}
        "###);
        db_snap!(index, external_documents_ids, @r###"
        docids:
        1                        0
        "###);

        // A first batch of documents has been inserted

        // BATCH 2

        println!("--- ENTERING BATCH 2");

        let mut wtxn = index.write_txn().unwrap();

        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
        insta::assert_display_snapshot!(removed.unwrap(), @"1");

        let documents = documents!([
            { "id": 0, "catto": "jorts" },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"1");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 1,
            number_of_documents: 1,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":0,"catto":"jorts"}
        "###);

        db_snap!(index, external_documents_ids, @r###"
        docids:
        0                        1
        "###);

        // BATCH 3

        println!("--- ENTERING BATCH 3");

        let mut wtxn = index.write_txn().unwrap();

        let builder = IndexDocuments::new(
            &mut wtxn,
            &index,
            &index.indexer_config,
            index.index_documents_config.clone(),
            |_| (),
            || false,
        )
        .unwrap();

        let documents = documents!([
            { "id": 1, "catto": "jorts" },
        ]);
        let (builder, added) = builder.add_documents(documents).unwrap();
        insta::assert_display_snapshot!(added.unwrap(), @"1");

        let addition = builder.execute().unwrap();
        insta::assert_debug_snapshot!(addition, @r###"
        DocumentAdditionResult {
            indexed_documents: 1,
            number_of_documents: 2,
        }
        "###);
        wtxn.commit().unwrap();

        db_snap!(index, documents, @r###"
        {"id":1,"catto":"jorts"}
        {"id":0,"catto":"jorts"}
        "###);

        // Ensuring all the returned IDs actually exists
        let rtxn = index.read_txn().unwrap();
        let res = index.search(&rtxn).execute().unwrap();
        index.documents(&rtxn, res.documents_ids).unwrap();
    }

    fn delete_documents<'t>(
        wtxn: &mut RwTxn<'t>,
        index: &'t TempIndex,
        external_ids: &[&str],
    ) -> Vec<u32> {
        let external_document_ids = index.external_documents_ids();
        let ids_to_delete: Vec<u32> = external_ids
            .iter()
            .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
            .collect();

        // Delete some documents.
        index.delete_documents_using_wtxn(
            wtxn,
            external_ids.iter().map(ToString::to_string).collect(),
        );

        ids_to_delete
    }

    #[test]
    fn delete_documents_with_numbers_as_primary_key() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();
        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
                    { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
                    { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
                ]),
            )
            .unwrap();

        // delete those documents, ids are synchronous therefore 0, 1, and 2.
        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);

        wtxn.commit().unwrap();

        // All these snapshots should be empty since the database was cleared
        db_snap!(index, documents_ids);
        db_snap!(index, word_docids);
        db_snap!(index, word_pair_proximity_docids);
        db_snap!(index, facet_id_exists_docids);

        let rtxn = index.read_txn().unwrap();

        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
    }

    #[test]
    fn delete_documents_with_strange_primary_key() {
        let index = TempIndex::new();

        index
            .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
            .unwrap();

        let mut wtxn = index.write_txn().unwrap();
        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "mysuperid": 0, "name": "kevin" },
                    { "mysuperid": 1, "name": "kevina" },
                    { "mysuperid": 2, "name": "benoit" }
                ]),
            )
            .unwrap();
        wtxn.commit().unwrap();

        let mut wtxn = index.write_txn().unwrap();

        // Delete not all of the documents but some of them.
        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);

        wtxn.commit().unwrap();

        db_snap!(index, documents_ids);
        db_snap!(index, word_docids);
        db_snap!(index, word_pair_proximity_docids);
    }

    #[test]
    fn filtered_placeholder_search_should_not_return_deleted_documents() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();

        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_primary_key(S("docid"));
                settings.set_filterable_fields(hashset! { S("label"), S("label2") });
            })
            .unwrap();

        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "docid": "1_4",  "label": ["sign"] },
                    { "docid": "1_5",  "label": ["letter"] },
                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
                    { "docid": "1_39", "label": ["abstract"] },
                    { "docid": "1_40", "label": ["cartoon"] },
                    { "docid": "1_41", "label": ["art","drawing"] },
                    { "docid": "1_42", "label": ["art","pattern"] },
                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
                    { "docid": "1_44", "label": ["drawing"] },
                    { "docid": "1_45", "label": ["art"] },
                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
                    { "docid": "1_47", "label": ["abstract","pattern"] },
                    { "docid": "1_52", "label": ["abstract","cartoon"] },
                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
                    { "docid": "1_68", "label": ["design"] },
                    { "docid": "1_69", "label": ["geometry"] },
                    { "docid": "1_70", "label2": ["geometry", 1.2] },
                    { "docid": "1_71", "label2": ["design", 2.2] },
                    { "docid": "1_72", "label2": ["geometry", 1.2] }
                ]),
            )
            .unwrap();

        delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);

        // Placeholder search with filter
        let filter = Filter::from_str("label = sign").unwrap().unwrap();
        let results = index.search(&wtxn).filter(filter).execute().unwrap();
        assert!(results.documents_ids.is_empty());

        wtxn.commit().unwrap();

        db_snap!(index, word_docids);
        db_snap!(index, facet_id_f64_docids);
        db_snap!(index, word_pair_proximity_docids);
        db_snap!(index, facet_id_exists_docids);
        db_snap!(index, facet_id_string_docids);
    }

    #[test]
    fn placeholder_search_should_not_return_deleted_documents() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_primary_key(S("docid"));
            })
            .unwrap();

        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "docid": "1_4",  "label": ["sign"] },
                    { "docid": "1_5",  "label": ["letter"] },
                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
                    { "docid": "1_39", "label": ["abstract"] },
                    { "docid": "1_40", "label": ["cartoon"] },
                    { "docid": "1_41", "label": ["art","drawing"] },
                    { "docid": "1_42", "label": ["art","pattern"] },
                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
                    { "docid": "1_44", "label": ["drawing"] },
                    { "docid": "1_45", "label": ["art"] },
                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
                    { "docid": "1_47", "label": ["abstract","pattern"] },
                    { "docid": "1_52", "label": ["abstract","cartoon"] },
                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
                    { "docid": "1_68", "label": ["design"] },
                    { "docid": "1_69", "label": ["geometry"] },
                    { "docid": "1_70", "label2": ["geometry", 1.2] },
                    { "docid": "1_71", "label2": ["design", 2.2] },
                    { "docid": "1_72", "label2": ["geometry", 1.2] }
                ]),
            )
            .unwrap();

        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);

        // Placeholder search
        let results = index.search(&wtxn).execute().unwrap();
        assert!(!results.documents_ids.is_empty());
        for id in results.documents_ids.iter() {
            assert!(
                !deleted_internal_ids.contains(id),
                "The document {} was supposed to be deleted",
                id
            );
        }

        wtxn.commit().unwrap();
    }

    #[test]
    fn search_should_not_return_deleted_documents() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_primary_key(S("docid"));
            })
            .unwrap();

        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "docid": "1_4",  "label": ["sign"] },
                    { "docid": "1_5",  "label": ["letter"] },
                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
                    { "docid": "1_39", "label": ["abstract"] },
                    { "docid": "1_40", "label": ["cartoon"] },
                    { "docid": "1_41", "label": ["art","drawing"] },
                    { "docid": "1_42", "label": ["art","pattern"] },
                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
                    { "docid": "1_44", "label": ["drawing"] },
                    { "docid": "1_45", "label": ["art"] },
                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
                    { "docid": "1_47", "label": ["abstract","pattern"] },
                    { "docid": "1_52", "label": ["abstract","cartoon"] },
                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
                    { "docid": "1_68", "label": ["design"] },
                    { "docid": "1_69", "label": ["geometry"] },
                    { "docid": "1_70", "label2": ["geometry", 1.2] },
                    { "docid": "1_71", "label2": ["design", 2.2] },
                    { "docid": "1_72", "label2": ["geometry", 1.2] }
                ]),
            )
            .unwrap();

        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);

        // search for abstract
        let results = index.search(&wtxn).query("abstract").execute().unwrap();
        assert!(!results.documents_ids.is_empty());
        for id in results.documents_ids.iter() {
            assert!(
                !deleted_internal_ids.contains(id),
                "The document {} was supposed to be deleted",
                id
            );
        }

        wtxn.commit().unwrap();
    }

    #[test]
    fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_primary_key(S("id"));
                settings.set_filterable_fields(hashset!(S("_geo")));
                settings.set_sortable_fields(hashset!(S("_geo")));
            })
            .unwrap();

        index.add_documents_using_wtxn(&mut wtxn, documents!([
            { "id": "1",  "city": "Lille",             "_geo": { "lat": 50.6299, "lng": 3.0569 } },
            { "id": "2",  "city": "Mons-en-Barœul",    "_geo": { "lat": 50.6415, "lng": 3.1106 } },
            { "id": "3",  "city": "Hellemmes",         "_geo": { "lat": 50.6312, "lng": 3.1106 } },
            { "id": "4",  "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
            { "id": "5",  "city": "Hem",               "_geo": { "lat": 50.6552, "lng": 3.1897 } },
            { "id": "6",  "city": "Roubaix",           "_geo": { "lat": 50.6924, "lng": 3.1763 } },
            { "id": "7",  "city": "Tourcoing",         "_geo": { "lat": 50.7263, "lng": 3.1541 } },
            { "id": "8",  "city": "Mouscron",          "_geo": { "lat": 50.7453, "lng": 3.2206 } },
            { "id": "9",  "city": "Tournai",           "_geo": { "lat": 50.6053, "lng": 3.3758 } },
            { "id": "10", "city": "Ghent",             "_geo": { "lat": 51.0537, "lng": 3.6957 } },
            { "id": "11", "city": "Brussels",          "_geo": { "lat": 50.8466, "lng": 4.3370 } },
            { "id": "12", "city": "Charleroi",         "_geo": { "lat": 50.4095, "lng": 4.4347 } },
            { "id": "13", "city": "Mons",              "_geo": { "lat": 50.4502, "lng": 3.9623 } },
            { "id": "14", "city": "Valenciennes",      "_geo": { "lat": 50.3518, "lng": 3.5326 } },
            { "id": "15", "city": "Arras",             "_geo": { "lat": 50.2844, "lng": 2.7637 } },
            { "id": "16", "city": "Cambrai",           "_geo": { "lat": 50.1793, "lng": 3.2189 } },
            { "id": "17", "city": "Bapaume",           "_geo": { "lat": 50.1112, "lng": 2.8547 } },
            { "id": "18", "city": "Amiens",            "_geo": { "lat": 49.9314, "lng": 2.2710 } },
            { "id": "19", "city": "Compiègne",         "_geo": { "lat": 49.4449, "lng": 2.7913 } },
            { "id": "20", "city": "Paris",             "_geo": { "lat": 48.9021, "lng": 2.3708 } }
        ])).unwrap();

        let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);

        // Placeholder search with geo filter
        let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
        let results = index.search(&wtxn).filter(filter).execute().unwrap();
        assert!(!results.documents_ids.is_empty());
        for id in results.documents_ids.iter() {
            assert!(
                !deleted_internal_ids.contains(id),
                "The document {} was supposed to be deleted",
                id
            );
        }

        wtxn.commit().unwrap();

        db_snap!(index, facet_id_f64_docids);
        db_snap!(index, facet_id_string_docids);
    }

    #[test]
    fn get_documents_should_not_return_deleted_documents() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_primary_key(S("docid"));
            })
            .unwrap();

        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "docid": "1_4",  "label": ["sign"] },
                    { "docid": "1_5",  "label": ["letter"] },
                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
                    { "docid": "1_39", "label": ["abstract"] },
                    { "docid": "1_40", "label": ["cartoon"] },
                    { "docid": "1_41", "label": ["art","drawing"] },
                    { "docid": "1_42", "label": ["art","pattern"] },
                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
                    { "docid": "1_44", "label": ["drawing"] },
                    { "docid": "1_45", "label": ["art"] },
                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
                    { "docid": "1_47", "label": ["abstract","pattern"] },
                    { "docid": "1_52", "label": ["abstract","cartoon"] },
                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
                    { "docid": "1_68", "label": ["design"] },
                    { "docid": "1_69", "label": ["geometry"] },
                    { "docid": "1_70", "label2": ["geometry", 1.2] },
                    { "docid": "1_71", "label2": ["design", 2.2] },
                    { "docid": "1_72", "label2": ["geometry", 1.2] }
                ]),
            )
            .unwrap();

        let deleted_external_ids = ["1_7", "1_52"];
        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);

        // list all documents
        let results = index.all_documents(&wtxn).unwrap();
        for result in results {
            let (id, _) = result.unwrap();
            assert!(
                !deleted_internal_ids.contains(&id),
                "The document {} was supposed to be deleted",
                id
            );
        }

        // list internal document ids
        let results = index.documents_ids(&wtxn).unwrap();
        for id in results {
            assert!(
                !deleted_internal_ids.contains(&id),
                "The document {} was supposed to be deleted",
                id
            );
        }
        wtxn.commit().unwrap();

        let rtxn = index.read_txn().unwrap();

        // get internal docids from deleted external document ids
        let results = index.external_documents_ids();
        for id in deleted_external_ids {
            assert!(
                results.get(&rtxn, id).unwrap().is_none(),
                "The document {} was supposed to be deleted",
                id
            );
        }
        drop(rtxn);
    }

    #[test]
    fn stats_should_not_return_deleted_documents() {
        let index = TempIndex::new();

        let mut wtxn = index.write_txn().unwrap();

        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_primary_key(S("docid"));
            })
            .unwrap();

        index.add_documents_using_wtxn(&mut wtxn, documents!([
            { "docid": "1_4",  "label": ["sign"]},
            { "docid": "1_5",  "label": ["letter"]},
            { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
            { "docid": "1_36", "label": ["drawing","painting","pattern"]},
            { "docid": "1_37", "label": ["art","drawing","outdoor"]},
            { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
            { "docid": "1_39", "label": ["abstract"]},
            { "docid": "1_40", "label": ["cartoon"]},
            { "docid": "1_41", "label": ["art","drawing"]},
            { "docid": "1_42", "label": ["art","pattern"]},
            { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
            { "docid": "1_44", "label": ["drawing"], "number": 44i32},
            { "docid": "1_45", "label": ["art"]},
            { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
            { "docid": "1_47", "label": ["abstract","pattern"]},
            { "docid": "1_52", "label": ["abstract","cartoon"]},
            { "docid": "1_57", "label": ["abstract","drawing","pattern"]},
            { "docid": "1_58", "label": ["abstract","art","cartoon"]},
            { "docid": "1_68", "label": ["design"]},
            { "docid": "1_69", "label": ["geometry"]}
        ])).unwrap();

        delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);

        // count internal documents
        let results = index.number_of_documents(&wtxn).unwrap();
        assert_eq!(18, results);

        // count field distribution
        let results = index.field_distribution(&wtxn).unwrap();
        assert_eq!(Some(&18), results.get("label"));
        assert_eq!(Some(&1), results.get("title"));
        assert_eq!(Some(&2), results.get("number"));

        wtxn.commit().unwrap();
    }

    #[test]
    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
        use charabia::{Language, Script};
        let index = TempIndex::new();
        let mut wtxn = index.write_txn().unwrap();
        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
            ]))
            .unwrap();

        let key_cmn = (Script::Cj, Language::Cmn);
        let cj_cmn_docs =
            index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
        let mut expected_cj_cmn_docids = RoaringBitmap::new();
        expected_cj_cmn_docids.push(1);
        expected_cj_cmn_docids.push(5);
        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);

        delete_documents(&mut wtxn, &index, &["1"]);
        wtxn.commit().unwrap();

        let rtxn = index.read_txn().unwrap();
        let cj_cmn_docs =
            index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
        let mut expected_cj_cmn_docids = RoaringBitmap::new();
        expected_cj_cmn_docids.push(5);
        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
    }

    #[test]
    fn delete_words_exact_attributes() {
        let index = TempIndex::new();

        index
            .update_settings(|settings| {
                settings.set_primary_key(S("id"));
                settings.set_searchable_fields(vec![S("text"), S("exact")]);
                settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
            })
            .unwrap();

        index
            .add_documents(documents!([
                { "id": 0, "text": "hello" },
                { "id": 1, "exact": "hello"}
            ]))
            .unwrap();
        db_snap!(index, word_docids, 1, @r###"
        hello            [0, ]
        "###);
        db_snap!(index, exact_word_docids, 1, @r###"
        hello            [1, ]
        "###);
        db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");

        let mut wtxn = index.write_txn().unwrap();
        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
        wtxn.commit().unwrap();

        db_snap!(index, word_docids, 2, @r###"
        hello            [0, ]
        "###);
        db_snap!(index, exact_word_docids, 2, @"");
        db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");

        insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
        let txn = index.read_txn().unwrap();
        let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
        insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);

        let mut s = Search::new(&txn, &index);
        s.query("hello");
        let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
        insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
    }
}
-												Rename the validate function as an enriching function

											
										
										
											2022-06-21 11:14:14 +02:00
+								mod enrich;
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								mod extract;
 								mod helpers;
 								mod transform;
 								mod typed_chunk;
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								use std::collections::{HashMap, HashSet};
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								use std::io::{Read, Seek};
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								use std::iter::FromIterator;
-												Document and refine facet indexing algorithms

											
										
										
											2022-09-07 16:44:08 +02:00
+								use std::num::NonZeroU32;
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								use std::result::Result as StdResult;
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								use crossbeam_channel::{Receiver, Sender};
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								use grenad::{Merger, MergerBuilder};
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								use heed::types::Str;
 								use heed::Database;
-												WIP arroy integration

											
										
										
											2023-12-07 13:33:15 +01:00
+								use rand::SeedableRng;
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								use roaring::RoaringBitmap;
-												format the whole project

											
										
										
											2021-06-16 18:33:33 +02:00
+								use serde::{Deserialize, Serialize};
-												Compute the new, common and, deleted prefix words fst once

											
										
										
											2022-01-27 11:00:18 +01:00
+								use slice_group_by::GroupBy;
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								use tracing::debug;
 								use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
-												Introduce the searchable parameter settings to the Settings update

											
										
										
											2020-11-03 13:20:11 +01:00
-												Rename the validate function as an enriching function

											
										
										
											2022-06-21 11:14:14 +02:00
+								use self::enrich::enrich_documents_batch;
-												fix warning

											
										
										
											2024-01-02 15:19:00 +01:00
+								pub use self::enrich::{extract_finite_float_from_value, DocumentId};
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								pub use self::helpers::{
-												Change the behavior of the as_cloneable_grenad by taking a ref

											
										
										
											2022-02-16 15:40:08 +01:00
+								    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-												fix PR comments

											
										
										
											2024-02-13 15:14:03 +01:00
+								    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
 								    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
 								    valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								};
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								pub use self::transform::{Transform, TransformOutput};
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								use crate::documents::{obkv_to_object, DocumentsBatchReader};
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								use crate::error::{Error, InternalError, UserError};
-												Bring the newly created word pair proximity docids

											
										
										
											2022-01-18 14:59:51 +01:00
+								pub use crate::update::index_documents::helpers::CursorClonableMmap;
-												format the whole project

											
										
										
											2021-06-16 18:33:33 +02:00
+								use crate::update::{
-												Remove word pair proximity prefix cache and compute it at search time

											
										
										
											2023-11-08 14:16:01 +01:00
+								    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
-												format the whole project

											
										
										
											2021-06-16 18:33:33 +02:00
+								};
-												Various changes

- DistributionShift in Search object (to be set from model in embed?)
- Fix issue where embedder index wasn't computed at search time
- Accept as default embedder either the "default" one, or the only embedder when there is only one

											
										
										
											2023-12-13 15:38:44 +01:00
+								use crate::vector::EmbeddingConfigs;
-												clean PR warnings

											
										
										
											2023-10-10 11:23:16 +02:00
+								use crate::{CboRoaringBitmapCodec, Index, Result};
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								static MERGED_DATABASE_COUNT: usize = 7;
-												Remove word pair proximity prefix cache and compute it at search time

											
										
										
											2023-11-08 14:16:01 +01:00
+								static PREFIX_DATABASE_COUNT: usize = 4;
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
-												improve document addition returned metaimprove document addition
returned metaimprove document addition returned metaimprove document
addition returned metaimprove document addition returned metaimprove
document addition returned metaimprove document addition returned
metaimprove document addition returned meta

											
										
										
											2021-11-10 14:08:36 +01:00
+								#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-												return documents number on addition

											
										
										
											2020-12-30 18:43:50 +01:00
+								pub struct DocumentAdditionResult {
-												improve document addition returned metaimprove document addition
returned metaimprove document addition returned metaimprove document
addition returned metaimprove document addition returned metaimprove
document addition returned metaimprove document addition returned
metaimprove document addition returned meta

											
										
										
											2021-11-10 14:08:36 +01:00
+								    /// The number of documents that were indexed during the update
 								    pub indexed_documents: u64,
 								    /// The total number of documents in the index after the update
 								    pub number_of_documents: u64,
-												return documents number on addition

											
										
										
											2020-12-30 18:43:50 +01:00
+								}
-												derive serde for method and format

This is nicer when working with UpdateMeta struct

											
										
										
											2020-12-22 18:17:35 +01:00
+								#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
-												Update the Transform struct to support JSON updates

											
										
										
											2020-10-31 16:10:15 +01:00
+								#[non_exhaustive]
-												Move the IndexDocuments update into its own module

											
										
										
											2020-10-26 11:02:44 +01:00
+								pub enum IndexDocumentsMethod {
 								    /// Replace the previous document with the new one,
 								    /// removing all the already known attributes.
 								    ReplaceDocuments,
 								    /// Merge the previous version of the document with the new version,
 								    /// replacing old attributes values with the new ones and add the new attributes.
 								    UpdateDocuments,
 								}
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								impl Default for IndexDocumentsMethod {
 								    fn default() -> Self {
 								        Self::ReplaceDocuments
 								    }
 								}
-												Make the changes to use heed v0.20-alpha.6

											
										
										
											2023-11-22 18:21:19 +01:00
+								pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
 								    wtxn: &'t mut heed::RwTxn<'i>,
-												Move the IndexDocuments update into its own module

											
										
										
											2020-10-26 11:02:44 +01:00
+								    index: &'i Index,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    config: IndexDocumentsConfig,
 								    indexer_config: &'a IndexerConfig,
 								    transform: Option<Transform<'a, 'i>>,
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								    progress: FP,
 								    should_abort: FA,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    added_documents: u64,
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								    deleted_documents: u64,
-												Various changes

- DistributionShift in Search object (to be set from model in embed?)
- Fix issue where embedder index wasn't computed at search time
- Accept as default embedder either the "default" one, or the only embedder when there is only one

											
										
										
											2023-12-13 15:38:44 +01:00
+								    embedders: EmbeddingConfigs,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								}
 								#[derive(Default, Debug, Clone)]
 								pub struct IndexDocumentsConfig {
 								    pub words_prefix_threshold: Option<u32>,
 								    pub max_prefix_length: Option<usize>,
 								    pub words_positions_level_group_size: Option<NonZeroU32>,
 								    pub words_positions_min_level_size: Option<NonZeroU32>,
 								    pub update_method: IndexDocumentsMethod,
 								    pub autogenerate_docids: bool,
-												Move the IndexDocuments update into its own module

											
										
										
											2020-10-26 11:02:44 +01:00
+								}
-												Make the changes to use heed v0.20-alpha.6

											
										
										
											2023-11-22 18:21:19 +01:00
+								impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								where
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								    FP: Fn(UpdateIndexingStep) + Sync + Send,
 								    FA: Fn() -> bool + Sync + Send,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								{
-												Use update_id in UpdateBuilder

Add `the update_id` to the to the updates. The rationale is the
following:
- It allows for better tracability of the update events, thus improved
  debugging and logging.
- The enigne is now aware of what he's already processed, and can return
  it if asked. It may not make sense now, but in the future, the update
  store may not work the same way, and this information about the state
  of the engine will be desirable (distributed environement).

											
										
										
											2020-12-22 16:21:07 +01:00
+								    pub fn new(
-												Make the changes to use heed v0.20-alpha.6

											
										
										
											2023-11-22 18:21:19 +01:00
+								        wtxn: &'t mut heed::RwTxn<'i>,
-												Use update_id in UpdateBuilder

Add `the update_id` to the to the updates. The rationale is the
following:
- It allows for better tracability of the update events, thus improved
  debugging and logging.
- The enigne is now aware of what he's already processed, and can return
  it if asked. It may not make sense now, but in the future, the update
  store may not work the same way, and this information about the state
  of the engine will be desirable (distributed environement).

											
										
										
											2020-12-22 16:21:07 +01:00
+								        index: &'i Index,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        indexer_config: &'a IndexerConfig,
 								        config: IndexDocumentsConfig,
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        progress: FP,
 								        should_abort: FA,
-												Make the changes to use heed v0.20-alpha.6

											
										
										
											2023-11-22 18:21:19 +01:00
+								    ) -> Result<IndexDocuments<'t, 'i, 'a, FP, FA>> {
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        let transform = Some(Transform::new(
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								            wtxn,
-												Fixing piles of clippy errors.

Most of these are calling clone when the struct supports Copy.

Many are using & and &mut on `self` when the function they are called
from already has an immutable or mutable borrow so this isn't needed.

I tried to stay away from actual changes or places where I'd have to
name fresh variables.

											
										
										
											2022-10-13 22:02:54 +02:00
+								            index,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								            indexer_config,
 								            config.update_method,
 								            config.autogenerate_docids,
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        )?);
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        Ok(IndexDocuments {
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								            transform,
 								            config,
 								            indexer_config,
 								            progress,
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								            should_abort,
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								            wtxn,
 								            index,
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								            added_documents: 0,
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								            deleted_documents: 0,
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								            embedders: Default::default(),
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        })
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								    }
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    /// Adds a batch of documents to the current builder.
 								    ///
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								    /// Since the documents are progressively added to the writer, a failure will cause only
 								    /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward.
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    ///
 								    /// Returns the number of documents added to the builder.
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								    #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								    pub fn add_documents<R: Read + Seek>(
 								        mut self,
 								        reader: DocumentsBatchReader<R>,
 								    ) -> Result<(Self, StdResult<u64, UserError>)> {
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        puffin::profile_function!();
-												meilisearch compatible primary key inference

											
										
										
											2021-05-06 21:16:40 +02:00
+								        // Early return when there is no document to add
-												Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits

											
										
										
											2021-08-31 11:44:15 +02:00
+								        if reader.is_empty() {
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								            return Ok((self, Ok(0)));
-												early return on empty document addition

											
										
										
											2021-05-06 18:14:16 +02:00
+								        }
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								        // We check for user errors in this validator and if there is one, we can return
 								        // the `IndexDocument` struct as it is valid to send more documents into it.
 								        // However, if there is an internal error we throw it away!
-												Rename the validate function as an enriching function

											
										
										
											2022-06-21 11:14:14 +02:00
+								        let enriched_documents_reader = match enrich_documents_batch(
-												Fix the indexation tests

											
										
										
											2022-06-15 14:35:19 +02:00
+								            self.wtxn,
 								            self.index,
 								            self.config.autogenerate_docids,
 								            reader,
 								        )? {
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								            Ok(reader) => reader,
 								            Err(user_error) => return Ok((self, Err(user_error))),
 								        };
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        let indexed_documents =
 								            self.transform.as_mut().expect("Invalid document addition state").read_documents(
 								                enriched_documents_reader,
 								                self.wtxn,
 								                &self.progress,
 								                &self.should_abort,
 								            )? as u64;
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        self.added_documents += indexed_documents;
-												return documents number on addition

											
										
										
											2020-12-30 18:43:50 +01:00
-												Introduce the validate_documents_batch function

											
										
										
											2022-06-14 18:12:15 +02:00
+								        Ok((self, Ok(indexed_documents)))
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    }
-												Introduce the searchable parameter settings to the Settings update

											
										
										
											2020-11-03 13:20:11 +01:00
-												Various changes

- DistributionShift in Search object (to be set from model in embed?)
- Fix issue where embedder index wasn't computed at search time
- Accept as default embedder either the "default" one, or the only embedder when there is only one

											
										
										
											2023-12-13 15:38:44 +01:00
+								    pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								        self.embedders = embedders;
 								        self
 								    }
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								    /// Remove a batch of documents from the current builder.
 								    ///
 								    /// Returns the number of documents deleted from the builder.
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								    #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								    pub fn remove_documents(
 								        mut self,
 								        to_delete: Vec<String>,
 								    ) -> Result<(Self, StdResult<u64, UserError>)> {
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        puffin::profile_function!();
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        // Early return when there is no document to add
 								        if to_delete.is_empty() {
-												Stop using delete documents pipeline in batch runner

											
										
										
											2023-10-25 13:41:11 +02:00
+								            // Maintains Invariant: remove documents actually always returns Ok for the inner result
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								            return Ok((self, Ok(0)));
 								        }
 								        let deleted_documents = self
 								            .transform
 								            .as_mut()
 								            .expect("Invalid document deletion state")
 								            .remove_documents(to_delete, self.wtxn, &self.should_abort)?
 								            as u64;
 								        self.deleted_documents += deleted_documents;
-												Stop using delete documents pipeline in batch runner

											
										
										
											2023-10-25 13:41:11 +02:00
+								        // Maintains Invariant: remove documents actually always returns Ok for the inner result
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        Ok((self, Ok(deleted_documents)))
 								    }
-												Batch::remove_documents_from_db_no_batch

											
										
										
											2023-11-09 14:23:02 +01:00
+								    /// Removes documents from db using their internal document ids.
 								    ///
 								    /// # Warning
 								    ///
 								    /// This function is dangerous and will only work correctly if:
 								    ///
 								    /// - All the passed ids currently exist in the database
 								    /// - No batching using the standards `remove_documents` and `add_documents` took place
 								    ///
 								    /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								    #[tracing::instrument(level = "trace", skip_all, target = "indexing::details")]
-												Batch::remove_documents_from_db_no_batch

											
										
										
											2023-11-09 14:23:02 +01:00
+								    pub fn remove_documents_from_db_no_batch(
 								        mut self,
 								        to_delete: &RoaringBitmap,
 								    ) -> Result<(Self, u64)> {
 								        puffin::profile_function!();
 								        // Early return when there is no document to add
 								        if to_delete.is_empty() {
 								            return Ok((self, 0));
 								        }
 								        let deleted_documents = self
 								            .transform
 								            .as_mut()
 								            .expect("Invalid document deletion state")
 								            .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
 								            as u64;
 								        self.deleted_documents += deleted_documents;
 								        Ok((self, deleted_documents))
 								    }
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								    #[tracing::instrument(
 								        level = "trace"
 								        skip_all,
 								        target = "indexing::documents",
 								        name = "index_documents"
 								    )]
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    pub fn execute(mut self) -> Result<DocumentAdditionResult> {
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        puffin::profile_function!();
-												Actually execute the transform even if there are only documents to delete

											
										
										
											2023-10-25 17:32:45 +02:00
+								        if self.added_documents == 0 && self.deleted_documents == 0 {
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								            let number_of_documents = self.index.number_of_documents(self.wtxn)?;
 								            return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
 								        }
 								        let output = self
 								            .transform
 								            .take()
 								            .expect("Invalid document addition state")
 								            .output_from_sorter(self.wtxn, &self.progress)?;
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
 								        let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
 								        self.index.put_faceted_fields(self.wtxn, &new_facets)?;
-												fix the searchable fields bug when a field is nested

Update milli/src/index.rs

Co-authored-by: Clément Renault <clement@meilisearch.com>

											
										
										
											2022-05-16 15:22:52 +02:00
+								        // in case new fields were introduced we're going to recreate the searchable fields.
 								        if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? {
 								            // we can't keep references on the faceted fields while we update the index thus we need to own it.
 								            let faceted_fields: Vec<String> =
 								                faceted_fields.into_iter().map(str::to_string).collect();
 								            self.index.put_all_searchable_fields_from_fields_ids_map(
 								                self.wtxn,
 								                &faceted_fields.iter().map(String::as_ref).collect::<Vec<_>>(),
 								                &output.fields_ids_map,
 								            )?;
 								        }
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        let indexed_documents = output.documents_count as u64;
 								        let number_of_documents = self.execute_raw(output)?;
-												Introduce the searchable parameter settings to the Settings update

											
										
										
											2020-11-03 13:20:11 +01:00
-												improve document addition returned metaimprove document addition
returned metaimprove document addition returned metaimprove document
addition returned metaimprove document addition returned metaimprove
document addition returned metaimprove document addition returned
metaimprove document addition returned meta

											
										
										
											2021-11-10 14:08:36 +01:00
+								        Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
 								    }
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
-												improve document addition returned metaimprove document addition
returned metaimprove document addition returned metaimprove document
addition returned metaimprove document addition returned metaimprove
document addition returned metaimprove document addition returned
metaimprove document addition returned meta

											
										
										
											2021-11-10 14:08:36 +01:00
+								    /// Returns the total number of documents in the index after the update.
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								    #[tracing::instrument(
 								        level = "trace",
 								        skip_all,
-												Meilisearch: fix some wrong spans

											
										
										
											2024-02-26 16:38:17 +01:00
+								        target = "indexing::details",
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								        name = "index_documents_raw"
 								    )]
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								    pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
-												Introduce the searchable parameter settings to the Settings update

											
										
										
											2020-11-03 13:20:11 +01:00
+								    where
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        FP: Fn(UpdateIndexingStep) + Sync,
 								        FA: Fn() -> bool + Sync,
-												Introduce the searchable parameter settings to the Settings update

											
										
										
											2020-11-03 13:20:11 +01:00
+								    {
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        puffin::profile_function!();
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								        let TransformOutput {
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
+								            primary_key,
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								            fields_ids_map,
-												rename fields_distribution in field_distribution

											
										
										
											2021-06-17 15:16:20 +02:00
+								            field_distribution,
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								            documents_count,
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								            original_documents,
 								            flattened_documents,
-												Update the Transform struct to support JSON updates

											
										
										
											2020-10-31 16:10:15 +01:00
+								        } = output;
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
-												Introduce an empty FilterCondition variant to support unknown fields

											
										
										
											2021-07-27 16:24:21 +02:00
+								        // The fields_ids_map is put back to the store now so the rest of the transaction sees an
 								        // up to date field map.
 								        self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								        let backup_pool;
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        let pool = match self.indexer_config.thread_pool {
 								            Some(ref pool) => pool,
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								            #[cfg(not(test))]
 								            None => {
 								                // We initialize a bakcup pool with the default
 								                // settings if none have already been set.
 								                backup_pool = rayon::ThreadPoolBuilder::new().build()?;
 								                &backup_pool
 								            }
 								            #[cfg(test)]
 								            None => {
 								                // We initialize a bakcup pool with the default
 								                // settings if none have already been set.
 								                backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
 								                &backup_pool
 								            }
 								        };
 								        // create LMDB writer channel
-												Remove unwrap sending errors in channel

											
										
										
											2021-08-24 13:01:31 +02:00
+								        let (lmdb_writer_sx, lmdb_writer_rx): (
 								            Sender<Result<TypedChunk>>,
 								            Receiver<Result<TypedChunk>>,
 								        ) = crossbeam_channel::unbounded();
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
-												improve the error handling in general and introduce the concept of reserved keywords

											
										
										
											2021-09-02 15:57:40 +02:00
+								        // get the primary key field id
-												edit the two lasts TODO comments

											
										
										
											2021-09-08 18:12:10 +02:00
+								        let primary_key_id = fields_ids_map.id(&primary_key).unwrap();
-												improve the error handling in general and introduce the concept of reserved keywords

											
										
										
											2021-09-02 15:57:40 +02:00
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								        // get searchable fields for word databases
 								        let searchable_fields =
 								            self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
 								        // get filterable fields for facet databases
 								        let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        // get the fid of the `_geo.lat` and `_geo.lng` fields.
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								        let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
 								        // self.index.fields_ids_map($a)? ==>> field_id_map
 								        let geo_fields_ids = match field_id_map.id("_geo") {
-												Apply suggestions from code review

Co-authored-by: Clément Renault <clement@meilisearch.com>
											
										
										
											2021-09-09 12:20:08 +02:00
+								            Some(gfid) => {
 								                let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
 								                let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                // if `_geo` is faceted then we get the `lat` and `lng`
-												Apply suggestions from code review

Co-authored-by: Clément Renault <clement@meilisearch.com>
											
										
										
											2021-09-09 12:20:08 +02:00
+								                if is_sortable || is_filterable {
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								                    let field_ids = field_id_map
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                        .insert("_geo.lat")
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								                        .zip(field_id_map.insert("_geo.lng"))
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                        .ok_or(UserError::AttributeLimitReached)?;
 								                    Some(field_ids)
-												Apply suggestions from code review

Co-authored-by: Clément Renault <clement@meilisearch.com>
											
										
										
											2021-09-09 12:20:08 +02:00
+								                } else {
 								                    None
 								                }
 								            }
 								            None => None,
-												only index _geo if it's set as sortable OR filterable

and only allow the filters if geo was set to filterable

											
										
										
											2021-08-30 15:47:33 +02:00
+								        };
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
-												Take stop word in account

											
										
										
											2021-08-17 12:25:07 +02:00
+								        let stop_words = self.index.stop_words(self.wtxn)?;
-												Make the search and the indexing work

											
										
										
											2023-07-24 18:35:20 +02:00
+								        let separators = self.index.allowed_separators(self.wtxn)?;
 								        let dictionary = self.index.dictionary(self.wtxn)?;
-												extract exact word docids

											
										
										
											2022-03-24 17:00:29 +01:00
+								        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
-												Implement proximityPrecision setting on milli side

											
										
										
											2023-12-06 15:49:02 +01:00
+								        let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
-												Take stop word in account

											
										
										
											2021-08-17 12:25:07 +02:00
-												bump heed

											
										
										
											2022-08-10 16:25:24 +02:00
+								        let pool_params = GrenadParameters {
 								            chunk_compression_type: self.indexer_config.chunk_compression_type,
 								            chunk_compression_level: self.indexer_config.chunk_compression_level,
 								            max_memory: self.indexer_config.max_memory,
 								            max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
 								        };
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								        let documents_chunk_size = match self.indexer_config.documents_chunk_size {
 								            Some(chunk_size) => chunk_size,
 								            None => {
 								                let default_chunk_size = 1024 * 1024 * 4; // 4MiB
 								                let min_chunk_size = 1024 * 512; // 512KiB
 								                // compute the chunk size from the number of available threads and the inputed data size.
 								                let total_size = flattened_documents.metadata().map(|m| m.len());
 								                let current_num_threads = pool.current_num_threads();
 								                // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
 								                let chunk_count = if current_num_threads > 2 {
 								                    (current_num_threads * 3 / 4).max(2)
 								                } else {
 								                    current_num_threads
 								                };
 								                total_size
 								                    .map_or(default_chunk_size, |size| (size as usize) / chunk_count)
 								                    .max(min_chunk_size)
 								            }
 								        };
 								        let original_documents = grenad::Reader::new(original_documents)?;
 								        let flattened_documents = grenad::Reader::new(flattened_documents)?;
-												bump heed

											
										
										
											2022-08-10 16:25:24 +02:00
+								        let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								        let cloned_embedder = self.embedders.clone();
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								        let mut final_documents_ids = RoaringBitmap::new();
 								        let mut databases_seen = 0;
 								        let mut word_position_docids = None;
 								        let mut word_fid_docids = None;
 								        let mut word_docids = None;
 								        let mut exact_word_docids = None;
 								        let mut chunk_accumulator = ChunkAccumulator::default();
 								        let mut dimension = HashMap::new();
 								        let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								        let current_span = tracing::Span::current();
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								        // Run extraction pipeline in parallel.
 								        pool.install(|| {
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								            rayon::spawn(move || {
 								                let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								            let _enter = child_span.enter();
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								            puffin::profile_scope!("extract_and_send_grenad_chunks");
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								                // split obkv file into several chunks
 								                let original_chunk_iter =
 								                    grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
 								                // split obkv file into several chunks
 								                let flattened_chunk_iter =
 								                    grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
 								                let separators: Option<Vec<_>> =
 								                    separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
 								                let dictionary: Option<Vec<_>> =
 								                    dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
 								                let result = original_chunk_iter.and_then(|original_chunk| {
 								                    let flattened_chunk = flattened_chunk_iter?;
 								                    // extract all databases from the chunked obkv douments
 								                    extract::data_from_obkv_documents(
 								                        original_chunk,
 								                        flattened_chunk,
 								                        pool_params,
 								                        lmdb_writer_sx.clone(),
 								                        searchable_fields,
 								                        faceted_fields,
 								                        primary_key_id,
 								                        geo_fields_ids,
 								                        field_id_map,
 								                        stop_words,
 								                        separators.as_deref(),
 								                        dictionary.as_deref(),
 								                        max_positions_per_attributes,
 								                        exact_attributes,
 								                        proximity_precision,
 								                        cloned_embedder,
 								                    )
 								                });
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								                if let Err(e) = result {
 								                    let _ = lmdb_writer_sx.send(Err(e));
 								                }
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								                // needs to be dropped to avoid channel waiting lock.
 								                drop(lmdb_writer_sx);
 								            });
-												Reactivate prefix databases

											
										
										
											2023-11-08 11:52:08 +01:00
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								            (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 								                databases_seen,
 								                total_databases: TOTAL_POSTING_DATABASE_COUNT,
 								            });
-												WIP

											
										
										
											2023-12-07 17:03:10 +01:00
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								            loop {
 								                if (self.should_abort)() {
 								                    return Err(Error::InternalError(InternalError::AbortedIndexation));
 								                }
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								                match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
 								                    Err(status) => {
 								                        if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
 								                            let (docids, is_merged_database) =
 								                                write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
 								                            if !docids.is_empty() {
 								                                final_documents_ids |= docids;
 								                                let documents_seen_count = final_documents_ids.len();
 								                                (self.progress)(UpdateIndexingStep::IndexDocuments {
 								                                    documents_seen: documents_seen_count as usize,
 								                                    total_documents: documents_count,
 								                                });
 								                                debug!(documents = documents_seen_count, total = documents_count, "Seen");
 								                            }
 								                            if is_merged_database {
 								                                databases_seen += 1;
 								                                (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 								                                    databases_seen,
 								                                    total_databases: TOTAL_POSTING_DATABASE_COUNT,
 								                                });
 								                            }
 								                        // If no more chunk remains in the chunk accumulator and the channel is disconected, break.
 								                        } else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
 								                            break;
-												yield in loop when the channel is not disconnected

											
										
										
											2024-02-08 18:21:27 +01:00
+								                        } else {
 								                            rayon::yield_now();
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								                        }
-												Reactivate prefix databases

											
										
										
											2023-11-08 11:52:08 +01:00
+								                    }
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								                    Ok(result) => {
 								                        let typed_chunk = match result? {
 								                            TypedChunk::WordDocids {
 								                                word_docids_reader,
 								                                exact_word_docids_reader,
 								                                word_fid_docids_reader,
 								                            } => {
 								                                let cloneable_chunk =
 								                                    unsafe { as_cloneable_grenad(&word_docids_reader)? };
 								                                let word_docids = word_docids.get_or_insert_with(|| {
 								                                    MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
 								                                });
 								                                word_docids.push(cloneable_chunk.into_cursor()?);
 								                                let cloneable_chunk =
 								                                    unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
 								                                let exact_word_docids =
 								                                    exact_word_docids.get_or_insert_with(|| {
 								                                        MergerBuilder::new(
 								                                            merge_deladd_cbo_roaring_bitmaps as MergeFn,
 								                                        )
 								                                    });
 								                                exact_word_docids.push(cloneable_chunk.into_cursor()?);
 								                                let cloneable_chunk =
 								                                    unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
 								                                let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
 								                                    MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
 								                                });
 								                                word_fid_docids.push(cloneable_chunk.into_cursor()?);
 								                                TypedChunk::WordDocids {
 								                                    word_docids_reader,
 								                                    exact_word_docids_reader,
 								                                    word_fid_docids_reader,
 								                                }
 								                            }
 								                            TypedChunk::WordPositionDocids(chunk) => {
 								                                let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
 								                                let word_position_docids =
 								                                    word_position_docids.get_or_insert_with(|| {
 								                                        MergerBuilder::new(
 								                                            merge_deladd_cbo_roaring_bitmaps as MergeFn,
 								                                        )
 								                                    });
 								                                word_position_docids.push(cloneable_chunk.into_cursor()?);
 								                                TypedChunk::WordPositionDocids(chunk)
 								                            }
 								                            TypedChunk::VectorPoints {
 								                                expected_dimension,
 								                                remove_vectors,
 								                                embeddings,
 								                                manual_vectors,
 								                                embedder_name,
 								                            } => {
 								                                dimension.insert(embedder_name.clone(), expected_dimension);
 								                                TypedChunk::VectorPoints {
 								                                    remove_vectors,
 								                                    embeddings,
 								                                    expected_dimension,
 								                                    manual_vectors,
 								                                    embedder_name,
 								                                }
 								                            }
 								                            otherwise => otherwise,
 								                        };
 								                        chunk_accumulator.insert(typed_chunk);
-												WIP

											
										
										
											2023-12-07 17:03:10 +01:00
+								                    }
 								                }
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								            }
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
 								            Ok(())
 								        })?;
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
-												rename fields_distribution in field_distribution

											
										
										
											2021-06-17 15:16:20 +02:00
+								        // We write the field distribution into the main database
 								        self.index.put_field_distribution(self.wtxn, &field_distribution)?;
-												feat(index): store fields distribution in index

											
										
										
											2021-03-31 17:14:23 +02:00
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
+								        // We write the primary key field id into the main database
-												Fix settings bug

replace ids with str in settings

This allows for better maintainability of the settings code, since
updating the searchable attributes is now straightforward.

criterion use string

fix reindexing fieldid remaping

add tests for primary_key compute

fix tests

fix http-ui

fixup! add tests for primary_key compute

code improvements settings

update deps

fixup! code improvements settings

fixup! refactor settings updates and fix bug

fixup! Fix settings bug

fixup! Fix settings bug

fixup! Fix settings bug

Update src/update/index_documents/transform.rs

Co-authored-by: Clément Renault <clement@meilisearch.com>

fixup! Fix settings bug

											
										
										
											2021-01-20 17:27:43 +01:00
+								        self.index.put_primary_key(self.wtxn, &primary_key)?;
-												Reactivate prefix databases

											
										
										
											2023-11-08 11:52:08 +01:00
+								        let number_of_documents = self.index.number_of_documents(self.wtxn)?;
-												Various changes

- fixed seed for arroy
- check vector dimensions as soon as it is provided to search
- don't embed whitespace

											
										
										
											2023-12-14 16:01:35 +01:00
+								        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
-												WIP multi embedders

fixed template bugs

											
										
										
											2023-12-12 21:19:48 +01:00
+								        for (embedder_name, dimension) in dimension {
-												WIP

											
										
										
											2023-12-07 17:03:10 +01:00
+								            let wtxn = &mut *self.wtxn;
 								            let vector_arroy = self.index.vector_arroy;
-												WIP

- manual embedder
- multi embedders OK
- clippy + tests OK

											
										
										
											2023-12-12 23:39:01 +01:00
 								            let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
 								                InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
 								            )?;
-												WIP

											
										
										
											2023-12-07 17:03:10 +01:00
+								            pool.install(|| {
-												WIP multi embedders

fixed template bugs

											
										
										
											2023-12-12 21:19:48 +01:00
+								                let writer_index = (embedder_index as u16) << 8;
-												WIP

											
										
										
											2023-12-07 17:03:10 +01:00
+								                for k in 0..=u8::MAX {
-												Bump arroy to v0.2.0

											
										
										
											2024-01-16 16:45:55 +01:00
+								                    let writer =
 								                        arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?;
-												WIP

											
										
										
											2023-12-07 17:03:10 +01:00
+								                    if writer.is_empty(wtxn)? {
 								                        break;
 								                    }
 								                    writer.build(wtxn, &mut rng, None)?;
 								                }
 								                Result::Ok(())
 								            })?;
 								        }
-												Reactivate prefix databases

											
										
										
											2023-11-08 11:52:08 +01:00
+								        self.execute_prefix_databases(
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								            word_docids.map(MergerBuilder::build),
 								            exact_word_docids.map(MergerBuilder::build),
 								            word_position_docids.map(MergerBuilder::build),
 								            word_fid_docids.map(MergerBuilder::build),
-												Reactivate prefix databases

											
										
										
											2023-11-08 11:52:08 +01:00
+								        )?;
-												improve document addition returned metaimprove document addition
returned metaimprove document addition returned metaimprove document
addition returned metaimprove document addition returned metaimprove
document addition returned metaimprove document addition returned
metaimprove document addition returned meta

											
										
										
											2021-11-10 14:08:36 +01:00
-												Reactivate prefix databases

											
										
										
											2023-11-08 11:52:08 +01:00
+								        Ok(number_of_documents)
-												Plug new indexer

											
										
										
											2021-08-16 13:36:30 +02:00
+								    }
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								    #[tracing::instrument(
 								        level = "trace",
 								        skip_all,
 								        target = "indexing::prefix",
 								        name = "index_documents_prefix_databases"
 								    )]
-												Bring the newly created word pair proximity docids

											
										
										
											2022-01-18 14:59:51 +01:00
+								    pub fn execute_prefix_databases(
 								        self,
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								        word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
 								        exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
 								        word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
 								        word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
-												Bring the newly created word pair proximity docids

											
										
										
											2022-01-18 14:59:51 +01:00
+								    ) -> Result<()>
 								    where
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        FP: Fn(UpdateIndexingStep) + Sync,
 								        FA: Fn() -> bool + Sync,
-												Bring the newly created word pair proximity docids

											
										
										
											2022-01-18 14:59:51 +01:00
+								    {
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        puffin::profile_function!();
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								        // Merged databases are already been indexed, we start from this count;
 								        let mut databases_seen = MERGED_DATABASE_COUNT;
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        if (self.should_abort)() {
 								            return Err(Error::InternalError(InternalError::AbortedIndexation));
 								        }
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								        databases_seen += 1;
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 								            databases_seen,
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								            total_databases: TOTAL_POSTING_DATABASE_COUNT,
 								        });
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        if (self.should_abort)() {
 								            return Err(Error::InternalError(InternalError::AbortedIndexation));
 								        }
-												Retrieve the previous version of the words prefixes FST

											
										
										
											2022-01-18 14:02:24 +01:00
+								        let previous_words_prefixes_fst =
 								            self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;
-												Run the words prefixes update inside of the indexing documents update

											
										
										
											2021-02-10 11:53:13 +01:00
+								        // Run the words prefixes update operation.
-												remove update_id in UpdateBuilder

											
										
										
											2021-11-03 13:12:01 +01:00
+								        let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        if let Some(value) = self.config.words_prefix_threshold {
-												Run the words prefixes update inside of the indexing documents update

											
										
										
											2021-02-10 11:53:13 +01:00
+								            builder.threshold(value);
 								        }
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        if let Some(value) = self.config.max_prefix_length {
-												Run the words prefixes update inside of the indexing documents update

											
										
										
											2021-02-10 11:53:13 +01:00
+								            builder.max_prefix_length(value);
 								        }
 								        builder.execute()?;
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        if (self.should_abort)() {
 								            return Err(Error::InternalError(InternalError::AbortedIndexation));
 								        }
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        let current_prefix_fst;
 								        let common_prefix_fst_words_tmp;
 								        let common_prefix_fst_words: Vec<_>;
 								        let new_prefix_fst_words;
 								        let del_prefix_fst_words;
-												Compute the new, common and, deleted prefix words fst once

											
										
										
											2022-01-27 11:00:18 +01:00
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								        {
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								            let span = tracing::trace_span!(target: "indexing::details", "compute_prefix_diffs");
 								            let _entered = span.enter();
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								            puffin::profile_scope!("compute_prefix_diffs");
-												Compute the new, common and, deleted prefix words fst once

											
										
										
											2022-01-27 11:00:18 +01:00
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								            current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
-												Compute the new, common and, deleted prefix words fst once

											
										
										
											2022-01-27 11:00:18 +01:00
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								            // We retrieve the common words between the previous and new prefix word fst.
 								            common_prefix_fst_words_tmp = fst_stream_into_vec(
 								                previous_words_prefixes_fst.op().add(&current_prefix_fst).intersection(),
 								            );
 								            common_prefix_fst_words = common_prefix_fst_words_tmp
 								                .as_slice()
 								                .linear_group_by_key(|x| x.chars().next().unwrap())
 								                .collect();
 								            // We retrieve the newly added words between the previous and new prefix word fst.
 								            new_prefix_fst_words = fst_stream_into_vec(
 								                current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
 								            );
 								            // We compute the set of prefixes that are no more part of the prefix fst.
 								            del_prefix_fst_words = fst_stream_into_hashset(
 								                previous_words_prefixes_fst.op().add(&current_prefix_fst).difference(),
 								            );
 								        }
-												Compute the new, common and, deleted prefix words fst once

											
										
										
											2022-01-27 11:00:18 +01:00
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								        databases_seen += 1;
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 								            databases_seen,
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								            total_databases: TOTAL_POSTING_DATABASE_COUNT,
 								        });
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        if (self.should_abort)() {
 								            return Err(Error::InternalError(InternalError::AbortedIndexation));
 								        }
-												Replace the ugly unwraps by clean if let Somes

											
										
										
											2022-02-28 16:00:33 +01:00
+								        if let Some(word_docids) = word_docids {
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								            execute_word_prefix_docids(
-												refactor WordPrefixDocids to take dbs instead of indexes

											
										
										
											2022-03-25 10:20:39 +01:00
+								                self.wtxn,
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								                word_docids,
-												Fixing piles of clippy errors.

Most of these are calling clone when the struct supports Copy.

Many are using & and &mut on `self` when the function they are called
from already has an immutable or mutable borrow so this isn't needed.

I tried to stay away from actual changes or places where I'd have to
name fresh variables.

											
										
										
											2022-10-13 22:02:54 +02:00
+								                self.index.word_docids,
 								                self.index.word_prefix_docids,
 								                self.indexer_config,
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								                &new_prefix_fst_words,
 								                &common_prefix_fst_words,
 								                &del_prefix_fst_words,
 								            )?;
 								        }
 								        if let Some(exact_word_docids) = exact_word_docids {
 								            execute_word_prefix_docids(
 								                self.wtxn,
 								                exact_word_docids,
-												Fixing piles of clippy errors.

Most of these are calling clone when the struct supports Copy.

Many are using & and &mut on `self` when the function they are called
from already has an immutable or mutable borrow so this isn't needed.

I tried to stay away from actual changes or places where I'd have to
name fresh variables.

											
										
										
											2022-10-13 22:02:54 +02:00
+								                self.index.exact_word_docids,
 								                self.index.exact_word_prefix_docids,
 								                self.indexer_config,
-												Replace the ugly unwraps by clean if let Somes

											
										
										
											2022-02-28 16:00:33 +01:00
+								                &new_prefix_fst_words,
 								                &common_prefix_fst_words,
 								                &del_prefix_fst_words,
 								            )?;
 								        }
-												Compute the words prefixes at the end of an update

											
										
										
											2021-03-25 11:10:12 +01:00
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        if (self.should_abort)() {
 								            return Err(Error::InternalError(InternalError::AbortedIndexation));
 								        }
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								        databases_seen += 1;
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 								            databases_seen,
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								            total_databases: TOTAL_POSTING_DATABASE_COUNT,
 								        });
-												Replace the ugly unwraps by clean if let Somes

											
										
										
											2022-02-28 16:00:33 +01:00
+								        if let Some(word_position_docids) = word_position_docids {
 								            // Run the words prefix position docids update operation.
-												Fix indexing of word_prefix_fid_docids

											
										
										
											2023-04-27 11:12:46 +02:00
+								            let mut builder = WordPrefixIntegerDocids::new(
 								                self.wtxn,
 								                self.index.word_prefix_position_docids,
 								                self.index.word_position_docids,
 								            );
-												Replace the ugly unwraps by clean if let Somes

											
										
										
											2022-02-28 16:00:33 +01:00
+								            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
 								            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
 								            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
 								            builder.max_memory = self.indexer_config.max_memory;
-												Fix indexing of word_prefix_fid_docids

											
										
										
											2023-04-27 11:12:46 +02:00
-												Replace the ugly unwraps by clean if let Somes

											
										
										
											2022-02-28 16:00:33 +01:00
+								            builder.execute(
 								                word_position_docids,
 								                &new_prefix_fst_words,
 								                &common_prefix_fst_words,
 								                &del_prefix_fst_words,
 								            )?;
-												Fix indexing of word_prefix_fid_docids

											
										
										
											2023-04-27 11:12:46 +02:00
+								        }
 								        if let Some(word_fid_docids) = word_fid_docids {
 								            // Run the words prefix fid docids update operation.
 								            let mut builder = WordPrefixIntegerDocids::new(
 								                self.wtxn,
 								                self.index.word_prefix_fid_docids,
 								                self.index.word_fid_docids,
 								            );
 								            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
 								            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
 								            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
 								            builder.max_memory = self.indexer_config.max_memory;
 								            builder.execute(
 								                word_fid_docids,
 								                &new_prefix_fst_words,
 								                &common_prefix_fst_words,
 								                &del_prefix_fst_words,
 								            )?;
-												Expose and use the WordsLevelPositions update

											
										
										
											2021-03-17 13:55:24 +01:00
+								        }
-												Introduce an indexation abortion function when indexing documents

											
										
										
											2022-10-05 17:41:07 +02:00
+								        if (self.should_abort)() {
 								            return Err(Error::InternalError(InternalError::AbortedIndexation));
 								        }
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								        databases_seen += 1;
-												document batch support

reusable transform

rework update api

add indexer config

fix tests

review changes

Co-authored-by: Clément Renault <clement@meilisearch.com>

fmt

											
										
										
											2021-12-08 14:12:07 +01:00
+								        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 								            databases_seen,
-												Fix test and use progress callback

											
										
										
											2021-08-17 10:56:06 +02:00
+								            total_databases: TOTAL_POSTING_DATABASE_COUNT,
 								        });
-												Introduce the UpdateBuilder and use it in the HTTP routes

											
										
										
											2020-10-26 20:18:10 +01:00
+								        Ok(())
-												Move the IndexDocuments update into its own module

											
										
										
											2020-10-26 11:02:44 +01:00
+								    }
 								}
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								/// Run the word prefix docids update operation.
-												Fix cargo clippy errors

Dont apply clippy for tests for now

Fix clippy warnings of filter-parser package

parent 8352febd646ec4bcf56a44161e5c4dce0e55111f
author unvalley <38400669+unvalley@users.noreply.github.com> 1666325847 +0900
committer unvalley <kirohi.code@gmail.com> 1666791316 +0900

Update .github/workflows/rust.yml

Co-authored-by: Clémentine Urquizar - curqui <clementine@meilisearch.com>

Allow clippy lint too_many_argments

Allow clippy lint needless_collect

Allow clippy lint too_many_arguments and type_complexity

Fix for clippy warnings comparison_chains

Fix for clippy warnings vec_init_then_push

Allow clippy lint should_implement_trait

Allow clippy lint drop_non_drop

Fix lifetime clipy warnings in filter-paprser

Execute cargo fmt

Fix clippy remaining warnings

Fix clippy remaining warnings again and allow lint on each place

											
										
										
											2022-10-14 16:44:10 +02:00
+								#[allow(clippy::too_many_arguments)]
-												Add tracing to milli

											
										
										
											2024-01-23 09:42:48 +01:00
+								#[tracing::instrument(
 								    level = "trace",
 								    skip_all,
 								    target = "indexing::prefix",
 								    name = "index_documents_word_prefix_docids"
 								)]
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								fn execute_word_prefix_docids(
 								    txn: &mut heed::RwTxn,
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								    merger: Merger<CursorClonableMmap, MergeFn>,
-												Generalize usage of CboRoaringBitmap codec to ease the use

											
										
										
											2023-09-25 16:39:32 +02:00
+								    word_docids_db: Database<Str, CboRoaringBitmapCodec>,
 								    word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								    indexer_config: &IndexerConfig,
 								    new_prefix_fst_words: &[String],
 								    common_prefix_fst_words: &[&[String]],
 								    del_prefix_fst_words: &HashSet<Vec<u8>>,
 								) -> Result<()> {
-												First iteration on exposing puffin profiling

											
										
										
											2023-07-10 18:41:54 +02:00
+								    puffin::profile_function!();
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								    let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
 								    builder.chunk_compression_type = indexer_config.chunk_compression_type;
 								    builder.chunk_compression_level = indexer_config.chunk_compression_level;
 								    builder.max_nb_chunks = indexer_config.max_nb_chunks;
 								    builder.max_memory = indexer_config.max_memory;
-												Compute chunk size based on the input data size ant the number of indexing threads

											
										
										
											2024-01-22 16:23:12 +01:00
+								    builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
-												extract exact_word_prefix_docids

											
										
										
											2022-03-25 16:17:55 +01:00
+								    Ok(())
 								}
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
+								#[cfg(test)]
 								mod tests {
-												Tests pass

											
										
										
											2023-12-13 21:49:13 +01:00
+								    use std::collections::BTreeMap;
-												Add a test to check that we can index more that 256 fields

											
										
										
											2021-07-06 11:40:45 +02:00
+								    use big_s::S;
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								    use fst::IntoStreamer;
 								    use heed::RwTxn;
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								    use maplit::hashset;
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
-												format the whole project

											
										
										
											2021-06-16 18:33:33 +02:00
+								    use super::*;
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								    use crate::documents::documents_batch_reader_from_objects;
 								    use crate::index::tests::TempIndex;
-												Rename TermMatchingPolicies

											
										
										
											2022-08-18 17:36:08 +02:00
+								    use crate::search::TermsMatchingStrategy;
-												Tests pass

											
										
										
											2023-12-13 21:49:13 +01:00
+								    use crate::update::Setting;
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								    use crate::{db_snap, Filter, Search};
-												format the whole project

											
										
										
											2021-06-16 18:33:33 +02:00
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
+								    #[test]
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								    fn simple_document_replacement() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
 								        // First we send 3 documents with ids from 1 to 3.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "id": 1, "name": "kevin" },
 								                { "id": 2, "name": "kevina" },
 								                { "id": 3, "name": "benoit" }
 								            ]))
 								            .unwrap();
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
 								        // Check that there is 3 documents now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 3);
 								        drop(rtxn);
 								        // Second we send 1 document with id 1, to erase the previous ones.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([ { "id": 1, "name": "updated kevin" } ])).unwrap();
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								        // Check that there is **always** 3 documents.
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
+								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 3);
 								        drop(rtxn);
 								        // Third we send 3 documents again to replace the existing ones.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "id": 1, "name": "updated second kevin" },
 								                { "id": 2, "name": "updated kevina" },
 								                { "id": 3, "name": "updated benoit" }
 								            ]))
 								            .unwrap();
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								        // Check that there is **always** 3 documents.
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
+								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 3);
-												Update test to showcase the bug

											
										
										
											2022-08-17 15:03:08 +02:00
+								        let count = index.all_documents(&rtxn).unwrap().count();
 								        assert_eq!(count, 3);
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
+								        drop(rtxn);
 								    }
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
 								    #[test]
 								    fn simple_document_merge() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
 								        // First we send 3 documents with duplicate ids and
 								        // change the index method to merge documents.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "id": 1, "name": "kevin" },
 								                { "id": 1, "name": "kevina" },
 								                { "id": 1, "name": "benoit" }
 								            ]))
 								            .unwrap();
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
 								        // Check that there is only 1 document now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 1);
 								        // Check that we get only one document from the database.
 								        let docs = index.documents(&rtxn, Some(0)).unwrap();
 								        assert_eq!(docs.len(), 1);
 								        let (id, doc) = docs[0];
 								        assert_eq!(id, 0);
 								        // Check that this document is equal to the last one sent.
 								        let mut doc_iter = doc.iter();
-												stop casting integer docids to string

											
										
										
											2021-09-28 18:35:54 +02:00
+								        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
 								        assert_eq!(doc_iter.next(), None);
 								        drop(rtxn);
 								        // Second we send 1 document with id 1, to force it to be merged with the previous one.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([ { "id": 1, "age": 25 } ])).unwrap();
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
 								        // Check that there is **always** 1 document.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 1);
 								        // Check that we get only one document from the database.
-												remove tests on soft-deleted

											
										
										
											2023-10-31 16:36:18 +01:00
+								        let docs = index.documents(&rtxn, Some(0)).unwrap();
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								        assert_eq!(docs.len(), 1);
 								        let (id, doc) = docs[0];
-												remove tests on soft-deleted

											
										
										
											2023-10-31 16:36:18 +01:00
+								        assert_eq!(id, 0);
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
 								        // Check that this document is equal to the last one sent.
 								        let mut doc_iter = doc.iter();
-												stop casting integer docids to string

											
										
										
											2021-09-28 18:35:54 +02:00
+								        assert_eq!(doc_iter.next(), Some((0, &b"1"[..])));
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								        assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
-												stop casting integer docids to string

											
										
										
											2021-09-28 18:35:54 +02:00
+								        assert_eq!(doc_iter.next(), Some((2, &b"25"[..])));
-												Add a test to check that merging works correctly with CSVs

											
										
										
											2020-10-30 13:46:56 +01:00
+								        assert_eq!(doc_iter.next(), None);
 								        drop(rtxn);
 								    }
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
-												Introduce a parameter to disable the engine to autogenerate docids

											
										
										
											2020-10-31 21:46:55 +01:00
+								    #[test]
-												Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits

											
										
										
											2021-08-31 11:44:15 +02:00
+								    fn not_auto_generated_documents_ids() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Introduce a parameter to disable the engine to autogenerate docids

											
										
										
											2020-10-31 21:46:55 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let result = index.add_documents(documents!([
-												Introduce a parameter to disable the engine to autogenerate docids

											
										
										
											2020-10-31 21:46:55 +01:00
+								            { "name": "kevin" },
-												Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits

											
										
										
											2021-08-31 11:44:15 +02:00
+								            { "name": "kevina" },
-												Introduce a parameter to disable the engine to autogenerate docids

											
										
										
											2020-10-31 21:46:55 +01:00
+								            { "name": "benoit" }
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        ]));
 								        assert!(result.is_err());
-												Introduce a parameter to disable the engine to autogenerate docids

											
										
										
											2020-10-31 21:46:55 +01:00
 								        // Check that there is no document.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 0);
 								        drop(rtxn);
 								    }
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
+								    #[test]
 								    fn simple_auto_generated_documents_ids() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.autogenerate_docids = true;
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
+								        // First we send 3 documents with ids from 1 to 3.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "name": "kevin" },
 								                { "name": "kevina" },
 								                { "name": "benoit" }
 								            ]))
 								            .unwrap();
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
 								        // Check that there is 3 documents now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 3);
 								        let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
 								        let (_id, obkv) = docs.iter().find(|(_id, kv)| kv.get(0) == Some(br#""kevin""#)).unwrap();
-												Execute cargo clippy --fix

											
										
										
											2022-10-10 15:28:03 +02:00
+								        let kevin_uuid: String = serde_json::from_slice(obkv.get(1).unwrap()).unwrap();
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
+								        drop(rtxn);
 								        // Second we send 1 document with the generated uuid, to erase the previous ones.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([ { "name": "updated kevin", "id": kevin_uuid } ])).unwrap();
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
 								        // Check that there is **always** 3 documents.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 3);
-												Fix a bug where generated docids were not saved when indexing JSON docs

											
										
										
											2020-11-01 12:14:44 +01:00
-												Fasten the document deletion

When a document deletion occurs, instead of deleting the document we mark it as deleted
in the new “soft deleted” bitmap. It is then removed from the search, and all the other
endpoints.

											
										
										
											2022-06-13 17:59:34 +02:00
+								        // the document 0 has been deleted and reinserted with the id 3
-												remove tests on soft-deleted

											
										
										
											2023-10-31 16:36:18 +01:00
+								        let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
-												Fasten the document deletion

When a document deletion occurs, instead of deleting the document we mark it as deleted
in the new “soft deleted” bitmap. It is then removed from the search, and all the other
endpoints.

											
										
										
											2022-06-13 17:59:34 +02:00
+								        let kevin_position =
 								            docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
 								        assert_eq!(kevin_position, 2);
 								        let (_, doc) = docs[kevin_position];
-												Fix a bug where generated docids were not saved when indexing JSON docs

											
										
										
											2020-11-01 12:14:44 +01:00
 								        // Check that this document is equal to the last
 								        // one sent and that an UUID has been generated.
-												Validate documents ids before accepting them

											
										
										
											2020-11-01 16:43:12 +01:00
+								        assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..]));
-												Fix a bug where generated docids were not saved when indexing JSON docs

											
										
										
											2020-11-01 12:14:44 +01:00
+								        // This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes (").
-												feat(index): introduce fields_ids_distribution

											
										
										
											2021-03-31 17:14:23 +02:00
+								        assert_eq!(doc.get(1).unwrap().len(), 36 + 2);
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
+								        drop(rtxn);
 								    }
 								    #[test]
 								    fn reordered_auto_generated_documents_ids() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let mut index = TempIndex::new();
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
 								        // First we send 3 documents with ids from 1 to 3.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "id": 1, "name": "kevin" },
 								                { "id": 2, "name": "kevina" },
 								                { "id": 3, "name": "benoit" }
 								            ]))
 								            .unwrap();
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
 								        // Check that there is 3 documents now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 3);
 								        drop(rtxn);
 								        // Second we send 1 document without specifying the id.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.index_documents_config.autogenerate_docids = true;
 								        index.add_documents(documents!([ { "name": "new kevin" } ])).unwrap();
-												Generate a uuid v4 based document id when missing

											
										
										
											2020-10-31 12:54:43 +01:00
 								        // Check that there is 4 documents now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 4);
 								        drop(rtxn);
 								    }
-												Update the Transform struct to support JSON updates

											
										
										
											2020-10-31 16:10:15 +01:00
 								    #[test]
-												Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits

											
										
										
											2021-08-31 11:44:15 +02:00
+								    fn empty_update() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Update the Transform struct to support JSON updates

											
										
										
											2020-10-31 16:10:15 +01:00
 								        // First we send 0 documents and only headers.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([])).unwrap();
-												Update the Transform struct to support JSON updates

											
										
										
											2020-10-31 16:10:15 +01:00
 								        // Check that there is no documents.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 0);
 								        drop(rtxn);
 								    }
-												Update the Transform struct to support JSON stream updates

											
										
										
											2020-11-01 11:50:10 +01:00
-												Validate documents ids before accepting them

											
										
										
											2020-11-01 16:43:12 +01:00
+								    #[test]
 								    fn invalid_documents_ids() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Validate documents ids before accepting them

											
										
										
											2020-11-01 16:43:12 +01:00
 								        // First we send 1 document with an invalid id.
 								        // There is a space in the document id.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([ { "id": "brume bleue", "name": "kevin" } ])).unwrap_err();
-												Validate documents ids before accepting them

											
										
										
											2020-11-01 16:43:12 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        // Then we send 1 document with a valid id.
 								        index.add_documents(documents!([ { "id": 32, "name": "kevin" } ])).unwrap();
-												Validate documents ids before accepting them

											
										
										
											2020-11-01 16:43:12 +01:00
 								        // Check that there is 1 document now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 1);
 								        drop(rtxn);
 								    }
-												Make sure we index all kind of JSON types

											
										
										
											2020-11-06 16:15:07 +01:00
 								    #[test]
-												Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits

											
										
										
											2021-08-31 11:44:15 +02:00
+								    fn complex_documents() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Make sure we index all kind of JSON types

											
										
										
											2020-11-06 16:15:07 +01:00
 								        // First we send 3 documents with an id for only one of them.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
 								                { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
 								                { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
 								            ]))
 								            .unwrap();
-												Make sure we index all kind of JSON types

											
										
										
											2020-11-06 16:15:07 +01:00
 								        // Check that there is 1 documents now.
 								        let rtxn = index.read_txn().unwrap();
 								        // Search for a sub object value
 								        let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
 								        assert_eq!(result.documents_ids, vec![0]);
 								        // Search for a sub array value
 								        let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
 								        assert_eq!(result.documents_ids, vec![1]);
 								        // Search for a sub array sub object key
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap();
-												Make sure we index all kind of JSON types

											
										
										
											2020-11-06 16:15:07 +01:00
+								        assert_eq!(result.documents_ids, vec![2]);
 								        drop(rtxn);
 								    }
-												Add a test for the words level positions generation bug

											
										
										
											2021-06-23 18:35:44 +02:00
 								    #[test]
 								    fn simple_documents_replace() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-												Add a test for the words level positions generation bug

											
										
										
											2021-06-23 18:35:44 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([
-												add a tests for the indexation of the geosearch

											
										
										
											2021-08-26 13:27:32 +02:00
+								          { "id": 2,    "title": "Pride and Prejudice",                    "author": "Jane Austin",              "genre": "romance",    "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
-												Add a test for the words level positions generation bug

											
										
										
											2021-06-23 18:35:44 +02:00
+								          { "id": 456,  "title": "Le Petit Prince",                        "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
 								          { "id": 1,    "title": "Alice In Wonderland",                    "author": "Lewis Carroll",            "genre": "fantasy",    "price": 25.99 },
 								          { "id": 1344, "title": "The Hobbit",                             "author": "J. R. R. Tolkien",         "genre": "fantasy" },
 								          { "id": 4,    "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling",            "genre": "fantasy" },
-												add a tests for the indexation of the geosearch

											
										
										
											2021-08-26 13:27:32 +02:00
+								          { "id": 42,   "title": "The Hitchhiker's Guide to the Galaxy",   "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } }
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        ])).unwrap();
-												Add a test for the words level positions generation bug

											
										
										
											2021-06-23 18:35:44 +02:00
-												Add more tests and allow disabling of soft-deletion outside of tests

Also allow disabling soft-deletion in the IndexDocumentsConfig

											
										
										
											2022-12-05 10:33:31 +01:00
+								        db_snap!(index, word_docids, "initial");
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
 								        index
-												Update test to showcase the bug

											
										
										
											2022-08-17 15:03:08 +02:00
+								            .add_documents(documents!([
 								                {"id":4,"title":"Harry Potter and the Half-Blood Princess"},
 								                {"id":456,"title":"The Little Prince"}
 								            ]))
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								            .unwrap();
-												Update test to showcase the bug

											
										
										
											2022-08-17 15:03:08 +02:00
 								        index
 								            .add_documents(documents!([
 								                { "id": 2, "author": "J. Austen", "date": "1813" }
 								            ]))
 								            .unwrap();
-												Fix typo in comment

											
										
										
											2022-08-17 15:09:48 +02:00
+								        // Check that there is **always** 6 documents.
-												Update test to showcase the bug

											
										
										
											2022-08-17 15:03:08 +02:00
+								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 6);
 								        let count = index.all_documents(&rtxn).unwrap().count();
 								        assert_eq!(count, 6);
-												Add more tests and allow disabling of soft-deletion outside of tests

Also allow disabling soft-deletion in the IndexDocumentsConfig

											
										
										
											2022-12-05 10:33:31 +01:00
+								        db_snap!(index, word_docids, "updated");
-												Update test to showcase the bug

											
										
										
											2022-08-17 15:03:08 +02:00
+								        drop(rtxn);
-												Add a test for the words level positions generation bug

											
										
										
											2021-06-23 18:35:44 +02:00
+								    }
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
-												fix the mixed dataset geosearch indexing bug

											
										
										
											2022-05-16 15:55:18 +02:00
+								    #[test]
 								    fn mixed_geo_documents() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-												fix the mixed dataset geosearch indexing bug

											
										
										
											2022-05-16 15:55:18 +02:00
 								        // We send 6 documents and mix the ones that have _geo and those that don't have it.
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								              { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
 								              { "id": 456 },
 								              { "id": 1 },
 								              { "id": 1344 },
 								              { "id": 4 },
 								              { "id": 42, "_geo": { "lat": 35, "lng": 23 } }
 								            ]))
 								            .unwrap();
-												fix the mixed dataset geosearch indexing bug

											
										
										
											2022-05-16 15:55:18 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(hashset!(S("_geo")));
 								            })
 								            .unwrap();
-												fix the mixed dataset geosearch indexing bug

											
										
										
											2022-05-16 15:55:18 +02:00
+								    }
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								    #[test]
 								    fn geo_error() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(hashset!(S("_geo")));
 								            })
 								            .unwrap();
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let error = index
 								            .add_documents(documents!([
 								              { "id": 0, "_geo": { "lng": 42 } }
 								            ]))
 								            .unwrap_err();
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        assert_eq!(
 								            &error.to_string(),
 								            r#"Could not find latitude in the document with the id: `0`. Was expecting a `_geo.lat` field."#
 								        );
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let error = index
 								            .add_documents(documents!([
 								              { "id": 0, "_geo": { "lat": 42 } }
 								            ]))
 								            .unwrap_err();
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        assert_eq!(
 								            &error.to_string(),
 								            r#"Could not find longitude in the document with the id: `0`. Was expecting a `_geo.lng` field."#
 								        );
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let error = index
 								            .add_documents(documents!([
 								              { "id": 0, "_geo": { "lat": "lol", "lng": 42 } }
 								            ]))
 								            .unwrap_err();
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        assert_eq!(
 								            &error.to_string(),
-												Fix the nested document id fetching function

											
										
										
											2022-07-11 17:44:08 +02:00
+								            r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."#
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        );
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let error = index
 								            .add_documents(documents!([
 								              { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } }
 								            ]))
 								            .unwrap_err();
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        assert_eq!(
 								            &error.to_string(),
-												Fix the nested document id fetching function

											
										
										
											2022-07-11 17:44:08 +02:00
+								            r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."#
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        );
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let error = index
 								            .add_documents(documents!([
 								              { "id": 0, "_geo": { "lat": 12, "lng": "hello" } }
 								            ]))
 								            .unwrap_err();
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        assert_eq!(
 								            &error.to_string(),
-												Fix the nested document id fetching function

											
										
										
											2022-07-11 17:44:08 +02:00
+								            r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."#
-												improve geosearch error messages

											
										
										
											2022-05-02 19:19:50 +02:00
+								        );
 								    }
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
+								    #[test]
 								    fn delete_documents_then_insert() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
 								        index
 								            .add_documents(documents!([
 								                { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
 								                { "objectId": 456, "title": "Le Petit Prince",     "comment": "A french book" },
 								                { "objectId": 1,   "title": "Alice In Wonderland", "comment": "A weird book" },
 								                { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
 								            ]))
 								            .unwrap();
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
 								        // Delete not all of the documents but some of them.
-												remove more warnings and fix some tests

											
										
										
											2023-10-25 14:49:25 +02:00
+								        index.delete_document("30");
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
-												remove more warnings and fix some tests

											
										
										
											2023-10-25 14:49:25 +02:00
+								        let txn = index.read_txn().unwrap();
 								        assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								        let external_documents_ids = index.external_documents_ids();
 								        assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
 								            ]))
 								            .unwrap();
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let wtxn = index.write_txn().unwrap();
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								        let external_documents_ids = index.external_documents_ids();
 								        assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
+								        wtxn.commit().unwrap();
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
 								        index
 								            .add_documents(documents!([
 								                { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
 								            ]))
 								            .unwrap();
-												Add a test for when we insert a previously deleted document

											
										
										
											2021-06-30 11:23:29 +02:00
+								    }
-												Add a test to check that we can index more that 256 fields

											
										
										
											2021-07-06 11:40:45 +02:00
 								    #[test]
 								    fn index_more_than_256_fields() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Add a test to check that we can index more that 256 fields

											
										
										
											2021-07-06 11:40:45 +02:00
-												Fix the tests for the new DocumentsBatchBuilder/Reader

											
										
										
											2022-06-14 16:04:27 +02:00
+								        let mut big_object = serde_json::Map::new();
 								        big_object.insert(S("id"), serde_json::Value::from("wow"));
-												Add a test to check that we can index more that 256 fields

											
										
										
											2021-07-06 11:40:45 +02:00
+								        for i in 0..1000 {
 								            let key = i.to_string();
-												Fix the tests for the new DocumentsBatchBuilder/Reader

											
										
										
											2022-06-14 16:04:27 +02:00
+								            big_object.insert(key, serde_json::Value::from("I am a text!"));
-												Add a test to check that we can index more that 256 fields

											
										
										
											2021-07-06 11:40:45 +02:00
+								        }
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let documents = documents_batch_reader_from_objects([big_object]);
 								        index.add_documents(documents).unwrap();
-												Add a test to check that we can index more that 256 fields

											
										
										
											2021-07-06 11:40:45 +02:00
+								    }
-												Add a test that triggers a panic when indexing zeroes

											
										
										
											2021-07-22 17:14:44 +02:00
-												Remove limit of 1000 position per attribute

Instead of using an arbitrary limit we encode the absolute position in a u32
using one strong u16 for the field id and a weak u16 for the relative position in the attribute.

											
										
										
											2021-09-22 17:48:24 +02:00
+								    #[test]
 								    fn index_more_than_1000_positions_in_a_field() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new_with_map_size(4096 * 100_000); // 400 MB
 								        let mut content = String::with_capacity(382101);
 								        for i in 0..=u16::MAX {
 								            content.push_str(&format!("{i} "));
 								        }
 								        index
 								            .add_documents(documents!({
 								                "id": "wow",
 								                "content": content
 								            }))
-												Use smartstring to store the external id in our hashmap

We need to store all the external id (primary key) in a hashmap
associated to their internal id during.
The smartstring remove heap allocation / memory usage and should
improve the cache locality.

											
										
										
											2022-04-11 15:43:18 +02:00
+								            .unwrap();
-												Remove limit of 1000 position per attribute

Instead of using an arbitrary limit we encode the absolute position in a u32
using one strong u16 for the field id and a weak u16 for the relative position in the attribute.

											
										
										
											2021-09-22 17:48:24 +02:00
-												Make clippy happy

											
										
										
											2023-01-17 18:01:26 +01:00
+								        let rtxn = index.read_txn().unwrap();
 								        assert!(index.word_docids.get(&rtxn, "0").unwrap().is_some());
 								        assert!(index.word_docids.get(&rtxn, "64").unwrap().is_some());
 								        assert!(index.word_docids.get(&rtxn, "256").unwrap().is_some());
 								        assert!(index.word_docids.get(&rtxn, "1024").unwrap().is_some());
 								        assert!(index.word_docids.get(&rtxn, "32768").unwrap().is_some());
 								        assert!(index.word_docids.get(&rtxn, "65535").unwrap().is_some());
-												Remove limit of 1000 position per attribute

Instead of using an arbitrary limit we encode the absolute position in a u32
using one strong u16 for the field id and a weak u16 for the relative position in the attribute.

											
										
										
											2021-09-22 17:48:24 +02:00
+								    }
-												Add a test that triggers a panic when indexing zeroes

											
										
										
											2021-07-22 17:14:44 +02:00
+								    #[test]
 								    fn index_documents_with_zeroes() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                {
 								                    "id": 2,
 								                    "title": "Prideand Prejudice",
 								                    "au{hor": "Jane Austin",
 								                    "genre": "romance",
 								                    "price$": "3.5$",
 								                },
 								                {
 								                    "id": 456,
 								                    "title": "Le Petit Prince",
 								                    "au{hor": "Antoine de Saint-Exupéry",
 								                    "genre": "adventure",
 								                    "price$": "10.0$",
 								                },
 								                {
 								                    "id": 1,
 								                    "title": "Wonderland",
 								                    "au{hor": "Lewis Carroll",
 								                    "genre": "fantasy",
 								                    "price$": "25.99$",
 								                },
 								                {
 								                    "id": 4,
 								                    "title": "Harry Potter ing fantasy\0lood Prince",
 								                    "au{hor": "J. K. Rowling",
 								                    "genre": "fantasy\0",
 								                },
 								            ]))
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								    }
 								    #[test]
 								    fn index_documents_with_nested_fields() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                {
 								                    "id": 0,
 								                    "title": "The zeroth document",
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                {
 								                    "id": 1,
 								                    "title": "The first document",
 								                    "nested": {
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                        "object": "field",
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                        "machin": "bidule",
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                    },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                },
 								                {
 								                    "id": 2,
 								                    "title": "The second document",
 								                    "nested": [
 								                        "array",
 								                        {
 								                            "object": "field",
 								                        },
 								                        {
 								                            "prout": "truc",
 								                            "machin": "lol",
 								                        },
 								                    ],
 								                },
 								                {
 								                    "id": 3,
 								                    "title": "The third document",
 								                    "nested": "I lied",
 								                },
 								            ]))
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")];
 								                settings.set_searchable_fields(searchable_fields);
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin"));
 								                settings.set_filterable_fields(faceted_fields);
 								            })
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
 								        let rtxn = index.read_txn().unwrap();
 								        let facets = index.faceted_fields(&rtxn).unwrap();
 								        assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin")));
 								        // testing the simple query search
 								        let mut search = crate::Search::new(&rtxn, &index);
 								        search.query("document");
-												replace optional_words by term_matching_strategy

											
										
										
											2022-08-22 17:37:36 +02:00
+								        search.terms_matching_strategy(TermsMatchingStrategy::default());
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        // all documents should be returned
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids.len(), 4);
 								        search.query("zeroth");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![0]);
 								        search.query("first");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1]);
 								        search.query("second");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![2]);
 								        search.query("third");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![3]);
 								        search.query("field");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1, 2]);
 								        search.query("lol");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![2]);
 								        search.query("object");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert!(documents_ids.is_empty());
 								        search.query("array");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert!(documents_ids.is_empty()); // nested is not searchable
 								        search.query("lied");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert!(documents_ids.is_empty()); // nested is not searchable
 								        // testing the filters
 								        let mut search = crate::Search::new(&rtxn, &index);
 								        search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap());
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1]);
 								        search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap());
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1, 2]);
 								        search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap());
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1]);
 								        search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap());
 								        let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
 								        assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
 								        search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap());
 								        let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
 								        assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
 								    }
 								    #[test]
 								    fn index_documents_with_nested_primary_key() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_primary_key("complex.nested.id".to_owned());
 								            })
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                {
 								                    "complex": {
 								                        "nested": {
 								                            "id": 0,
 								                        },
-												Only flatten the required fields

apply review comments

Co-authored-by: Kerollmops <kero@meilisearch.com>

											
										
										
											2022-04-25 14:09:52 +02:00
+								                    },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                    "title": "The zeroth document",
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                {
 								                    "complex.nested": {
 								                        "id": 1,
 								                    },
 								                    "title": "The first document",
 								                },
 								                {
 								                    "complex": {
 								                        "nested.id": 2,
 								                    },
 								                    "title": "The second document",
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                {
 								                    "complex.nested.id": 3,
 								                    "title": "The third document",
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								            ]))
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
 								        let rtxn = index.read_txn().unwrap();
 								        // testing the simple query search
 								        let mut search = crate::Search::new(&rtxn, &index);
 								        search.query("document");
-												replace optional_words by term_matching_strategy

											
										
										
											2022-08-22 17:37:36 +02:00
+								        search.terms_matching_strategy(TermsMatchingStrategy::default());
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        // all documents should be returned
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids.len(), 4);
 								        search.query("zeroth");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![0]);
 								        search.query("first");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1]);
 								        search.query("second");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![2]);
 								        search.query("third");
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![3]);
 								    }
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
+								    #[test]
 								    fn retrieve_a_b_nested_document_id() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_primary_key("a.b".to_owned());
 								            })
 								            .unwrap();
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
 								        // There must be an issue with the primary key no present in the given document
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap_err();
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
+								    }
 								    #[test]
 								    fn retrieve_a_b_c_nested_document_id() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_primary_key("a.b.c".to_owned());
 								            })
 								            .unwrap();
 								        index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap();
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
 								        let rtxn = index.read_txn().unwrap();
-												Change the original document sorter key from the internal docid to a concatenation of the internal and the external docid

											
										
										
											2023-10-31 16:46:16 +01:00
+								        let all_documents_count = index.all_documents(&rtxn).unwrap().count();
 								        assert_eq!(all_documents_count, 1);
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								        let external_documents_ids = index.external_documents_ids();
 								        assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
-												Add some tests to check for the nested documents ids

											
										
										
											2022-07-12 12:42:06 +02:00
+								    }
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								    #[test]
 								    fn test_facets_generation() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                {
 								                    "id": 0,
 								                    "dog": {
 								                        "race": {
 								                            "bernese mountain": "zeroth",
 								                        },
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                    },
 								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                {
 								                    "id": 1,
 								                    "dog.race": {
 								                        "bernese mountain": "first",
 								                    },
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                {
 								                    "id": 2,
 								                    "dog.race.bernese mountain": "second",
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								                },
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								                {
 								                    "id": 3,
 								                    "dog": {
 								                        "race.bernese mountain": "third"
 								                    },
 								                },
 								            ]))
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(hashset!(String::from("dog")));
 								            })
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Fix some facet indexing bugs

											
										
										
											2022-09-01 09:51:43 +02:00
+								        db_snap!(index, facet_id_string_docids, @r###"
 0  first        1  [1, ]
 0  second       1  [2, ]
 0  third        1  [3, ]
 0  zeroth       1  [0, ]
 								        "###);
 								        db_snap!(index, field_id_docid_facet_strings, @r###"
 0    zeroth       zeroth
 1    first        first
 2    second       second
 3    third        third
 								        "###);
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        let rtxn = index.read_txn().unwrap();
 								        let hidden = index.faceted_fields(&rtxn).unwrap();
 								        assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain")));
 								        for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] {
 								            let mut search = crate::Search::new(&rtxn, &index);
 								            let filter = format!(r#""dog.race.bernese mountain" = {s}"#);
 								            search.filter(crate::Filter::from_str(&filter).unwrap().unwrap());
 								            let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								            assert_eq!(documents_ids, vec![i]);
 								        }
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        // Reset the settings
 								        index
 								            .update_settings(|settings| {
 								                settings.reset_filterable_fields();
 								            })
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Fix some facet indexing bugs

											
										
										
											2022-09-01 09:51:43 +02:00
+								        db_snap!(index, facet_id_string_docids, @"");
 								        db_snap!(index, field_id_docid_facet_strings, @"");
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        let rtxn = index.read_txn().unwrap();
 								        let facets = index.faceted_fields(&rtxn).unwrap();
 								        assert_eq!(facets, hashset!());
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        // update the settings to test the sortable
 								        index
 								            .update_settings(|settings| {
 								                settings.set_sortable_fields(hashset!(S("dog.race")));
 								            })
 								            .unwrap();
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
-												Fix some facet indexing bugs

											
										
										
											2022-09-01 09:51:43 +02:00
+								        db_snap!(index, facet_id_string_docids, @r###"
 0  first        1  [1, ]
 0  second       1  [2, ]
 0  third        1  [3, ]
 0  zeroth       1  [0, ]
 								        "###);
 								        db_snap!(index, field_id_docid_facet_strings, @r###"
 0    zeroth       zeroth
 1    first        first
 2    second       second
 3    third        third
 								        "###);
-												nested fields

											
										
										
											2022-03-23 17:28:41 +01:00
+								        let rtxn = index.read_txn().unwrap();
 								        let facets = index.faceted_fields(&rtxn).unwrap();
 								        assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain")));
 								        let mut search = crate::Search::new(&rtxn, &index);
 								        search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S(
 								            "dog.race.bernese mountain",
 								        )))]);
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids, vec![1, 2, 3, 0]);
-												Add a test that triggers a panic when indexing zeroes

											
										
										
											2021-07-22 17:14:44 +02:00
+								    }
-												Add test checking the bug reported in meilisearch issue 1716

											
										
										
											2021-09-23 15:55:39 +02:00
 								    #[test]
 								    fn index_2_times_documents_split_by_zero_document_indexation() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
 								        index
 								            .add_documents(documents!([
 								                {"id": 0, "name": "Kerollmops", "score": 78},
 								                {"id": 1, "name": "ManyTheFish", "score": 75},
 								                {"id": 2, "name": "Ferdi", "score": 39},
 								                {"id": 3, "name": "Tommy", "score": 33}
 								            ]))
 								            .unwrap();
-												Add test checking the bug reported in meilisearch issue 1716

											
										
										
											2021-09-23 15:55:39 +02:00
 								        // Check that there is 4 document now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 4);
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(documents!([])).unwrap();
-												Add test checking the bug reported in meilisearch issue 1716

											
										
										
											2021-09-23 15:55:39 +02:00
 								        // Check that there is 4 document now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 4);
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								                {"id": 0, "name": "Kerollmops", "score": 78},
 								                {"id": 1, "name": "ManyTheFish", "score": 75},
 								                {"id": 2, "name": "Ferdi", "score": 39},
 								                {"id": 3, "name": "Tommy", "score": 33}
 								            ]))
 								            .unwrap();
-												Add test checking the bug reported in meilisearch issue 1716

											
										
										
											2021-09-23 15:55:39 +02:00
 								        // Check that there is 4 document now.
 								        let rtxn = index.read_txn().unwrap();
 								        let count = index.number_of_documents(&rtxn).unwrap();
 								        assert_eq!(count, 4);
 								    }
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
-												Allow to disable specialized tokenizations (again)

In PR #2773, I added the `chinese`, `hebrew`, `japanese` and `thai`
feature flags to allow melisearch to be built without huge specialed
tokenizations that took up 90% of the melisearch binary size.
Unfortunately, due to some recent changes, this doesn't work anymore.
The problem lies in excessive use of the `default` feature flag, which
infects the dependency graph.

Instead of adding `default-features = false` here and there, it's easier
and more future-proof to not declare `default` in `milli` and
`meilisearch-types`. I've renamed it to `all-tokenizers`, which also
makes it a bit clearer what it's about.

											
										
										
											2023-04-24 00:26:08 +02:00
+								    #[cfg(feature = "chinese")]
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
+								    #[test]
 								    fn test_meilisearch_1714() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index
 								            .add_documents(documents!([
 								              {"id": "123", "title": "小化妆包" },
 								              {"id": "456", "title": "Ipad 包" }
 								            ]))
 								            .unwrap();
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
 								        let rtxn = index.read_txn().unwrap();
 								        // Only the first document should match.
-												Fix #1714 test

											
										
										
											2022-12-13 16:21:31 +01:00
+								        let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
+								        assert_eq!(count, 1);
 								        // Only the second document should match.
-												Fix #1714 test

											
										
										
											2022-12-13 16:21:31 +01:00
+								        let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
+								        assert_eq!(count, 1);
 								        let mut search = crate::Search::new(&rtxn, &index);
 								        search.query("化妆包");
-												replace optional_words by term_matching_strategy

											
										
										
											2022-08-22 17:37:36 +02:00
+								        search.terms_matching_strategy(TermsMatchingStrategy::default());
-												Add failing test related to Meilisearch#1714

											
										
										
											2021-09-28 12:05:11 +02:00
 								        // only 1 document should be returned
 								        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
 								        assert_eq!(documents_ids.len(), 1);
 								    }
-												Add a test to make sure that long words are handled

											
										
										
											2022-04-21 13:45:28 +02:00
 								    /// We try to index documents with words that are too long here,
 								    /// it should not return any error.
 								    #[test]
 								    fn text_with_too_long_words() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
 								        index
 								            .add_documents(documents!([
 								              {"id": 1, "title": "a".repeat(256) },
 								              {"id": 2, "title": "b".repeat(512) },
 								              {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) },
 								            ]))
 								            .unwrap();
-												Add a test to make sure that long words are handled

											
										
										
											2022-04-21 13:45:28 +02:00
+								    }
-												Add a test for long keys in LMDB

											
										
										
											2022-05-03 10:03:13 +02:00
 								    #[test]
 								    fn text_with_too_long_keys() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Add a test for long keys in LMDB

											
										
										
											2022-05-03 10:03:13 +02:00
+								        let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3";
 								        // Create 200 documents with a long text
 								        let content = {
-												Fix the tests for the new DocumentsBatchBuilder/Reader

											
										
										
											2022-06-14 16:04:27 +02:00
+								            let documents_iter = (0..200i32)
-												Add a test for long keys in LMDB

											
										
										
											2022-05-03 10:03:13 +02:00
+								                .map(|i| serde_json::json!({ "id": i, "script": script }))
-												Fix the tests for the new DocumentsBatchBuilder/Reader

											
										
										
											2022-06-14 16:04:27 +02:00
+								                .filter_map(|json| match json {
 								                    serde_json::Value::Object(object) => Some(object),
 								                    _ => None,
 								                });
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								            documents_batch_reader_from_objects(documents_iter)
-												Add a test for long keys in LMDB

											
										
										
											2022-05-03 10:03:13 +02:00
+								        };
 								        // Index those 200 long documents
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(content).unwrap();
-												Add a test for long keys in LMDB

											
										
										
											2022-05-03 10:03:13 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        // Index one long document
 								        index
 								            .add_documents(documents!([
 								              {"id": 400, "script": script },
 								            ]))
 								            .unwrap();
-												Add a test for long keys in LMDB

											
										
										
											2022-05-03 10:03:13 +02:00
+								    }
-												add failing test

											
										
										
											2022-06-07 12:24:06 +02:00
 								    #[test]
 								    fn index_documents_in_multiple_transforms() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												add failing test

											
										
										
											2022-06-07 12:24:06 +02:00
 								        let doc1 = documents! {[{
 								            "id": 228142,
 								            "title": "asdsad",
 								            "state": "automated",
 								            "priority": "normal",
 								            "public_uid": "37ccf021",
 								            "project_id": 78207,
 								            "branch_id_number": 0
 								        }]};
 								        let doc2 = documents! {[{
 								            "id": 228143,
 								            "title": "something",
 								            "state": "automated",
 								            "priority": "normal",
 								            "public_uid": "39c6499b",
 								            "project_id": 78207,
 								            "branch_id_number": 0
 								        }]};
-												Fix existing tests

											
										
										
											2022-12-20 11:21:29 +01:00
+								        {
 								            let mut wtxn = index.write_txn().unwrap();
 								            index.put_primary_key(&mut wtxn, "id").unwrap();
 								            wtxn.commit().unwrap();
 								        }
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(doc1).unwrap();
 								        index.add_documents(doc2).unwrap();
-												add failing test

											
										
										
											2022-06-07 12:24:06 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let wtxn = index.read_txn().unwrap();
-												add failing test

											
										
										
											2022-06-07 12:24:06 +02:00
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								        let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
-												add failing test

											
										
										
											2022-06-07 12:24:06 +02:00
+								        let ids = map.values().collect::<HashSet<_>>();
 								        assert_eq!(ids.len(), map.len());
 								    }
-												Make sur that we do not accept floats as documents ids

											
										
										
											2022-06-15 16:06:52 +02:00
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
+								    #[test]
 								    fn index_documents_check_exists_database() {
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								        let content = || {
 								            documents!([
 								                {
 								                    "id": 0,
 								                    "colour": 0,
 								                },
 								                {
 								                    "id": 1,
 								                    "colour": []
 								                },
 								                {
 								                    "id": 2,
 								                    "colour": {}
 								                },
 								                {
 								                    "id": 3,
 								                    "colour": null
 								                },
 								                {
 								                    "id": 4,
 								                    "colour": [1]
 								                },
 								                {
 								                    "id": 5
 								                },
 								                {
 								                    "id": 6,
 								                    "colour": {
 								                        "green": 1
 								                    }
 								                },
 								                {
 								                    "id": 7,
 								                    "colour": {
 								                        "green": {
 								                            "blue": []
 								                        }
 								                    }
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
+								                }
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								            ])
 								        };
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								        let check_ok = |index: &Index| {
 								            let rtxn = index.read_txn().unwrap();
 								            let facets = index.faceted_fields(&rtxn).unwrap();
 								            assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
 								            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								            let bitmap_colour =
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								                index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								            assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								            let bitmap_colour_green =
 								                index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
 								        };
-												Simplify unit tests in facet/filter.rs

											
										
										
											2022-08-04 10:46:10 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let faceted_fields = hashset!(S("colour"));
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
 								        index.add_documents(content()).unwrap();
-												Simplify unit tests in facet/filter.rs

											
										
										
											2022-08-04 10:46:10 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(faceted_fields.clone());
 								            })
 								            .unwrap();
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								        check_ok(&index);
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Simplify unit tests in facet/filter.rs

											
										
										
											2022-08-04 10:46:10 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(faceted_fields.clone());
 								            })
 								            .unwrap();
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(content()).unwrap();
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								        check_ok(&index);
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
+								    }
-												Merge branch 'filter/field-exist'

											
										
										
											2022-07-21 14:51:41 +02:00
-												Add one more test for the NULL operator

											
										
										
											2023-03-09 13:53:37 +01:00
+								    #[test]
 								    fn index_documents_check_is_null_database() {
 								        let content = || {
 								            documents!([
 								                {
 								                    "id": 0,
 								                    "colour": null,
 								                },
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								                {
 								                    "id": 1,
 								                    "colour": [null], // must not be returned
 								                },
-												Add one more test for the NULL operator

											
										
										
											2023-03-09 13:53:37 +01:00
+								                {
 								                    "id": 6,
 								                    "colour": {
 								                        "green": null
 								                    }
 								                },
 								                {
 								                    "id": 7,
 								                    "colour": {
 								                        "green": {
 								                            "blue": null
 								                        }
 								                    }
 								                },
 								                {
 								                    "id": 8,
 								                    "colour": 0,
 								                },
 								                {
 								                    "id": 9,
 								                    "colour": []
 								                },
 								                {
 								                    "id": 10,
 								                    "colour": {}
 								                },
 								                {
 								                    "id": 12,
 								                    "colour": [1]
 								                },
 								                {
 								                    "id": 13
 								                },
 								                {
 								                    "id": 14,
 								                    "colour": {
 								                        "green": 1
 								                    }
 								                },
 								                {
 								                    "id": 15,
 								                    "colour": {
 								                        "green": {
 								                            "blue": []
 								                        }
 								                    }
 								                }
 								            ])
 								        };
 								        let check_ok = |index: &Index| {
 								            let rtxn = index.read_txn().unwrap();
 								            let facets = index.faceted_fields(&rtxn).unwrap();
 								            assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
 								            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
 								            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
 								            let colour_blue_id =
 								                index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap();
 								            let bitmap_null_colour =
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								                index.facet_id_is_null_docids.get(&rtxn, &colour_id).unwrap().unwrap();
-												Add one more test for the NULL operator

											
										
										
											2023-03-09 13:53:37 +01:00
+								            assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]);
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								            let bitmap_colour_green =
 								                index.facet_id_is_null_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]);
-												Add one more test for the NULL operator

											
										
										
											2023-03-09 13:53:37 +01:00
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								            let bitmap_colour_blue =
 								                index.facet_id_is_null_docids.get(&rtxn, &colour_blue_id).unwrap().unwrap();
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								            assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
-												Add one more test for the NULL operator

											
										
										
											2023-03-09 13:53:37 +01:00
+								        };
 								        let faceted_fields = hashset!(S("colour"));
 								        let index = TempIndex::new();
 								        index.add_documents(content()).unwrap();
 								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(faceted_fields.clone());
 								            })
 								            .unwrap();
 								        check_ok(&index);
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								        let index = TempIndex::new();
 								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(faceted_fields.clone());
 								            })
 								            .unwrap();
 								        index.add_documents(content()).unwrap();
 								        check_ok(&index);
 								    }
 								    #[test]
 								    fn index_documents_check_is_empty_database() {
 								        let content = || {
 								            documents!([
 								                {"id": 0, "tags": null },
 								                {"id": 1, "tags": [null] },
 								                {"id": 2, "tags": [] },
 								                {"id": 3, "tags": ["hello","world"] },
 								                {"id": 4, "tags": [""] },
 								                {"id": 5 },
 								                {"id": 6, "tags": {} },
 								                {"id": 7, "tags": {"green": "cool"} },
 								                {"id": 8, "tags": {"green": ""} },
 								                {"id": 9, "tags": "" },
 								                {"id": 10, "tags": { "green": null } },
 								                {"id": 11, "tags": { "green": { "blue": null } } },
 								                {"id": 12, "tags": { "green": { "blue": [] } } }
 								            ])
 								        };
 								        let check_ok = |index: &Index| {
 								            let rtxn = index.read_txn().unwrap();
 								            let facets = index.faceted_fields(&rtxn).unwrap();
 								            assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));
 								            let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
 								            let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
 								            let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap();
 								            let bitmap_empty_tags =
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								                index.facet_id_is_empty_docids.get(&rtxn, &tags_id).unwrap().unwrap();
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								            assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]);
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								            let bitmap_tags_green =
 								                index.facet_id_is_empty_docids.get(&rtxn, &tags_green_id).unwrap().unwrap();
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								            assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]);
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								            let bitmap_tags_blue =
 								                index.facet_id_is_empty_docids.get(&rtxn, &tags_blue_id).unwrap().unwrap();
-												Improve the testing of the filters

											
										
										
											2023-03-15 14:57:17 +01:00
+								            assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
 								        };
 								        let faceted_fields = hashset!(S("tags"));
 								        let index = TempIndex::new();
 								        index.add_documents(content()).unwrap();
 								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(faceted_fields.clone());
 								            })
 								            .unwrap();
 								        check_ok(&index);
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												Simplify unit tests in facet/filter.rs

											
										
										
											2022-08-04 10:46:10 +02:00
+								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(faceted_fields.clone());
 								            })
 								            .unwrap();
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(content()).unwrap();
-												Refactor index_documents_check_exists_database tests

											
										
										
											2022-06-16 08:41:33 +02:00
+								        check_ok(&index);
-												Add a database containing the docids where each field exists

											
										
										
											2022-07-19 09:30:19 +02:00
+								    }
-												Merge branch 'filter/field-exist'

											
										
										
											2022-07-21 14:51:41 +02:00
-												Make sur that we do not accept floats as documents ids

											
										
										
											2022-06-15 16:06:52 +02:00
+								    #[test]
 								    fn primary_key_must_not_contain_floats() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new_with_map_size(4096 * 100);
-												Make sur that we do not accept floats as documents ids

											
										
										
											2022-06-15 16:06:52 +02:00
 								        let doc1 = documents! {[{
 								            "id": -228142,
 								            "title": "asdsad",
 								        }]};
 								        let doc2 = documents! {[{
 								            "id": 228143.56,
 								            "title": "something",
 								        }]};
 								        let doc3 = documents! {[{
 								            "id": -228143.56,
 								            "title": "something",
 								        }]};
 								        let doc4 = documents! {[{
 								            "id": 2.0,
 								            "title": "something",
 								        }]};
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(doc1).unwrap();
 								        index.add_documents(doc2).unwrap_err();
 								        index.add_documents(doc3).unwrap_err();
 								        index.add_documents(doc4).unwrap_err();
-												Make sur that we do not accept floats as documents ids

											
										
										
											2022-06-15 16:06:52 +02:00
+								    }
-												fix: Remove whitespace trimming during document id validation

fix #592

											
										
										
											2022-08-03 11:38:40 +02:00
 								    #[test]
 								    fn primary_key_must_not_contain_whitespace() {
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        let index = TempIndex::new();
-												fix: Remove whitespace trimming during document id validation

fix #592

											
										
										
											2022-08-03 11:38:40 +02:00
 								        let doc1 = documents! {[{
 								            "id": " 1",
 								            "title": "asdsad",
 								        }]};
 								        let doc2 = documents! {[{
 								            "id": "\t2",
 								            "title": "something",
 								        }]};
 								        let doc3 = documents! {[{
 								            "id": "\r3",
 								            "title": "something",
 								        }]};
 								        let doc4 = documents! {[{
 								            "id": "\n4",
 								            "title": "something",
 								        }]};
-												Simplify indexing tests

											
										
										
											2022-08-02 15:13:06 +02:00
+								        index.add_documents(doc1).unwrap_err();
 								        index.add_documents(doc2).unwrap_err();
 								        index.add_documents(doc3).unwrap_err();
 								        index.add_documents(doc4).unwrap_err();
-												fix: Remove whitespace trimming during document id validation

fix #592

											
										
										
											2022-08-03 11:38:40 +02:00
+								    }
-												Add a test to make sure that long words are correctly skipped

											
										
										
											2022-09-07 14:11:44 +02:00
-												Add primary_key_inference test

											
										
										
											2022-12-20 11:21:42 +01:00
+								    #[test]
 								    fn primary_key_inference() {
 								        let index = TempIndex::new();
 								        let doc_no_id = documents! {[{
 								            "title": "asdsad",
 								            "state": "automated",
 								            "priority": "normal",
 								            "branch_id_number": 0
 								        }]};
 								        assert!(matches!(
 								            index.add_documents(doc_no_id),
 								            Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
 								        ));
 								        let doc_multiple_ids = documents! {[{
 								            "id": 228143,
 								            "title": "something",
 								            "state": "automated",
 								            "priority": "normal",
 								            "public_uid": "39c6499b",
 								            "project_id": 78207,
 								            "branch_id_number": 0
 								        }]};
-												Format let-else ❤️ 🎉

											
										
										
											2023-07-03 10:20:28 +02:00
+								        let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { candidates })) =
 								            index.add_documents(doc_multiple_ids)
 								        else {
 								            panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)")
 								        };
-												Add primary_key_inference test

											
										
										
											2022-12-20 11:21:42 +01:00
 								        assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
 								        let doc_inferable = documents! {[{
 								            "video": "test.mp4",
 								            "id": 228143,
 								            "title": "something",
 								            "state": "automated",
 								            "priority": "normal",
 								            "public_uid_": "39c6499b",
 								            "project_id_": 78207,
 								            "branch_id_number": 0
 								        }]};
 								        index.add_documents(doc_inferable).unwrap();
 								        let txn = index.read_txn().unwrap();
 								        assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
 								    }
-												Add a test to make sure that long words are correctly skipped

											
										
										
											2022-09-07 14:11:44 +02:00
+								    #[test]
 								    fn long_words_must_be_skipped() {
 								        let index = TempIndex::new();
 								        // this is obviousy too long
 								        let long_word = "lol".repeat(1000);
 								        let doc1 = documents! {[{
 								            "id": "1",
-												Execute cargo clippy --fix

											
										
										
											2022-10-10 15:28:03 +02:00
+								            "title": long_word,
-												Add a test to make sure that long words are correctly skipped

											
										
										
											2022-09-07 14:11:44 +02:00
+								        }]};
 								        index.add_documents(doc1).unwrap();
 								        let rtxn = index.read_txn().unwrap();
 								        let words_fst = index.words_fst(&rtxn).unwrap();
 								        assert!(!words_fst.contains(&long_word));
 								    }
-												Add test for indexing a document with a long facet value

											
										
										
											2022-11-16 15:19:55 +01:00
 								    #[test]
 								    fn long_facet_values_must_not_crash() {
 								        let index = TempIndex::new();
 								        // this is obviousy too long
 								        let long_word = "lol".repeat(1000);
 								        let doc1 = documents! {[{
 								            "id": "1",
 								            "title": long_word,
 								        }]};
 								        index
 								            .update_settings(|settings| {
 								                settings.set_filterable_fields(hashset! { S("title") });
 								            })
 								            .unwrap();
 								        index.add_documents(doc1).unwrap();
 								    }
-												Add tests for checking that detected script and language associated with document(s) were stored during indexing

											
										
										
											2022-10-14 21:05:53 +02:00
 								    #[cfg(feature = "default")]
 								    #[test]
 								    fn store_detected_script_and_language_per_document_during_indexing() {
 								        use charabia::{Language, Script};
 								        let index = TempIndex::new();
 								        index
 								            .add_documents(documents!([
 								                { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
 								                { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
 								                { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
 								                { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
 								                { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
 								                { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
 								            ]))
 								            .unwrap();
 								        let rtxn = index.read_txn().unwrap();
 								        let key_jpn = (Script::Cj, Language::Jpn);
 								        let key_cmn = (Script::Cj, Language::Cmn);
 								        let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
 								        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
-												Fix tests

											
										
										
											2022-10-19 14:13:10 +02:00
+								        let expected_cj_jpn_docids = [3].iter().collect();
-												Add tests for checking that detected script and language associated with document(s) were stored during indexing

											
										
										
											2022-10-14 21:05:53 +02:00
+								        assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
-												Format code

											
										
										
											2022-10-19 14:18:11 +02:00
+								        let expected_cj_cmn_docids = [1, 5].iter().collect();
-												Add tests for checking that detected script and language associated with document(s) were stored during indexing

											
										
										
											2022-10-14 21:05:53 +02:00
+								        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
 								    }
-												Merge branch 'main' into enhance-language-detection

											
										
										
											2023-02-20 18:14:34 +01:00
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								    #[test]
 								    fn add_and_delete_documents_in_single_transform() {
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let documents = documents!([
 								            { "id": 1, "doggo": "kevin" },
 								            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
 								            { "id": 3, "name": "jean", "age": 25 },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"3");
 								        let (builder, removed) = builder.remove_documents(vec![S("2")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"1");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 3,
 								            number_of_documents: 2,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":1,"doggo":"kevin"}
 								        {"id":3,"name":"jean","age":25}
 								        "###);
 								    }
 								    #[test]
 								    fn add_update_and_delete_documents_in_single_transform() {
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let documents = documents!([
 								            { "id": 1, "doggo": "kevin" },
 								            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
 								            { "id": 3, "name": "jean", "age": 25 },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"3");
 								        let documents = documents!([
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								            { "id": 2, "catto": "jorts" },
 								            { "id": 3, "legs": 4 },
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"2");
 								        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"2");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 5,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        {"id":3,"name":"jean","age":25,"legs":4}
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        "###);
 								    }
 								    #[test]
 								    fn add_document_and_in_another_transform_update_and_delete_documents() {
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let documents = documents!([
 								            { "id": 1, "doggo": "kevin" },
 								            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
 								            { "id": 3, "name": "jean", "age": 25 },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"3");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 3,
 								            number_of_documents: 3,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":1,"doggo":"kevin"}
 								        {"id":2,"doggo":{"name":"bob","age":20}}
 								        {"id":3,"name":"jean","age":25}
 								        "###);
 								        // A first batch of documents has been inserted
 								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let documents = documents!([
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								            { "id": 2, "catto": "jorts" },
 								            { "id": 3, "legs": 4 },
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"2");
 								        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"2");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 2,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        {"id":3,"name":"jean","age":25,"legs":4}
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        "###);
 								    }
 								    #[test]
 								    fn delete_document_and_then_add_documents_in_the_same_transform() {
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-												provide a new method on the transform to remove documents

											
										
										
											2023-02-08 16:06:09 +01:00
+								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"0");
 								        let documents = documents!([
 								            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
 								            { "id": 3, "name": "bob", "age": 25 },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"2");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 2,
 								            number_of_documents: 2,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":2,"doggo":{"name":"jean","age":20}}
 								        {"id":3,"name":"bob","age":25}
 								        "###);
 								    }
-												add a test to ensure we handle correctly a deletion of multiple time the same document

											
										
										
											2023-02-08 21:03:34 +01:00
 								    #[test]
 								    fn delete_the_same_document_multiple_time() {
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
+								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-												add a test to ensure we handle correctly a deletion of multiple time the same document

											
										
										
											2023-02-08 21:03:34 +01:00
+								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let (builder, removed) =
 								            builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"0");
 								        let documents = documents!([
 								            { "id": 1, "doggo": "kevin" },
 								            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
 								            { "id": 3, "name": "bob", "age": 25 },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"3");
 								        let (builder, removed) =
 								            builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"2");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 3,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":3,"name":"bob","age":25}
 								        "###);
 								    }
-												Fix a bug when you update a document that was already present in the db, deleted and then inserted again in the same transform

											
										
										
											2023-02-14 19:09:40 +01:00
 								    #[test]
 								    fn add_document_and_in_another_transform_delete_the_document_then_add_it_again() {
 								        let mut index = TempIndex::new();
 								        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
 								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let documents = documents!([
 								            { "id": 1, "doggo": "kevin" },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"1");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 1,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":1,"doggo":"kevin"}
 								        "###);
 								        // A first batch of documents has been inserted
 								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"1");
 								        let documents = documents!([
 								            { "id": 1, "catto": "jorts" },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"1");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 1,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":1,"catto":"jorts"}
 								        "###);
 								    }
-												Fix indexing of word_position_docid and fid

											
										
										
											2023-04-05 14:55:02 +02:00
 								    #[test]
 								    fn test_word_fid_position() {
 								        let index = TempIndex::new();
 								        index
 								            .add_documents(documents!([
 								              {"id": 0, "text": "sun flowers are looking at the sun" },
 								              {"id": 1, "text": "sun flowers are looking at the sun" },
 								              {"id": 2, "text": "the sun is shining today" },
 								              {
 								                "id": 3,
 								                "text": "a a a a a a a a a a a a a a a a a
-												Remove docid_word_positions_db + fix deletion bug

That would happen when a word was deleted from all exact attributes
but not all regular attributes.

											
										
										
											2023-06-07 10:02:21 +02:00
+								                a a a a a a a a a a a a a a a a a a a a a a a a a a
 								                a a a a a a a a a a a a a a a a a a a a a a a a a a
 								                a a a a a a a a a a a a a a a a a a a a a a a a a a
 								                a a a a a a a a a a a a a a a a a a a a a a a a a a
 								                a a a a a a a a a a a a a a a a a a a a a a a a a a
-												Fix indexing of word_position_docid and fid

											
										
										
											2023-04-05 14:55:02 +02:00
+								                a a a a a a a a a a a a a a a a a a a a a "
 								             }
 								            ]))
 								            .unwrap();
 								        db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
 								        db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
 								        index
 								            .add_documents(documents!([
 								              {"id": 4, "text": "sun flowers are looking at the sun" },
 								              {"id": 5, "text2": "sun flowers are looking at the sun" },
 								              {"id": 6, "text": "b b b" },
 								              {
 								                "id": 7,
 								                "text2": "a a a a"
 								             }
 								            ]))
 								            .unwrap();
 								        db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
 								        db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
 								        // Delete not all of the documents but some of them.
-												remove more warnings and fix some tests

											
										
										
											2023-10-25 14:49:25 +02:00
+								        index.delete_documents(vec!["0".into(), "3".into()]);
-												Fix indexing of word_position_docid and fid

											
										
										
											2023-04-05 14:55:02 +02:00
 								        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
 								        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
 								    }
-												fix the addition + deletion bug

											
										
										
											2023-05-17 18:19:43 +02:00
-												Add a test case scenario

											
										
										
											2023-09-06 11:49:27 +02:00
+								    /// Index multiple different number of vectors in documents.
 								    /// Vectors must be of the same length.
 								    #[test]
 								    fn test_multiple_vectors() {
-												Fix tests

											
										
										
											2023-12-20 17:06:50 +01:00
+								        use crate::vector::settings::EmbeddingSettings;
-												Add a test case scenario

											
										
										
											2023-09-06 11:49:27 +02:00
+								        let index = TempIndex::new();
-												Tests pass

											
										
										
											2023-12-13 21:49:13 +01:00
+								        index
 								            .update_settings(|settings| {
 								                let mut embedders = BTreeMap::default();
 								                embedders.insert(
 								                    "manual".to_string(),
 								                    Setting::Set(EmbeddingSettings {
-												Fix tests

											
										
										
											2023-12-20 17:06:50 +01:00
+								                        source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided),
 								                        model: Setting::NotSet,
 								                        revision: Setting::NotSet,
 								                        api_key: Setting::NotSet,
 								                        dimensions: Setting::Set(3),
-												Tests pass

											
										
										
											2023-12-13 21:49:13 +01:00
+								                        document_template: Setting::NotSet,
-												Expose REST embedder to the API

											
										
										
											2024-03-25 10:05:38 +01:00
+								                        url: Setting::NotSet,
 								                        query: Setting::NotSet,
 								                        input_field: Setting::NotSet,
 								                        path_to_embeddings: Setting::NotSet,
 								                        embedding_object: Setting::NotSet,
 								                        input_type: Setting::NotSet,
-												Tests pass

											
										
										
											2023-12-13 21:49:13 +01:00
+								                    }),
 								                );
 								                settings.set_embedder_settings(embedders);
 								            })
 								            .unwrap();
 								        index
 								            .add_documents(
 								                documents!([{"id": 0, "_vectors": { "manual": [[0, 1, 2], [3, 4, 5]] } }]),
 								            )
 								            .unwrap();
 								        index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap();
-												Add a test case scenario

											
										
										
											2023-09-06 11:49:27 +02:00
+								        index
-												Fix tests

											
										
										
											2023-12-20 17:06:50 +01:00
+								               .add_documents(
 								                   documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]),
 								               )
 								               .unwrap();
-												Add a test case scenario

											
										
										
											2023-09-06 11:49:27 +02:00
 								        let rtxn = index.read_txn().unwrap();
-												Small commit to add hybrid search and autoembedding

											
										
										
											2023-11-15 15:46:37 +01:00
+								        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap();
-												Add a test case scenario

											
										
										
											2023-09-06 11:49:27 +02:00
+								        assert_eq!(res.documents_ids.len(), 3);
 								    }
-												fix the addition + deletion bug

											
										
										
											2023-05-17 18:19:43 +02:00
+								    #[test]
 								    fn reproduce_the_bug() {
 								        /*
 								            [milli/examples/fuzz.rs:69] &batches = [
 								            Batch(
 								                [
 								                    AddDoc(
 								                        { "id": 1, "doggo": "bernese" }, => internal 0
 								                    ),
 								                ],
 								            ),
 								            Batch(
 								                [
 								                    DeleteDoc(
 , => delete internal 0
 								                    ),
 								                    AddDoc(
 								                        { "id": 0, "catto": "jorts" }, => internal 1
 								                    ),
 								                ],
 								            ),
 								            Batch(
 								                [
 								                    AddDoc(
 								                        { "id": 1, "catto": "jorts" }, => internal 2
 								                    ),
 								                ],
 								            ),
 								        ]
 								        */
-												remove more warnings and fix some tests

											
										
										
											2023-10-25 14:49:25 +02:00
+								        let index = TempIndex::new();
-												fix the addition + deletion bug

											
										
										
											2023-05-17 18:19:43 +02:00
 								        // START OF BATCH
 								        println!("--- ENTERING BATCH 1");
 								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        // OP
 								        let documents = documents!([
 								            { "id": 1, "doggo": "bernese" },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"1");
 								        // FINISHING
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 1,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":1,"doggo":"bernese"}
 								        "###);
 								        db_snap!(index, external_documents_ids, @r###"
-												Fix some snapshots

											
										
										
											2023-10-25 18:02:43 +02:00
+								        docids:
-												fix the addition + deletion bug

											
										
										
											2023-05-17 18:19:43 +02:00
+0
 								        "###);
 								        // A first batch of documents has been inserted
 								        // BATCH 2
 								        println!("--- ENTERING BATCH 2");
 								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
 								        insta::assert_display_snapshot!(removed.unwrap(), @"1");
 								        let documents = documents!([
 								            { "id": 0, "catto": "jorts" },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"1");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 1,
 								            number_of_documents: 1,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":0,"catto":"jorts"}
 								        "###);
 								        db_snap!(index, external_documents_ids, @r###"
-												Fix some snapshots

											
										
										
											2023-10-25 18:02:43 +02:00
+								        docids:
-												fix the addition + deletion bug

											
										
										
											2023-05-17 18:19:43 +02:00
+1
 								        "###);
 								        // BATCH 3
 								        println!("--- ENTERING BATCH 3");
 								        let mut wtxn = index.write_txn().unwrap();
 								        let builder = IndexDocuments::new(
 								            &mut wtxn,
 								            &index,
 								            &index.indexer_config,
 								            index.index_documents_config.clone(),
 								            |_| (),
 								            || false,
 								        )
 								        .unwrap();
 								        let documents = documents!([
 								            { "id": 1, "catto": "jorts" },
 								        ]);
 								        let (builder, added) = builder.add_documents(documents).unwrap();
 								        insta::assert_display_snapshot!(added.unwrap(), @"1");
 								        let addition = builder.execute().unwrap();
 								        insta::assert_debug_snapshot!(addition, @r###"
 								        DocumentAdditionResult {
 								            indexed_documents: 1,
 								            number_of_documents: 2,
 								        }
 								        "###);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents, @r###"
 								        {"id":1,"catto":"jorts"}
 								        {"id":0,"catto":"jorts"}
 								        "###);
 								        // Ensuring all the returned IDs actually exists
 								        let rtxn = index.read_txn().unwrap();
 								        let res = index.search(&rtxn).execute().unwrap();
 								        index.documents(&rtxn, res.documents_ids).unwrap();
 								    }
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
 								    fn delete_documents<'t>(
-												Fix the tests

											
										
										
											2023-11-23 12:07:35 +01:00
+								        wtxn: &mut RwTxn<'t>,
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								        index: &'t TempIndex,
 								        external_ids: &[&str],
 								    ) -> Vec<u32> {
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								        let external_document_ids = index.external_documents_ids();
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								        let ids_to_delete: Vec<u32> = external_ids
 								            .iter()
-												Fix clippy issues

											
										
										
											2023-11-06 11:19:31 +01:00
+								            .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								            .collect();
 								        // Delete some documents.
 								        index.delete_documents_using_wtxn(
 								            wtxn,
 								            external_ids.iter().map(ToString::to_string).collect(),
 								        );
 								        ids_to_delete
 								    }
 								    #[test]
 								    fn delete_documents_with_numbers_as_primary_key() {
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                    { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
 								                    { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
 								                    { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
 								                ]),
 								            )
 								            .unwrap();
 								        // delete those documents, ids are synchronous therefore 0, 1, and 2.
 								        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
 								        wtxn.commit().unwrap();
 								        // All these snapshots should be empty since the database was cleared
 								        db_snap!(index, documents_ids);
 								        db_snap!(index, word_docids);
 								        db_snap!(index, word_pair_proximity_docids);
 								        db_snap!(index, facet_id_exists_docids);
 								        let rtxn = index.read_txn().unwrap();
 								        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
 								    }
 								    #[test]
 								    fn delete_documents_with_strange_primary_key() {
 								        let index = TempIndex::new();
 								        index
 								            .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
 								            .unwrap();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                    { "mysuperid": 0, "name": "kevin" },
 								                    { "mysuperid": 1, "name": "kevina" },
 								                    { "mysuperid": 2, "name": "benoit" }
 								                ]),
 								            )
 								            .unwrap();
 								        wtxn.commit().unwrap();
 								        let mut wtxn = index.write_txn().unwrap();
 								        // Delete not all of the documents but some of them.
 								        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, documents_ids);
 								        db_snap!(index, word_docids);
 								        db_snap!(index, word_pair_proximity_docids);
 								    }
 								    #[test]
-												Rename test

											
										
										
											2023-10-31 09:41:17 +01:00
+								    fn filtered_placeholder_search_should_not_return_deleted_documents() {
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .update_settings_using_wtxn(&mut wtxn, |settings| {
 								                settings.set_primary_key(S("docid"));
 								                settings.set_filterable_fields(hashset! { S("label"), S("label2") });
 								            })
 								            .unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                    { "docid": "1_4",  "label": ["sign"] },
 								                    { "docid": "1_5",  "label": ["letter"] },
 								                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
 								                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
 								                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
 								                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
 								                    { "docid": "1_39", "label": ["abstract"] },
 								                    { "docid": "1_40", "label": ["cartoon"] },
 								                    { "docid": "1_41", "label": ["art","drawing"] },
 								                    { "docid": "1_42", "label": ["art","pattern"] },
 								                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
 								                    { "docid": "1_44", "label": ["drawing"] },
 								                    { "docid": "1_45", "label": ["art"] },
 								                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
 								                    { "docid": "1_47", "label": ["abstract","pattern"] },
 								                    { "docid": "1_52", "label": ["abstract","cartoon"] },
 								                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
 								                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
 								                    { "docid": "1_68", "label": ["design"] },
 								                    { "docid": "1_69", "label": ["geometry"] },
 								                    { "docid": "1_70", "label2": ["geometry", 1.2] },
 								                    { "docid": "1_71", "label2": ["design", 2.2] },
 								                    { "docid": "1_72", "label2": ["geometry", 1.2] }
 								                ]),
 								            )
 								            .unwrap();
 								        delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
 								        // Placeholder search with filter
 								        let filter = Filter::from_str("label = sign").unwrap().unwrap();
 								        let results = index.search(&wtxn).filter(filter).execute().unwrap();
 								        assert!(results.documents_ids.is_empty());
 								        wtxn.commit().unwrap();
 								        db_snap!(index, word_docids);
 								        db_snap!(index, facet_id_f64_docids);
 								        db_snap!(index, word_pair_proximity_docids);
 								        db_snap!(index, facet_id_exists_docids);
 								        db_snap!(index, facet_id_string_docids);
 								    }
 								    #[test]
 								    fn placeholder_search_should_not_return_deleted_documents() {
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .update_settings_using_wtxn(&mut wtxn, |settings| {
 								                settings.set_primary_key(S("docid"));
 								            })
 								            .unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                    { "docid": "1_4",  "label": ["sign"] },
 								                    { "docid": "1_5",  "label": ["letter"] },
 								                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
 								                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
 								                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
 								                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
 								                    { "docid": "1_39", "label": ["abstract"] },
 								                    { "docid": "1_40", "label": ["cartoon"] },
 								                    { "docid": "1_41", "label": ["art","drawing"] },
 								                    { "docid": "1_42", "label": ["art","pattern"] },
 								                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
 								                    { "docid": "1_44", "label": ["drawing"] },
 								                    { "docid": "1_45", "label": ["art"] },
 								                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
 								                    { "docid": "1_47", "label": ["abstract","pattern"] },
 								                    { "docid": "1_52", "label": ["abstract","cartoon"] },
 								                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
 								                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
 								                    { "docid": "1_68", "label": ["design"] },
 								                    { "docid": "1_69", "label": ["geometry"] },
 								                    { "docid": "1_70", "label2": ["geometry", 1.2] },
 								                    { "docid": "1_71", "label2": ["design", 2.2] },
 								                    { "docid": "1_72", "label2": ["geometry", 1.2] }
 								                ]),
 								            )
 								            .unwrap();
 								        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
 								        // Placeholder search
 								        let results = index.search(&wtxn).execute().unwrap();
 								        assert!(!results.documents_ids.is_empty());
 								        for id in results.documents_ids.iter() {
 								            assert!(
 								                !deleted_internal_ids.contains(id),
 								                "The document {} was supposed to be deleted",
 								                id
 								            );
 								        }
 								        wtxn.commit().unwrap();
 								    }
 								    #[test]
 								    fn search_should_not_return_deleted_documents() {
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .update_settings_using_wtxn(&mut wtxn, |settings| {
 								                settings.set_primary_key(S("docid"));
 								            })
 								            .unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                    { "docid": "1_4",  "label": ["sign"] },
 								                    { "docid": "1_5",  "label": ["letter"] },
 								                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
 								                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
 								                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
 								                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
 								                    { "docid": "1_39", "label": ["abstract"] },
 								                    { "docid": "1_40", "label": ["cartoon"] },
 								                    { "docid": "1_41", "label": ["art","drawing"] },
 								                    { "docid": "1_42", "label": ["art","pattern"] },
 								                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
 								                    { "docid": "1_44", "label": ["drawing"] },
 								                    { "docid": "1_45", "label": ["art"] },
 								                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
 								                    { "docid": "1_47", "label": ["abstract","pattern"] },
 								                    { "docid": "1_52", "label": ["abstract","cartoon"] },
 								                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
 								                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
 								                    { "docid": "1_68", "label": ["design"] },
 								                    { "docid": "1_69", "label": ["geometry"] },
 								                    { "docid": "1_70", "label2": ["geometry", 1.2] },
 								                    { "docid": "1_71", "label2": ["design", 2.2] },
 								                    { "docid": "1_72", "label2": ["geometry", 1.2] }
 								                ]),
 								            )
 								            .unwrap();
 								        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
 								        // search for abstract
 								        let results = index.search(&wtxn).query("abstract").execute().unwrap();
 								        assert!(!results.documents_ids.is_empty());
 								        for id in results.documents_ids.iter() {
 								            assert!(
 								                !deleted_internal_ids.contains(id),
 								                "The document {} was supposed to be deleted",
 								                id
 								            );
 								        }
 								        wtxn.commit().unwrap();
 								    }
 								    #[test]
 								    fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .update_settings_using_wtxn(&mut wtxn, |settings| {
 								                settings.set_primary_key(S("id"));
 								                settings.set_filterable_fields(hashset!(S("_geo")));
 								                settings.set_sortable_fields(hashset!(S("_geo")));
 								            })
 								            .unwrap();
 								        index.add_documents_using_wtxn(&mut wtxn, documents!([
 								            { "id": "1",  "city": "Lille",             "_geo": { "lat": 50.6299, "lng": 3.0569 } },
 								            { "id": "2",  "city": "Mons-en-Barœul",    "_geo": { "lat": 50.6415, "lng": 3.1106 } },
 								            { "id": "3",  "city": "Hellemmes",         "_geo": { "lat": 50.6312, "lng": 3.1106 } },
 								            { "id": "4",  "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
 								            { "id": "5",  "city": "Hem",               "_geo": { "lat": 50.6552, "lng": 3.1897 } },
 								            { "id": "6",  "city": "Roubaix",           "_geo": { "lat": 50.6924, "lng": 3.1763 } },
 								            { "id": "7",  "city": "Tourcoing",         "_geo": { "lat": 50.7263, "lng": 3.1541 } },
 								            { "id": "8",  "city": "Mouscron",          "_geo": { "lat": 50.7453, "lng": 3.2206 } },
 								            { "id": "9",  "city": "Tournai",           "_geo": { "lat": 50.6053, "lng": 3.3758 } },
 								            { "id": "10", "city": "Ghent",             "_geo": { "lat": 51.0537, "lng": 3.6957 } },
 								            { "id": "11", "city": "Brussels",          "_geo": { "lat": 50.8466, "lng": 4.3370 } },
 								            { "id": "12", "city": "Charleroi",         "_geo": { "lat": 50.4095, "lng": 4.4347 } },
 								            { "id": "13", "city": "Mons",              "_geo": { "lat": 50.4502, "lng": 3.9623 } },
 								            { "id": "14", "city": "Valenciennes",      "_geo": { "lat": 50.3518, "lng": 3.5326 } },
 								            { "id": "15", "city": "Arras",             "_geo": { "lat": 50.2844, "lng": 2.7637 } },
 								            { "id": "16", "city": "Cambrai",           "_geo": { "lat": 50.1793, "lng": 3.2189 } },
 								            { "id": "17", "city": "Bapaume",           "_geo": { "lat": 50.1112, "lng": 2.8547 } },
 								            { "id": "18", "city": "Amiens",            "_geo": { "lat": 49.9314, "lng": 2.2710 } },
 								            { "id": "19", "city": "Compiègne",         "_geo": { "lat": 49.4449, "lng": 2.7913 } },
 								            { "id": "20", "city": "Paris",             "_geo": { "lat": 48.9021, "lng": 2.3708 } }
 								        ])).unwrap();
 								        let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
 								        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
 								        // Placeholder search with geo filter
 								        let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
 								        let results = index.search(&wtxn).filter(filter).execute().unwrap();
 								        assert!(!results.documents_ids.is_empty());
 								        for id in results.documents_ids.iter() {
 								            assert!(
 								                !deleted_internal_ids.contains(id),
 								                "The document {} was supposed to be deleted",
 								                id
 								            );
 								        }
 								        wtxn.commit().unwrap();
 								        db_snap!(index, facet_id_f64_docids);
 								        db_snap!(index, facet_id_string_docids);
 								    }
 								    #[test]
 								    fn get_documents_should_not_return_deleted_documents() {
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .update_settings_using_wtxn(&mut wtxn, |settings| {
 								                settings.set_primary_key(S("docid"));
 								            })
 								            .unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                    { "docid": "1_4",  "label": ["sign"] },
 								                    { "docid": "1_5",  "label": ["letter"] },
 								                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
 								                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
 								                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
 								                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
 								                    { "docid": "1_39", "label": ["abstract"] },
 								                    { "docid": "1_40", "label": ["cartoon"] },
 								                    { "docid": "1_41", "label": ["art","drawing"] },
 								                    { "docid": "1_42", "label": ["art","pattern"] },
 								                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
 								                    { "docid": "1_44", "label": ["drawing"] },
 								                    { "docid": "1_45", "label": ["art"] },
 								                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
 								                    { "docid": "1_47", "label": ["abstract","pattern"] },
 								                    { "docid": "1_52", "label": ["abstract","cartoon"] },
 								                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
 								                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
 								                    { "docid": "1_68", "label": ["design"] },
 								                    { "docid": "1_69", "label": ["geometry"] },
 								                    { "docid": "1_70", "label2": ["geometry", 1.2] },
 								                    { "docid": "1_71", "label2": ["design", 2.2] },
 								                    { "docid": "1_72", "label2": ["geometry", 1.2] }
 								                ]),
 								            )
 								            .unwrap();
 								        let deleted_external_ids = ["1_7", "1_52"];
 								        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
 								        // list all documents
 								        let results = index.all_documents(&wtxn).unwrap();
 								        for result in results {
 								            let (id, _) = result.unwrap();
 								            assert!(
 								                !deleted_internal_ids.contains(&id),
 								                "The document {} was supposed to be deleted",
 								                id
 								            );
 								        }
 								        // list internal document ids
 								        let results = index.documents_ids(&wtxn).unwrap();
 								        for id in results {
 								            assert!(
 								                !deleted_internal_ids.contains(&id),
 								                "The document {} was supposed to be deleted",
 								                id
 								            );
 								        }
 								        wtxn.commit().unwrap();
 								        let rtxn = index.read_txn().unwrap();
 								        // get internal docids from deleted external document ids
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								        let results = index.external_documents_ids();
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								        for id in deleted_external_ids {
-												Fix tests compilation after changes to ExternalDocumentsIds API

											
										
										
											2023-10-30 13:34:07 +01:00
+								            assert!(
 								                results.get(&rtxn, id).unwrap().is_none(),
 								                "The document {} was supposed to be deleted",
 								                id
 								            );
-												Recover delete_documents tests that were too eagerly deleted

											
										
										
											2023-10-26 12:16:16 +02:00
+								        }
 								        drop(rtxn);
 								    }
 								    #[test]
 								    fn stats_should_not_return_deleted_documents() {
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .update_settings_using_wtxn(&mut wtxn, |settings| {
 								                settings.set_primary_key(S("docid"));
 								            })
 								            .unwrap();
 								        index.add_documents_using_wtxn(&mut wtxn, documents!([
 								            { "docid": "1_4",  "label": ["sign"]},
 								            { "docid": "1_5",  "label": ["letter"]},
 								            { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
 								            { "docid": "1_36", "label": ["drawing","painting","pattern"]},
 								            { "docid": "1_37", "label": ["art","drawing","outdoor"]},
 								            { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
 								            { "docid": "1_39", "label": ["abstract"]},
 								            { "docid": "1_40", "label": ["cartoon"]},
 								            { "docid": "1_41", "label": ["art","drawing"]},
 								            { "docid": "1_42", "label": ["art","pattern"]},
 								            { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
 								            { "docid": "1_44", "label": ["drawing"], "number": 44i32},
 								            { "docid": "1_45", "label": ["art"]},
 								            { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
 								            { "docid": "1_47", "label": ["abstract","pattern"]},
 								            { "docid": "1_52", "label": ["abstract","cartoon"]},
 								            { "docid": "1_57", "label": ["abstract","drawing","pattern"]},
 								            { "docid": "1_58", "label": ["abstract","art","cartoon"]},
 								            { "docid": "1_68", "label": ["design"]},
 								            { "docid": "1_69", "label": ["geometry"]}
 								        ])).unwrap();
 								        delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
 								        // count internal documents
 								        let results = index.number_of_documents(&wtxn).unwrap();
 								        assert_eq!(18, results);
 								        // count field distribution
 								        let results = index.field_distribution(&wtxn).unwrap();
 								        assert_eq!(Some(&18), results.get("label"));
 								        assert_eq!(Some(&1), results.get("title"));
 								        assert_eq!(Some(&2), results.get("number"));
 								        wtxn.commit().unwrap();
 								    }
 								    #[test]
 								    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
 								        use charabia::{Language, Script};
 								        let index = TempIndex::new();
 								        let mut wtxn = index.write_txn().unwrap();
 								        index
 								            .add_documents_using_wtxn(
 								                &mut wtxn,
 								                documents!([
 								                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
 								                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
 								                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
 								                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
 								                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
 								                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
 								            ]))
 								            .unwrap();
 								        let key_cmn = (Script::Cj, Language::Cmn);
 								        let cj_cmn_docs =
 								            index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
 								        let mut expected_cj_cmn_docids = RoaringBitmap::new();
 								        expected_cj_cmn_docids.push(1);
 								        expected_cj_cmn_docids.push(5);
 								        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
 								        delete_documents(&mut wtxn, &index, &["1"]);
 								        wtxn.commit().unwrap();
 								        let rtxn = index.read_txn().unwrap();
 								        let cj_cmn_docs =
 								            index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
 								        let mut expected_cj_cmn_docids = RoaringBitmap::new();
 								        expected_cj_cmn_docids.push(5);
 								        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
 								    }
 								    #[test]
 								    fn delete_words_exact_attributes() {
 								        let index = TempIndex::new();
 								        index
 								            .update_settings(|settings| {
 								                settings.set_primary_key(S("id"));
 								                settings.set_searchable_fields(vec![S("text"), S("exact")]);
 								                settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
 								            })
 								            .unwrap();
 								        index
 								            .add_documents(documents!([
 								                { "id": 0, "text": "hello" },
 								                { "id": 1, "exact": "hello"}
 								            ]))
 								            .unwrap();
 								        db_snap!(index, word_docids, 1, @r###"
 								        hello            [0, ]
 								        "###);
 								        db_snap!(index, exact_word_docids, 1, @r###"
 								        hello            [1, ]
 								        "###);
 								        db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
 								        let mut wtxn = index.write_txn().unwrap();
 								        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
 								        wtxn.commit().unwrap();
 								        db_snap!(index, word_docids, 2, @r###"
 								        hello            [0, ]
 								        "###);
 								        db_snap!(index, exact_word_docids, 2, @"");
 								        db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
 								        insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
 								        let txn = index.read_txn().unwrap();
 								        let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
 								        insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
 								        let mut s = Search::new(&txn, &index);
 								        s.query("hello");
 								        let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
 								        insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
 								    }
-												Fix a documents indexing bug and add a test

											
										
										
											2020-10-30 12:14:25 +01:00
+								}