From da48506f151adf79d398d89bf8ac24e249cc0338 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 7 Mar 2023 18:35:26 +0100 Subject: [PATCH 01/11] Rerun extraction when language detection might have failed --- .../extract/extract_docid_word_positions.rs | 177 ++++++++++++++---- 1 file changed, 143 insertions(+), 34 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..5a103f1e0 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,12 +3,14 @@ use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder}; +use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; +use obkv::KvReader; use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; +use crate::update::index_documents::MergeFn; use crate::{ absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, }; @@ -33,7 +35,7 @@ pub fn extract_docid_word_positions( let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); - let mut script_language_pair = HashMap::new(); + let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, concat_u32s_array, @@ -45,11 +47,11 @@ pub fn extract_docid_word_positions( let mut key_buffer = Vec::new(); let mut field_buffer = String::new(); - let mut builder = TokenizerBuilder::new(); + let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { - builder.stop_words(stop_words); + tokenizer_builder.stop_words(stop_words); } - let tokenizer = builder.build(); + let tokenizer = tokenizer_builder.build(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -57,49 +59,120 @@ pub fn extract_docid_word_positions( .try_into() .map(u32::from_be_bytes) .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let obkv = obkv::KvReader::::new(value); + let obkv = KvReader::::new(value); documents_ids.push(document_id); key_buffer.clear(); key_buffer.extend_from_slice(&document_id.to_be_bytes()); - for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - field_buffer.clear(); - if let Some(field) = json_to_string(&value, &mut field_buffer) { - let tokens = process_tokens(tokenizer.tokenize(field)) - .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + let mut script_language_word_count = HashMap::new(); - for (index, token) in tokens { - if let Some(language) = token.language { - let script = token.script; - let entry = script_language_pair - .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); - } - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - key_buffer.truncate(mem::size_of::()); - key_buffer.extend_from_slice(token.as_bytes()); + extract_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + max_positions_per_attributes, + &mut key_buffer, + &mut field_buffer, + &mut script_language_word_count, + &mut docid_word_positions_sorter, + )?; - let position: u16 = index - .try_into() - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = absolute_from_relative_position(field_id, position); - docid_word_positions_sorter - .insert(&key_buffer, position.to_ne_bytes())?; + // if we detect a potetial mistake in the language detection, + // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. + // context: https://github.com/meilisearch/meilisearch/issues/3565 + if script_language_word_count.values().any(potential_language_detection_error) { + // build an allow list with the most frequent detected languages in the document. + let script_language: HashMap<_, _> = + script_language_word_count.iter().filter_map(most_frequent_languages).collect(); + + // if the allow list is empty, meaning that no Language is considered frequent, + // then we don't rerun the extraction. + if !script_language.is_empty() { + // build a new temporar tokenizer including the allow list. + let mut tokenizer_builder = TokenizerBuilder::new(); + if let Some(stop_words) = stop_words { + tokenizer_builder.stop_words(stop_words); + } + tokenizer_builder.allow_list(&script_language); + let tokenizer = tokenizer_builder.build(); + + script_language_word_count.clear(); + + // rerun the extraction. + extract_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + max_positions_per_attributes, + &mut key_buffer, + &mut field_buffer, + &mut script_language_word_count, + &mut docid_word_positions_sorter, + )?; + } + } + + for (script, languages_frequency) in script_language_word_count { + for (language, _) in languages_frequency { + let entry = script_language_docids + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); + } + } + } + + sorter_into_reader(docid_word_positions_sorter, indexer) + .map(|reader| (documents_ids, reader, script_language_docids)) +} + +fn extract_tokens_from_document>( + obkv: &KvReader, + searchable_fields: &Option>, + tokenizer: &Tokenizer, + max_positions_per_attributes: u32, + key_buffer: &mut Vec, + field_buffer: &mut String, + script_language_word_count: &mut HashMap>, + docid_word_positions_sorter: &mut grenad::Sorter, +) -> Result<()> { + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + field_buffer.clear(); + if let Some(field) = json_to_string(&value, field_buffer) { + let tokens = process_tokens(tokenizer.tokenize(field)) + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + + for (index, token) in tokens { + // if a language has been detected for the token, we update the counter. + if let Some(language) = token.language { + let script = token.script; + let entry = + script_language_word_count.entry(script).or_insert_with(Vec::new); + match entry.iter_mut().find(|(l, _)| *l == language) { + Some((_, n)) => *n += 1, + None => entry.push((language, 1)), } } + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(token.as_bytes()); + + let position: u16 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let position = absolute_from_relative_position(field_id, position); + docid_word_positions_sorter.insert(&key_buffer, position.to_ne_bytes())?; + } } } } } - sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (documents_ids, reader, script_language_pair)) + Ok(()) } /// Transform a JSON value into a string that can be indexed. @@ -183,3 +256,39 @@ fn process_tokens<'a>( }) .filter(|(_, t)| t.is_word()) } + +fn potential_language_detection_error(languages_frequency: &Vec<(Language, usize)>) -> bool { + if languages_frequency.len() > 1 { + let threshold = compute_laguage_frequency_threshold(languages_frequency); + languages_frequency.iter().any(|(_, c)| *c <= threshold) + } else { + false + } +} + +fn most_frequent_languages( + (script, languages_frequency): (&Script, &Vec<(Language, usize)>), +) -> Option<(Script, Vec)> { + if languages_frequency.len() > 1 { + let threshold = compute_laguage_frequency_threshold(languages_frequency); + + let languages: Vec<_> = languages_frequency + .iter() + .filter(|(_, c)| *c > threshold) + .map(|(l, _)| l.clone()) + .collect(); + + if languages.is_empty() { + None + } else { + Some((script.clone(), languages)) + } + } else { + None + } +} + +fn compute_laguage_frequency_threshold(languages_frequency: &Vec<(Language, usize)>) -> usize { + let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); + total / 20 // 5% is a completely arbitrar value. +} From 37d4551e8e196ad849a91eeef9b86c73bb5c88f8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 7 Mar 2023 19:38:01 +0100 Subject: [PATCH 02/11] Add a threshold filtering the Languages allowed to be detected at search time --- milli/src/index.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a4048dfb0..7a473c0b4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1211,11 +1211,22 @@ impl Index { let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; let mut script_language: HashMap> = HashMap::new(); + let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); + let mut total = 0; for sl in self.script_language_docids.iter(rtxn)? { let ((script, language), docids) = sl?; // keep only Languages that contains at least 1 document. - if !soft_deleted_documents.is_superset(&docids) { + let remaining_documents_count = (docids - &soft_deleted_documents).len(); + total += remaining_documents_count; + if remaining_documents_count > 0 { + script_language_doc_count.push((script, language, remaining_documents_count)); + } + } + + let threshold = total / 20; // 5% (arbitrar) + for (script, language, count) in script_language_doc_count { + if count > threshold { if let Some(languages) = script_language.get_mut(&script) { (*languages).push(language); } else { From 3092cf0448f2e7edeece7becc44d51c1adfe0a14 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Mar 2023 10:53:42 +0100 Subject: [PATCH 03/11] Fix clippy errors --- meilisearch/src/routes/indexes/mod.rs | 2 ++ .../extract/extract_docid_word_positions.rs | 12 +++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index c5c168786..487511da0 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -100,6 +100,8 @@ pub async fn list_indexes( Ok(Some(IndexView::new(uid.to_string(), index)?)) })?; // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. + // error when trying to fix it: the trait `ExactSizeIterator` is not implemented for `Flatten>>` + #[allow(clippy::needless_collect)] let indexes: Vec = indexes.into_iter().flatten().collect(); let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter()); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 5a103f1e0..057559462 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -127,6 +127,7 @@ pub fn extract_docid_word_positions( .map(|reader| (documents_ids, reader, script_language_docids)) } +#[allow(clippy::too_many_arguments)] fn extract_tokens_from_document>( obkv: &KvReader, searchable_fields: &Option>, @@ -272,23 +273,20 @@ fn most_frequent_languages( if languages_frequency.len() > 1 { let threshold = compute_laguage_frequency_threshold(languages_frequency); - let languages: Vec<_> = languages_frequency - .iter() - .filter(|(_, c)| *c > threshold) - .map(|(l, _)| l.clone()) - .collect(); + let languages: Vec<_> = + languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect(); if languages.is_empty() { None } else { - Some((script.clone(), languages)) + Some((*script, languages)) } } else { None } } -fn compute_laguage_frequency_threshold(languages_frequency: &Vec<(Language, usize)>) -> usize { +fn compute_laguage_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); total / 20 // 5% is a completely arbitrar value. } From 24c0775c675f668bd9d9cc24ec419262c9cca70f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Mar 2023 12:36:04 +0100 Subject: [PATCH 04/11] Change indexing threshold --- .../index_documents/extract/extract_docid_word_positions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 057559462..6eee90c06 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -288,5 +288,5 @@ fn most_frequent_languages( fn compute_laguage_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); - total / 20 // 5% is a completely arbitrar value. + total / 10 // 10% is a completely arbitrar value. } From 7e2fd82e41c3cb0c9d241adb24f3d6c4fd888cc5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Mar 2023 12:44:16 +0100 Subject: [PATCH 05/11] Use Language allow list in the highlighter --- meilisearch/src/search.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index c287f1ba0..ebf9ace1f 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -378,6 +378,11 @@ pub fn perform_search( let mut tokenizer_buidler = TokenizerBuilder::default(); tokenizer_buidler.create_char_map(true); + let script_lang_map = index.script_language(&rtxn)?; + if !script_lang_map.is_empty() { + tokenizer_buidler.allow_list(&script_lang_map); + } + let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_buidler.build()); formatter_builder.crop_marker(query.crop_marker); formatter_builder.highlight_prefix(query.highlight_pre_tag); From b99ef3d33644b24cccf9ff167f698b30b99625d1 Mon Sep 17 00:00:00 2001 From: curquiza Date: Tue, 7 Mar 2023 17:25:39 +0100 Subject: [PATCH 06/11] Update CI to still use ubuntu-18 --- .github/workflows/publish-binaries.yml | 40 +++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 13555cbac..76dde74d1 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -96,14 +96,12 @@ jobs: publish-macos-apple-silicon: name: Publish binary for macOS silicon - runs-on: ${{ matrix.os }} + runs-on: macos-12 needs: check-version strategy: - fail-fast: false matrix: include: - - os: macos-12 - target: aarch64-apple-darwin + - target: aarch64-apple-darwin asset_name: meilisearch-macos-apple-silicon steps: - name: Checkout repository @@ -132,21 +130,29 @@ jobs: publish-aarch64: name: Publish binary for aarch64 - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest needs: check-version + container: + # Use ubuntu-18.04 to compile with glibc 2.27 + image: ubuntu:18.04 strategy: - fail-fast: false matrix: include: - - build: aarch64 - os: ubuntu-18.04 - target: aarch64-unknown-linux-gnu - linker: gcc-aarch64-linux-gnu - use-cross: true + - target: aarch64-unknown-linux-gnu asset_name: meilisearch-linux-aarch64 steps: - name: Checkout repository uses: actions/checkout@v3 + - name: Install needed dependencies + run: | + apt-get update -y && apt upgrade -y + apt-get install -y curl build-essential gcc-aarch64-linux-gnu + - name: Set up Docker for cross compilation + run: | + apt-get install -y curl apt-transport-https ca-certificates software-properties-common + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - + add-apt-repository "deb [arch=$(dpkg --print-architecture)] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" + apt-get update -y && apt-get install -y docker-ce - name: Installing Rust toolchain uses: actions-rs/toolchain@v1 with: @@ -154,15 +160,7 @@ jobs: profile: minimal target: ${{ matrix.target }} override: true - - name: APT update - run: | - sudo apt update - - name: Install target specific tools - if: matrix.use-cross - run: | - sudo apt-get install -y ${{ matrix.linker }} - name: Configure target aarch64 GNU - if: matrix.target == 'aarch64-unknown-linux-gnu' ## Environment variable is not passed using env: ## LD gold won't work with MUSL # env: @@ -176,8 +174,10 @@ jobs: uses: actions-rs/cargo@v1 with: command: build - use-cross: ${{ matrix.use-cross }} + use-cross: true args: --release --target ${{ matrix.target }} + env: + CROSS_DOCKER_IN_DOCKER: true - name: List target output files run: ls -lR ./target - name: Upload the binary to release From b4b859ec8c2a25ca502af48afaf5fd5e25fcab09 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 9 Mar 2023 10:56:17 +0100 Subject: [PATCH 07/11] Fix typos --- meilisearch/src/search.rs | 8 ++++---- milli/src/index.rs | 2 +- .../extract/extract_docid_word_positions.rs | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index ebf9ace1f..7e4a7da6a 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -375,15 +375,15 @@ pub fn perform_search( &displayed_ids, ); - let mut tokenizer_buidler = TokenizerBuilder::default(); - tokenizer_buidler.create_char_map(true); + let mut tokenizer_builder = TokenizerBuilder::default(); + tokenizer_builder.create_char_map(true); let script_lang_map = index.script_language(&rtxn)?; if !script_lang_map.is_empty() { - tokenizer_buidler.allow_list(&script_lang_map); + tokenizer_builder.allow_list(&script_lang_map); } - let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_buidler.build()); + let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build()); formatter_builder.crop_marker(query.crop_marker); formatter_builder.highlight_prefix(query.highlight_pre_tag); formatter_builder.highlight_suffix(query.highlight_post_tag); diff --git a/milli/src/index.rs b/milli/src/index.rs index 7a473c0b4..20e64f984 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1224,7 +1224,7 @@ impl Index { } } - let threshold = total / 20; // 5% (arbitrar) + let threshold = total / 20; // 5% (arbitrary) for (script, language, count) in script_language_doc_count { if count > threshold { if let Some(languages) = script_language.get_mut(&script) { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6eee90c06..56b1299d5 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -89,7 +89,7 @@ pub fn extract_docid_word_positions( // if the allow list is empty, meaning that no Language is considered frequent, // then we don't rerun the extraction. if !script_language.is_empty() { - // build a new temporar tokenizer including the allow list. + // build a new temporary tokenizer including the allow list. let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -260,7 +260,7 @@ fn process_tokens<'a>( fn potential_language_detection_error(languages_frequency: &Vec<(Language, usize)>) -> bool { if languages_frequency.len() > 1 { - let threshold = compute_laguage_frequency_threshold(languages_frequency); + let threshold = compute_language_frequency_threshold(languages_frequency); languages_frequency.iter().any(|(_, c)| *c <= threshold) } else { false @@ -271,7 +271,7 @@ fn most_frequent_languages( (script, languages_frequency): (&Script, &Vec<(Language, usize)>), ) -> Option<(Script, Vec)> { if languages_frequency.len() > 1 { - let threshold = compute_laguage_frequency_threshold(languages_frequency); + let threshold = compute_language_frequency_threshold(languages_frequency); let languages: Vec<_> = languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect(); @@ -286,7 +286,7 @@ fn most_frequent_languages( } } -fn compute_laguage_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { +fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); - total / 10 // 10% is a completely arbitrar value. + total / 10 // 10% is a completely arbitrary value. } From 5deea631ea3aed6d0ded1a73abc472d6d1aafc09 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 9 Mar 2023 11:19:13 +0100 Subject: [PATCH 08/11] fix clippy too many arguments --- .../extract/extract_docid_word_positions.rs | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 56b1299d5..3e1af5915 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -45,8 +45,7 @@ pub fn extract_docid_word_positions( max_memory, ); - let mut key_buffer = Vec::new(); - let mut field_buffer = String::new(); + let mut buffers = Buffers::default(); let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -62,8 +61,8 @@ pub fn extract_docid_word_positions( let obkv = KvReader::::new(value); documents_ids.push(document_id); - key_buffer.clear(); - key_buffer.extend_from_slice(&document_id.to_be_bytes()); + buffers.key_buffer.clear(); + buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes()); let mut script_language_word_count = HashMap::new(); @@ -72,8 +71,7 @@ pub fn extract_docid_word_positions( searchable_fields, &tokenizer, max_positions_per_attributes, - &mut key_buffer, - &mut field_buffer, + &mut buffers, &mut script_language_word_count, &mut docid_word_positions_sorter, )?; @@ -105,8 +103,7 @@ pub fn extract_docid_word_positions( searchable_fields, &tokenizer, max_positions_per_attributes, - &mut key_buffer, - &mut field_buffer, + &mut buffers, &mut script_language_word_count, &mut docid_word_positions_sorter, )?; @@ -127,22 +124,20 @@ pub fn extract_docid_word_positions( .map(|reader| (documents_ids, reader, script_language_docids)) } -#[allow(clippy::too_many_arguments)] fn extract_tokens_from_document>( obkv: &KvReader, searchable_fields: &Option>, tokenizer: &Tokenizer, max_positions_per_attributes: u32, - key_buffer: &mut Vec, - field_buffer: &mut String, + buffers: &mut Buffers, script_language_word_count: &mut HashMap>, docid_word_positions_sorter: &mut grenad::Sorter, ) -> Result<()> { for (field_id, field_bytes) in obkv.iter() { if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - field_buffer.clear(); - if let Some(field) = json_to_string(&value, field_buffer) { + buffers.field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { let tokens = process_tokens(tokenizer.tokenize(field)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); @@ -159,14 +154,15 @@ fn extract_tokens_from_document>( } let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - key_buffer.truncate(mem::size_of::()); - key_buffer.extend_from_slice(token.as_bytes()); + buffers.key_buffer.truncate(mem::size_of::()); + buffers.key_buffer.extend_from_slice(token.as_bytes()); let position: u16 = index .try_into() .map_err(|_| SerializationError::InvalidNumberSerialization)?; let position = absolute_from_relative_position(field_id, position); - docid_word_positions_sorter.insert(&key_buffer, position.to_ne_bytes())?; + docid_word_positions_sorter + .insert(&buffers.key_buffer, position.to_ne_bytes())?; } } } @@ -290,3 +286,9 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); total / 10 // 10% is a completely arbitrary value. } + +#[derive(Default)] +struct Buffers { + key_buffer: Vec, + field_buffer: String, +} From dff2715ef3b7899fd53ab543deb3cf7e71d746a2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 9 Mar 2023 11:28:10 +0100 Subject: [PATCH 09/11] Try removing needless collect --- meilisearch/src/routes/indexes/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 487511da0..2b204ac65 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -101,7 +101,6 @@ pub async fn list_indexes( })?; // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. // error when trying to fix it: the trait `ExactSizeIterator` is not implemented for `Flatten>>` - #[allow(clippy::needless_collect)] let indexes: Vec = indexes.into_iter().flatten().collect(); let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter()); From dea101e3d927775c7717ceebd1720b843096e5d3 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Thu, 9 Mar 2023 15:17:03 +0100 Subject: [PATCH 10/11] Update meilisearch/src/routes/indexes/mod.rs Co-authored-by: Louis Dureuil --- meilisearch/src/routes/indexes/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 2b204ac65..c5c168786 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -100,7 +100,6 @@ pub async fn list_indexes( Ok(Some(IndexView::new(uid.to_string(), index)?)) })?; // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. - // error when trying to fix it: the trait `ExactSizeIterator` is not implemented for `Flatten>>` let indexes: Vec = indexes.into_iter().flatten().collect(); let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter()); From 2f8eb4f54adc7986c93111127a14d22bf6ac7df1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 9 Mar 2023 15:34:36 +0100 Subject: [PATCH 11/11] last PR fixes --- .../extract/extract_docid_word_positions.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 3e1af5915..131b78df9 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -79,7 +79,11 @@ pub fn extract_docid_word_positions( // if we detect a potetial mistake in the language detection, // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. // context: https://github.com/meilisearch/meilisearch/issues/3565 - if script_language_word_count.values().any(potential_language_detection_error) { + if script_language_word_count + .values() + .map(Vec::as_slice) + .any(potential_language_detection_error) + { // build an allow list with the most frequent detected languages in the document. let script_language: HashMap<_, _> = script_language_word_count.iter().filter_map(most_frequent_languages).collect(); @@ -254,7 +258,7 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn potential_language_detection_error(languages_frequency: &Vec<(Language, usize)>) -> bool { +fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool { if languages_frequency.len() > 1 { let threshold = compute_language_frequency_threshold(languages_frequency); languages_frequency.iter().any(|(_, c)| *c <= threshold) @@ -289,6 +293,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) #[derive(Default)] struct Buffers { + // the key buffer is the concatenation of the internal document id with the field id. + // The buffer has to be completelly cleared between documents, + // and the field id part must be cleared between each field. key_buffer: Vec, + // the field buffer for each fields desserialization, and must be cleared between each field. field_buffer: String, }