2024-03-26 13:27:43 +01:00
|
|
|
use std::collections::HashMap;
|
2021-08-16 13:36:30 +02:00
|
|
|
use std::convert::TryInto;
|
|
|
|
use std::fs::File;
|
2023-09-28 16:26:01 +02:00
|
|
|
use std::io::BufReader;
|
2021-08-16 13:36:30 +02:00
|
|
|
use std::{io, mem, str};
|
|
|
|
|
2023-03-07 18:35:26 +01:00
|
|
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
2023-09-18 09:59:38 +02:00
|
|
|
use obkv::{KvReader, KvWriterU16};
|
2021-08-16 13:36:30 +02:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
use serde_json::Value;
|
|
|
|
|
2023-09-18 09:59:38 +02:00
|
|
|
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
2021-08-16 13:36:30 +02:00
|
|
|
use crate::error::{InternalError, SerializationError};
|
2023-10-16 14:58:11 +02:00
|
|
|
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
2024-03-26 13:27:43 +01:00
|
|
|
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
2023-10-10 11:23:16 +02:00
|
|
|
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2023-10-19 10:22:39 +02:00
|
|
|
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
2023-02-21 10:18:44 +01:00
|
|
|
|
2021-08-16 13:36:30 +02:00
|
|
|
/// Extracts the word and positions where this word appear and
|
|
|
|
/// prefixes it by the document id.
|
|
|
|
///
|
|
|
|
/// Returns the generated internal documents ids and a grenad reader
|
|
|
|
/// with the list of extracted words from the given chunk of documents.
|
2024-01-23 09:42:48 +01:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
2022-02-16 15:28:48 +01:00
|
|
|
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|
|
|
obkv_documents: grenad::Reader<R>,
|
2021-08-16 13:36:30 +02:00
|
|
|
indexer: GrenadParameters,
|
2024-03-26 13:27:43 +01:00
|
|
|
settings_diff: &InnerIndexSettingsDiff,
|
2021-10-06 12:11:07 +02:00
|
|
|
max_positions_per_attributes: Option<u32>,
|
2023-11-06 10:31:14 +01:00
|
|
|
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
2023-07-10 18:41:54 +02:00
|
|
|
puffin::profile_function!();
|
|
|
|
|
2021-10-06 12:11:07 +02:00
|
|
|
let max_positions_per_attributes = max_positions_per_attributes
|
|
|
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
2021-08-16 13:36:30 +02:00
|
|
|
let max_memory = indexer.max_memory_by_thread();
|
2024-04-03 11:19:45 +02:00
|
|
|
let force_reindexing = settings_diff.reindex_searchable();
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// initialize destination values.
|
2021-08-16 13:36:30 +02:00
|
|
|
let mut documents_ids = RoaringBitmap::new();
|
2023-10-19 10:22:39 +02:00
|
|
|
let mut script_language_docids = HashMap::new();
|
2021-08-16 13:36:30 +02:00
|
|
|
let mut docid_word_positions_sorter = create_sorter(
|
2022-09-13 10:40:37 +02:00
|
|
|
grenad::SortAlgorithm::Stable,
|
2023-09-18 09:59:38 +02:00
|
|
|
keep_latest_obkv,
|
2021-08-16 13:36:30 +02:00
|
|
|
indexer.chunk_compression_type,
|
|
|
|
indexer.chunk_compression_level,
|
|
|
|
indexer.max_nb_chunks,
|
|
|
|
max_memory,
|
|
|
|
);
|
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// initialize buffers.
|
|
|
|
let mut del_buffers = Buffers::default();
|
|
|
|
let mut add_buffers = Buffers::default();
|
|
|
|
let mut key_buffer = Vec::new();
|
|
|
|
let mut value_buffer = Vec::new();
|
|
|
|
|
|
|
|
// initialize tokenizer.
|
2024-03-26 13:27:43 +01:00
|
|
|
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
2024-04-03 11:19:45 +02:00
|
|
|
let old_separators: Option<Vec<_>> = settings_diff
|
|
|
|
.old
|
|
|
|
.allowed_separators
|
|
|
|
.as_ref()
|
|
|
|
.map(|s| s.iter().map(String::as_str).collect());
|
2024-03-26 13:27:43 +01:00
|
|
|
let old_dictionary: Option<Vec<_>> =
|
2024-04-03 11:19:45 +02:00
|
|
|
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
2024-03-26 13:27:43 +01:00
|
|
|
let mut del_builder = tokenizer_builder(
|
|
|
|
old_stop_words,
|
|
|
|
old_separators.as_deref(),
|
|
|
|
old_dictionary.as_deref(),
|
|
|
|
None,
|
|
|
|
);
|
|
|
|
let del_tokenizer = del_builder.build();
|
|
|
|
|
|
|
|
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
2024-04-03 11:19:45 +02:00
|
|
|
let new_separators: Option<Vec<_>> = settings_diff
|
|
|
|
.new
|
|
|
|
.allowed_separators
|
|
|
|
.as_ref()
|
|
|
|
.map(|s| s.iter().map(String::as_str).collect());
|
2024-03-26 13:27:43 +01:00
|
|
|
let new_dictionary: Option<Vec<_>> =
|
2024-04-03 11:19:45 +02:00
|
|
|
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
2024-03-26 13:27:43 +01:00
|
|
|
let mut add_builder = tokenizer_builder(
|
|
|
|
new_stop_words,
|
|
|
|
new_separators.as_deref(),
|
|
|
|
new_dictionary.as_deref(),
|
|
|
|
None,
|
|
|
|
);
|
|
|
|
let add_tokenizer = add_builder.build();
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// iterate over documents.
|
2022-02-16 15:28:48 +01:00
|
|
|
let mut cursor = obkv_documents.into_cursor()?;
|
|
|
|
while let Some((key, value)) = cursor.move_on_next()? {
|
2021-08-16 13:36:30 +02:00
|
|
|
let document_id = key
|
|
|
|
.try_into()
|
|
|
|
.map(u32::from_be_bytes)
|
|
|
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
2023-03-07 18:35:26 +01:00
|
|
|
let obkv = KvReader::<FieldId>::new(value);
|
2021-08-16 13:36:30 +02:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
2024-04-03 11:19:45 +02:00
|
|
|
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
|
2023-10-16 14:58:11 +02:00
|
|
|
continue;
|
|
|
|
}
|
2023-03-07 18:35:26 +01:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
documents_ids.push(document_id);
|
2023-03-07 18:35:26 +01:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// Update key buffer prefix.
|
|
|
|
key_buffer.clear();
|
|
|
|
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
2023-03-07 18:35:26 +01:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// Tokenize deletions and additions in 2 diffferent threads.
|
|
|
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
|
|
|
|| {
|
|
|
|
// deletions
|
|
|
|
lang_safe_tokens_from_document(
|
|
|
|
&obkv,
|
2024-03-26 13:27:43 +01:00
|
|
|
&settings_diff.old,
|
|
|
|
&del_tokenizer,
|
2023-10-16 14:58:11 +02:00
|
|
|
max_positions_per_attributes,
|
|
|
|
DelAdd::Deletion,
|
|
|
|
&mut del_buffers,
|
|
|
|
)
|
|
|
|
},
|
|
|
|
|| {
|
|
|
|
// additions
|
|
|
|
lang_safe_tokens_from_document(
|
2023-03-07 18:35:26 +01:00
|
|
|
&obkv,
|
2024-03-26 13:27:43 +01:00
|
|
|
&settings_diff.new,
|
|
|
|
&add_tokenizer,
|
2023-03-07 18:35:26 +01:00
|
|
|
max_positions_per_attributes,
|
2023-10-16 14:58:11 +02:00
|
|
|
DelAdd::Addition,
|
|
|
|
&mut add_buffers,
|
|
|
|
)
|
|
|
|
},
|
|
|
|
);
|
|
|
|
|
|
|
|
let (del_obkv, del_script_language_word_count) = del?;
|
|
|
|
let (add_obkv, add_script_language_word_count) = add?;
|
|
|
|
|
|
|
|
// merge deletions and additions.
|
2023-10-24 10:19:32 +02:00
|
|
|
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
2023-10-16 14:58:11 +02:00
|
|
|
value_buffer.clear();
|
|
|
|
del_add_from_two_obkvs(
|
2024-03-26 13:27:43 +01:00
|
|
|
&KvReader::<FieldId>::new(del_obkv),
|
|
|
|
&KvReader::<FieldId>::new(add_obkv),
|
2023-10-16 14:58:11 +02:00
|
|
|
&mut value_buffer,
|
|
|
|
)?;
|
|
|
|
|
2023-10-24 10:19:32 +02:00
|
|
|
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
|
|
|
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
2023-10-16 14:58:11 +02:00
|
|
|
for (field_id, value) in obkv.iter() {
|
|
|
|
key_buffer.truncate(mem::size_of::<u32>());
|
|
|
|
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
|
|
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
// update script_language_docids deletions.
|
|
|
|
for (script, languages_frequency) in del_script_language_word_count {
|
|
|
|
for (language, _) in languages_frequency {
|
2023-10-19 10:22:39 +02:00
|
|
|
let entry = script_language_docids
|
2023-10-16 14:58:11 +02:00
|
|
|
.entry((script, language))
|
2023-10-19 10:22:39 +02:00
|
|
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
|
|
|
entry.0.push(document_id);
|
2023-03-07 18:35:26 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// update script_language_docids additions.
|
|
|
|
for (script, languages_frequency) in add_script_language_word_count {
|
2023-03-07 18:35:26 +01:00
|
|
|
for (language, _) in languages_frequency {
|
2023-10-19 10:22:39 +02:00
|
|
|
let entry = script_language_docids
|
2023-03-07 18:35:26 +01:00
|
|
|
.entry((script, language))
|
2023-10-19 10:22:39 +02:00
|
|
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
|
|
|
entry.1.push(document_id);
|
2023-03-07 18:35:26 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-24 10:19:32 +02:00
|
|
|
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
2023-03-07 18:35:26 +01:00
|
|
|
sorter_into_reader(docid_word_positions_sorter, indexer)
|
2023-11-06 10:31:14 +01:00
|
|
|
.map(|reader| (reader, script_language_docids))
|
2023-03-07 18:35:26 +01:00
|
|
|
}
|
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
/// Check if any searchable fields of a document changed.
|
|
|
|
fn searchable_fields_changed(
|
|
|
|
obkv: &KvReader<FieldId>,
|
2024-04-03 11:19:45 +02:00
|
|
|
settings_diff: &InnerIndexSettingsDiff,
|
2023-10-16 14:58:11 +02:00
|
|
|
) -> bool {
|
2024-04-03 11:19:45 +02:00
|
|
|
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
2023-10-16 14:58:11 +02:00
|
|
|
for (field_id, field_bytes) in obkv.iter() {
|
|
|
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
|
|
|
let del_add = KvReaderDelAdd::new(field_bytes);
|
|
|
|
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
|
|
|
// if both fields are None, check the next field.
|
|
|
|
(None, None) => (),
|
|
|
|
// if both contains a value and values are the same, check the next field.
|
|
|
|
(Some(del), Some(add)) if del == add => (),
|
|
|
|
// otherwise the fields are different, return true.
|
|
|
|
_otherwise => return true,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
false
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Factorize tokenizer building.
|
|
|
|
fn tokenizer_builder<'a>(
|
2024-01-22 16:23:12 +01:00
|
|
|
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
2023-10-16 14:58:11 +02:00
|
|
|
allowed_separators: Option<&'a [&str]>,
|
|
|
|
dictionary: Option<&'a [&str]>,
|
|
|
|
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
2024-01-22 16:23:12 +01:00
|
|
|
) -> TokenizerBuilder<'a, Vec<u8>> {
|
2023-10-16 14:58:11 +02:00
|
|
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
|
|
|
if let Some(stop_words) = stop_words {
|
|
|
|
tokenizer_builder.stop_words(stop_words);
|
|
|
|
}
|
|
|
|
if let Some(dictionary) = dictionary {
|
|
|
|
tokenizer_builder.words_dict(dictionary);
|
|
|
|
}
|
|
|
|
if let Some(separators) = allowed_separators {
|
|
|
|
tokenizer_builder.separators(separators);
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(script_language) = script_language {
|
2023-11-06 11:19:31 +01:00
|
|
|
tokenizer_builder.allow_list(script_language);
|
2023-10-16 14:58:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
tokenizer_builder
|
|
|
|
}
|
|
|
|
|
2023-11-20 10:06:29 +01:00
|
|
|
/// Extract words mapped with their positions of a document,
|
2023-10-16 14:58:11 +02:00
|
|
|
/// ensuring no Language detection mistakes was made.
|
|
|
|
fn lang_safe_tokens_from_document<'a>(
|
2023-03-07 18:35:26 +01:00
|
|
|
obkv: &KvReader<FieldId>,
|
2024-03-26 13:27:43 +01:00
|
|
|
settings: &InnerIndexSettings,
|
2023-06-28 18:52:32 +02:00
|
|
|
tokenizer: &Tokenizer,
|
2023-03-07 18:35:26 +01:00
|
|
|
max_positions_per_attributes: u32,
|
2023-10-16 14:58:11 +02:00
|
|
|
del_add: DelAdd,
|
|
|
|
buffers: &'a mut Buffers,
|
|
|
|
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
|
|
|
let mut script_language_word_count = HashMap::new();
|
|
|
|
|
|
|
|
tokens_from_document(
|
2023-11-06 11:19:31 +01:00
|
|
|
obkv,
|
2024-03-26 13:27:43 +01:00
|
|
|
&settings.searchable_fields_ids,
|
2023-11-06 11:19:31 +01:00
|
|
|
tokenizer,
|
2023-10-16 14:58:11 +02:00
|
|
|
max_positions_per_attributes,
|
|
|
|
del_add,
|
|
|
|
buffers,
|
|
|
|
&mut script_language_word_count,
|
|
|
|
)?;
|
|
|
|
|
|
|
|
// if we detect a potetial mistake in the language detection,
|
|
|
|
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
|
|
|
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
|
|
|
if script_language_word_count
|
|
|
|
.values()
|
|
|
|
.map(Vec::as_slice)
|
|
|
|
.any(potential_language_detection_error)
|
|
|
|
{
|
|
|
|
// build an allow list with the most frequent detected languages in the document.
|
|
|
|
let script_language: HashMap<_, _> =
|
|
|
|
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
|
|
|
|
|
|
|
// if the allow list is empty, meaning that no Language is considered frequent,
|
|
|
|
// then we don't rerun the extraction.
|
|
|
|
if !script_language.is_empty() {
|
|
|
|
// build a new temporary tokenizer including the allow list.
|
2024-03-26 13:27:43 +01:00
|
|
|
let stop_words = settings.stop_words.as_ref();
|
2024-04-03 11:19:45 +02:00
|
|
|
let separators: Option<Vec<_>> = settings
|
|
|
|
.allowed_separators
|
|
|
|
.as_ref()
|
|
|
|
.map(|s| s.iter().map(String::as_str).collect());
|
2024-03-26 13:27:43 +01:00
|
|
|
let dictionary: Option<Vec<_>> =
|
2024-04-03 11:19:45 +02:00
|
|
|
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
2024-03-26 13:27:43 +01:00
|
|
|
let mut builder =
|
|
|
|
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
2023-10-16 14:58:11 +02:00
|
|
|
let tokenizer = builder.build();
|
|
|
|
|
|
|
|
script_language_word_count.clear();
|
|
|
|
|
|
|
|
// rerun the extraction.
|
|
|
|
tokens_from_document(
|
2023-11-06 11:19:31 +01:00
|
|
|
obkv,
|
2024-03-26 13:27:43 +01:00
|
|
|
&settings.searchable_fields_ids,
|
2023-10-16 14:58:11 +02:00
|
|
|
&tokenizer,
|
|
|
|
max_positions_per_attributes,
|
|
|
|
del_add,
|
|
|
|
buffers,
|
|
|
|
&mut script_language_word_count,
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-24 10:19:32 +02:00
|
|
|
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
|
2023-10-16 14:58:11 +02:00
|
|
|
Ok((&buffers.obkv_buffer, script_language_word_count))
|
|
|
|
}
|
|
|
|
|
2023-11-20 10:06:29 +01:00
|
|
|
/// Extract words mapped with their positions of a document.
|
2023-10-16 14:58:11 +02:00
|
|
|
fn tokens_from_document<'a>(
|
|
|
|
obkv: &KvReader<FieldId>,
|
2024-03-26 13:27:43 +01:00
|
|
|
searchable_fields: &Option<Vec<FieldId>>,
|
2023-10-16 14:58:11 +02:00
|
|
|
tokenizer: &Tokenizer,
|
|
|
|
max_positions_per_attributes: u32,
|
|
|
|
del_add: DelAdd,
|
|
|
|
buffers: &'a mut Buffers,
|
2023-03-07 18:35:26 +01:00
|
|
|
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
2023-10-16 14:58:11 +02:00
|
|
|
) -> Result<&'a [u8]> {
|
|
|
|
buffers.obkv_buffer.clear();
|
|
|
|
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
2023-03-07 18:35:26 +01:00
|
|
|
for (field_id, field_bytes) in obkv.iter() {
|
2023-10-16 14:58:11 +02:00
|
|
|
// if field is searchable.
|
2023-03-07 18:35:26 +01:00
|
|
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
2023-10-16 14:58:11 +02:00
|
|
|
// extract deletion or addition only.
|
|
|
|
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
|
|
|
// parse json.
|
|
|
|
let value =
|
|
|
|
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
|
|
|
|
2023-11-20 10:06:29 +01:00
|
|
|
// prepare writing destination.
|
2023-10-16 14:58:11 +02:00
|
|
|
buffers.obkv_positions_buffer.clear();
|
|
|
|
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
|
|
|
|
2023-11-20 10:06:29 +01:00
|
|
|
// convert json into a unique string.
|
2023-10-16 14:58:11 +02:00
|
|
|
buffers.field_buffer.clear();
|
|
|
|
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
|
|
|
// create an iterator of token with their positions.
|
|
|
|
let tokens = process_tokens(tokenizer.tokenize(field))
|
|
|
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
|
|
|
|
|
|
|
for (index, token) in tokens {
|
|
|
|
// if a language has been detected for the token, we update the counter.
|
|
|
|
if let Some(language) = token.language {
|
|
|
|
let script = token.script;
|
2023-11-23 12:20:44 +01:00
|
|
|
let entry = script_language_word_count.entry(script).or_default();
|
2023-10-16 14:58:11 +02:00
|
|
|
match entry.iter_mut().find(|(l, _)| *l == language) {
|
|
|
|
Some((_, n)) => *n += 1,
|
|
|
|
None => entry.push((language, 1)),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// keep a word only if it is not empty and fit in a LMDB key.
|
|
|
|
let token = token.lemma().trim();
|
|
|
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
|
|
|
let position: u16 = index
|
|
|
|
.try_into()
|
|
|
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
|
|
|
writer.insert(position, token.as_bytes())?;
|
2021-09-08 15:24:52 +02:00
|
|
|
}
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
2023-09-18 09:59:38 +02:00
|
|
|
|
2023-10-16 14:58:11 +02:00
|
|
|
// write positions into document.
|
|
|
|
let positions = writer.into_inner()?;
|
|
|
|
document_writer.insert(field_id, positions)?;
|
|
|
|
}
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-24 10:19:32 +02:00
|
|
|
// returns a KV<FieldId, KV<u16, String>>
|
2023-10-16 14:58:11 +02:00
|
|
|
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
2021-08-16 13:36:30 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Transform a JSON value into a string that can be indexed.
|
|
|
|
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
|
|
|
fn inner(value: &Value, output: &mut String) -> bool {
|
|
|
|
use std::fmt::Write;
|
|
|
|
match value {
|
2023-03-15 16:56:44 +01:00
|
|
|
Value::Null | Value::Object(_) => false,
|
2021-08-16 13:36:30 +02:00
|
|
|
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
|
|
|
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
|
|
|
Value::String(string) => write!(output, "{}", string).is_ok(),
|
|
|
|
Value::Array(array) => {
|
|
|
|
let mut count = 0;
|
|
|
|
for value in array {
|
|
|
|
if inner(value, output) {
|
|
|
|
output.push_str(". ");
|
|
|
|
count += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// check that at least one value was written
|
|
|
|
count != 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Value::String(string) = value {
|
2022-10-24 21:34:13 +02:00
|
|
|
Some(string)
|
2021-08-16 13:36:30 +02:00
|
|
|
} else if inner(value, buffer) {
|
|
|
|
Some(buffer)
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
}
|
2021-08-25 15:09:46 +02:00
|
|
|
|
|
|
|
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
|
|
|
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
2023-04-04 17:07:26 +02:00
|
|
|
/// else we keep the standard proximity of 1 between words.
|
2021-08-25 15:09:46 +02:00
|
|
|
fn process_tokens<'a>(
|
|
|
|
tokens: impl Iterator<Item = Token<'a>>,
|
|
|
|
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
|
|
|
tokens
|
2022-06-02 15:47:28 +02:00
|
|
|
.skip_while(|token| token.is_separator())
|
2023-09-05 15:44:14 +02:00
|
|
|
.scan((0, None), |(offset, prev_kind), mut token| {
|
2021-08-25 15:09:46 +02:00
|
|
|
match token.kind {
|
2023-09-05 15:44:14 +02:00
|
|
|
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
2021-08-25 15:09:46 +02:00
|
|
|
*offset += match *prev_kind {
|
|
|
|
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
|
|
|
Some(_) => 1,
|
|
|
|
None => 0,
|
|
|
|
};
|
|
|
|
*prev_kind = Some(token.kind)
|
|
|
|
}
|
|
|
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
|
|
|
*prev_kind = Some(token.kind);
|
|
|
|
}
|
|
|
|
TokenKind::Separator(SeparatorKind::Soft)
|
|
|
|
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
|
|
|
{
|
|
|
|
*prev_kind = Some(token.kind);
|
|
|
|
}
|
2023-09-05 15:44:14 +02:00
|
|
|
_ => token.kind = TokenKind::Unknown,
|
2021-08-25 15:09:46 +02:00
|
|
|
}
|
|
|
|
Some((*offset, token))
|
|
|
|
})
|
|
|
|
.filter(|(_, t)| t.is_word())
|
|
|
|
}
|
2023-03-07 18:35:26 +01:00
|
|
|
|
2023-03-09 15:34:36 +01:00
|
|
|
fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool {
|
2023-03-07 18:35:26 +01:00
|
|
|
if languages_frequency.len() > 1 {
|
2023-03-09 10:56:17 +01:00
|
|
|
let threshold = compute_language_frequency_threshold(languages_frequency);
|
2023-03-07 18:35:26 +01:00
|
|
|
languages_frequency.iter().any(|(_, c)| *c <= threshold)
|
|
|
|
} else {
|
|
|
|
false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn most_frequent_languages(
|
|
|
|
(script, languages_frequency): (&Script, &Vec<(Language, usize)>),
|
|
|
|
) -> Option<(Script, Vec<Language>)> {
|
|
|
|
if languages_frequency.len() > 1 {
|
2023-03-09 10:56:17 +01:00
|
|
|
let threshold = compute_language_frequency_threshold(languages_frequency);
|
2023-03-07 18:35:26 +01:00
|
|
|
|
2023-03-08 10:53:42 +01:00
|
|
|
let languages: Vec<_> =
|
|
|
|
languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect();
|
2023-03-07 18:35:26 +01:00
|
|
|
|
|
|
|
if languages.is_empty() {
|
|
|
|
None
|
|
|
|
} else {
|
2023-03-08 10:53:42 +01:00
|
|
|
Some((*script, languages))
|
2023-03-07 18:35:26 +01:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-09 10:56:17 +01:00
|
|
|
fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize {
|
2023-03-07 18:35:26 +01:00
|
|
|
let total: usize = languages_frequency.iter().map(|(_, c)| c).sum();
|
2023-03-09 10:56:17 +01:00
|
|
|
total / 10 // 10% is a completely arbitrary value.
|
2023-03-07 18:35:26 +01:00
|
|
|
}
|
2023-03-09 11:19:13 +01:00
|
|
|
|
|
|
|
#[derive(Default)]
|
|
|
|
struct Buffers {
|
2023-03-09 15:34:36 +01:00
|
|
|
// the field buffer for each fields desserialization, and must be cleared between each field.
|
2023-03-09 11:19:13 +01:00
|
|
|
field_buffer: String,
|
2023-09-21 10:02:08 +02:00
|
|
|
// buffer used to store the value data containing an obkv.
|
|
|
|
obkv_buffer: Vec<u8>,
|
2023-10-16 14:58:11 +02:00
|
|
|
// buffer used to store the value data containing an obkv of tokens with their positions.
|
|
|
|
obkv_positions_buffer: Vec<u8>,
|
2023-03-09 11:19:13 +01:00
|
|
|
}
|