mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Extract and index data
This commit is contained in:
parent
c45d1e3610
commit
d97fb6117e
3 changed files with 37 additions and 9 deletions
|
@ -1,9 +1,9 @@
|
|||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
|
@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> {
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut script_language_pair = HashMap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
concat_u32s_array,
|
||||
|
@ -70,6 +71,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
let script = token.script;
|
||||
let language = token.language.unwrap_or_default();
|
||||
let entry = script_language_pair
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
|
@ -88,7 +95,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair))
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
|
|
|
@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data(
|
|||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
|
||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
searchable_fields,
|
||||
|
@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data(
|
|||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
|
||||
|
||||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue