mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Extract and index data
This commit is contained in:
parent
c45d1e3610
commit
d97fb6117e
@ -1,9 +1,9 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
stop_words: Option<&fst::Set<&[u8]>>,
|
stop_words: Option<&fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
|
) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> {
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
|
let mut script_language_pair = HashMap::new();
|
||||||
let mut docid_word_positions_sorter = create_sorter(
|
let mut docid_word_positions_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
concat_u32s_array,
|
concat_u32s_array,
|
||||||
@ -70,6 +71,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
for (index, token) in tokens {
|
for (index, token) in tokens {
|
||||||
|
let script = token.script;
|
||||||
|
let language = token.language.unwrap_or_default();
|
||||||
|
let entry = script_language_pair
|
||||||
|
.entry((script, language))
|
||||||
|
.or_insert_with(RoaringBitmap::new);
|
||||||
|
entry.push(document_id);
|
||||||
let token = token.lemma().trim();
|
let token = token.lemma().trim();
|
||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
key_buffer.truncate(mem::size_of::<u32>());
|
key_buffer.truncate(mem::size_of::<u32>());
|
||||||
@ -88,7 +95,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
|
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Transform a JSON value into a string that can be indexed.
|
/// Transform a JSON value into a string that can be indexed.
|
||||||
|
@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
|
let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions(
|
||||||
flattened_documents_chunk.clone(),
|
flattened_documents_chunk.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
let _ = lmdb_writer_sx
|
let _ = lmdb_writer_sx
|
||||||
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
|
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
|
||||||
|
|
||||||
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
|
||||||
|
|
||||||
Ok(docid_word_positions_chunk)
|
Ok(docid_word_positions_chunk)
|
||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
use charabia::{Language, Script};
|
||||||
use grenad::MergerBuilder;
|
use grenad::MergerBuilder;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesDecode, RwTxn};
|
use heed::{BytesDecode, RwTxn};
|
||||||
@ -16,10 +18,7 @@ use super::{ClonableMmap, MergeFn};
|
|||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::{
|
use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz};
|
||||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
|
|
||||||
Result,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
|
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
|
||||||
@ -38,6 +37,7 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
||||||
GeoPoints(grenad::Reader<File>),
|
GeoPoints(grenad::Reader<File>),
|
||||||
|
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||||
@ -210,6 +210,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.put_geo_rtree(wtxn, &rtree)?;
|
index.put_geo_rtree(wtxn, &rtree)?;
|
||||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||||
}
|
}
|
||||||
|
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
for (key, value) in hash_pair {
|
||||||
|
buffer.clear();
|
||||||
|
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||||
|
Some(db_values) => {
|
||||||
|
let mut db_value_buffer = Vec::new();
|
||||||
|
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
|
||||||
|
let mut new_value_buffer = Vec::new();
|
||||||
|
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
|
||||||
|
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
|
||||||
|
let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?;
|
||||||
|
merged_db_values
|
||||||
|
}
|
||||||
|
None => value
|
||||||
|
};
|
||||||
|
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((RoaringBitmap::new(), is_merged_database))
|
Ok((RoaringBitmap::new(), is_merged_database))
|
||||||
|
Loading…
Reference in New Issue
Block a user