mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-12 06:24:29 +01:00
Fix indexing of word_position_docid and fid
This commit is contained in:
parent
d9460a76f4
commit
5440f43fd3
@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String {
|
|||||||
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
pub fn snap_word_fid_docids(index: &Index) -> String {
|
||||||
|
make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| {
|
||||||
|
&format!("{word:<16} {fid:<3} {}", display_bitmap(&b))
|
||||||
|
})
|
||||||
|
}
|
||||||
pub fn snap_field_id_word_count_docids(index: &Index) -> String {
|
pub fn snap_field_id_word_count_docids(index: &Index) -> String {
|
||||||
make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
|
make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
|
||||||
&format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
|
&format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
|
||||||
@ -477,6 +482,9 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, word_position_docids) => {{
|
($index:ident, word_position_docids) => {{
|
||||||
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
($index:ident, word_fid_docids) => {{
|
||||||
|
$crate::snapshot_tests::snap_word_fid_docids(&$index)
|
||||||
|
}};
|
||||||
($index:ident, field_id_word_count_docids) => {{
|
($index:ident, field_id_word_count_docids) => {{
|
||||||
$crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
|
$crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
@ -0,0 +1,48 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
use super::helpers::{
|
||||||
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
|
try_split_array_at, GrenadParameters,
|
||||||
|
};
|
||||||
|
use crate::error::SerializationError;
|
||||||
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
|
use crate::{relative_from_absolute_position, DocumentId, Result};
|
||||||
|
|
||||||
|
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
||||||
|
docid_word_positions: grenad::Reader<R>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Unstable,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
|
for position in read_u32_ne_bytes(value) {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
let (fid, _) = relative_from_absolute_position(position);
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
||||||
|
|
||||||
|
Ok(word_fid_docids_reader)
|
||||||
|
}
|
@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
|
|||||||
/// Returns a grenad reader with the list of extracted words at positions and
|
/// Returns a grenad reader with the list of extracted words at positions and
|
||||||
/// documents ids from the given chunk of docid word positions.
|
/// documents ids from the given chunk of docid word positions.
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
|
|||||||
for position in read_u32_ne_bytes(value) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
let (_fid, position) = relative_from_absolute_position(position);
|
let (_, position) = relative_from_absolute_position(position);
|
||||||
let position = bucketed_position(position);
|
let position = bucketed_position(position);
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
|
@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values;
|
|||||||
mod extract_fid_word_count_docids;
|
mod extract_fid_word_count_docids;
|
||||||
mod extract_geo_points;
|
mod extract_geo_points;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
|
mod extract_word_fid_docids;
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
|
||||||
@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
|||||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
|
use self::extract_word_fid_docids::extract_word_fid_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_fid_and_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
||||||
GrenadParameters, MergeFn, MergeableReader,
|
GrenadParameters, MergeFn, MergeableReader,
|
||||||
@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_fid_and_position_docids,
|
extract_word_position_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
|
docid_word_positions_chunks,
|
||||||
|
indexer,
|
||||||
|
lmdb_writer_sx.clone(),
|
||||||
|
extract_word_fid_docids,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
TypedChunk::WordFidDocids,
|
||||||
|
"word-fid-docids",
|
||||||
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_strings_chunks,
|
docid_fid_facet_strings_chunks,
|
||||||
|
@ -2255,4 +2255,61 @@ mod tests {
|
|||||||
{"id":1,"catto":"jorts"}
|
{"id":1,"catto":"jorts"}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_word_fid_position() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{"id": 0, "text": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 1, "text": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 2, "text": "the sun is shining today" },
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a "
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
|
||||||
|
db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{"id": 4, "text": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 5, "text2": "sun flowers are looking at the sun" },
|
||||||
|
{"id": 6, "text": "b b b" },
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"text2": "a a a a"
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
||||||
|
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
// Delete not all of the documents but some of them.
|
||||||
|
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||||
|
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||||
|
builder.delete_external_id("0");
|
||||||
|
builder.delete_external_id("3");
|
||||||
|
let result = builder.execute().unwrap();
|
||||||
|
println!("{result:?}");
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||||
|
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||||
|
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
|
|||||||
exact_word_docids_reader: grenad::Reader<File>,
|
exact_word_docids_reader: grenad::Reader<File>,
|
||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<File>),
|
||||||
|
WordFidDocids(grenad::Reader<File>),
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||||
@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
|
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
|
||||||
|
append_entries_into_database(
|
||||||
|
word_fid_docids_iter,
|
||||||
|
&index.word_fid_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
)?;
|
||||||
|
is_merged_database = true;
|
||||||
|
}
|
||||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
||||||
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
||||||
indexer.execute(wtxn)?;
|
indexer.execute(wtxn)?;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user