mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Remove limit of 1000 position per attribute
Instead of using an arbitrary limit we encode the absolute position in a u32 using one strong u16 for the field id and a weak u16 for the relative position in the attribute.
This commit is contained in:
parent
8f6b6c9042
commit
360c5ff3df
6 changed files with 91 additions and 24 deletions
|
@ -10,8 +10,7 @@ use serde_json::Value;
|
|||
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::proximity::ONE_ATTRIBUTE;
|
||||
use crate::{FieldId, Result};
|
||||
use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
|
@ -63,7 +62,7 @@ pub fn extract_docid_word_positions<R: io::Read>(
|
|||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||
let analyzed = analyzer.analyze(field);
|
||||
let tokens = process_tokens(analyzed.tokens())
|
||||
.take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE);
|
||||
.take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE);
|
||||
|
||||
for (index, token) in tokens {
|
||||
let token = token.text().trim();
|
||||
|
@ -71,10 +70,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
|
|||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(token.as_bytes());
|
||||
|
||||
let position: u32 = index
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = field_id as u32 * ONE_ATTRIBUTE + position;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter
|
||||
.insert(&key_buffer, &position.to_ne_bytes())?;
|
||||
}
|
||||
|
|
|
@ -10,8 +10,7 @@ use super::helpers::{
|
|||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::extract_position;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
|
@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
|
|||
}
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
let (field_id, position) = extract_position(position);
|
||||
let word_count = position + 1;
|
||||
let (field_id, position) = relative_from_absolute_position(position);
|
||||
let word_count = position as u32 + 1;
|
||||
|
||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
||||
*value = cmp::max(*value, word_count);
|
||||
|
|
|
@ -884,6 +884,44 @@ mod tests {
|
|||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_more_than_1000_positions_in_a_field() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(50 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let mut big_object = HashMap::new();
|
||||
big_object.insert(S("id"), "wow");
|
||||
let content: String =
|
||||
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap();
|
||||
big_object.insert("content".to_string(), &content);
|
||||
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
|
||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
builder.add_documents(big_object).unwrap();
|
||||
builder.finish().unwrap();
|
||||
cursor.set_position(0);
|
||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||
|
||||
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let mut rtxn = index.read_txn().unwrap();
|
||||
|
||||
assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some());
|
||||
assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some());
|
||||
assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some());
|
||||
assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some());
|
||||
assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some());
|
||||
assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_documents_with_zeroes() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue