mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Remove limit of 1000 position per attribute
Instead of using an arbitrary limit we encode the absolute position in a u32 using one strong u16 for the field id and a weak u16 for the relative position in the attribute.
This commit is contained in:
parent
8f6b6c9042
commit
360c5ff3df
@ -53,9 +53,24 @@ pub type Attribute = u32;
|
|||||||
pub type DocumentId = u32;
|
pub type DocumentId = u32;
|
||||||
pub type FieldId = u16;
|
pub type FieldId = u16;
|
||||||
pub type Position = u32;
|
pub type Position = u32;
|
||||||
|
pub type RelativePosition = u16;
|
||||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||||
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>;
|
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>;
|
||||||
|
|
||||||
|
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
||||||
|
|
||||||
|
// Convert an absolute word position into a relative position.
|
||||||
|
// Return the field id of the attribute related to the absolute position
|
||||||
|
// and the relative position in the attribute.
|
||||||
|
pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
|
||||||
|
((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
|
||||||
|
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
|
||||||
|
(field_id as u32) << 16 | (relative as u32)
|
||||||
|
}
|
||||||
|
|
||||||
/// Transform a raw obkv store into a JSON Object.
|
/// Transform a raw obkv store into a JSON Object.
|
||||||
pub fn obkv_to_json(
|
pub fn obkv_to_json(
|
||||||
displayed_fields: &[FieldId],
|
displayed_fields: &[FieldId],
|
||||||
@ -187,4 +202,26 @@ mod tests {
|
|||||||
// the distance of hard separators is clamped to 8 anyway.
|
// the distance of hard separators is clamped to 8 anyway.
|
||||||
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
|
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_relative_position_conversion() {
|
||||||
|
assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
|
||||||
|
assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
|
||||||
|
assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
|
||||||
|
assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
|
||||||
|
assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
|
||||||
|
assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
|
||||||
|
assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_absolute_position_conversion() {
|
||||||
|
assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
|
||||||
|
assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
|
||||||
|
assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
|
||||||
|
assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
|
||||||
|
assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
|
||||||
|
assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
|
||||||
|
assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
use std::cmp;
|
use std::cmp;
|
||||||
|
|
||||||
use crate::{Attribute, Position};
|
use crate::{relative_from_absolute_position, Position};
|
||||||
|
|
||||||
pub const ONE_ATTRIBUTE: u32 = 1000;
|
|
||||||
pub const MAX_DISTANCE: u32 = 8;
|
pub const MAX_DISTANCE: u32 = 8;
|
||||||
|
|
||||||
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||||
@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
|
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
|
||||||
let (lhs_attr, lhs_index) = extract_position(lhs);
|
let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs);
|
||||||
let (rhs_attr, rhs_index) = extract_position(rhs);
|
let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs);
|
||||||
if lhs_attr != rhs_attr {
|
if lhs_attr != rhs_attr {
|
||||||
MAX_DISTANCE
|
MAX_DISTANCE
|
||||||
} else {
|
} else {
|
||||||
index_proximity(lhs_index, rhs_index)
|
index_proximity(lhs_index as u32, rhs_index as u32)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn extract_position(position: Position) -> (Attribute, Position) {
|
|
||||||
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn path_proximity(path: &[Position]) -> u32 {
|
pub fn path_proximity(path: &[Position]) -> u32 {
|
||||||
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
|
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ use crate::search::criteria::{
|
|||||||
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
|
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
|
||||||
};
|
};
|
||||||
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
|
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
|
||||||
use crate::Result;
|
use crate::{absolute_from_relative_position, FieldId, Result};
|
||||||
|
|
||||||
pub struct Exactness<'t> {
|
pub struct Exactness<'t> {
|
||||||
ctx: &'t dyn Context<'t>,
|
ctx: &'t dyn Context<'t>,
|
||||||
@ -181,7 +181,7 @@ fn resolve_state(
|
|||||||
ctx.field_id_word_count_docids(id, query_len)?
|
ctx.field_id_word_count_docids(id, query_len)?
|
||||||
{
|
{
|
||||||
let mut attribute_candidates_array =
|
let mut attribute_candidates_array =
|
||||||
attribute_start_with_docids(ctx, id as u32, query)?;
|
attribute_start_with_docids(ctx, id, query)?;
|
||||||
attribute_candidates_array.push(attribute_allowed_docids);
|
attribute_candidates_array.push(attribute_allowed_docids);
|
||||||
candidates |= intersection_of(attribute_candidates_array.iter().collect());
|
candidates |= intersection_of(attribute_candidates_array.iter().collect());
|
||||||
}
|
}
|
||||||
@ -199,8 +199,7 @@ fn resolve_state(
|
|||||||
let mut candidates = RoaringBitmap::new();
|
let mut candidates = RoaringBitmap::new();
|
||||||
let attributes_ids = ctx.searchable_fields_ids()?;
|
let attributes_ids = ctx.searchable_fields_ids()?;
|
||||||
for id in attributes_ids {
|
for id in attributes_ids {
|
||||||
let attribute_candidates_array =
|
let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?;
|
||||||
attribute_start_with_docids(ctx, id as u32, query)?;
|
|
||||||
candidates |= intersection_of(attribute_candidates_array.iter().collect());
|
candidates |= intersection_of(attribute_candidates_array.iter().collect());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -290,12 +289,12 @@ fn resolve_state(
|
|||||||
|
|
||||||
fn attribute_start_with_docids(
|
fn attribute_start_with_docids(
|
||||||
ctx: &dyn Context,
|
ctx: &dyn Context,
|
||||||
attribute_id: u32,
|
attribute_id: FieldId,
|
||||||
query: &[ExactQueryPart],
|
query: &[ExactQueryPart],
|
||||||
) -> heed::Result<Vec<RoaringBitmap>> {
|
) -> heed::Result<Vec<RoaringBitmap>> {
|
||||||
let mut attribute_candidates_array = Vec::new();
|
let mut attribute_candidates_array = Vec::new();
|
||||||
// start from attribute first position
|
// start from attribute first position
|
||||||
let mut pos = attribute_id * 1000;
|
let mut pos = absolute_from_relative_position(attribute_id, 0);
|
||||||
for part in query {
|
for part in query {
|
||||||
use ExactQueryPart::*;
|
use ExactQueryPart::*;
|
||||||
match part {
|
match part {
|
||||||
|
@ -10,8 +10,7 @@ use serde_json::Value;
|
|||||||
|
|
||||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::{InternalError, SerializationError};
|
use crate::error::{InternalError, SerializationError};
|
||||||
use crate::proximity::ONE_ATTRIBUTE;
|
use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||||
use crate::{FieldId, Result};
|
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
@ -63,7 +62,7 @@ pub fn extract_docid_word_positions<R: io::Read>(
|
|||||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||||
let analyzed = analyzer.analyze(field);
|
let analyzed = analyzer.analyze(field);
|
||||||
let tokens = process_tokens(analyzed.tokens())
|
let tokens = process_tokens(analyzed.tokens())
|
||||||
.take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE);
|
.take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE);
|
||||||
|
|
||||||
for (index, token) in tokens {
|
for (index, token) in tokens {
|
||||||
let token = token.text().trim();
|
let token = token.text().trim();
|
||||||
@ -71,10 +70,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
|
|||||||
key_buffer.truncate(mem::size_of::<u32>());
|
key_buffer.truncate(mem::size_of::<u32>());
|
||||||
key_buffer.extend_from_slice(token.as_bytes());
|
key_buffer.extend_from_slice(token.as_bytes());
|
||||||
|
|
||||||
let position: u32 = index
|
let position: u16 = index
|
||||||
.try_into()
|
.try_into()
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
let position = field_id as u32 * ONE_ATTRIBUTE + position;
|
let position = absolute_from_relative_position(field_id, position);
|
||||||
docid_word_positions_sorter
|
docid_word_positions_sorter
|
||||||
.insert(&key_buffer, &position.to_ne_bytes())?;
|
.insert(&key_buffer, &position.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,7 @@ use super::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::proximity::extract_position;
|
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
||||||
use crate::{DocumentId, FieldId, Result};
|
|
||||||
|
|
||||||
/// Extracts the field id word count and the documents ids where
|
/// Extracts the field id word count and the documents ids where
|
||||||
/// this field id with this amount of words appear.
|
/// this field id with this amount of words appear.
|
||||||
@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for position in read_u32_ne_bytes(value) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
let (field_id, position) = extract_position(position);
|
let (field_id, position) = relative_from_absolute_position(position);
|
||||||
let word_count = position + 1;
|
let word_count = position as u32 + 1;
|
||||||
|
|
||||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
||||||
*value = cmp::max(*value, word_count);
|
*value = cmp::max(*value, word_count);
|
||||||
|
@ -884,6 +884,44 @@ mod tests {
|
|||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_more_than_1000_positions_in_a_field() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(50 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let mut big_object = HashMap::new();
|
||||||
|
big_object.insert(S("id"), "wow");
|
||||||
|
let content: String =
|
||||||
|
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap();
|
||||||
|
big_object.insert("content".to_string(), &content);
|
||||||
|
|
||||||
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
|
|
||||||
|
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
builder.add_documents(big_object).unwrap();
|
||||||
|
builder.finish().unwrap();
|
||||||
|
cursor.set_position(0);
|
||||||
|
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||||
|
|
||||||
|
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||||
|
builder.execute(content, |_, _| ()).unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let mut rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some());
|
||||||
|
assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some());
|
||||||
|
assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some());
|
||||||
|
assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some());
|
||||||
|
assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some());
|
||||||
|
assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn index_documents_with_zeroes() {
|
fn index_documents_with_zeroes() {
|
||||||
let path = tempfile::tempdir().unwrap();
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
Loading…
Reference in New Issue
Block a user