368: Remove limit of 1000 position per attribute r=irevoire a=ManyTheFish

Instead of using an arbitrary limit we encode the absolute position in a u32
using one strong u16 for the field id and a weak u16 for the relative position in the attribute.

- [x] check database size difference

below is the database size difference for each dataset:
![Capture d’écran 2021-09-27 à 18 01 44](https://user-images.githubusercontent.com/6482087/134944199-bd25fed0-6c34-475c-9afc-197871e06553.png)

- [ ] check search time on big dataset


Related to [product#202](https://github.com/meilisearch/product/issues/202)

Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
bors[bot] 2021-10-12 08:30:33 +00:00 committed by GitHub
commit 3f7f24b90e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 121 additions and 24 deletions

View File

@ -131,6 +131,11 @@ pub struct IndexerOpt {
/// Number of parallel jobs for indexing, defaults to # of CPUs. /// Number of parallel jobs for indexing, defaults to # of CPUs.
#[structopt(long)] #[structopt(long)]
pub indexing_jobs: Option<usize>, pub indexing_jobs: Option<usize>,
/// Maximum relative position in an attribute for a word to be indexed.
/// Any value higher than 65535 will be clamped.
#[structopt(long)]
pub max_positions_per_attributes: Option<u32>,
} }
struct Highlighter<'a, A> { struct Highlighter<'a, A> {
@ -346,6 +351,9 @@ async fn main() -> anyhow::Result<()> {
if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level { if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level {
update_builder.chunk_compression_level(chunk_compression_level); update_builder.chunk_compression_level(chunk_compression_level);
} }
if let Some(max_pos_per_attributes) = indexer_opt_cloned.max_positions_per_attributes {
update_builder.max_positions_per_attributes(max_pos_per_attributes);
}
update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap());
update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.log_every_n(indexer_opt_cloned.log_every_n);
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);

View File

@ -53,9 +53,24 @@ pub type Attribute = u32;
pub type DocumentId = u32; pub type DocumentId = u32;
pub type FieldId = u16; pub type FieldId = u16;
pub type Position = u32; pub type Position = u32;
pub type RelativePosition = u16;
pub type FieldDistribution = BTreeMap<String, u64>; pub type FieldDistribution = BTreeMap<String, u64>;
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>;
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
// Convert an absolute word position into a relative position.
// Return the field id of the attribute related to the absolute position
// and the relative position in the attribute.
pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
}
// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
(field_id as u32) << 16 | (relative as u32)
}
/// Transform a raw obkv store into a JSON Object. /// Transform a raw obkv store into a JSON Object.
pub fn obkv_to_json( pub fn obkv_to_json(
displayed_fields: &[FieldId], displayed_fields: &[FieldId],
@ -187,4 +202,26 @@ mod tests {
// the distance of hard separators is clamped to 8 anyway. // the distance of hard separators is clamped to 8 anyway.
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
} }
#[test]
fn test_relative_position_conversion() {
assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
}
#[test]
fn test_absolute_position_conversion() {
assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
}
} }

View File

@ -1,8 +1,7 @@
use std::cmp; use std::cmp;
use crate::{Attribute, Position}; use crate::{relative_from_absolute_position, Position};
pub const ONE_ATTRIBUTE: u32 = 1000;
pub const MAX_DISTANCE: u32 = 8; pub const MAX_DISTANCE: u32 = 8;
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
} }
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
let (lhs_attr, lhs_index) = extract_position(lhs); let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs);
let (rhs_attr, rhs_index) = extract_position(rhs); let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs);
if lhs_attr != rhs_attr { if lhs_attr != rhs_attr {
MAX_DISTANCE MAX_DISTANCE
} else { } else {
index_proximity(lhs_index, rhs_index) index_proximity(lhs_index as u32, rhs_index as u32)
} }
} }
pub fn extract_position(position: Position) -> (Attribute, Position) {
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
}
pub fn path_proximity(path: &[Position]) -> u32 { pub fn path_proximity(path: &[Position]) -> u32 {
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>() path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
} }

View File

@ -10,7 +10,7 @@ use crate::search::criteria::{
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
}; };
use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::Result; use crate::{absolute_from_relative_position, FieldId, Result};
pub struct Exactness<'t> { pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
@ -181,7 +181,7 @@ fn resolve_state(
ctx.field_id_word_count_docids(id, query_len)? ctx.field_id_word_count_docids(id, query_len)?
{ {
let mut attribute_candidates_array = let mut attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?; attribute_start_with_docids(ctx, id, query)?;
attribute_candidates_array.push(attribute_allowed_docids); attribute_candidates_array.push(attribute_allowed_docids);
candidates |= intersection_of(attribute_candidates_array.iter().collect()); candidates |= intersection_of(attribute_candidates_array.iter().collect());
} }
@ -199,8 +199,7 @@ fn resolve_state(
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?; let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids { for id in attributes_ids {
let attribute_candidates_array = let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?;
attribute_start_with_docids(ctx, id as u32, query)?;
candidates |= intersection_of(attribute_candidates_array.iter().collect()); candidates |= intersection_of(attribute_candidates_array.iter().collect());
} }
@ -290,12 +289,12 @@ fn resolve_state(
fn attribute_start_with_docids( fn attribute_start_with_docids(
ctx: &dyn Context, ctx: &dyn Context,
attribute_id: u32, attribute_id: FieldId,
query: &[ExactQueryPart], query: &[ExactQueryPart],
) -> heed::Result<Vec<RoaringBitmap>> { ) -> heed::Result<Vec<RoaringBitmap>> {
let mut attribute_candidates_array = Vec::new(); let mut attribute_candidates_array = Vec::new();
// start from attribute first position // start from attribute first position
let mut pos = attribute_id * 1000; let mut pos = absolute_from_relative_position(attribute_id, 0);
for part in query { for part in query {
use ExactQueryPart::*; use ExactQueryPart::*;
match part { match part {

View File

@ -10,8 +10,7 @@ use serde_json::Value;
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
use crate::error::{InternalError, SerializationError}; use crate::error::{InternalError, SerializationError};
use crate::proximity::ONE_ATTRIBUTE; use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
use crate::{FieldId, Result};
/// Extracts the word and positions where this word appear and /// Extracts the word and positions where this word appear and
/// prefixes it by the document id. /// prefixes it by the document id.
@ -24,7 +23,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
indexer: GrenadParameters, indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>, searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<&[u8]>>, stop_words: Option<&fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
) -> Result<(RoaringBitmap, grenad::Reader<File>)> { ) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
@ -63,7 +65,7 @@ pub fn extract_docid_word_positions<R: io::Read>(
if let Some(field) = json_to_string(&value, &mut field_buffer) { if let Some(field) = json_to_string(&value, &mut field_buffer) {
let analyzed = analyzer.analyze(field); let analyzed = analyzer.analyze(field);
let tokens = process_tokens(analyzed.tokens()) let tokens = process_tokens(analyzed.tokens())
.take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
for (index, token) in tokens { for (index, token) in tokens {
let token = token.text().trim(); let token = token.text().trim();
@ -71,10 +73,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
key_buffer.truncate(mem::size_of::<u32>()); key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(token.as_bytes()); key_buffer.extend_from_slice(token.as_bytes());
let position: u32 = index let position: u16 = index
.try_into() .try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?; .map_err(|_| SerializationError::InvalidNumberSerialization)?;
let position = field_id as u32 * ONE_ATTRIBUTE + position; let position = absolute_from_relative_position(field_id, position);
docid_word_positions_sorter docid_word_positions_sorter
.insert(&key_buffer, &position.to_ne_bytes())?; .insert(&key_buffer, &position.to_ne_bytes())?;
} }

View File

@ -10,8 +10,7 @@ use super::helpers::{
}; };
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::extract_position; use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
use crate::{DocumentId, FieldId, Result};
/// Extracts the field id word count and the documents ids where /// Extracts the field id word count and the documents ids where
/// this field id with this amount of words appear. /// this field id with this amount of words appear.
@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
} }
for position in read_u32_ne_bytes(value) { for position in read_u32_ne_bytes(value) {
let (field_id, position) = extract_position(position); let (field_id, position) = relative_from_absolute_position(position);
let word_count = position + 1; let word_count = position as u32 + 1;
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
*value = cmp::max(*value, word_count); *value = cmp::max(*value, word_count);

View File

@ -42,6 +42,7 @@ pub(crate) fn data_from_obkv_documents(
primary_key_id: FieldId, primary_key_id: FieldId,
geo_field_id: Option<FieldId>, geo_field_id: Option<FieldId>,
stop_words: Option<fst::Set<&[u8]>>, stop_words: Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
) -> Result<()> { ) -> Result<()> {
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
.par_bridge() .par_bridge()
@ -55,6 +56,7 @@ pub(crate) fn data_from_obkv_documents(
primary_key_id, primary_key_id,
geo_field_id, geo_field_id,
&stop_words, &stop_words,
max_positions_per_attributes,
) )
}) })
.collect(); .collect();
@ -177,6 +179,7 @@ fn extract_documents_data(
primary_key_id: FieldId, primary_key_id: FieldId,
geo_field_id: Option<FieldId>, geo_field_id: Option<FieldId>,
stop_words: &Option<fst::Set<&[u8]>>, stop_words: &Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
) -> Result<( ) -> Result<(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>), (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
@ -206,6 +209,7 @@ fn extract_documents_data(
indexer.clone(), indexer.clone(),
searchable_fields, searchable_fields,
stop_words.as_ref(), stop_words.as_ref(),
max_positions_per_attributes,
)?; )?;
// send documents_ids to DB writer // send documents_ids to DB writer

View File

@ -68,6 +68,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>, pub(crate) chunk_compression_level: Option<u32>,
pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) thread_pool: Option<&'a ThreadPool>,
pub(crate) max_positions_per_attributes: Option<u32>,
facet_level_group_size: Option<NonZeroUsize>, facet_level_group_size: Option<NonZeroUsize>,
facet_min_level_size: Option<NonZeroUsize>, facet_min_level_size: Option<NonZeroUsize>,
words_prefix_threshold: Option<u32>, words_prefix_threshold: Option<u32>,
@ -104,6 +105,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
update_method: IndexDocumentsMethod::ReplaceDocuments, update_method: IndexDocumentsMethod::ReplaceDocuments,
autogenerate_docids: false, autogenerate_docids: false,
update_id, update_id,
max_positions_per_attributes: None,
} }
} }
@ -262,6 +264,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
primary_key_id, primary_key_id,
geo_field_id, geo_field_id,
stop_words, stop_words,
self.max_positions_per_attributes,
) )
}); });
@ -284,6 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
chunk_compression_type: self.chunk_compression_type, chunk_compression_type: self.chunk_compression_type,
chunk_compression_level: self.chunk_compression_level, chunk_compression_level: self.chunk_compression_level,
thread_pool: self.thread_pool, thread_pool: self.thread_pool,
max_positions_per_attributes: self.max_positions_per_attributes,
update_id: self.update_id, update_id: self.update_id,
}; };
let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?;
@ -884,6 +888,44 @@ mod tests {
wtxn.commit().unwrap(); wtxn.commit().unwrap();
} }
#[test]
fn index_more_than_1000_positions_in_a_field() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(50 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let content: String =
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap();
big_object.insert("content".to_string(), &content);
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let mut rtxn = index.read_txn().unwrap();
assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some());
}
#[test] #[test]
fn index_documents_with_zeroes() { fn index_documents_with_zeroes() {
let path = tempfile::tempdir().unwrap(); let path = tempfile::tempdir().unwrap();

View File

@ -69,6 +69,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>, pub(crate) chunk_compression_level: Option<u32>,
pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) thread_pool: Option<&'a ThreadPool>,
pub(crate) max_positions_per_attributes: Option<u32>,
update_id: u64, update_id: u64,
searchable_fields: Setting<Vec<String>>, searchable_fields: Setting<Vec<String>>,
@ -108,6 +109,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
primary_key: Setting::NotSet, primary_key: Setting::NotSet,
update_id, update_id,
max_positions_per_attributes: None,
} }
} }
@ -237,6 +239,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_type = self.chunk_compression_type;
indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.chunk_compression_level = self.chunk_compression_level;
indexing_builder.thread_pool = self.thread_pool; indexing_builder.thread_pool = self.thread_pool;
indexing_builder.max_positions_per_attributes = self.max_positions_per_attributes;
indexing_builder.execute_raw(output, &cb)?; indexing_builder.execute_raw(output, &cb)?;
Ok(()) Ok(())

View File

@ -12,6 +12,7 @@ pub struct UpdateBuilder<'a> {
pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>, pub(crate) chunk_compression_level: Option<u32>,
pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) thread_pool: Option<&'a ThreadPool>,
pub(crate) max_positions_per_attributes: Option<u32>,
pub(crate) update_id: u64, pub(crate) update_id: u64,
} }
@ -25,6 +26,7 @@ impl<'a> UpdateBuilder<'a> {
chunk_compression_type: CompressionType::None, chunk_compression_type: CompressionType::None,
chunk_compression_level: None, chunk_compression_level: None,
thread_pool: None, thread_pool: None,
max_positions_per_attributes: None,
update_id, update_id,
} }
} }
@ -57,6 +59,10 @@ impl<'a> UpdateBuilder<'a> {
self.thread_pool = Some(thread_pool); self.thread_pool = Some(thread_pool);
} }
pub fn max_positions_per_attributes(&mut self, max_positions_per_attributes: u32) {
self.max_positions_per_attributes = Some(max_positions_per_attributes);
}
pub fn clear_documents<'t, 'u, 'i>( pub fn clear_documents<'t, 'u, 'i>(
self, self,
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -87,6 +93,7 @@ impl<'a> UpdateBuilder<'a> {
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_compression_level = self.chunk_compression_level;
builder.thread_pool = self.thread_pool; builder.thread_pool = self.thread_pool;
builder.max_positions_per_attributes = self.max_positions_per_attributes;
builder builder
} }
@ -105,6 +112,7 @@ impl<'a> UpdateBuilder<'a> {
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_compression_level = self.chunk_compression_level;
builder.thread_pool = self.thread_pool; builder.thread_pool = self.thread_pool;
builder.max_positions_per_attributes = self.max_positions_per_attributes;
builder builder
} }