mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Implement word count and word pair proximity extractors
This commit is contained in:
parent
27308eaab1
commit
34f11e3380
@ -0,0 +1,135 @@
|
|||||||
|
use std::{borrow::Cow, collections::HashMap};
|
||||||
|
|
||||||
|
use heed::RoTxn;
|
||||||
|
|
||||||
|
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
|
||||||
|
use crate::{
|
||||||
|
update::{
|
||||||
|
new::{extract::cache::CboCachedSorter, DocumentChange},
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
},
|
||||||
|
FieldId, GlobalFieldsIdsMap, Index, Result,
|
||||||
|
};
|
||||||
|
|
||||||
|
const MAX_COUNTED_WORDS: usize = 30;
|
||||||
|
|
||||||
|
pub struct FidWordCountDocidsExtractor;
|
||||||
|
impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||||
|
fn attributes_to_extract<'a>(
|
||||||
|
rtxn: &'a RoTxn,
|
||||||
|
index: &'a Index,
|
||||||
|
) -> Result<Option<Vec<&'a str>>> {
|
||||||
|
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This case is unreachable because extract_document_change has been reimplemented to not call this function.
|
||||||
|
fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
|
||||||
|
// This method is reimplemented to count the number of words in the document in each field
|
||||||
|
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
|
||||||
|
fn extract_document_change(
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
document_tokenizer: &DocumentTokenizer,
|
||||||
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
|
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
document_change: DocumentChange,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
match document_change {
|
||||||
|
DocumentChange::Deletion(inner) => {
|
||||||
|
let mut fid_word_count = HashMap::new();
|
||||||
|
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||||
|
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are deleted.
|
||||||
|
for (fid, count) in fid_word_count.iter() {
|
||||||
|
if *count <= MAX_COUNTED_WORDS {
|
||||||
|
let key = build_key(*fid, *count as u8, &mut key_buffer);
|
||||||
|
/// TODO manage the error
|
||||||
|
cached_sorter.insert_del_u32(key, inner.docid()).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DocumentChange::Update(inner) => {
|
||||||
|
let mut fid_word_count = HashMap::new();
|
||||||
|
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||||
|
fid_word_count
|
||||||
|
.entry(fid)
|
||||||
|
.and_modify(|(current_count, new_count)| *current_count += 1)
|
||||||
|
.or_insert((1, 0));
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(
|
||||||
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut token_fn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||||
|
fid_word_count
|
||||||
|
.entry(fid)
|
||||||
|
.and_modify(|(current_count, new_count)| *new_count += 1)
|
||||||
|
.or_insert((0, 1));
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
|
|
||||||
|
// Only the fields that have a change in the number of words are updated.
|
||||||
|
for (fid, (current_count, new_count)) in fid_word_count.iter() {
|
||||||
|
if *current_count != *new_count {
|
||||||
|
if *current_count <= MAX_COUNTED_WORDS {
|
||||||
|
let key = build_key(*fid, *current_count as u8, &mut key_buffer);
|
||||||
|
/// TODO manage the error
|
||||||
|
cached_sorter.insert_del_u32(key, inner.docid()).unwrap();
|
||||||
|
}
|
||||||
|
if *new_count <= MAX_COUNTED_WORDS {
|
||||||
|
let key = build_key(*fid, *new_count as u8, &mut key_buffer);
|
||||||
|
/// TODO manage the error
|
||||||
|
cached_sorter.insert_add_u32(key, inner.docid()).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DocumentChange::Insertion(inner) => {
|
||||||
|
let mut fid_word_count = HashMap::new();
|
||||||
|
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||||
|
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
|
|
||||||
|
// The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are stored.
|
||||||
|
for (fid, count) in fid_word_count.iter() {
|
||||||
|
if *count <= MAX_COUNTED_WORDS {
|
||||||
|
let key = build_key(*fid, *count as u8, &mut key_buffer);
|
||||||
|
/// TODO manage the error
|
||||||
|
cached_sorter.insert_add_u32(key, inner.docid()).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_key(fid: FieldId, count: u8, key_buffer: &mut Vec<u8>) -> &[u8] {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
key_buffer.push(count);
|
||||||
|
key_buffer.as_slice()
|
||||||
|
}
|
@ -0,0 +1,182 @@
|
|||||||
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
|
collections::{BTreeMap, VecDeque},
|
||||||
|
};
|
||||||
|
|
||||||
|
use heed::RoTxn;
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use obkv::KvReader;
|
||||||
|
|
||||||
|
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
|
||||||
|
use crate::{
|
||||||
|
proximity::{index_proximity, MAX_DISTANCE},
|
||||||
|
update::{
|
||||||
|
new::{extract::cache::CboCachedSorter, DocumentChange},
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
},
|
||||||
|
FieldId, GlobalFieldsIdsMap, Index, Result,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct WordPairProximityDocidsExtractor;
|
||||||
|
impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||||
|
fn attributes_to_extract<'a>(
|
||||||
|
rtxn: &'a RoTxn,
|
||||||
|
index: &'a Index,
|
||||||
|
) -> Result<Option<Vec<&'a str>>> {
|
||||||
|
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
|
||||||
|
Ok(vec![])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This case is unreachable because extract_document_change has been reimplemented to not call this function.
|
||||||
|
fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
|
||||||
|
// This method is reimplemented to count the number of words in the document in each field
|
||||||
|
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
|
||||||
|
fn extract_document_change(
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
document_tokenizer: &DocumentTokenizer,
|
||||||
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
|
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
document_change: DocumentChange,
|
||||||
|
) -> Result<()> {
|
||||||
|
/// TODO: mutualize those buffers
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut add_word_pair_proximity = BTreeMap::new();
|
||||||
|
let mut del_word_pair_proximity = BTreeMap::new();
|
||||||
|
let mut word_positions: VecDeque<(String, u16)> =
|
||||||
|
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||||
|
|
||||||
|
let docid = document_change.docid();
|
||||||
|
match document_change {
|
||||||
|
DocumentChange::Deletion(inner) => {
|
||||||
|
let document = inner.current(rtxn, index)?.unwrap();
|
||||||
|
process_document_tokens(
|
||||||
|
document,
|
||||||
|
document_tokenizer,
|
||||||
|
fields_ids_map,
|
||||||
|
&mut word_positions,
|
||||||
|
&mut del_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
DocumentChange::Update(inner) => {
|
||||||
|
let document = inner.current(rtxn, index)?.unwrap();
|
||||||
|
process_document_tokens(
|
||||||
|
&document,
|
||||||
|
document_tokenizer,
|
||||||
|
fields_ids_map,
|
||||||
|
&mut word_positions,
|
||||||
|
&mut del_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
let document = inner.new();
|
||||||
|
process_document_tokens(
|
||||||
|
document,
|
||||||
|
document_tokenizer,
|
||||||
|
fields_ids_map,
|
||||||
|
&mut word_positions,
|
||||||
|
&mut add_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
DocumentChange::Insertion(inner) => {
|
||||||
|
let document = inner.new();
|
||||||
|
process_document_tokens(
|
||||||
|
document,
|
||||||
|
document_tokenizer,
|
||||||
|
fields_ids_map,
|
||||||
|
&mut word_positions,
|
||||||
|
&mut add_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use itertools::EitherOrBoth::*;
|
||||||
|
for eob in
|
||||||
|
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||||
|
d.cmp(a)
|
||||||
|
})
|
||||||
|
{
|
||||||
|
match eob {
|
||||||
|
Left(((w1, w2), prox)) => {
|
||||||
|
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||||
|
cached_sorter.insert_del_u32(key, docid).unwrap();
|
||||||
|
}
|
||||||
|
Right(((w1, w2), prox)) => {
|
||||||
|
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||||
|
cached_sorter.insert_add_u32(key, docid).unwrap();
|
||||||
|
}
|
||||||
|
Both(((w1, w2), del_prox), (_, add_prox)) => {
|
||||||
|
if del_prox != add_prox {
|
||||||
|
let key = build_key(*del_prox, w1, w2, &mut key_buffer);
|
||||||
|
cached_sorter.insert_del_u32(key, docid).unwrap();
|
||||||
|
let key = build_key(*add_prox, w1, w2, &mut key_buffer);
|
||||||
|
cached_sorter.insert_add_u32(key, docid).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> &'a [u8] {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.push(prox);
|
||||||
|
key_buffer.extend_from_slice(w1.as_bytes());
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(w2.as_bytes());
|
||||||
|
key_buffer.as_slice()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn word_positions_into_word_pair_proximity(
|
||||||
|
word_positions: &mut VecDeque<(String, u16)>,
|
||||||
|
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||||
|
for (word, position) in word_positions.iter() {
|
||||||
|
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||||
|
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||||
|
word_pair_proximity
|
||||||
|
.entry((head_word.clone(), word.clone()))
|
||||||
|
.and_modify(|p| {
|
||||||
|
*p = std::cmp::min(*p, prox);
|
||||||
|
})
|
||||||
|
.or_insert(prox);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process_document_tokens(
|
||||||
|
document: &KvReader<FieldId>,
|
||||||
|
document_tokenizer: &DocumentTokenizer,
|
||||||
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
|
word_positions: &mut VecDeque<(String, u16)>,
|
||||||
|
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||||
|
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||||
|
while word_positions
|
||||||
|
.front()
|
||||||
|
.map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
|
||||||
|
{
|
||||||
|
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// insert the new word.
|
||||||
|
word_positions.push_back((word.to_string(), pos));
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
|
||||||
|
|
||||||
|
while !word_positions.is_empty() {
|
||||||
|
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -1,13 +1,17 @@
|
|||||||
|
mod extract_fid_word_count_docids;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
|
mod extract_word_pair_proximity_docids;
|
||||||
mod tokenize_document;
|
mod tokenize_document;
|
||||||
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
|
pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor;
|
||||||
pub use extract_word_docids::{
|
pub use extract_word_docids::{
|
||||||
ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
||||||
WordPositionDocidsExtractor,
|
WordPositionDocidsExtractor,
|
||||||
};
|
};
|
||||||
|
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||||
|
@ -3,6 +3,7 @@ use std::collections::HashMap;
|
|||||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::proximity::MAX_DISTANCE;
|
||||||
use crate::update::new::extract::perm_json_p::{
|
use crate::update::new::extract::perm_json_p::{
|
||||||
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
||||||
};
|
};
|
||||||
@ -43,8 +44,10 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
return Err(UserError::AttributeLimitReached.into());
|
return Err(UserError::AttributeLimitReached.into());
|
||||||
};
|
};
|
||||||
|
|
||||||
let position =
|
let position = field_position
|
||||||
field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
|
.entry(field_id)
|
||||||
|
.and_modify(|counter| *counter += MAX_DISTANCE)
|
||||||
|
.or_insert(0);
|
||||||
if *position as u32 >= self.max_positions_per_attributes {
|
if *position as u32 >= self.max_positions_per_attributes {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@ -116,19 +119,19 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
|
||||||
/// else we keep the standard proximity of 1 between words.
|
/// else we keep the standard proximity of 1 between words.
|
||||||
fn process_tokens<'a>(
|
fn process_tokens<'a>(
|
||||||
start_offset: usize,
|
start_offset: u32,
|
||||||
tokens: impl Iterator<Item = Token<'a>>,
|
tokens: impl Iterator<Item = Token<'a>>,
|
||||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
) -> impl Iterator<Item = (u32, Token<'a>)> {
|
||||||
tokens
|
tokens
|
||||||
.skip_while(|token| token.is_separator())
|
.skip_while(|token| token.is_separator())
|
||||||
.scan((start_offset, None), |(offset, prev_kind), mut token| {
|
.scan((start_offset, None), |(offset, prev_kind), mut token| {
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||||
*offset += match *prev_kind {
|
*offset += match *prev_kind {
|
||||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
|
||||||
Some(_) => 1,
|
Some(_) => 1,
|
||||||
None => 0,
|
None => 0,
|
||||||
};
|
};
|
||||||
@ -246,7 +249,7 @@ mod test {
|
|||||||
]: "doggo",
|
]: "doggo",
|
||||||
[
|
[
|
||||||
2,
|
2,
|
||||||
8,
|
MAX_DISTANCE,
|
||||||
]: "doggo",
|
]: "doggo",
|
||||||
[
|
[
|
||||||
2,
|
2,
|
||||||
|
Loading…
Reference in New Issue
Block a user