mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Invert the word docid positions key order
This commit is contained in:
parent
c2405bcae2
commit
daa3673c1c
@ -13,10 +13,7 @@ use bstr::ByteSlice as _;
|
|||||||
use csv::StringRecord;
|
use csv::StringRecord;
|
||||||
use flate2::read::GzDecoder;
|
use flate2::read::GzDecoder;
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::BytesDecode;
|
use heed::{EnvOpenOptions, BytesEncode, types::*};
|
||||||
use heed::BytesEncode;
|
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
use heed::types::*;
|
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
use memmap::Mmap;
|
use memmap::Mmap;
|
||||||
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
||||||
@ -26,7 +23,7 @@ use structopt::StructOpt;
|
|||||||
|
|
||||||
use milli::heed_codec::CsvStringRecordCodec;
|
use milli::heed_codec::CsvStringRecordCodec;
|
||||||
use milli::tokenizer::{simple_tokenizer, only_words};
|
use milli::tokenizer::{simple_tokenizer, only_words};
|
||||||
use milli::{SmallVec32, Index, DocumentId, BEU32, StrBEU32Codec};
|
use milli::{SmallVec32, Index, DocumentId, BEU32};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||||
const ONE_MILLION: usize = 1_000_000;
|
const ONE_MILLION: usize = 1_000_000;
|
||||||
@ -202,12 +199,13 @@ impl Store {
|
|||||||
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
for (word, positions) in iter {
|
// We serialize the document ids into a buffer
|
||||||
key.truncate(1);
|
|
||||||
key.extend_from_slice(word.as_bytes());
|
|
||||||
// We prefix the words by the document id.
|
// We prefix the words by the document id.
|
||||||
key.extend_from_slice(&id.to_be_bytes());
|
key.extend_from_slice(&id.to_be_bytes());
|
||||||
// We serialize the document ids into a buffer
|
|
||||||
|
for (word, positions) in iter {
|
||||||
|
key.truncate(1 + 4);
|
||||||
|
key.extend_from_slice(word.as_bytes());
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.reserve(positions.serialized_size());
|
buffer.reserve(positions.serialized_size());
|
||||||
positions.serialize_into(&mut buffer)?;
|
positions.serialize_into(&mut buffer)?;
|
||||||
@ -350,7 +348,7 @@ fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) ->
|
|||||||
}
|
}
|
||||||
else if key.starts_with(&[WORD_DOCID_POSITIONS_BYTE]) {
|
else if key.starts_with(&[WORD_DOCID_POSITIONS_BYTE]) {
|
||||||
// Write the postings lists
|
// Write the postings lists
|
||||||
index.word_docid_positions.as_polymorph()
|
index.docid_word_positions.as_polymorph()
|
||||||
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,26 +2,26 @@ use std::borrow::Cow;
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::str;
|
use std::str;
|
||||||
|
|
||||||
pub struct StrBEU32Codec;
|
pub struct BEU32StrCodec;
|
||||||
|
|
||||||
impl<'a> heed::BytesDecode<'a> for StrBEU32Codec {
|
impl<'a> heed::BytesDecode<'a> for BEU32StrCodec {
|
||||||
type DItem = (&'a str, u32);
|
type DItem = (u32, &'a str);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (str_bytes, n_bytes) = bytes.split_at(bytes.len() - 4);
|
let (n_bytes, str_bytes) = bytes.split_at(4);
|
||||||
let s = str::from_utf8(str_bytes).ok()?;
|
|
||||||
let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||||
Some((s, n))
|
let s = str::from_utf8(str_bytes).ok()?;
|
||||||
|
Some((n, s))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
|
impl<'a> heed::BytesEncode<'a> for BEU32StrCodec {
|
||||||
type EItem = (&'a str, u32);
|
type EItem = (u32, &'a str);
|
||||||
|
|
||||||
fn bytes_encode((s, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
let mut bytes = Vec::with_capacity(s.len() + 4);
|
let mut bytes = Vec::with_capacity(s.len() + 4);
|
||||||
bytes.extend_from_slice(s.as_bytes());
|
|
||||||
bytes.extend_from_slice(&n.to_be_bytes());
|
bytes.extend_from_slice(&n.to_be_bytes());
|
||||||
|
bytes.extend_from_slice(s.as_bytes());
|
||||||
Some(Cow::Owned(bytes))
|
Some(Cow::Owned(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,7 +1,7 @@
|
|||||||
mod csv_string_record_codec;
|
mod csv_string_record_codec;
|
||||||
mod roaring_bitmap_codec;
|
mod roaring_bitmap_codec;
|
||||||
mod str_beu32_codec;
|
mod beu32_str_codec;
|
||||||
|
|
||||||
pub use self::csv_string_record_codec::CsvStringRecordCodec;
|
pub use self::csv_string_record_codec::CsvStringRecordCodec;
|
||||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
||||||
pub use self::str_beu32_codec::StrBEU32Codec;
|
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||||
|
@ -15,7 +15,7 @@ use heed::{PolyDatabase, Database};
|
|||||||
|
|
||||||
pub use self::search::{Search, SearchResult};
|
pub use self::search::{Search, SearchResult};
|
||||||
pub use self::criterion::{Criterion, default_criteria};
|
pub use self::criterion::{Criterion, default_criteria};
|
||||||
pub use self::heed_codec::{RoaringBitmapCodec, StrBEU32Codec, CsvStringRecordCodec};
|
pub use self::heed_codec::{RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec};
|
||||||
|
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
||||||
@ -38,7 +38,7 @@ pub struct Index {
|
|||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
||||||
pub word_docid_positions: Database<StrBEU32Codec, RoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, RoaringBitmapCodec>,
|
||||||
/// Maps the document id to the document as a CSV line.
|
/// Maps the document id to the document as a CSV line.
|
||||||
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
}
|
}
|
||||||
@ -48,7 +48,7 @@ impl Index {
|
|||||||
Ok(Index {
|
Ok(Index {
|
||||||
main: env.create_poly_database(None)?,
|
main: env.create_poly_database(None)?,
|
||||||
word_docids: env.create_database(Some("word-docids"))?,
|
word_docids: env.create_database(Some("word-docids"))?,
|
||||||
word_docid_positions: env.create_database(Some("word-docid-positions"))?,
|
docid_word_positions: env.create_database(Some("docid-word-positions"))?,
|
||||||
documents: env.create_database(Some("documents"))?,
|
documents: env.create_database(Some("documents"))?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -151,7 +151,7 @@ impl<'a> Search<'a> {
|
|||||||
for (word, (_distance, docids)) in words {
|
for (word, (_distance, docids)) in words {
|
||||||
|
|
||||||
if docids.contains(candidate) {
|
if docids.contains(candidate) {
|
||||||
match index.word_docid_positions.get(rtxn, &(word, candidate))? {
|
match index.docid_word_positions.get(rtxn, &(candidate, word))? {
|
||||||
Some(positions) => union_positions.union_with(&positions),
|
Some(positions) => union_positions.union_with(&positions),
|
||||||
None => error!("position missing for candidate {} and word {}", candidate, word),
|
None => error!("position missing for candidate {} and word {}", candidate, word),
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user