Make the FieldsIdsMap serialization more stable by using a BTreeMap

This commit is contained in:
Clément Renault 2020-10-22 14:23:33 +02:00
parent 9133f38138
commit 566a7c3039
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
9 changed files with 67 additions and 77 deletions

View File

@ -1,9 +1,9 @@
use std::collections::{HashMap, BTreeMap};
use std::collections::BTreeMap;
use serde::{Serialize, Deserialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldsIdsMap {
names_ids: HashMap<String, u8>,
names_ids: BTreeMap<String, u8>,
ids_names: BTreeMap<u8, String>,
next_id: Option<u8>,
}
@ -11,7 +11,7 @@ pub struct FieldsIdsMap {
impl FieldsIdsMap {
pub fn new() -> FieldsIdsMap {
FieldsIdsMap {
names_ids: HashMap::new(),
names_ids: BTreeMap::new(),
ids_names: BTreeMap::new(),
next_id: Some(0),
}
@ -66,6 +66,12 @@ impl FieldsIdsMap {
}
}
impl Default for FieldsIdsMap {
fn default() -> FieldsIdsMap {
FieldsIdsMap::new()
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@ -1,26 +0,0 @@
use std::borrow::Cow;
use csv::{StringRecord, Writer, ReaderBuilder};
pub struct CsvStringRecordCodec;
impl heed::BytesDecode<'_> for CsvStringRecordCodec {
type DItem = StringRecord;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
let mut reader = ReaderBuilder::new()
.has_headers(false)
.buffer_capacity(bytes.len()) // we will just read this record
.from_reader(bytes);
reader.records().next()?.ok() // it return an Option of Result
}
}
impl heed::BytesEncode<'_> for CsvStringRecordCodec {
type EItem = StringRecord;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut writer = Writer::from_writer(Vec::new());
writer.write_record(item).ok()?;
writer.into_inner().ok().map(Cow::Owned)
}
}

View File

@ -1,7 +1,6 @@
mod beu32_str_codec;
mod bo_roaring_bitmap_codec;
mod cbo_roaring_bitmap_codec;
mod csv_string_record_codec;
mod obkv_codec;
mod roaring_bitmap_codec;
mod str_str_u8_codec;
@ -9,7 +8,6 @@ mod str_str_u8_codec;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::csv_string_record_codec::CsvStringRecordCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
pub use self::str_str_u8_codec::StrStrU8Codec;

View File

@ -1,23 +1,23 @@
use anyhow::Context;
use csv::StringRecord;
use heed::types::*;
use heed::{PolyDatabase, Database};
use roaring::RoaringBitmap;
use crate::Search;
use crate::{BEU32, DocumentId};
use crate::fields_ids_map::FieldsIdsMap;
use crate::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
BoRoaringBitmapCodec, CboRoaringBitmapCodec,
};
pub const WORDS_FST_KEY: &str = "words-fst";
pub const HEADERS_KEY: &str = "headers";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
#[derive(Clone)]
pub struct Index {
/// Contains many different types (e.g. the documents CSV headers).
/// Contains many different types (e.g. the fields ids map).
pub main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
@ -25,7 +25,7 @@ pub struct Index {
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the document id to the document as a CSV line.
/// Maps the document id to the document as an obkv store.
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
@ -44,17 +44,17 @@ impl Index {
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
}
pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> {
self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers)
pub fn put_fields_ids_map(&self, wtxn: &mut heed::RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, FIELDS_IDS_MAP_KEY, map)
}
pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> {
self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY)
pub fn fields_ids_map(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<FieldsIdsMap>> {
self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)
}
pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
match self.headers(rtxn)? {
Some(headers) => Ok(Some(headers.len())),
pub fn number_of_fields(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
match self.fields_ids_map(rtxn)? {
Some(map) => Ok(Some(map.len())),
None => Ok(None),
}
}

View File

@ -1,6 +1,6 @@
use std::borrow::Cow;
use anyhow::bail;
use anyhow::{bail, ensure};
use bstr::ByteSlice as _;
use fst::IntoStreamer;
use roaring::RoaringBitmap;
@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
@ -25,8 +25,8 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
build.extend_stream(op.into_stream()).unwrap();
Ok(build.into_inner().unwrap())
},
HEADERS_KEY => {
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
FIELDS_IDS_MAP_KEY => {
ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match");
Ok(values[0].to_vec())
},
DOCUMENTS_IDS_KEY => word_docids_merge(&[], values),

View File

@ -16,7 +16,8 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use roaring::RoaringBitmap;
use tempfile::tempfile;
use crate::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::fields_ids_map::FieldsIdsMap;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token};
use crate::{SmallVec32, Position, DocumentId};
@ -30,7 +31,7 @@ const MAX_POSITION: usize = 1000;
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
pub struct Readers {
@ -182,10 +183,10 @@ impl Store {
Ok(())
}
fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> {
let headers = CsvStringRecordCodec::bytes_encode(headers)
.with_context(|| format!("could not encode csv record"))?;
Ok(self.main_sorter.insert(HEADERS_KEY, headers)?)
fn write_fields_ids_map(&mut self, map: &FieldsIdsMap) -> anyhow::Result<()> {
let bytes = serde_json::to_vec(&map)?;
self.main_sorter.insert(FIELDS_IDS_MAP_KEY, bytes)?;
Ok(())
}
fn write_document(
@ -320,7 +321,12 @@ impl Store {
// Write the headers into the store.
let headers = rdr.headers()?;
self.write_headers(&headers)?;
let mut fields_ids_map = FieldsIdsMap::new();
for header in headers.iter() {
fields_ids_map.insert(header).context("no more field id available")?;
}
self.write_fields_ids_map(&fields_ids_map)?;
let mut before = Instant::now();
let mut document_id: usize = base_document_id;

View File

@ -20,8 +20,8 @@ pub use self::index::Index;
pub use self::search::{Search, SearchResult};
pub use self::update_store::UpdateStore;
pub use self::heed_codec::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
};
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -1,8 +1,10 @@
use std::collections::HashMap;
use std::io::{self, BufRead};
use std::iter::once;
use std::path::PathBuf;
use std::time::Instant;
use anyhow::Context;
use heed::EnvOpenOptions;
use log::debug;
use structopt::StructOpt;
@ -59,18 +61,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let query = result?;
let result = index.search(&rtxn).query(query).execute().unwrap();
let headers = match index.headers(&rtxn)? {
Some(headers) => headers,
None => return Ok(()),
};
let mut stdout = io::stdout();
let fields_ids_map = index.fields_ids_map(&rtxn)?.unwrap_or_default();
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
let mut wtr = csv::Writer::from_writer(io::stdout());
wtr.write_record(&headers)?;
for (_id, record) in documents {
wtr.write_record(record.iter().map(|(_, v)| v))?;
let document: anyhow::Result<HashMap<_, _>> = record.iter()
.map(|(k, v)| {
let key = fields_ids_map.name(k).context("field id not found")?;
let val = std::str::from_utf8(v)?;
Ok((key, val))
})
.collect();
let document = document?;
serde_json::to_writer(&mut stdout, &document)?;
}
wtr.flush()?;
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
}

View File

@ -382,22 +382,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
let mut documents = Vec::new();
if let Some(headers) = index.headers(&rtxn).unwrap() {
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
let mut record = record.iter()
.map(|(key_id, value)| {
let key = headers[key_id as usize].to_owned();
let value = std::str::from_utf8(value).unwrap().to_owned();
(key, value)
})
.collect();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap().unwrap_or_default();
if !disable_highlighting {
highlight_record(&mut record, &found_words);
}
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
let mut record = record.iter()
.map(|(key_id, value)| {
let key = fields_ids_map.name(key_id).unwrap().to_owned();
let value = std::str::from_utf8(value).unwrap().to_owned();
(key, value)
})
.collect();
documents.push(record);
if !disable_highlighting {
highlight_record(&mut record, &found_words);
}
documents.push(record);
}
Response::builder()