Make the FieldsIdsMap serialization more stable by using a BTreeMap

This commit is contained in:
Clément Renault 2020-10-22 14:23:33 +02:00
parent 9133f38138
commit 566a7c3039
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
9 changed files with 67 additions and 77 deletions

View File

@ -1,9 +1,9 @@
use std::collections::{HashMap, BTreeMap}; use std::collections::BTreeMap;
use serde::{Serialize, Deserialize}; use serde::{Serialize, Deserialize};
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldsIdsMap { pub struct FieldsIdsMap {
names_ids: HashMap<String, u8>, names_ids: BTreeMap<String, u8>,
ids_names: BTreeMap<u8, String>, ids_names: BTreeMap<u8, String>,
next_id: Option<u8>, next_id: Option<u8>,
} }
@ -11,7 +11,7 @@ pub struct FieldsIdsMap {
impl FieldsIdsMap { impl FieldsIdsMap {
pub fn new() -> FieldsIdsMap { pub fn new() -> FieldsIdsMap {
FieldsIdsMap { FieldsIdsMap {
names_ids: HashMap::new(), names_ids: BTreeMap::new(),
ids_names: BTreeMap::new(), ids_names: BTreeMap::new(),
next_id: Some(0), next_id: Some(0),
} }
@ -66,6 +66,12 @@ impl FieldsIdsMap {
} }
} }
impl Default for FieldsIdsMap {
fn default() -> FieldsIdsMap {
FieldsIdsMap::new()
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;

View File

@ -1,26 +0,0 @@
use std::borrow::Cow;
use csv::{StringRecord, Writer, ReaderBuilder};
pub struct CsvStringRecordCodec;
impl heed::BytesDecode<'_> for CsvStringRecordCodec {
type DItem = StringRecord;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
let mut reader = ReaderBuilder::new()
.has_headers(false)
.buffer_capacity(bytes.len()) // we will just read this record
.from_reader(bytes);
reader.records().next()?.ok() // it return an Option of Result
}
}
impl heed::BytesEncode<'_> for CsvStringRecordCodec {
type EItem = StringRecord;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut writer = Writer::from_writer(Vec::new());
writer.write_record(item).ok()?;
writer.into_inner().ok().map(Cow::Owned)
}
}

View File

@ -1,7 +1,6 @@
mod beu32_str_codec; mod beu32_str_codec;
mod bo_roaring_bitmap_codec; mod bo_roaring_bitmap_codec;
mod cbo_roaring_bitmap_codec; mod cbo_roaring_bitmap_codec;
mod csv_string_record_codec;
mod obkv_codec; mod obkv_codec;
mod roaring_bitmap_codec; mod roaring_bitmap_codec;
mod str_str_u8_codec; mod str_str_u8_codec;
@ -9,7 +8,6 @@ mod str_str_u8_codec;
pub use self::beu32_str_codec::BEU32StrCodec; pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::csv_string_record_codec::CsvStringRecordCodec;
pub use self::obkv_codec::ObkvCodec; pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
pub use self::str_str_u8_codec::StrStrU8Codec; pub use self::str_str_u8_codec::StrStrU8Codec;

View File

@ -1,23 +1,23 @@
use anyhow::Context; use anyhow::Context;
use csv::StringRecord;
use heed::types::*; use heed::types::*;
use heed::{PolyDatabase, Database}; use heed::{PolyDatabase, Database};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::Search; use crate::Search;
use crate::{BEU32, DocumentId}; use crate::{BEU32, DocumentId};
use crate::fields_ids_map::FieldsIdsMap;
use crate::{ use crate::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
}; };
pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_FST_KEY: &str = "words-fst";
pub const HEADERS_KEY: &str = "headers"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
#[derive(Clone)] #[derive(Clone)]
pub struct Index { pub struct Index {
/// Contains many different types (e.g. the documents CSV headers). /// Contains many different types (e.g. the fields ids map).
pub main: PolyDatabase, pub main: PolyDatabase,
/// A word and all the documents ids containing the word. /// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>, pub word_docids: Database<Str, RoaringBitmapCodec>,
@ -25,7 +25,7 @@ pub struct Index {
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>, pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the document id to the document as a CSV line. /// Maps the document id to the document as an obkv store.
pub documents: Database<OwnedType<BEU32>, ObkvCodec>, pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
} }
@ -44,17 +44,17 @@ impl Index {
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?) Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
} }
pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> { pub fn put_fields_ids_map(&self, wtxn: &mut heed::RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers) self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, FIELDS_IDS_MAP_KEY, map)
} }
pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> { pub fn fields_ids_map(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<FieldsIdsMap>> {
self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY) self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)
} }
pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> { pub fn number_of_fields(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
match self.headers(rtxn)? { match self.fields_ids_map(rtxn)? {
Some(headers) => Ok(Some(headers.len())), Some(map) => Ok(Some(map.len())),
None => Ok(None), None => Ok(None),
} }
} }

View File

@ -1,6 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use anyhow::bail; use anyhow::{bail, ensure};
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use fst::IntoStreamer; use fst::IntoStreamer;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
@ -25,8 +25,8 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
build.extend_stream(op.into_stream()).unwrap(); build.extend_stream(op.into_stream()).unwrap();
Ok(build.into_inner().unwrap()) Ok(build.into_inner().unwrap())
}, },
HEADERS_KEY => { FIELDS_IDS_MAP_KEY => {
assert!(values.windows(2).all(|vs| vs[0] == vs[1])); ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match");
Ok(values[0].to_vec()) Ok(values[0].to_vec())
}, },
DOCUMENTS_IDS_KEY => word_docids_merge(&[], values), DOCUMENTS_IDS_KEY => word_docids_merge(&[], values),

View File

@ -16,7 +16,8 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use tempfile::tempfile; use tempfile::tempfile;
use crate::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::fields_ids_map::FieldsIdsMap;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token}; use crate::tokenizer::{simple_tokenizer, only_token};
use crate::{SmallVec32, Position, DocumentId}; use crate::{SmallVec32, Position, DocumentId};
@ -30,7 +31,7 @@ const MAX_POSITION: usize = 1000;
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
pub struct Readers { pub struct Readers {
@ -182,10 +183,10 @@ impl Store {
Ok(()) Ok(())
} }
fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> { fn write_fields_ids_map(&mut self, map: &FieldsIdsMap) -> anyhow::Result<()> {
let headers = CsvStringRecordCodec::bytes_encode(headers) let bytes = serde_json::to_vec(&map)?;
.with_context(|| format!("could not encode csv record"))?; self.main_sorter.insert(FIELDS_IDS_MAP_KEY, bytes)?;
Ok(self.main_sorter.insert(HEADERS_KEY, headers)?) Ok(())
} }
fn write_document( fn write_document(
@ -320,7 +321,12 @@ impl Store {
// Write the headers into the store. // Write the headers into the store.
let headers = rdr.headers()?; let headers = rdr.headers()?;
self.write_headers(&headers)?;
let mut fields_ids_map = FieldsIdsMap::new();
for header in headers.iter() {
fields_ids_map.insert(header).context("no more field id available")?;
}
self.write_fields_ids_map(&fields_ids_map)?;
let mut before = Instant::now(); let mut before = Instant::now();
let mut document_id: usize = base_document_id; let mut document_id: usize = base_document_id;

View File

@ -20,8 +20,8 @@ pub use self::index::Index;
pub use self::search::{Search, SearchResult}; pub use self::search::{Search, SearchResult};
pub use self::update_store::UpdateStore; pub use self::update_store::UpdateStore;
pub use self::heed_codec::{ pub use self::heed_codec::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
}; };
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -1,8 +1,10 @@
use std::collections::HashMap;
use std::io::{self, BufRead}; use std::io::{self, BufRead};
use std::iter::once; use std::iter::once;
use std::path::PathBuf; use std::path::PathBuf;
use std::time::Instant; use std::time::Instant;
use anyhow::Context;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use log::debug; use log::debug;
use structopt::StructOpt; use structopt::StructOpt;
@ -59,18 +61,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let query = result?; let query = result?;
let result = index.search(&rtxn).query(query).execute().unwrap(); let result = index.search(&rtxn).query(query).execute().unwrap();
let headers = match index.headers(&rtxn)? { let mut stdout = io::stdout();
Some(headers) => headers, let fields_ids_map = index.fields_ids_map(&rtxn)?.unwrap_or_default();
None => return Ok(()),
};
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?; let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
let mut wtr = csv::Writer::from_writer(io::stdout());
wtr.write_record(&headers)?;
for (_id, record) in documents { for (_id, record) in documents {
wtr.write_record(record.iter().map(|(_, v)| v))?; let document: anyhow::Result<HashMap<_, _>> = record.iter()
.map(|(k, v)| {
let key = fields_ids_map.name(k).context("field id not found")?;
let val = std::str::from_utf8(v)?;
Ok((key, val))
})
.collect();
let document = document?;
serde_json::to_writer(&mut stdout, &document)?;
} }
wtr.flush()?;
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
} }

View File

@ -382,11 +382,12 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let SearchResult { found_words, documents_ids } = search.execute().unwrap(); let SearchResult { found_words, documents_ids } = search.execute().unwrap();
let mut documents = Vec::new(); let mut documents = Vec::new();
if let Some(headers) = index.headers(&rtxn).unwrap() { let fields_ids_map = index.fields_ids_map(&rtxn).unwrap().unwrap_or_default();
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
let mut record = record.iter() let mut record = record.iter()
.map(|(key_id, value)| { .map(|(key_id, value)| {
let key = headers[key_id as usize].to_owned(); let key = fields_ids_map.name(key_id).unwrap().to_owned();
let value = std::str::from_utf8(value).unwrap().to_owned(); let value = std::str::from_utf8(value).unwrap().to_owned();
(key, value) (key, value)
}) })
@ -398,7 +399,6 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
documents.push(record); documents.push(record);
} }
}
Response::builder() Response::builder()
.header("Content-Type", "application/json") .header("Content-Type", "application/json")