diff --git a/src/fields_ids_map.rs b/src/fields_ids_map.rs index e4a86f622..82d06e818 100644 --- a/src/fields_ids_map.rs +++ b/src/fields_ids_map.rs @@ -1,9 +1,9 @@ -use std::collections::{HashMap, BTreeMap}; +use std::collections::BTreeMap; use serde::{Serialize, Deserialize}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FieldsIdsMap { - names_ids: HashMap, + names_ids: BTreeMap, ids_names: BTreeMap, next_id: Option, } @@ -11,7 +11,7 @@ pub struct FieldsIdsMap { impl FieldsIdsMap { pub fn new() -> FieldsIdsMap { FieldsIdsMap { - names_ids: HashMap::new(), + names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0), } @@ -66,6 +66,12 @@ impl FieldsIdsMap { } } +impl Default for FieldsIdsMap { + fn default() -> FieldsIdsMap { + FieldsIdsMap::new() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/heed_codec/csv_string_record_codec.rs b/src/heed_codec/csv_string_record_codec.rs deleted file mode 100644 index d444d48a4..000000000 --- a/src/heed_codec/csv_string_record_codec.rs +++ /dev/null @@ -1,26 +0,0 @@ -use std::borrow::Cow; -use csv::{StringRecord, Writer, ReaderBuilder}; - -pub struct CsvStringRecordCodec; - -impl heed::BytesDecode<'_> for CsvStringRecordCodec { - type DItem = StringRecord; - - fn bytes_decode(bytes: &[u8]) -> Option { - let mut reader = ReaderBuilder::new() - .has_headers(false) - .buffer_capacity(bytes.len()) // we will just read this record - .from_reader(bytes); - reader.records().next()?.ok() // it return an Option of Result - } -} - -impl heed::BytesEncode<'_> for CsvStringRecordCodec { - type EItem = StringRecord; - - fn bytes_encode(item: &Self::EItem) -> Option> { - let mut writer = Writer::from_writer(Vec::new()); - writer.write_record(item).ok()?; - writer.into_inner().ok().map(Cow::Owned) - } -} diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index a6c1d9555..68739fbf1 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -1,7 +1,6 @@ mod beu32_str_codec; mod bo_roaring_bitmap_codec; mod cbo_roaring_bitmap_codec; -mod csv_string_record_codec; mod obkv_codec; mod roaring_bitmap_codec; mod str_str_u8_codec; @@ -9,7 +8,6 @@ mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; -pub use self::csv_string_record_codec::CsvStringRecordCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/src/index.rs b/src/index.rs index c28947597..0f44a6167 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,23 +1,23 @@ use anyhow::Context; -use csv::StringRecord; use heed::types::*; use heed::{PolyDatabase, Database}; use roaring::RoaringBitmap; use crate::Search; use crate::{BEU32, DocumentId}; +use crate::fields_ids_map::FieldsIdsMap; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, - CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, + BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; pub const WORDS_FST_KEY: &str = "words-fst"; -pub const HEADERS_KEY: &str = "headers"; +pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; #[derive(Clone)] pub struct Index { - /// Contains many different types (e.g. the documents CSV headers). + /// Contains many different types (e.g. the fields ids map). pub main: PolyDatabase, /// A word and all the documents ids containing the word. pub word_docids: Database, @@ -25,7 +25,7 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, - /// Maps the document id to the document as a CSV line. + /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } @@ -44,17 +44,17 @@ impl Index { Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?) } - pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> { - self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers) + pub fn put_fields_ids_map(&self, wtxn: &mut heed::RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_IDS_MAP_KEY, map) } - pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result> { - self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY) + pub fn fields_ids_map(&self, rtxn: &heed::RoTxn) -> heed::Result> { + self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY) } - pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result> { - match self.headers(rtxn)? { - Some(headers) => Ok(Some(headers.len())), + pub fn number_of_fields(&self, rtxn: &heed::RoTxn) -> anyhow::Result> { + match self.fields_ids_map(rtxn)? { + Some(map) => Ok(Some(map.len())), None => Ok(None), } } diff --git a/src/indexing/merge_function.rs b/src/indexing/merge_function.rs index ac55d62ee..3e957394f 100644 --- a/src/indexing/merge_function.rs +++ b/src/indexing/merge_function.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use anyhow::bail; +use anyhow::{bail, ensure}; use bstr::ByteSlice as _; use fst::IntoStreamer; use roaring::RoaringBitmap; @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); -const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); +const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes(); const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { @@ -25,8 +25,8 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { build.extend_stream(op.into_stream()).unwrap(); Ok(build.into_inner().unwrap()) }, - HEADERS_KEY => { - assert!(values.windows(2).all(|vs| vs[0] == vs[1])); + FIELDS_IDS_MAP_KEY => { + ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match"); Ok(values[0].to_vec()) }, DOCUMENTS_IDS_KEY => word_docids_merge(&[], values), diff --git a/src/indexing/store.rs b/src/indexing/store.rs index 29e4c046d..28564a1e0 100644 --- a/src/indexing/store.rs +++ b/src/indexing/store.rs @@ -16,7 +16,8 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use roaring::RoaringBitmap; use tempfile::tempfile; -use crate::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; +use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::{SmallVec32, Position, DocumentId}; @@ -30,7 +31,7 @@ const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); -const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); +const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes(); const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); pub struct Readers { @@ -182,10 +183,10 @@ impl Store { Ok(()) } - fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> { - let headers = CsvStringRecordCodec::bytes_encode(headers) - .with_context(|| format!("could not encode csv record"))?; - Ok(self.main_sorter.insert(HEADERS_KEY, headers)?) + fn write_fields_ids_map(&mut self, map: &FieldsIdsMap) -> anyhow::Result<()> { + let bytes = serde_json::to_vec(&map)?; + self.main_sorter.insert(FIELDS_IDS_MAP_KEY, bytes)?; + Ok(()) } fn write_document( @@ -320,7 +321,12 @@ impl Store { // Write the headers into the store. let headers = rdr.headers()?; - self.write_headers(&headers)?; + + let mut fields_ids_map = FieldsIdsMap::new(); + for header in headers.iter() { + fields_ids_map.insert(header).context("no more field id available")?; + } + self.write_fields_ids_map(&fields_ids_map)?; let mut before = Instant::now(); let mut document_id: usize = base_document_id; diff --git a/src/lib.rs b/src/lib.rs index 1da9833c8..2d03b4ddf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,8 +20,8 @@ pub use self::index::Index; pub use self::search::{Search, SearchResult}; pub use self::update_store::UpdateStore; pub use self::heed_codec::{ - RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, - CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, + RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, + ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; pub type FastMap4 = HashMap>; diff --git a/src/subcommand/search.rs b/src/subcommand/search.rs index f7f7adaeb..74b214b5a 100644 --- a/src/subcommand/search.rs +++ b/src/subcommand/search.rs @@ -1,8 +1,10 @@ +use std::collections::HashMap; use std::io::{self, BufRead}; use std::iter::once; use std::path::PathBuf; use std::time::Instant; +use anyhow::Context; use heed::EnvOpenOptions; use log::debug; use structopt::StructOpt; @@ -59,18 +61,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let query = result?; let result = index.search(&rtxn).query(query).execute().unwrap(); - let headers = match index.headers(&rtxn)? { - Some(headers) => headers, - None => return Ok(()), - }; + let mut stdout = io::stdout(); + let fields_ids_map = index.fields_ids_map(&rtxn)?.unwrap_or_default(); let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?; - let mut wtr = csv::Writer::from_writer(io::stdout()); - wtr.write_record(&headers)?; for (_id, record) in documents { - wtr.write_record(record.iter().map(|(_, v)| v))?; + let document: anyhow::Result> = record.iter() + .map(|(k, v)| { + let key = fields_ids_map.name(k).context("field id not found")?; + let val = std::str::from_utf8(v)?; + Ok((key, val)) + }) + .collect(); + + let document = document?; + serde_json::to_writer(&mut stdout, &document)?; } - wtr.flush()?; debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); } diff --git a/src/subcommand/serve.rs b/src/subcommand/serve.rs index fdbc60d04..961ac2a81 100644 --- a/src/subcommand/serve.rs +++ b/src/subcommand/serve.rs @@ -382,22 +382,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let SearchResult { found_words, documents_ids } = search.execute().unwrap(); let mut documents = Vec::new(); - if let Some(headers) = index.headers(&rtxn).unwrap() { - for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { - let mut record = record.iter() - .map(|(key_id, value)| { - let key = headers[key_id as usize].to_owned(); - let value = std::str::from_utf8(value).unwrap().to_owned(); - (key, value) - }) - .collect(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap().unwrap_or_default(); - if !disable_highlighting { - highlight_record(&mut record, &found_words); - } + for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { + let mut record = record.iter() + .map(|(key_id, value)| { + let key = fields_ids_map.name(key_id).unwrap().to_owned(); + let value = std::str::from_utf8(value).unwrap().to_owned(); + (key, value) + }) + .collect(); - documents.push(record); + if !disable_highlighting { + highlight_record(&mut record, &found_words); } + + documents.push(record); } Response::builder()