From 3b64735058c678a8314bf118a92a84c9ed47d11d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 28 Dec 2020 19:08:53 +0100 Subject: [PATCH] Introduce a struct to compute facets values --- http-ui/public/script.js | 6 +- http-ui/src/main.rs | 23 +++++- src/index.rs | 6 +- src/lib.rs | 2 +- src/search/facet/facet_distribution.rs | 106 +++++++++++++++++++++++++ src/search/facet/mod.rs | 10 ++- src/search/mod.rs | 23 +++--- 7 files changed, 156 insertions(+), 20 deletions(-) create mode 100644 src/search/facet/facet_distribution.rs diff --git a/http-ui/public/script.js b/http-ui/public/script.js index fb7a95cc9..a2005a9bd 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -15,18 +15,18 @@ $('#query, #facet').on('input', function () { type: "POST", url: "query", contentType: 'application/json', - data: JSON.stringify({ 'query': query, 'facetCondition': facet }), + data: JSON.stringify({ 'query': query, 'facetCondition': facet, "facetDistribution": true }), contentType: 'application/json', success: function (data, textStatus, request) { results.innerHTML = ''; let timeSpent = request.getResponseHeader('Time-Ms'); - let numberOfDocuments = data.length; + let numberOfDocuments = data.documents.length; count.innerHTML = `${numberOfDocuments}`; time.innerHTML = `${timeSpent}ms`; time.classList.remove('fade-in-out'); - for (element of data) { + for (element of data.documents) { const elem = document.createElement('li'); elem.classList.add("document"); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 366dbaa1e..fe2f71c8e 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -626,6 +626,14 @@ async fn main() -> anyhow::Result<()> { struct QueryBody { query: Option, facet_condition: Option, + facet_distribution: Option, + } + + #[derive(Debug, Serialize)] + #[serde(rename_all = "camelCase")] + struct Answer { + documents: Vec>, + facets: HashMap>, } let disable_highlighting = opt.disable_highlighting; @@ -649,7 +657,13 @@ async fn main() -> anyhow::Result<()> { } } - let SearchResult { found_words, documents_ids } = search.execute().unwrap(); + let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); + + let facets = if query.facet_distribution == Some(true) { + Some(index.facets(&rtxn).candidates(candidates).execute().unwrap()) + } else { + None + }; let mut documents = Vec::new(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -674,10 +688,15 @@ async fn main() -> anyhow::Result<()> { documents.push(object); } + let answer = Answer { + documents, + facets: facets.unwrap_or_default(), + }; + Response::builder() .header("Content-Type", "application/json") .header("Time-Ms", before_search.elapsed().as_millis().to_string()) - .body(serde_json::to_string(&documents).unwrap()) + .body(serde_json::to_string(&answer).unwrap()) }); let index_cloned = index.clone(); diff --git a/src/index.rs b/src/index.rs index 601816148..6020e332c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -9,7 +9,7 @@ use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::{default_criteria, Criterion, Search}; +use crate::{default_criteria, Criterion, Search, FacetDistribution}; use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, @@ -351,6 +351,10 @@ impl Index { Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?) } + pub fn facets<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { + FacetDistribution::new(rtxn, self) + } + pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> { Search::new(rtxn, self) } diff --git a/src/lib.rs b/src/lib.rs index 435c3be91..09a66ea65 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::index::Index; -pub use self::search::{Search, FacetCondition, SearchResult}; +pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs new file mode 100644 index 000000000..3af475c71 --- /dev/null +++ b/src/search/facet/facet_distribution.rs @@ -0,0 +1,106 @@ +use std::collections::{HashSet, HashMap}; +use std::fmt; +use std::ops::Bound::Unbounded; + +use roaring::RoaringBitmap; +use serde_json::Value; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::search::facet::FacetRange; +use crate::{Index, FieldId}; + +pub struct FacetDistribution<'a> { + facets: Option>, + candidates: Option, + rtxn: &'a heed::RoTxn<'a>, + index: &'a Index, +} + +impl<'a> FacetDistribution<'a> { + pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { + FacetDistribution { facets: None, candidates: None, rtxn, index } + } + + pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { + self.candidates = Some(candidates); + self + } + + pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { + self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); + self + } + + fn facet_values(&self, field_id: FieldId, field_type: FacetType) -> heed::Result> { + let db = self.index.facet_field_id_value_docids; + let iter = match field_type { + FacetType::String => { + let iter = db + .prefix_iter(&self.rtxn, &[field_id])? + .remap_key_type::() + .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); + Box::new(iter) as Box::> + }, + FacetType::Integer => { + let db = db.remap_key_type::(); + let range = FacetRange::::new( + self.rtxn, db, field_id, 0, Unbounded, Unbounded, + )?; + Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (Value::from(v), docids)))) + }, + FacetType::Float => { + let db = db.remap_key_type::(); + let range = FacetRange::::new( + self.rtxn, db, field_id, 0, Unbounded, Unbounded, + )?; + Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (Value::from(v), docids)))) + }, + }; + + let mut facet_values = Vec::new(); + for result in iter { + let (value, docids) = result?; + match &self.candidates { + Some(candidates) => if !docids.is_disjoint(candidates) { + facet_values.push(value); + }, + None => facet_values.push(value), + } + } + Ok(facet_values) + } + + pub fn execute(&self) -> heed::Result>> { + let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let faceted_fields = self.index.faceted_fields(self.rtxn)?; + let fields_ids: Vec<_> = match &self.facets { + Some(names) => { + names.iter().filter_map(|n| { + let id = fields_ids_map.id(n)?; + faceted_fields.get(&id).cloned().map(|t| (id, t)) + }).collect() + }, + None => faceted_fields.iter().map(|(id, t)| (*id, *t)).collect(), + }; + + let mut facets_values = HashMap::new(); + for (fid, ftype) in fields_ids { + let facet_name = fields_ids_map.name(fid).unwrap(); + let values = self.facet_values(fid, ftype)?; + facets_values.insert(facet_name.to_string(), values); + } + + Ok(facets_values) + } +} + +impl fmt::Debug for FacetDistribution<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self; + f.debug_struct("FacetDistribution") + .field("facets", facets) + .field("candidates", candidates) + .finish() + } +} diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index 41212e83e..70b5b4658 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -13,11 +13,13 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::{Index, FieldId}; pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +pub use self::facet_distribution::FacetDistribution; mod facet_condition; +mod facet_distribution; mod parser; -struct FacetRange<'t, T: 't, KC> { +pub struct FacetRange<'t, T: 't, KC> { iter: RoRange<'t, KC, LazyDecode>, end: Bound, } @@ -27,7 +29,7 @@ where KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, T: PartialOrd + Copy + Bounded, { - fn new( + pub fn new( rtxn: &'t heed::RoTxn, db: Database, field_id: FieldId, @@ -78,7 +80,7 @@ where } } -struct FacetRevRange<'t, T: 't, KC> { +pub struct FacetRevRange<'t, T: 't, KC> { iter: RoRevRange<'t, KC, LazyDecode>, end: Bound, } @@ -88,7 +90,7 @@ where KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, T: PartialOrd + Copy + Bounded, { - fn new( + pub fn new( rtxn: &'t heed::RoTxn, db: Database, field_id: FieldId, diff --git a/src/search/mod.rs b/src/search/mod.rs index 45fd0d709..05999caed 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -20,7 +20,7 @@ use crate::mdfs::Mdfs; use crate::query_tokens::{query_tokens, QueryToken}; use crate::{Index, FieldId, DocumentId, Criterion}; -pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; // Building these factories is not free. @@ -313,22 +313,26 @@ impl<'a> Search<'a> { // there is some facet conditions we return a placeholder. let documents_ids = match order_by_facet { Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, facet_candidates, limit)? + self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? }, None => facet_candidates.iter().take(limit).collect(), }; - return Ok(SearchResult { documents_ids, ..Default::default() }) + return Ok(SearchResult { + documents_ids, + candidates: facet_candidates, + ..Default::default() + }) }, (None, None) => { // If the query is not set or results in no DFAs we return a placeholder. - let documents_ids = self.index.documents_ids(self.rtxn)?; + let all_docids = self.index.documents_ids(self.rtxn)?; let documents_ids = match order_by_facet { Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, documents_ids, limit)? + self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? }, - None => documents_ids.iter().take(limit).collect(), + None => all_docids.iter().take(limit).collect(), }; - return Ok(SearchResult { documents_ids, ..Default::default() }) + return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) }, }; @@ -336,7 +340,7 @@ impl<'a> Search<'a> { // The mana depth first search is a revised DFS that explore // solutions in the order of their proximities. - let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates); + let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); let mut documents = Vec::new(); // We execute the Mdfs iterator until we find enough documents. @@ -364,7 +368,7 @@ impl<'a> Search<'a> { None => documents.into_iter().flatten().take(limit).collect(), }; - Ok(SearchResult { found_words, documents_ids }) + Ok(SearchResult { found_words, candidates, documents_ids }) } } @@ -383,6 +387,7 @@ impl fmt::Debug for Search<'_> { #[derive(Default)] pub struct SearchResult { pub found_words: HashSet, + pub candidates: RoaringBitmap, // TODO those documents ids should be associated with their criteria scores. pub documents_ids: Vec, }