Introduce a struct to compute facets values

This commit is contained in:
Kerollmops 2020-12-28 19:08:53 +01:00
parent 30dae0205e
commit 3b64735058
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
7 changed files with 156 additions and 20 deletions

View File

@ -15,18 +15,18 @@ $('#query, #facet').on('input', function () {
type: "POST",
url: "query",
contentType: 'application/json',
data: JSON.stringify({ 'query': query, 'facetCondition': facet }),
data: JSON.stringify({ 'query': query, 'facetCondition': facet, "facetDistribution": true }),
contentType: 'application/json',
success: function (data, textStatus, request) {
results.innerHTML = '';
let timeSpent = request.getResponseHeader('Time-Ms');
let numberOfDocuments = data.length;
let numberOfDocuments = data.documents.length;
count.innerHTML = `${numberOfDocuments}`;
time.innerHTML = `${timeSpent}ms`;
time.classList.remove('fade-in-out');
for (element of data) {
for (element of data.documents) {
const elem = document.createElement('li');
elem.classList.add("document");

View File

@ -626,6 +626,14 @@ async fn main() -> anyhow::Result<()> {
struct QueryBody {
query: Option<String>,
facet_condition: Option<String>,
facet_distribution: Option<bool>,
}
#[derive(Debug, Serialize)]
#[serde(rename_all = "camelCase")]
struct Answer {
documents: Vec<Map<String, Value>>,
facets: HashMap<String, Vec<Value>>,
}
let disable_highlighting = opt.disable_highlighting;
@ -649,7 +657,13 @@ async fn main() -> anyhow::Result<()> {
}
}
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap();
let facets = if query.facet_distribution == Some(true) {
Some(index.facets(&rtxn).candidates(candidates).execute().unwrap())
} else {
None
};
let mut documents = Vec::new();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
@ -674,10 +688,15 @@ async fn main() -> anyhow::Result<()> {
documents.push(object);
}
let answer = Answer {
documents,
facets: facets.unwrap_or_default(),
};
Response::builder()
.header("Content-Type", "application/json")
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
.body(serde_json::to_string(&documents).unwrap())
.body(serde_json::to_string(&answer).unwrap())
});
let index_cloned = index.clone();

View File

@ -9,7 +9,7 @@ use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
use crate::{default_criteria, Criterion, Search};
use crate::{default_criteria, Criterion, Search, FacetDistribution};
use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
use crate::{
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
@ -351,6 +351,10 @@ impl Index {
Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?)
}
pub fn facets<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
FacetDistribution::new(rtxn, self)
}
pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> {
Search::new(rtxn, self)
}

View File

@ -28,7 +28,7 @@ pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::index::Index;
pub use self::search::{Search, FacetCondition, SearchResult};
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
pub use self::update_store::UpdateStore;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -0,0 +1,106 @@
use std::collections::{HashSet, HashMap};
use std::fmt;
use std::ops::Bound::Unbounded;
use roaring::RoaringBitmap;
use serde_json::Value;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::search::facet::FacetRange;
use crate::{Index, FieldId};
pub struct FacetDistribution<'a> {
facets: Option<HashSet<String>>,
candidates: Option<RoaringBitmap>,
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
}
impl<'a> FacetDistribution<'a> {
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> {
FacetDistribution { facets: None, candidates: None, rtxn, index }
}
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
self.candidates = Some(candidates);
self
}
pub fn facets<I: IntoIterator<Item=A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self {
self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect());
self
}
fn facet_values(&self, field_id: FieldId, field_type: FacetType) -> heed::Result<Vec<Value>> {
let db = self.index.facet_field_id_value_docids;
let iter = match field_type {
FacetType::String => {
let iter = db
.prefix_iter(&self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>()
.map(|r| r.map(|((_, v), docids)| (Value::from(v), docids)));
Box::new(iter) as Box::<dyn Iterator<Item=_>>
},
FacetType::Integer => {
let db = db.remap_key_type::<FacetLevelValueI64Codec>();
let range = FacetRange::<i64, _>::new(
self.rtxn, db, field_id, 0, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (Value::from(v), docids))))
},
FacetType::Float => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
let range = FacetRange::<f64, _>::new(
self.rtxn, db, field_id, 0, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (Value::from(v), docids))))
},
};
let mut facet_values = Vec::new();
for result in iter {
let (value, docids) = result?;
match &self.candidates {
Some(candidates) => if !docids.is_disjoint(candidates) {
facet_values.push(value);
},
None => facet_values.push(value),
}
}
Ok(facet_values)
}
pub fn execute(&self) -> heed::Result<HashMap<String, Vec<Value>>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
let fields_ids: Vec<_> = match &self.facets {
Some(names) => {
names.iter().filter_map(|n| {
let id = fields_ids_map.id(n)?;
faceted_fields.get(&id).cloned().map(|t| (id, t))
}).collect()
},
None => faceted_fields.iter().map(|(id, t)| (*id, *t)).collect(),
};
let mut facets_values = HashMap::new();
for (fid, ftype) in fields_ids {
let facet_name = fields_ids_map.name(fid).unwrap();
let values = self.facet_values(fid, ftype)?;
facets_values.insert(facet_name.to_string(), values);
}
Ok(facets_values)
}
}
impl fmt::Debug for FacetDistribution<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self;
f.debug_struct("FacetDistribution")
.field("facets", facets)
.field("candidates", candidates)
.finish()
}
}

View File

@ -13,11 +13,13 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{Index, FieldId};
pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator};
pub use self::facet_distribution::FacetDistribution;
mod facet_condition;
mod facet_distribution;
mod parser;
struct FacetRange<'t, T: 't, KC> {
pub struct FacetRange<'t, T: 't, KC> {
iter: RoRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>,
end: Bound<T>,
}
@ -27,7 +29,7 @@ where
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
T: PartialOrd + Copy + Bounded,
{
fn new(
pub fn new(
rtxn: &'t heed::RoTxn,
db: Database<KC, CboRoaringBitmapCodec>,
field_id: FieldId,
@ -78,7 +80,7 @@ where
}
}
struct FacetRevRange<'t, T: 't, KC> {
pub struct FacetRevRange<'t, T: 't, KC> {
iter: RoRevRange<'t, KC, LazyDecode<CboRoaringBitmapCodec>>,
end: Bound<T>,
}
@ -88,7 +90,7 @@ where
KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>,
T: PartialOrd + Copy + Bounded,
{
fn new(
pub fn new(
rtxn: &'t heed::RoTxn,
db: Database<KC, CboRoaringBitmapCodec>,
field_id: FieldId,

View File

@ -20,7 +20,7 @@ use crate::mdfs::Mdfs;
use crate::query_tokens::{query_tokens, QueryToken};
use crate::{Index, FieldId, DocumentId, Criterion};
pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator};
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
pub use self::facet::{FacetIter};
// Building these factories is not free.
@ -313,22 +313,26 @@ impl<'a> Search<'a> {
// there is some facet conditions we return a placeholder.
let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, facet_candidates, limit)?
self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
},
None => facet_candidates.iter().take(limit).collect(),
};
return Ok(SearchResult { documents_ids, ..Default::default() })
return Ok(SearchResult {
documents_ids,
candidates: facet_candidates,
..Default::default()
})
},
(None, None) => {
// If the query is not set or results in no DFAs we return a placeholder.
let documents_ids = self.index.documents_ids(self.rtxn)?;
let all_docids = self.index.documents_ids(self.rtxn)?;
let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, documents_ids, limit)?
self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
},
None => documents_ids.iter().take(limit).collect(),
None => all_docids.iter().take(limit).collect(),
};
return Ok(SearchResult { documents_ids, ..Default::default() })
return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
},
};
@ -336,7 +340,7 @@ impl<'a> Search<'a> {
// The mana depth first search is a revised DFS that explore
// solutions in the order of their proximities.
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates);
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
let mut documents = Vec::new();
// We execute the Mdfs iterator until we find enough documents.
@ -364,7 +368,7 @@ impl<'a> Search<'a> {
None => documents.into_iter().flatten().take(limit).collect(),
};
Ok(SearchResult { found_words, documents_ids })
Ok(SearchResult { found_words, candidates, documents_ids })
}
}
@ -383,6 +387,7 @@ impl fmt::Debug for Search<'_> {
#[derive(Default)]
pub struct SearchResult {
pub found_words: HashSet<String>,
pub candidates: RoaringBitmap,
// TODO those documents ids should be associated with their criteria scores.
pub documents_ids: Vec<DocumentId>,
}