Add tests for checking that detected script and language associated with document(s) were stored during indexing

This commit is contained in:
f3r10 2022-10-14 14:05:53 -05:00 committed by ManyTheFish
parent b216ddba63
commit a27f329e3a
2 changed files with 42 additions and 0 deletions

View File

@ -4,6 +4,7 @@ use std::fs::File;
use std::mem::size_of;
use std::path::Path;
use charabia::{Language, Script};
use heed::flags::Flags;
use heed::types::*;
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
@ -1194,6 +1195,12 @@ impl Index {
pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
}
/* script language docids */
/// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result<Option<RoaringBitmap>> {
self.script_language_docids.get(rtxn, key)
}
}
#[cfg(test)]

View File

@ -1907,4 +1907,39 @@ mod tests {
index.add_documents(doc1).unwrap();
}
#[cfg(feature = "default")]
#[test]
fn store_detected_script_and_language_per_document_during_indexing() {
use charabia::{Language, Script};
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
{ "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
{ "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
{ "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
{ "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
{ "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let key_thai = (Script::Thai, Language::Other);
let key_jpn = (Script::Cj, Language::Jpn);
let key_cmn = (Script::Cj, Language::Cmn);
let thai_docs = index.script_language_documents_ids(&rtxn, &key_thai).unwrap().unwrap();
let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
let mut expected_thai_docids = RoaringBitmap::new();
expected_thai_docids.push(4);
assert_eq!(thai_docs, expected_thai_docids);
let mut expected_cj_jpn_docids = RoaringBitmap::new();
expected_cj_jpn_docids.push(3);
assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
let mut expected_cj_cmn_docids = RoaringBitmap::new();
expected_cj_cmn_docids.push(1);
expected_cj_cmn_docids.push(5);
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
}
}