#![cfg_attr(feature = "nightly", feature(test))] pub mod automaton; pub mod database; pub mod data; pub mod rank; pub mod tokenizer; mod common_words; use std::fmt; pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; /// Represent an internally generated document unique identifier. /// /// It is used to inform the database the document you want to deserialize. /// Helpful for custom ranking. #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct DocumentId(u64); /// Represent an attribute number along with the word index /// according to the tokenizer used. /// /// It can accept up to 1024 attributes and word positions /// can be maximum 2^22. #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Attribute(u32); impl Attribute { /// Construct an `Attribute` from an attribute number and /// the word position of a match according to the tokenizer used. fn new(attribute: u16, index: u32) -> Result { if attribute & 0b1111_1100_0000_0000 != 0 { return Err(AttributeError::AttributeTooBig) } if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { return Err(AttributeError::IndexTooBig) } let attribute = u32::from(attribute) << 22; Ok(Attribute(attribute | index)) } /// Construct an `Attribute` from an attribute number and /// the word position of a match according to the tokenizer used. /// /// # Panics /// /// The attribute must not be greater than 1024 /// and the word index not greater than 2^22. fn new_faillible(attribute: u16, index: u32) -> Attribute { match Attribute::new(attribute, index) { Ok(attribute) => attribute, Err(AttributeError::AttributeTooBig) => { panic!("attribute must not be greater than 1024") }, Err(AttributeError::IndexTooBig) => { panic!("attribute word index must not be greater than 2^22") }, } } #[inline] pub fn attribute(self) -> u16 { (self.0 >> 22) as u16 } #[inline] pub fn word_index(self) -> u32 { self.0 & 0b0000_0000_0011_1111_1111_1111_1111 } } impl fmt::Debug for Attribute { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Attribute") .field("attribute", &self.attribute()) .field("word_index", &self.word_index()) .finish() } } enum AttributeError { AttributeTooBig, IndexTooBig, } /// Represent a word position in bytes along with the length of it. /// /// It can represent words byte index to maximum 2^22 and /// up to words of length 1024. #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct WordArea(u32); impl WordArea { /// Construct a `WordArea` from a word position in bytes /// and the length of it. /// /// # Panics /// /// The byte index must not be greater than 2^22 /// and the length not greater than 1024. fn new(byte_index: u32, length: u16) -> Result { assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0); assert!(length & 0b1111_1100_0000_0000 == 0); if byte_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { return Err(WordAreaError::ByteIndexTooBig) } if length & 0b1111_1100_0000_0000 != 0 { return Err(WordAreaError::LengthTooBig) } let byte_index = byte_index << 10; Ok(WordArea(byte_index | u32::from(length))) } fn new_faillible(byte_index: u32, length: u16) -> WordArea { match WordArea::new(byte_index, length) { Ok(word_area) => word_area, Err(WordAreaError::ByteIndexTooBig) => { panic!("word area byte index must not be greater than 2^22") }, Err(WordAreaError::LengthTooBig) => { panic!("word area length must not be greater than 1024") }, } } #[inline] pub fn byte_index(self) -> u32 { self.0 >> 10 } #[inline] pub fn length(self) -> u16 { (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 } } impl fmt::Debug for WordArea { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("WordArea") .field("byte_index", &self.byte_index()) .field("length", &self.length()) .finish() } } enum WordAreaError { ByteIndexTooBig, LengthTooBig, } /// This structure represent the position of a word /// in a document and its attributes. /// /// This is stored in the map, generated at index time, /// extracted and interpreted at search time. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(C)] pub struct DocIndex { /// The document identifier where the word was found. pub document_id: DocumentId, /// The attribute in the document where the word was found /// along with the index in it. pub attribute: Attribute, /// The position in bytes where the word was found /// along with the length of it. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. pub word_area: WordArea, } /// This structure represent a matching word with informations /// on the location of the word in the document. /// /// The order of the field is important because it defines /// the way these structures are ordered between themselves. /// /// The word in itself is not important. // TODO do data oriented programming ? very arrays ? #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Match { /// The word index in the query sentence. /// Same as the `attribute_index` but for the query words. /// /// Used to retrieve the automaton that match this word. pub query_index: u32, /// The distance the word has with the query word /// (i.e. the Levenshtein distance). pub distance: u8, /// The attribute in the document where the word was found /// along with the index in it. pub attribute: Attribute, /// Whether the word that match is an exact match or a prefix. pub is_exact: bool, /// The position in bytes where the word was found /// along with the length of it. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. pub word_area: WordArea, } impl Match { pub fn zero() -> Self { Match { query_index: 0, distance: 0, attribute: Attribute::new_faillible(0, 0), is_exact: false, word_area: WordArea::new_faillible(0, 0), } } pub fn max() -> Self { Match { query_index: u32::max_value(), distance: u8::max_value(), attribute: Attribute(u32::max_value()), is_exact: true, word_area: WordArea(u32::max_value()), } } } #[cfg(test)] mod tests { use super::*; use quickcheck::{quickcheck, TestResult}; use std::mem; #[test] fn docindex_mem_size() { assert_eq!(mem::size_of::(), 16); } quickcheck! { fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult { if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) { return TestResult::discard() } let attribute = Attribute::new_faillible(gen_attr, gen_index); let valid_attribute = attribute.attribute() == gen_attr; let valid_index = attribute.word_index() == gen_index; TestResult::from_bool(valid_attribute && valid_index) } fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult { if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) { return TestResult::discard() } let a = Attribute::new_faillible(gen_attr, gen_index); let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1); TestResult::from_bool(a < b) } fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult { if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { return TestResult::discard() } let word_area = WordArea::new_faillible(gen_byte_index, gen_length); let valid_char_index = word_area.byte_index() == gen_byte_index; let valid_length = word_area.length() == gen_length; TestResult::from_bool(valid_char_index && valid_length) } fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult { if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { return TestResult::discard() } let a = WordArea::new_faillible(gen_byte_index, gen_length); let b = WordArea::new_faillible(gen_byte_index + 1, gen_length + 1); TestResult::from_bool(a < b) } } }