MeiliSearch/src/lib.rs

#![cfg_attr(feature = "nightly", feature(test))]

pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
mod common_words;
mod shared_data_cursor;
mod write_to_bytes;

use serde_derive::{Serialize, Deserialize};

pub use rocksdb;

pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;

/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);

/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
    /// The document identifier where the word was found.
    pub document_id: DocumentId,

    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: u16,
    pub word_index: u32,

    /// The position in bytes where the word was found
    /// along with the length of it.
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
    pub char_index: u32,
    pub char_length: u16,
}

/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
    /// The word index in the query sentence.
    /// Same as the `attribute_index` but for the query words.
    ///
    /// Used to retrieve the automaton that match this word.
    pub query_index: u32,

    /// The distance the word has with the query word
    /// (i.e. the Levenshtein distance).
    pub distance: u8,

    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: u16,
    pub word_index: u32,

    /// Whether the word that match is an exact match or a prefix.
    pub is_exact: bool,

    /// The position in bytes where the word was found
    /// along with the length of it.
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
    pub char_index: u32,
    pub char_length: u16,
}

impl Match {
    pub fn zero() -> Self {
        Match {
            query_index: 0,
            distance: 0,
            attribute: 0,
            word_index: 0,
            is_exact: false,
            char_index: 0,
            char_length: 0,
        }
    }

    pub fn max() -> Self {
        Match {
            query_index: u32::max_value(),
            distance: u8::max_value(),
            attribute: u16::max_value(),
            word_index: u32::max_value(),
            is_exact: true,
            char_index: u32::max_value(),
            char_length: u16::max_value(),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::mem;

    #[test]
    fn docindex_mem_size() {
        assert_eq!(mem::size_of::<DocIndex>(), 24);
    }
}
test: Add benchmarks to mesure the database 2018-12-30 13:04:02 +01:00			`#![cfg_attr(feature = "nightly", feature(test))]`

feat: Introduce the QueryBuilder struct 2018-11-27 19:11:33 +01:00			`pub mod automaton;`
feat: Introduce the Database and DatabaseView 2018-12-02 16:45:17 +01:00			`pub mod database;`
feat: Working on ops for Positive and Negative blobs 2018-11-08 12:05:59 +01:00			`pub mod data;`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`pub mod rank;`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`pub mod tokenizer;`
chore: Make the repo use examples and keep the library 2018-10-09 18:23:35 +02:00			`mod common_words;`
feat: Introduce the WriteToBytes trait 2019-02-17 16:32:43 +01:00			`mod shared_data_cursor;`
			`mod write_to_bytes;`
feat: Improve the indexing time a little bit ...by a factor of 17.6x. 2018-07-10 21:29:17 +02:00
feat: Add a new ranked attribute to the schema 2019-02-08 15:17:42 +01:00			`use serde_derive::{Serialize, Deserialize};`

feat: Reexport the internal rocksdb 2018-12-13 11:52:34 +01:00			`pub use rocksdb;`

feat: Use the new Tokenizer in the csv-indexer 2018-09-27 16:59:41 +02:00			`pub use self::tokenizer::Tokenizer;`
chore: Make the repo use examples and keep the library 2018-10-09 18:23:35 +02:00			`pub use self::common_words::CommonWords;`
feat: Make the parsing more generic over json 2018-05-13 15:12:15 +02:00
feat: Create a strong DocumentId type Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types. 2018-12-22 12:00:24 +01:00			`/// Represent an internally generated document unique identifier.`
			`///`
			`/// It is used to inform the database the document you want to deserialize.`
			`/// Helpful for custom ranking.`
feat: Add a new ranked attribute to the schema 2019-02-08 15:17:42 +01:00			`#[derive(Serialize, Deserialize)]`
feat: Create a strong DocumentId type Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types. 2018-12-22 12:00:24 +01:00			`#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]`
feat: Make the schema consider document ids 2018-12-25 12:26:38 +01:00			`pub struct DocumentId(u64);`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00			`/// This structure represent the position of a word`
			`/// in a document and its attributes.`
			`///`
			`/// This is stored in the map, generated at index time,`
			`/// extracted and interpreted at search time.`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]`
feat: Improve the indexing time a little bit ...by a factor of 17.6x. 2018-07-10 21:29:17 +02:00			`#[repr(C)]`
feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00			`pub struct DocIndex {`
			`/// The document identifier where the word was found.`
feat: Introduce a way to distinct documents 2018-10-17 13:35:34 +02:00			`pub document_id: DocumentId,`
feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`/// The attribute in the document where the word was found`
			`/// along with the index in it.`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`pub attribute: u16,`
			`pub word_index: u32,`
feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`/// The position in bytes where the word was found`
			`/// along with the length of it.`
feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00			`///`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`/// It informs on the original word area in the text indexed`
			`/// without needing to run the tokenizer again.`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`pub char_index: u32,`
			`pub char_length: u16,`
feat: Make the parsing more generic over json 2018-05-13 15:12:15 +02:00			`}`

feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00			`/// This structure represent a matching word with informations`
			`/// on the location of the word in the document.`
			`///`
			`/// The order of the field is important because it defines`
			`/// the way these structures are ordered between themselves.`
			`///`
			`/// The word in itself is not important.`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`// TODO do data oriented programming ? very arrays ?`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]`
feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00			`pub struct Match {`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`/// The word index in the query sentence.`
			/// Same as the `attribute_index` but for the query words.
			`///`
			`/// Used to retrieve the automaton that match this word.`
			`pub query_index: u32,`

feat: Define a `DocIndex` struct 2018-05-27 11:15:05 +02:00			`/// The distance the word has with the query word`
			`/// (i.e. the Levenshtein distance).`
			`pub distance: u8,`

feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`/// The attribute in the document where the word was found`
			`/// along with the index in it.`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`pub attribute: u16,`
			`pub word_index: u32,`
feat: Implement the excat match ranking rule 2018-07-06 20:58:06 +02:00
			`/// Whether the word that match is an exact match or a prefix.`
			`pub is_exact: bool,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00
			`/// The position in bytes where the word was found`
			`/// along with the length of it.`
			`///`
			`/// It informs on the original word area in the text indexed`
			`/// without needing to run the tokenizer again.`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`pub char_index: u32,`
			`pub char_length: u16,`
feat: Make the parsing more generic over json 2018-05-13 15:12:15 +02:00			`}`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
			`impl Match {`
			`pub fn zero() -> Self {`
			`Match {`
			`query_index: 0,`
			`distance: 0,`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`attribute: 0,`
			`word_index: 0,`
feat: Implement the excat match ranking rule 2018-07-06 20:58:06 +02:00			`is_exact: false,`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`char_index: 0,`
			`char_length: 0,`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`
			`}`

			`pub fn max() -> Self {`
			`Match {`
			`query_index: u32::max_value(),`
			`distance: u8::max_value(),`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`attribute: u16::max_value(),`
			`word_index: u32::max_value(),`
feat: Implement the excat match ranking rule 2018-07-06 20:58:06 +02:00			`is_exact: true,`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`char_index: u32::max_value(),`
			`char_length: u16::max_value(),`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`use std::mem;`

			`#[test]`
			`fn docindex_mem_size() {`
feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:17:50 +01:00			`assert_eq!(mem::size_of::<DocIndex>(), 24);`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`}`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`