diff --git a/Cargo.toml b/Cargo.toml index 9bb83ccef..0903eab10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,8 @@ [workspace] members = [ "meilidb-core", + "meilidb-schema", + "meilidb-tokenizer", ] [profile.release] diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index b288a7414..7f56c0903 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -19,8 +19,8 @@ siphasher = "0.3.0" slice-group-by = "0.2.6" zerocopy = "0.2.8" -meilidb-schema = { path = "../../MeiliDB/meilidb-schema", version = "0.1.0" } -meilidb-tokenizer = { path = "../../MeiliDB/meilidb-tokenizer", version = "0.1.0" } +meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } [dependencies.rmp-serde] git = "https://github.com/3Hren/msgpack-rust.git" diff --git a/meilidb-schema/Cargo.toml b/meilidb-schema/Cargo.toml new file mode 100644 index 000000000..88178bc1d --- /dev/null +++ b/meilidb-schema/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "meilidb-schema" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +bincode = "1.1.2" +indexmap = { version = "1.1.0", features = ["serde-1"] } +serde = { version = "1.0.91", features = ["derive"] } +serde_json = { version = "1.0.39", features = ["preserve_order"] } +toml = { version = "0.5.0", features = ["preserve_order"] } diff --git a/meilidb-schema/src/lib.rs b/meilidb-schema/src/lib.rs new file mode 100644 index 000000000..5109b33e1 --- /dev/null +++ b/meilidb-schema/src/lib.rs @@ -0,0 +1,285 @@ +use std::collections::{HashMap, BTreeMap}; +use std::{fmt, u16}; +use std::ops::BitOr; +use std::sync::Arc; + +use serde::{Serialize, Deserialize}; +use indexmap::IndexMap; + +pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false }; +pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false }; +pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true }; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SchemaProps { + #[serde(default)] + pub displayed: bool, + + #[serde(default)] + pub indexed: bool, + + #[serde(default)] + pub ranked: bool, +} + +impl SchemaProps { + pub fn is_displayed(self) -> bool { + self.displayed + } + + pub fn is_indexed(self) -> bool { + self.indexed + } + + pub fn is_ranked(self) -> bool { + self.ranked + } +} + +impl BitOr for SchemaProps { + type Output = Self; + + fn bitor(self, other: Self) -> Self::Output { + SchemaProps { + displayed: self.displayed | other.displayed, + indexed: self.indexed | other.indexed, + ranked: self.ranked | other.ranked, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct SchemaBuilder { + identifier: String, + attributes: IndexMap, +} + +impl SchemaBuilder { + pub fn with_identifier>(name: S) -> SchemaBuilder { + SchemaBuilder { + identifier: name.into(), + attributes: IndexMap::new(), + } + } + + pub fn new_attribute>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { + let len = self.attributes.len(); + if self.attributes.insert(name.into(), props).is_some() { + panic!("Field already inserted.") + } + SchemaAttr(len as u16) + } + + pub fn build(self) -> Schema { + let mut attrs = HashMap::new(); + let mut props = Vec::new(); + + for (i, (name, prop)) in self.attributes.into_iter().enumerate() { + attrs.insert(name.clone(), SchemaAttr(i as u16)); + props.push((name, prop)); + } + + let identifier = self.identifier; + Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Schema { + inner: Arc, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct InnerSchema { + identifier: String, + attrs: HashMap, + props: Vec<(String, SchemaProps)>, +} + +impl Schema { + fn to_builder(&self) -> SchemaBuilder { + let identifier = self.inner.identifier.clone(); + let attributes = self.attributes_ordered(); + SchemaBuilder { identifier, attributes } + } + + fn attributes_ordered(&self) -> IndexMap { + let mut ordered = BTreeMap::new(); + for (name, attr) in &self.inner.attrs { + let (_, props) = self.inner.props[attr.0 as usize]; + ordered.insert(attr.0, (name, props)); + } + + let mut attributes = IndexMap::with_capacity(ordered.len()); + for (_, (name, props)) in ordered { + attributes.insert(name.clone(), props); + } + + attributes + } + + pub fn props(&self, attr: SchemaAttr) -> SchemaProps { + let (_, props) = self.inner.props[attr.0 as usize]; + props + } + + pub fn identifier_name(&self) -> &str { + &self.inner.identifier + } + + pub fn attribute>(&self, name: S) -> Option { + self.inner.attrs.get(name.as_ref()).cloned() + } + + pub fn attribute_name(&self, attr: SchemaAttr) -> &str { + let (name, _) = &self.inner.props[attr.0 as usize]; + name + } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.inner.props.iter() + .map(move |(name, prop)| { + let attr = self.inner.attrs.get(name).unwrap(); + (name.as_str(), *attr, *prop) + }) + } +} + +impl Serialize for Schema { + fn serialize(&self, serializer: S) -> Result + where S: serde::ser::Serializer, + { + self.to_builder().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for Schema { + fn deserialize(deserializer: D) -> Result + where D: serde::de::Deserializer<'de>, + { + let builder = SchemaBuilder::deserialize(deserializer)?; + Ok(builder.build()) + } +} + +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct SchemaAttr(pub u16); + +impl SchemaAttr { + pub const fn new(value: u16) -> SchemaAttr { + SchemaAttr(value) + } + + pub const fn min() -> SchemaAttr { + SchemaAttr(u16::min_value()) + } + + pub const fn max() -> SchemaAttr { + SchemaAttr(u16::max_value()) + } + + pub fn next(self) -> Option { + self.0.checked_add(1).map(SchemaAttr) + } + + pub fn prev(self) -> Option { + self.0.checked_sub(1).map(SchemaAttr) + } +} + +impl fmt::Display for SchemaAttr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::error::Error; + + #[test] + fn serialize_deserialize() -> bincode::Result<()> { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("alpha", DISPLAYED); + builder.new_attribute("beta", DISPLAYED | INDEXED); + builder.new_attribute("gamma", INDEXED); + let schema = builder.build(); + + let mut buffer = Vec::new(); + bincode::serialize_into(&mut buffer, &schema)?; + let schema2 = bincode::deserialize_from(buffer.as_slice())?; + + assert_eq!(schema, schema2); + + Ok(()) + } + + #[test] + fn serialize_deserialize_toml() -> Result<(), Box> { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("alpha", DISPLAYED); + builder.new_attribute("beta", DISPLAYED | INDEXED); + builder.new_attribute("gamma", INDEXED); + let schema = builder.build(); + + let buffer = toml::to_vec(&schema)?; + let schema2 = toml::from_slice(buffer.as_slice())?; + + assert_eq!(schema, schema2); + + let data = r#" + identifier = "id" + + [attributes."alpha"] + displayed = true + + [attributes."beta"] + displayed = true + indexed = true + + [attributes."gamma"] + indexed = true + "#; + let schema2 = toml::from_str(data)?; + assert_eq!(schema, schema2); + + Ok(()) + } + + #[test] + fn serialize_deserialize_json() -> Result<(), Box> { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("alpha", DISPLAYED); + builder.new_attribute("beta", DISPLAYED | INDEXED); + builder.new_attribute("gamma", INDEXED); + let schema = builder.build(); + + let buffer = serde_json::to_vec(&schema)?; + let schema2 = serde_json::from_slice(buffer.as_slice())?; + + assert_eq!(schema, schema2); + + let data = r#" + { + "identifier": "id", + "attributes": { + "alpha": { + "displayed": true + }, + "beta": { + "displayed": true, + "indexed": true + }, + "gamma": { + "indexed": true + } + } + }"#; + let schema2 = serde_json::from_str(data)?; + assert_eq!(schema, schema2); + + Ok(()) + } +} diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml new file mode 100644 index 000000000..32c9429b7 --- /dev/null +++ b/meilidb-tokenizer/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "meilidb-tokenizer" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs new file mode 100644 index 000000000..3cea72ffc --- /dev/null +++ b/meilidb-tokenizer/src/lib.rs @@ -0,0 +1,295 @@ +use std::iter::Peekable; +use slice_group_by::StrGroupBy; +use self::SeparatorCategory::*; + +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum SeparatorCategory { + Soft, + Hard, +} + +impl SeparatorCategory { + fn merge(self, other: SeparatorCategory) -> SeparatorCategory { + if let (Soft, Soft) = (self, other) { Soft } else { Hard } + } + + fn to_usize(self) -> usize { + match self { + Soft => 1, + Hard => 8, + } + } +} + +fn is_separator(c: char) -> bool { + classify_separator(c).is_some() +} + +fn classify_separator(c: char) -> Option { + match c { + ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), + '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), + _ => None, + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum CharCategory { + Separator(SeparatorCategory), + Cjk, + Other, +} + +fn classify_char(c: char) -> CharCategory { + if let Some(category) = classify_separator(c) { + CharCategory::Separator(category) + } else if is_cjk(c) { + CharCategory::Cjk + } else { + CharCategory::Other + } +} + +fn is_str_word(s: &str) -> bool { + !s.chars().any(is_separator) +} + +fn same_group_category(a: char, b: char) -> bool { + match (classify_char(a), classify_char(b)) { + (CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false, + (CharCategory::Separator(_), CharCategory::Separator(_)) => true, + (a, b) => a == b, + } +} + +// fold the number of chars along with the index position +fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) { + (n + 1, i + c.len_utf8()) +} + +pub fn split_query_string(query: &str) -> impl Iterator { + Tokenizer::new(query).map(|t| t.word) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct Token<'a> { + pub word: &'a str, + pub word_index: usize, + pub char_index: usize, +} + +pub struct Tokenizer<'a> { + inner: &'a str, + word_index: usize, + char_index: usize, +} + +impl<'a> Tokenizer<'a> { + pub fn new(string: &str) -> Tokenizer { + // skip every separator and set `char_index` + // to the number of char trimmed + let (count, index) = string.char_indices() + .take_while(|(_, c)| is_separator(*c)) + .fold((0, 0), chars_count_index); + + Tokenizer { + inner: &string[index..], + word_index: 0, + char_index: count, + } + } +} + +impl<'a> Iterator for Tokenizer<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + let mut iter = self.inner.linear_group_by(same_group_category).peekable(); + + while let (Some(string), next_string) = (iter.next(), iter.peek()) { + let (count, index) = string.char_indices().fold((0, 0), chars_count_index); + + if !is_str_word(string) { + self.word_index += string.chars() + .filter_map(classify_separator) + .fold(Soft, |a, x| a.merge(x)) + .to_usize(); + self.char_index += count; + self.inner = &self.inner[index..]; + continue; + } + + let token = Token { + word: string, + word_index: self.word_index, + char_index: self.char_index, + }; + + if next_string.filter(|s| is_str_word(s)).is_some() { + self.word_index += 1; + } + + self.char_index += count; + self.inner = &self.inner[index..]; + + return Some(token); + } + + self.inner = ""; + None + } +} + +pub struct SeqTokenizer<'a, I> +where I: Iterator, +{ + inner: I, + current: Option>>, + word_offset: usize, + char_offset: usize, +} + +impl<'a, I> SeqTokenizer<'a, I> +where I: Iterator, +{ + pub fn new(mut iter: I) -> SeqTokenizer<'a, I> { + let current = iter.next().map(|s| Tokenizer::new(s).peekable()); + SeqTokenizer { + inner: iter, + current: current, + word_offset: 0, + char_offset: 0, + } + } +} + +impl<'a, I> Iterator for SeqTokenizer<'a, I> +where I: Iterator, +{ + type Item = Token<'a>; + + fn next(&mut self) -> Option { + match &mut self.current { + Some(current) => { + match current.next() { + Some(token) => { + // we must apply the word and char offsets + // to the token before returning it + let token = Token { + word: token.word, + word_index: token.word_index + self.word_offset, + char_index: token.char_index + self.char_offset, + }; + + // if this is the last iteration on this text + // we must save the offsets for next texts + if current.peek().is_none() { + let hard_space = SeparatorCategory::Hard.to_usize(); + self.word_offset = token.word_index + hard_space; + self.char_offset = token.char_index + hard_space; + } + + Some(token) + }, + None => { + // no more words in this text we must + // start tokenizing the next text + self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable()); + self.next() + }, + } + }, + // no more texts available + None => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn easy() { + let mut tokenizer = Tokenizer::new("salut"); + + assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("yo "); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn hard() { + let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); + assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); + assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 })); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn hard_long_chars() { + let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 })); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn hard_kanjis() { + let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}"); + + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}"); + + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 })); + assert_eq!(tokenizer.next(), None); + } +}