use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::Token; use crate::{DocumentId, DocIndex, Attribute, WordArea}; use hashbrown::HashSet; use serde::Serialize; use serde::ser; pub struct IndexerSerializer<'a, B> { pub tokenizer_builder: &'a B, pub update: &'a mut DocumentUpdate, pub document_id: DocumentId, pub attribute: SchemaAttr, pub stop_words: &'a HashSet, } impl<'a, B> ser::Serializer for IndexerSerializer<'a, B> where B: TokenizerBuilder { type Ok = (); type Error = SerializerError; type SerializeSeq = ser::Impossible; type SerializeTuple = ser::Impossible; type SerializeTupleStruct = ser::Impossible; type SerializeTupleVariant = ser::Impossible; type SerializeMap = ser::Impossible; type SerializeStruct = ser::Impossible; type SerializeStructVariant = ser::Impossible; forward_to_unserializable_type! { bool => serialize_bool, char => serialize_char, i8 => serialize_i8, i16 => serialize_i16, i32 => serialize_i32, i64 => serialize_i64, u8 => serialize_u8, u16 => serialize_u16, u32 => serialize_u32, u64 => serialize_u64, f32 => serialize_f32, f64 => serialize_f64, } fn serialize_str(self, v: &str) -> Result { for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { // FIXME must u32::try_from instead let attribute = match Attribute::new(self.attribute.0, word_index as u32) { Ok(attribute) => attribute, Err(_) => return Ok(()), }; // FIXME must u16/u32::try_from instead let word_area = match WordArea::new(char_index as u32, word.len() as u16) { Ok(word_area) => word_area, Err(_) => return Ok(()), }; let doc_index = DocIndex { document_id: self.document_id, attribute, word_area }; // insert the exact representation let word_lower = word.to_lowercase(); if self.stop_words.contains(&word_lower) { continue } // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); } self.update.insert_doc_index(word_lower.into_bytes(), doc_index); } Ok(()) } fn serialize_bytes(self, _v: &[u8]) -> Result { Err(SerializerError::UnserializableType { name: "&[u8]" }) } fn serialize_none(self) -> Result { Err(SerializerError::UnserializableType { name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { Err(SerializerError::UnserializableType { name: "Option" }) } fn serialize_unit(self) -> Result { Err(SerializerError::UnserializableType { name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { Err(SerializerError::UnserializableType { name: "unit struct" }) } fn serialize_unit_variant( self, _name: &'static str, _variant_index: u32, _variant: &'static str ) -> Result { Err(SerializerError::UnserializableType { name: "unit variant" }) } fn serialize_newtype_struct( self, _name: &'static str, value: &T ) -> Result where T: Serialize, { value.serialize(self) } fn serialize_newtype_variant( self, _name: &'static str, _variant_index: u32, _variant: &'static str, _value: &T ) -> Result where T: Serialize, { Err(SerializerError::UnserializableType { name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { Err(SerializerError::UnserializableType { name: "seq" }) } fn serialize_tuple(self, _len: usize) -> Result { Err(SerializerError::UnserializableType { name: "tuple" }) } fn serialize_tuple_struct( self, _name: &'static str, _len: usize ) -> Result { Err(SerializerError::UnserializableType { name: "tuple struct" }) } fn serialize_tuple_variant( self, _name: &'static str, _variant_index: u32, _variant: &'static str, _len: usize ) -> Result { Err(SerializerError::UnserializableType { name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { Err(SerializerError::UnserializableType { name: "map" }) } fn serialize_struct( self, _name: &'static str, _len: usize ) -> Result { Err(SerializerError::UnserializableType { name: "struct" }) } fn serialize_struct_variant( self, _name: &'static str, _variant_index: u32, _variant: &'static str, _len: usize ) -> Result { Err(SerializerError::UnserializableType { name: "struct variant" }) } }