feat: Introduce a WordArea struct

Useful to highlight matching areas in the original text.
This commit is contained in:
Clément Renault 2018-12-23 16:46:49 +01:00
parent 62521262e8
commit b32c96cdc9
No known key found for this signature in database
GPG key ID: 0151CDAB43460DAE
14 changed files with 373 additions and 136 deletions

View file

@ -203,14 +203,15 @@ mod tests {
use super::*;
use std::error::Error;
use crate::{Attribute, WordArea};
use crate::DocumentId;
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 };
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let mut builder = PositiveBlobBuilder::memory();
@ -231,9 +232,9 @@ mod tests {
#[test]
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 };
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let mut builder = PositiveBlobBuilder::memory();

View file

@ -73,7 +73,7 @@ impl DocumentKeyAttr {
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap();
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
wtr.write_u16::<NativeEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer)
}
@ -95,7 +95,7 @@ impl DocumentKeyAttr {
pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
let value = (&self.0[offset..]).read_u16::<NativeEndian>().unwrap();
SchemaAttr::new(value)
}
@ -114,7 +114,7 @@ impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id())
.field("attribute", &self.attribute().as_u32())
.field("attribute", &self.attribute().0)
.finish()
}
}

View file

@ -1,6 +1,6 @@
use std::collections::{HashMap, BTreeMap};
use std::io::{Read, Write};
use std::{fmt, u32};
use std::{fmt, u16};
use std::path::Path;
use std::ops::BitOr;
use std::sync::Arc;
@ -53,7 +53,7 @@ impl SchemaBuilder {
if self.attrs.insert(name.into(), props).is_some() {
panic!("Field already inserted.")
}
SchemaAttr(len as u32)
SchemaAttr(len as u16)
}
pub fn build(self) -> Schema {
@ -61,7 +61,7 @@ impl SchemaBuilder {
let mut props = Vec::new();
for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u32));
attrs.insert(name.clone(), SchemaAttr(i as u16));
props.push((name, prop));
}
@ -94,10 +94,9 @@ impl Schema {
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
let mut ordered = BTreeMap::new();
for (name, field) in &self.inner.attrs {
let index = field.as_u32();
let (_, props) = self.inner.props[index as usize];
ordered.insert(index, (name, props));
for (name, attr) in &self.inner.attrs {
let (_, props) = self.inner.props[attr.0 as usize];
ordered.insert(attr.0, (name, props));
}
let mut attrs = LinkedHashMap::with_capacity(ordered.len());
@ -109,8 +108,7 @@ impl Schema {
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let index = attr.as_u32();
let (_, props) = self.inner.props[index as usize];
let (_, props) = self.inner.props[attr.0 as usize];
props
}
@ -119,26 +117,21 @@ impl Schema {
}
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let index = attr.as_u32();
let (name, _) = &self.inner.props[index as usize];
let (name, _) = &self.inner.props[attr.0 as usize];
name
}
}
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct SchemaAttr(u32);
pub struct SchemaAttr(pub(crate) u16);
impl SchemaAttr {
pub fn new(value: u32) -> SchemaAttr {
pub fn new(value: u16) -> SchemaAttr {
SchemaAttr(value)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u32::MAX)
}
pub fn as_u32(&self) -> u32 {
self.0
SchemaAttr(u16::MAX)
}
}

View file

@ -9,12 +9,12 @@ use serde::ser::{self, Serialize};
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlob;
use crate::database::schema::{Schema, SchemaAttr};
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::{TokenizerBuilder, Token};
use crate::database::DocumentKeyAttr;
use crate::database::update::Update;
use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX;
use crate::database::blob::Blob;
use crate::{DocumentId, DocIndex, Attribute, WordArea};
pub enum NewState {
Updated { value: Vec<u8> },
@ -355,11 +355,11 @@ where B: TokenizerBuilder
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for (index, word) in self.tokenizer_builder.build(v) {
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
let doc_index = DocIndex {
document_id: self.document_id,
attribute: self.attribute.as_u32() as u8,
attribute_index: index as u32,
attribute: Attribute::new(self.attribute.0, word_index as u32),
word_area: WordArea::new(char_index as u32, word.len() as u16),
};
// insert the exact representation