mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
feat: Introduce a WordArea struct
Useful to highlight matching areas in the original text.
This commit is contained in:
parent
62521262e8
commit
b32c96cdc9
14 changed files with 373 additions and 136 deletions
|
@ -203,14 +203,15 @@ mod tests {
|
|||
use super::*;
|
||||
|
||||
use std::error::Error;
|
||||
use crate::{Attribute, WordArea};
|
||||
|
||||
use crate::DocumentId;
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 };
|
||||
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
|
||||
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
|
||||
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
|
@ -231,9 +232,9 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 };
|
||||
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 };
|
||||
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 };
|
||||
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
|
||||
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
|
||||
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ impl DocumentKeyAttr {
|
|||
let mut wtr = Cursor::new(&mut buffer[..]);
|
||||
wtr.write_all(&raw_key).unwrap();
|
||||
wtr.write_all(b"-").unwrap();
|
||||
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
|
||||
wtr.write_u16::<NativeEndian>(attr.0).unwrap();
|
||||
|
||||
DocumentKeyAttr(buffer)
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ impl DocumentKeyAttr {
|
|||
|
||||
pub fn attribute(&self) -> SchemaAttr {
|
||||
let offset = 4 + size_of::<u64>() + 1;
|
||||
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
|
||||
let value = (&self.0[offset..]).read_u16::<NativeEndian>().unwrap();
|
||||
SchemaAttr::new(value)
|
||||
}
|
||||
|
||||
|
@ -114,7 +114,7 @@ impl fmt::Debug for DocumentKeyAttr {
|
|||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("DocumentKeyAttr")
|
||||
.field("document_id", &self.document_id())
|
||||
.field("attribute", &self.attribute().as_u32())
|
||||
.field("attribute", &self.attribute().0)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, u32};
|
||||
use std::{fmt, u16};
|
||||
use std::path::Path;
|
||||
use std::ops::BitOr;
|
||||
use std::sync::Arc;
|
||||
|
@ -53,7 +53,7 @@ impl SchemaBuilder {
|
|||
if self.attrs.insert(name.into(), props).is_some() {
|
||||
panic!("Field already inserted.")
|
||||
}
|
||||
SchemaAttr(len as u32)
|
||||
SchemaAttr(len as u16)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Schema {
|
||||
|
@ -61,7 +61,7 @@ impl SchemaBuilder {
|
|||
let mut props = Vec::new();
|
||||
|
||||
for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
|
||||
attrs.insert(name.clone(), SchemaAttr(i as u32));
|
||||
attrs.insert(name.clone(), SchemaAttr(i as u16));
|
||||
props.push((name, prop));
|
||||
}
|
||||
|
||||
|
@ -94,10 +94,9 @@ impl Schema {
|
|||
|
||||
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
||||
let mut ordered = BTreeMap::new();
|
||||
for (name, field) in &self.inner.attrs {
|
||||
let index = field.as_u32();
|
||||
let (_, props) = self.inner.props[index as usize];
|
||||
ordered.insert(index, (name, props));
|
||||
for (name, attr) in &self.inner.attrs {
|
||||
let (_, props) = self.inner.props[attr.0 as usize];
|
||||
ordered.insert(attr.0, (name, props));
|
||||
}
|
||||
|
||||
let mut attrs = LinkedHashMap::with_capacity(ordered.len());
|
||||
|
@ -109,8 +108,7 @@ impl Schema {
|
|||
}
|
||||
|
||||
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||
let index = attr.as_u32();
|
||||
let (_, props) = self.inner.props[index as usize];
|
||||
let (_, props) = self.inner.props[attr.0 as usize];
|
||||
props
|
||||
}
|
||||
|
||||
|
@ -119,26 +117,21 @@ impl Schema {
|
|||
}
|
||||
|
||||
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
|
||||
let index = attr.as_u32();
|
||||
let (name, _) = &self.inner.props[index as usize];
|
||||
let (name, _) = &self.inner.props[attr.0 as usize];
|
||||
name
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
|
||||
pub struct SchemaAttr(u32);
|
||||
pub struct SchemaAttr(pub(crate) u16);
|
||||
|
||||
impl SchemaAttr {
|
||||
pub fn new(value: u32) -> SchemaAttr {
|
||||
pub fn new(value: u16) -> SchemaAttr {
|
||||
SchemaAttr(value)
|
||||
}
|
||||
|
||||
pub fn max() -> SchemaAttr {
|
||||
SchemaAttr(u32::MAX)
|
||||
}
|
||||
|
||||
pub fn as_u32(&self) -> u32 {
|
||||
self.0
|
||||
SchemaAttr(u16::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -9,12 +9,12 @@ use serde::ser::{self, Serialize};
|
|||
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::{TokenizerBuilder, Token};
|
||||
use crate::database::DocumentKeyAttr;
|
||||
use crate::database::update::Update;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::database::blob::Blob;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
|
||||
pub enum NewState {
|
||||
Updated { value: Vec<u8> },
|
||||
|
@ -355,11 +355,11 @@ where B: TokenizerBuilder
|
|||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for (index, word) in self.tokenizer_builder.build(v) {
|
||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||
let doc_index = DocIndex {
|
||||
document_id: self.document_id,
|
||||
attribute: self.attribute.as_u32() as u8,
|
||||
attribute_index: index as u32,
|
||||
attribute: Attribute::new(self.attribute.0, word_index as u32),
|
||||
word_area: WordArea::new(char_index as u32, word.len() as u16),
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue