MeiliSearch/meilisearch-core/src/raw_document.rs

use std::fmt;
use std::sync::Arc;

use sdset::SetBuf;
use slice_group_by::GroupBy;

use crate::{DocumentId, Highlight, TmpMatch, AttrCount};

#[derive(Clone)]
pub struct RawDocument {
    pub id: DocumentId,
    pub matches: SharedMatches,
    pub highlights: Vec<Highlight>,
    pub fields_counts: Option<SetBuf<AttrCount>>,
}

impl RawDocument {
    pub fn query_index(&self) -> &[u32] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
        unsafe {
            &self
                .matches
                .matches
                .query_index
                .get_unchecked(r.start..r.end)
        }
    }

    pub fn distance(&self) -> &[u8] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
        unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
    }

    pub fn attribute(&self) -> &[u16] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
        unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
    }

    pub fn word_index(&self) -> &[u16] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
        unsafe {
            &self
                .matches
                .matches
                .word_index
                .get_unchecked(r.start..r.end)
        }
    }

    pub fn is_exact(&self) -> &[bool] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
    }
}

impl fmt::Debug for RawDocument {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.write_str("RawDocument {\r\n")?;
        f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
        f.write_fmt(format_args!(
            "{:>15}: {:^5?},\r\n",
            "query_index",
            self.query_index()
        ))?;
        f.write_fmt(format_args!(
            "{:>15}: {:^5?},\r\n",
            "distance",
            self.distance()
        ))?;
        f.write_fmt(format_args!(
            "{:>15}: {:^5?},\r\n",
            "attribute",
            self.attribute()
        ))?;
        f.write_fmt(format_args!(
            "{:>15}: {:^5?},\r\n",
            "word_index",
            self.word_index()
        ))?;
        f.write_fmt(format_args!(
            "{:>15}: {:^5?},\r\n",
            "is_exact",
            self.is_exact()
        ))?;
        f.write_str("}")?;
        Ok(())
    }
}

pub fn raw_documents_from(
    matches: SetBuf<(DocumentId, TmpMatch)>,
    highlights: SetBuf<(DocumentId, Highlight)>
) -> Vec<RawDocument> {
    let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
    let mut matches2 = Matches::with_capacity(matches.len());

    let matches = matches.linear_group_by_key(|(id, _)| *id);
    let highlights = highlights.linear_group_by_key(|(id, _)| *id);

    for (mgroup, hgroup) in matches.zip(highlights) {
        assert_eq!(mgroup[0].0, hgroup[0].0);

        let document_id = mgroup[0].0;
        let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
        let end = start + mgroup.len();
        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
        let fields_counts = None;

        docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
        // TODO we could try to keep both data
        //  - the data oriented one and,
        //  - the raw one, the one that comes from the arguments of this function
        // This way we would be able to only produce data oriented lazily.
        //
        // For example the default first criterion is `SumOfTypos`
        // and just needs the `query_index` and the `distance` fields.
        // It would probably be good to avoid wasting time sorting other fields of documents
        // that will never ever reach the second criterion.
        matches2.extend_from_slice(mgroup);
    }

    let matches = Arc::new(matches2);
    docs_ranges
        .into_iter()
        .map(|(id, range, highlights, fields_counts)| {
            let matches = SharedMatches { range, matches: matches.clone() };
            RawDocument { id, matches, highlights, fields_counts }
        })
        .collect()
}

#[derive(Debug, Copy, Clone)]
struct Range {
    start: usize,
    end: usize,
}

#[derive(Clone)]
pub struct SharedMatches {
    range: Range,
    matches: Arc<Matches>,
}

#[derive(Clone)]
struct Matches {
    query_index: Vec<u32>,
    distance: Vec<u8>,
    attribute: Vec<u16>,
    word_index: Vec<u16>,
    is_exact: Vec<bool>,
}

impl Matches {
    fn with_capacity(cap: usize) -> Matches {
        Matches {
            query_index: Vec::with_capacity(cap),
            distance: Vec::with_capacity(cap),
            attribute: Vec::with_capacity(cap),
            word_index: Vec::with_capacity(cap),
            is_exact: Vec::with_capacity(cap),
        }
    }

    fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
        for (_, match_) in matches {
            self.query_index.push(match_.query_index);
            self.distance.push(match_.distance);
            self.attribute.push(match_.attribute);
            self.word_index.push(match_.word_index);
            self.is_exact.push(match_.is_exact);
        }
    }
}
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`use std::fmt;`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`use std::sync::Arc;`
Use the documents_fileds_count store in the QueryBuilder 2019-10-14 18:48:32 +02:00
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`use sdset::SetBuf;`
			`use slice_group_by::GroupBy;`
Use the documents_fileds_count store in the QueryBuilder 2019-10-14 18:48:32 +02:00
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`use crate::{DocumentId, Highlight, TmpMatch, AttrCount};`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00
			`#[derive(Clone)]`
			`pub struct RawDocument {`
			`pub id: DocumentId,`
			`pub matches: SharedMatches,`
			`pub highlights: Vec<Highlight>,`
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`pub fields_counts: Option<SetBuf<AttrCount>>,`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`}`

			`impl RawDocument {`
			`pub fn query_index(&self) -> &[u32] {`
			`let r = self.matches.range;`
			`// it is safe because construction/modifications`
			`// can only be done in this module`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`unsafe {`
			`&self`
			`.matches`
			`.matches`
			`.query_index`
			`.get_unchecked(r.start..r.end)`
			`}`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`}`

			`pub fn distance(&self) -> &[u8] {`
			`let r = self.matches.range;`
			`// it is safe because construction/modifications`
			`// can only be done in this module`
			`unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }`
			`}`

			`pub fn attribute(&self) -> &[u16] {`
			`let r = self.matches.range;`
			`// it is safe because construction/modifications`
			`// can only be done in this module`
			`unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }`
			`}`

			`pub fn word_index(&self) -> &[u16] {`
			`let r = self.matches.range;`
			`// it is safe because construction/modifications`
			`// can only be done in this module`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`unsafe {`
			`&self`
			`.matches`
			`.matches`
			`.word_index`
			`.get_unchecked(r.start..r.end)`
			`}`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`}`

			`pub fn is_exact(&self) -> &[bool] {`
			`let r = self.matches.range;`
			`// it is safe because construction/modifications`
			`// can only be done in this module`
			`unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }`
			`}`
			`}`

			`impl fmt::Debug for RawDocument {`
			`fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {`
			`f.write_str("RawDocument {\r\n")?;`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;`
			`f.write_fmt(format_args!(`
			`"{:>15}: {:^5?},\r\n",`
			`"query_index",`
			`self.query_index()`
			`))?;`
			`f.write_fmt(format_args!(`
			`"{:>15}: {:^5?},\r\n",`
			`"distance",`
			`self.distance()`
			`))?;`
			`f.write_fmt(format_args!(`
			`"{:>15}: {:^5?},\r\n",`
			`"attribute",`
			`self.attribute()`
			`))?;`
			`f.write_fmt(format_args!(`
			`"{:>15}: {:^5?},\r\n",`
			`"word_index",`
			`self.word_index()`
			`))?;`
			`f.write_fmt(format_args!(`
			`"{:>15}: {:^5?},\r\n",`
			`"is_exact",`
			`self.is_exact()`
			`))?;`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`f.write_str("}")?;`
			`Ok(())`
			`}`
			`}`

feat: Separate highlights from matches to make the code easier to follow 2019-07-15 19:34:53 +02:00			`pub fn raw_documents_from(`
			`matches: SetBuf<(DocumentId, TmpMatch)>,`
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`highlights: SetBuf<(DocumentId, Highlight)>`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`) -> Vec<RawDocument> {`
Use the documents_fileds_count store in the QueryBuilder 2019-10-14 18:48:32 +02:00			`let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`let mut matches2 = Matches::with_capacity(matches.len());`

feat: Move the multi-word rewriting algorithm into its own function 2019-08-02 12:07:23 +02:00			`let matches = matches.linear_group_by_key(\|(id, _)\| *id);`
			`let highlights = highlights.linear_group_by_key(\|(id, _)\| *id);`
feat: Separate highlights from matches to make the code easier to follow 2019-07-15 19:34:53 +02:00
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`for (mgroup, hgroup) in matches.zip(highlights) {`
			`assert_eq!(mgroup[0].0, hgroup[0].0);`
feat: Separate highlights from matches to make the code easier to follow 2019-07-15 19:34:53 +02:00
			`let document_id = mgroup[0].0;`
Use the documents_fileds_count store in the QueryBuilder 2019-10-14 18:48:32 +02:00			`let start = docs_ranges.last().map(\|(_, r, _, _)\| r.end).unwrap_or(0);`
feat: Separate highlights from matches to make the code easier to follow 2019-07-15 19:34:53 +02:00			`let end = start + mgroup.len();`
			`let highlights = hgroup.iter().map(\|(_, h)\| *h).collect();`
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`let fields_counts = None;`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00
Use the documents_fileds_count store in the QueryBuilder 2019-10-14 18:48:32 +02:00			`docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));`
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`// TODO we could try to keep both data`
Add more debug infos 2019-11-30 16:33:48 +01:00			`// - the data oriented one and,`
			`// - the raw one, the one that comes from the arguments of this function`
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`// This way we would be able to only produce data oriented lazily.`
			`//`
			// For example the default first criterion is `SumOfTypos`
			// and just needs the `query_index` and the `distance` fields.
			`// It would probably be good to avoid wasting time sorting other fields of documents`
			`// that will never ever reach the second criterion.`
feat: Separate highlights from matches to make the code easier to follow 2019-07-15 19:34:53 +02:00			`matches2.extend_from_slice(mgroup);`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`}`

			`let matches = Arc::new(matches2);`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`docs_ranges`
			`.into_iter()`
			`.map(\|(id, range, highlights, fields_counts)\| {`
Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion 2019-11-29 16:31:47 +01:00			`let matches = SharedMatches { range, matches: matches.clone() };`
			`RawDocument { id, matches, highlights, fields_counts }`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`})`
			`.collect()`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`}`

			`#[derive(Debug, Copy, Clone)]`
			`struct Range {`
			`start: usize,`
			`end: usize,`
			`}`

			`#[derive(Clone)]`
			`pub struct SharedMatches {`
			`range: Range,`
			`matches: Arc<Matches>,`
			`}`

			`#[derive(Clone)]`
			`struct Matches {`
			`query_index: Vec<u32>,`
			`distance: Vec<u8>,`
			`attribute: Vec<u16>,`
			`word_index: Vec<u16>,`
			`is_exact: Vec<bool>,`
			`}`

			`impl Matches {`
			`fn with_capacity(cap: usize) -> Matches {`
			`Matches {`
			`query_index: Vec::with_capacity(cap),`
			`distance: Vec::with_capacity(cap),`
			`attribute: Vec::with_capacity(cap),`
			`word_index: Vec::with_capacity(cap),`
			`is_exact: Vec::with_capacity(cap),`
			`}`
			`}`

feat: Separate highlights from matches to make the code easier to follow 2019-07-15 19:34:53 +02:00			`fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {`
			`for (_, match_) in matches {`
feat: Move the RawDocument type to its own module 2019-07-07 19:55:15 +02:00			`self.query_index.push(match_.query_index);`
			`self.distance.push(match_.distance);`
			`self.attribute.push(match_.attribute);`
			`self.word_index.push(match_.word_index);`
			`self.is_exact.push(match_.is_exact);`
			`}`
			`}`
			`}`