Index the word pair proximities for both orders of the pair

This commit is contained in:
Clément Renault 2020-09-22 14:49:22 +02:00
parent a58ae5eb2a
commit 31224a8425
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 5 additions and 10 deletions

View File

@ -147,8 +147,6 @@ fn compute_words_pair_proximities(
if prox > 0 && prox < 8 { distances.insert(prox); }
}
if !distances.is_empty() {
// We only store the proximites under one word pair.
let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) };
words_pair_proximities.entry((w1.as_str(), w2.as_str()))
.or_insert_with(RoaringBitmap::new)
.union_with(&distances);
@ -256,7 +254,6 @@ impl Store {
let mut buffer = Vec::new();
for ((w1, w2), proximities) in words_pair_proximities {
assert!(w1 <= w2);
key.truncate(1);
key.extend_from_slice(w1.as_bytes());
key.push(0);

View File

@ -74,7 +74,7 @@ enum Command {
/// Outputs a CSV with the proximities for the two specidied words and
/// the documents ids where these relations appears.
///
/// `word1`, `word2` defines the word pair specified and sorted.
/// `word1`, `word2` defines the word pair specified *in this specific order*.
/// `proximity` defines the proximity between the two specified words.
/// `documents_ids` defines the documents ids where the relation appears.
WordPairProximitiesDocids {
@ -339,17 +339,15 @@ fn word_pair_proximities_docids(
use heed::types::ByteSlice;
use milli::RoaringBitmapCodec;
let (w1, w2) = if word1 > word2 { (word2, word1) } else { (word1, word2) };
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?;
// Create the prefix key with only the pair of words.
let mut prefix = Vec::with_capacity(w1.len() + w2.len() + 1);
prefix.extend_from_slice(w1.as_bytes());
let mut prefix = Vec::with_capacity(word1.len() + word2.len() + 1);
prefix.extend_from_slice(word1.as_bytes());
prefix.push(0);
prefix.extend_from_slice(w2.as_bytes());
prefix.extend_from_slice(word2.as_bytes());
let db = index.word_pair_proximity_docids.as_polymorph();
let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?;
@ -366,7 +364,7 @@ fn word_pair_proximities_docids(
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[&w1, &w2, &proximity.to_string(), &docids])?;
wtr.write_record(&[&word1, &word2, &proximity.to_string(), &docids])?;
}
Ok(wtr.flush()?)