From 31224a842560341e4d5f64c9f3b1f9a99c121822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Sep 2020 14:49:22 +0200 Subject: [PATCH] Index the word pair proximities for both orders of the pair --- src/bin/indexer.rs | 3 --- src/bin/infos.rs | 12 +++++------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index f6494839e..767dc576d 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -147,8 +147,6 @@ fn compute_words_pair_proximities( if prox > 0 && prox < 8 { distances.insert(prox); } } if !distances.is_empty() { - // We only store the proximites under one word pair. - let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) }; words_pair_proximities.entry((w1.as_str(), w2.as_str())) .or_insert_with(RoaringBitmap::new) .union_with(&distances); @@ -256,7 +254,6 @@ impl Store { let mut buffer = Vec::new(); for ((w1, w2), proximities) in words_pair_proximities { - assert!(w1 <= w2); key.truncate(1); key.extend_from_slice(w1.as_bytes()); key.push(0); diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 1b8eb5d24..923b40c3f 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -74,7 +74,7 @@ enum Command { /// Outputs a CSV with the proximities for the two specidied words and /// the documents ids where these relations appears. /// - /// `word1`, `word2` defines the word pair specified and sorted. + /// `word1`, `word2` defines the word pair specified *in this specific order*. /// `proximity` defines the proximity between the two specified words. /// `documents_ids` defines the documents ids where the relation appears. WordPairProximitiesDocids { @@ -339,17 +339,15 @@ fn word_pair_proximities_docids( use heed::types::ByteSlice; use milli::RoaringBitmapCodec; - let (w1, w2) = if word1 > word2 { (word2, word1) } else { (word1, word2) }; - let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?; // Create the prefix key with only the pair of words. - let mut prefix = Vec::with_capacity(w1.len() + w2.len() + 1); - prefix.extend_from_slice(w1.as_bytes()); + let mut prefix = Vec::with_capacity(word1.len() + word2.len() + 1); + prefix.extend_from_slice(word1.as_bytes()); prefix.push(0); - prefix.extend_from_slice(w2.as_bytes()); + prefix.extend_from_slice(word2.as_bytes()); let db = index.word_pair_proximity_docids.as_polymorph(); let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; @@ -366,7 +364,7 @@ fn word_pair_proximities_docids( } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[&w1, &w2, &proximity.to_string(), &docids])?; + wtr.write_record(&[&word1, &word2, &proximity.to_string(), &docids])?; } Ok(wtr.flush()?)