mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-25 22:34:28 +01:00
Make the indexer ignore certain words
This is a preparation for making the indexing fully parallel by making the indexer only be aware of certain words for each threads to avoid postings lists conflicts for each words
This commit is contained in:
parent
a3ac2623d5
commit
2ae3f40971
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -817,6 +817,7 @@ dependencies = [
|
||||
"linked-hash-map",
|
||||
"memmap",
|
||||
"once_cell",
|
||||
"rayon",
|
||||
"roaring",
|
||||
"serde",
|
||||
"slice-group-by",
|
||||
@ -1342,10 +1343,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.3.0"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
|
||||
checksum = "62f02856753d04e03e26929f820d0a0a337ebe71f849801eea335d464b349080"
|
||||
dependencies = [
|
||||
"autocfg 1.0.0",
|
||||
"crossbeam-deque",
|
||||
"either",
|
||||
"rayon-core",
|
||||
@ -1353,9 +1355,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.7.0"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
|
||||
checksum = "e92e15d89083484e11353891f1af602cc661426deb9564c298b270c726973280"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-queue",
|
||||
|
@ -19,6 +19,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||
linked-hash-map = "0.5.3"
|
||||
memmap = "0.7.0"
|
||||
once_cell = "1.4.0"
|
||||
rayon = "1.3.1"
|
||||
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
|
||||
slice-group-by = "0.2.6"
|
||||
smallstr = "0.2.0"
|
||||
|
@ -77,7 +77,14 @@ where
|
||||
a
|
||||
}
|
||||
|
||||
fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
|
||||
fn index_csv<R: io::Read>(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
mut rdr: csv::Reader<R>,
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
thread_index: usize,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
eprintln!("Indexing into LMDB...");
|
||||
|
||||
let mut words_cache = ArcCache::<_, (RoaringBitmap, FastMap4<_, RoaringBitmap>)>::new(100_000);
|
||||
@ -100,6 +107,9 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
|
||||
let word = word.to_lowercase(); // TODO cow_to_lowercase
|
||||
let position = (attr * 1000 + pos) as u32;
|
||||
|
||||
// If this indexing process is not concerned by this word, then ignore it.
|
||||
if fxhash::hash32(&word) as usize % num_threads != thread_index { continue; }
|
||||
|
||||
match words_cache.get_mut(&word) {
|
||||
(Some(entry), evicted) => {
|
||||
let (ids, positions) = entry;
|
||||
@ -214,11 +224,11 @@ fn main() -> anyhow::Result<()> {
|
||||
match opt.csv_file {
|
||||
Some(path) => {
|
||||
let rdr = csv::Reader::from_path(path)?;
|
||||
index_csv(&mut wtxn, rdr, &index)?;
|
||||
index_csv(&mut wtxn, rdr, &index, 1, 0)?;
|
||||
},
|
||||
None => {
|
||||
let rdr = csv::Reader::from_reader(io::stdin());
|
||||
index_csv(&mut wtxn, rdr, &index)?;
|
||||
index_csv(&mut wtxn, rdr, &index, 1, 0)?;
|
||||
}
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user