mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-12 06:24:29 +01:00
Clean up a little bit
This commit is contained in:
parent
3a998cf39c
commit
6762c2d08f
31
src/main.rs
31
src/main.rs
@ -116,7 +116,9 @@ impl Indexed {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// assert headers are valid
|
// assert headers are valid
|
||||||
assert_eq!(self.headers, other.headers);
|
if !self.headers.is_empty() {
|
||||||
|
assert_eq!(self.headers, other.headers);
|
||||||
|
}
|
||||||
|
|
||||||
// extend the documents
|
// extend the documents
|
||||||
self.documents.append(&mut other.documents);
|
self.documents.append(&mut other.documents);
|
||||||
@ -130,18 +132,13 @@ impl Indexed {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index_csv(
|
fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<Indexed> {
|
||||||
tid: usize,
|
|
||||||
mut rdr: csv::Reader<File>,
|
|
||||||
) -> anyhow::Result<Indexed>
|
|
||||||
{
|
|
||||||
const MAX_POSITION: usize = 1000;
|
const MAX_POSITION: usize = 1000;
|
||||||
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
||||||
|
|
||||||
let mut document = csv::StringRecord::new();
|
let mut document = csv::StringRecord::new();
|
||||||
let mut postings_ids = FastMap4::default();
|
let mut postings_ids = FastMap4::default();
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
let mut number_of_documents = 0;
|
|
||||||
|
|
||||||
// Write the headers into a Vec of bytes.
|
// Write the headers into a Vec of bytes.
|
||||||
let headers = rdr.headers()?;
|
let headers = rdr.headers()?;
|
||||||
@ -168,15 +165,8 @@ fn index_csv(
|
|||||||
writer.write_byte_record(document.as_byte_record())?;
|
writer.write_byte_record(document.as_byte_record())?;
|
||||||
let document = writer.into_inner()?;
|
let document = writer.into_inner()?;
|
||||||
documents.push((document_id, document));
|
documents.push((document_id, document));
|
||||||
|
|
||||||
number_of_documents += 1;
|
|
||||||
if number_of_documents % 100000 == 0 {
|
|
||||||
eprintln!("{}, documents seen {}", tid, number_of_documents);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("Start collecting the words into an FST");
|
|
||||||
|
|
||||||
// We compute and store the postings list into the DB.
|
// We compute and store the postings list into the DB.
|
||||||
let mut new_words = BTreeSet::default();
|
let mut new_words = BTreeSet::default();
|
||||||
for (word, _new_ids) in &postings_ids {
|
for (word, _new_ids) in &postings_ids {
|
||||||
@ -185,8 +175,6 @@ fn index_csv(
|
|||||||
|
|
||||||
let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?;
|
let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?;
|
||||||
|
|
||||||
eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed));
|
|
||||||
|
|
||||||
Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents })
|
Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents })
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,21 +232,26 @@ fn main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
let res = opt.files_to_index
|
let res = opt.files_to_index
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.enumerate()
|
.try_fold(|| Indexed::default(), |acc, path| {
|
||||||
.try_fold(|| Indexed::default(), |acc, (tid, path)| {
|
|
||||||
let rdr = csv::Reader::from_path(path)?;
|
let rdr = csv::Reader::from_path(path)?;
|
||||||
let indexed = index_csv(tid, rdr)?;
|
let indexed = index_csv(rdr)?;
|
||||||
Ok(acc.merge_with(indexed)) as anyhow::Result<Indexed>
|
Ok(acc.merge_with(indexed)) as anyhow::Result<Indexed>
|
||||||
})
|
})
|
||||||
.map(|indexed| match indexed {
|
.map(|indexed| match indexed {
|
||||||
Ok(indexed) => {
|
Ok(indexed) => {
|
||||||
|
let tid = rayon::current_thread_index();
|
||||||
|
eprintln!("{:?}: A new step to write into LMDB", tid);
|
||||||
let mut wtxn = env.write_txn()?;
|
let mut wtxn = env.write_txn()?;
|
||||||
let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?;
|
let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
|
eprintln!("{:?}: Wrote {} documents into LMDB", tid, count);
|
||||||
Ok(count)
|
Ok(count)
|
||||||
},
|
},
|
||||||
Err(e) => Err(e),
|
Err(e) => Err(e),
|
||||||
})
|
})
|
||||||
|
.inspect(|_| {
|
||||||
|
eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed))
|
||||||
|
})
|
||||||
.try_reduce(|| 0, |a, b| Ok(a + b));
|
.try_reduce(|| 0, |a, b| Ok(a + b));
|
||||||
|
|
||||||
println!("indexed {:?} documents", res);
|
println!("indexed {:?} documents", res);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user