Clean up a little bit

This commit is contained in:
Kerollmops 2020-05-31 12:29:19 +02:00
parent 3a998cf39c
commit 6762c2d08f
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -116,7 +116,9 @@ impl Indexed {
} }
// assert headers are valid // assert headers are valid
if !self.headers.is_empty() {
assert_eq!(self.headers, other.headers); assert_eq!(self.headers, other.headers);
}
// extend the documents // extend the documents
self.documents.append(&mut other.documents); self.documents.append(&mut other.documents);
@ -130,18 +132,13 @@ impl Indexed {
} }
} }
fn index_csv( fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<Indexed> {
tid: usize,
mut rdr: csv::Reader<File>,
) -> anyhow::Result<Indexed>
{
const MAX_POSITION: usize = 1000; const MAX_POSITION: usize = 1000;
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
let mut document = csv::StringRecord::new(); let mut document = csv::StringRecord::new();
let mut postings_ids = FastMap4::default(); let mut postings_ids = FastMap4::default();
let mut documents = Vec::new(); let mut documents = Vec::new();
let mut number_of_documents = 0;
// Write the headers into a Vec of bytes. // Write the headers into a Vec of bytes.
let headers = rdr.headers()?; let headers = rdr.headers()?;
@ -168,14 +165,7 @@ fn index_csv(
writer.write_byte_record(document.as_byte_record())?; writer.write_byte_record(document.as_byte_record())?;
let document = writer.into_inner()?; let document = writer.into_inner()?;
documents.push((document_id, document)); documents.push((document_id, document));
number_of_documents += 1;
if number_of_documents % 100000 == 0 {
eprintln!("{}, documents seen {}", tid, number_of_documents);
} }
}
eprintln!("Start collecting the words into an FST");
// We compute and store the postings list into the DB. // We compute and store the postings list into the DB.
let mut new_words = BTreeSet::default(); let mut new_words = BTreeSet::default();
@ -185,8 +175,6 @@ fn index_csv(
let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?; let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?;
eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed));
Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents }) Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents })
} }
@ -244,21 +232,26 @@ fn main() -> anyhow::Result<()> {
let res = opt.files_to_index let res = opt.files_to_index
.into_par_iter() .into_par_iter()
.enumerate() .try_fold(|| Indexed::default(), |acc, path| {
.try_fold(|| Indexed::default(), |acc, (tid, path)| {
let rdr = csv::Reader::from_path(path)?; let rdr = csv::Reader::from_path(path)?;
let indexed = index_csv(tid, rdr)?; let indexed = index_csv(rdr)?;
Ok(acc.merge_with(indexed)) as anyhow::Result<Indexed> Ok(acc.merge_with(indexed)) as anyhow::Result<Indexed>
}) })
.map(|indexed| match indexed { .map(|indexed| match indexed {
Ok(indexed) => { Ok(indexed) => {
let tid = rayon::current_thread_index();
eprintln!("{:?}: A new step to write into LMDB", tid);
let mut wtxn = env.write_txn()?; let mut wtxn = env.write_txn()?;
let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?; let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?;
wtxn.commit()?; wtxn.commit()?;
eprintln!("{:?}: Wrote {} documents into LMDB", tid, count);
Ok(count) Ok(count)
}, },
Err(e) => Err(e), Err(e) => Err(e),
}) })
.inspect(|_| {
eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed))
})
.try_reduce(|| 0, |a, b| Ok(a + b)); .try_reduce(|| 0, |a, b| Ok(a + b));
println!("indexed {:?} documents", res); println!("indexed {:?} documents", res);