feat: Reintroduce stopwords for the serializer

2025-07-03 20:07:09 +02:00 · 2019-01-06 18:03:47 +01:00 · 2019-01-06 18:03:47 +01:00 · 32f8908d71
commit 32f8908d71
parent a2f5e8aa25
7 changed files with 276 additions and 35 deletions
--- a/examples/create-database.rs
+++ b/examples/create-database.rs
@ -1,12 +1,13 @@
 #[global_allocator]
 static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

+use std::io::{self, BufRead, BufReader};
 use std::path::{Path, PathBuf};
 use std::error::Error;
 use std::borrow::Cow;
 use std::fs::File;

-use hashbrown::HashMap;
+use hashbrown::{HashMap, HashSet};
 use serde_derive::{Serialize, Deserialize};
 use structopt::StructOpt;

@ -26,6 +27,13 @@ pub struct Opt {
    /// The path to the schema.
    #[structopt(long = "schema", parse(from_os_str))]
    pub schema_path: PathBuf,
+
+    /// The path to the list of stop words (one by line).
+    #[structopt(long = "stop-words", parse(from_os_str))]
+    pub stop_words_path: Option<PathBuf>,
+
+    #[structopt(long = "update-group-size")]
+    pub update_group_size: Option<usize>,
 }

 #[derive(Serialize, Deserialize)]
@ -34,37 +42,75 @@ struct Document<'a> (
    HashMap<Cow<'a, str>, Cow<'a, str>>
 );

-fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
+fn index(
+    schema: Schema,
+    database_path: &Path,
+    csv_data_path: &Path,
+    update_group_size: Option<usize>,
+    stop_words: &HashSet<String>,
+) -> Result<Database, Box<Error>>
+{
    let database = Database::create(database_path, &schema)?;

-    println!("start indexing...");
-
-    let tokenizer_builder = DefaultBuilder::new();
-    let update_path = tempfile::NamedTempFile::new()?;
-    let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
-
    let mut rdr = csv::Reader::from_path(csv_data_path)?;
    let mut raw_record = csv::StringRecord::new();
    let headers = rdr.headers()?.clone();

-    while rdr.read_record(&mut raw_record)? {
-        let document: Document = match raw_record.deserialize(Some(&headers)) {
-            Ok(document) => document,
-            Err(e) => {
-                eprintln!("{:?}", e);
-                continue;
-            }
-        };
+    let mut i = 0;
+    let mut end_of_file = false;

-        update.update_document(&document, &tokenizer_builder)?;
+    while !end_of_file {
+        let tokenizer_builder = DefaultBuilder::new();
+        let update_path = tempfile::NamedTempFile::new()?;
+        let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
+
+        loop {
+            end_of_file = !rdr.read_record(&mut raw_record)?;
+            if end_of_file { break }
+
+            let document: Document = match raw_record.deserialize(Some(&headers)) {
+                Ok(document) => document,
+                Err(e) => {
+                    eprintln!("{:?}", e);
+                    continue;
+                }
+            };
+
+            update.update_document(&document, &tokenizer_builder, &stop_words)?;
+
+            print!("\rindexing document {}", i);
+            i += 1;
+
+            if let Some(group_size) = update_group_size {
+                if i % group_size == 0 { break }
+            }
+        }
+
+        println!();
+
+        println!("building update...");
+        let update = update.build()?;
+        println!("ingesting update...");
+        database.ingest_update_file(update)?;
    }

-    let update = update.build()?;
-    database.ingest_update_file(update)?;
-
    Ok(database)
 }

+fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
+    let f = File::open(path)?;
+    let reader = BufReader::new(f);
+    let mut words = HashSet::new();
+
+    for line in reader.lines() {
+        let line = line?;
+        let word = line.trim().to_string();
+        words.insert(word);
+    }
+
+    Ok(words)
+}
+
 fn main() -> Result<(), Box<Error>> {
    let _ = env_logger::init();
    let opt = Opt::from_args();
@ -74,8 +120,13 @@ fn main() -> Result<(), Box<Error>> {
        Schema::from_toml(file)?
    };

+    let stop_words = match opt.stop_words_path {
+        Some(ref path) => retrieve_stop_words(path)?,
+        None           => HashSet::new(),
+    };
+
    let (elapsed, result) = elapsed::measure_time(|| {
-        index(schema, &opt.database_path, &opt.csv_data_path)
+        index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
    });

    if let Err(e) = result {