135: Add stop words r=curquiza a=irevoire

closes #21 

Co-authored-by: tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2021-04-08 11:29:00 +00:00 committed by GitHub
commit f881e8691e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 71 additions and 152 deletions

30
Cargo.lock generated
View File

@ -1,5 +1,7 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3
[[package]] [[package]]
name = "actix-codec" name = "actix-codec"
version = "0.3.0" version = "0.3.0"
@ -1845,7 +1847,7 @@ dependencies = [
"log", "log",
"main_error", "main_error",
"meilisearch-error", "meilisearch-error",
"meilisearch-tokenizer", "meilisearch-tokenizer 0.1.1 (git+https://github.com/meilisearch/Tokenizer.git?branch=main)",
"memmap", "memmap",
"milli", "milli",
"mime", "mime",
@ -1875,6 +1877,22 @@ dependencies = [
"vergen", "vergen",
] ]
[[package]]
name = "meilisearch-tokenizer"
version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0#833c48b2ee39071f8b4f51abd15122afdb3c8c06"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]] [[package]]
name = "meilisearch-tokenizer" name = "meilisearch-tokenizer"
version = "0.1.1" version = "0.1.1"
@ -1919,7 +1937,7 @@ dependencies = [
[[package]] [[package]]
name = "milli" name = "milli"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/meilisearch/milli.git?rev=b7b23cd#b7b23cd4a8e62932c66c2ebedf9d89ddf089e299" source = "git+https://github.com/meilisearch/milli.git?rev=2bcdd8844c4ec9f6f8a34617ea0e4321fa633c0c#2bcdd8844c4ec9f6f8a34617ea0e4321fa633c0c"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bstr", "bstr",
@ -1939,7 +1957,7 @@ dependencies = [
"linked-hash-map", "linked-hash-map",
"log", "log",
"logging_timer", "logging_timer",
"meilisearch-tokenizer", "meilisearch-tokenizer 0.1.1 (git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0)",
"memmap", "memmap",
"num-traits", "num-traits",
"obkv", "obkv",
@ -2234,8 +2252,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
[[package]] [[package]]
name = "pest" name = "pest"
version = "2.1.3" version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
dependencies = [ dependencies = [
"ucd-trie", "ucd-trie",
] ]
@ -2243,7 +2260,8 @@ dependencies = [
[[package]] [[package]]
name = "pest" name = "pest"
version = "2.1.3" version = "2.1.3"
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
dependencies = [ dependencies = [
"ucd-trie", "ucd-trie",
] ]

View File

@ -42,7 +42,7 @@ main_error = "0.1.0"
meilisearch-error = { path = "../meilisearch-error" } meilisearch-error = { path = "../meilisearch-error" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" }
memmap = "0.7.0" memmap = "0.7.0"
milli = { git = "https://github.com/meilisearch/milli.git", rev = "b7b23cd" } milli = { git = "https://github.com/meilisearch/milli.git", rev = "2bcdd8844c4ec9f6f8a34617ea0e4321fa633c0c" }
mime = "0.3.16" mime = "0.3.16"
once_cell = "1.5.2" once_cell = "1.5.2"
parking_lot = "0.11.1" parking_lot = "0.11.1"

View File

@ -1,7 +1,7 @@
mod search; mod search;
mod updates; mod updates;
use std::collections::HashSet; use std::collections::{BTreeSet, HashSet};
use std::ops::Deref; use std::ops::Deref;
use std::sync::Arc; use std::sync::Arc;
@ -51,11 +51,24 @@ impl Index {
.map(|c| c.to_string()) .map(|c| c.to_string())
.collect(); .collect();
let stop_words = self
.stop_words(&txn)?
.map(|stop_words| -> anyhow::Result<BTreeSet<_>> {
Ok(stop_words
.stream()
.into_strs()?
.into_iter()
.collect())
})
.transpose()?
.unwrap_or_else(BTreeSet::new);
Ok(Settings { Ok(Settings {
displayed_attributes: Some(Some(displayed_attributes)), displayed_attributes: Some(Some(displayed_attributes)),
searchable_attributes: Some(Some(searchable_attributes)), searchable_attributes: Some(Some(searchable_attributes)),
attributes_for_faceting: Some(Some(faceted_attributes)), attributes_for_faceting: Some(Some(faceted_attributes)),
ranking_rules: Some(Some(criteria)), ranking_rules: Some(Some(criteria)),
stop_words: Some(Some(stop_words)),
}) })
} }

View File

@ -1,4 +1,4 @@
use std::collections::HashMap; use std::collections::{BTreeSet, HashMap};
use std::io; use std::io;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
@ -44,8 +44,12 @@ pub struct Settings {
)] )]
pub ranking_rules: Option<Option<Vec<String>>>, pub ranking_rules: Option<Option<Vec<String>>>,
// TODO we are missing the stopWords, synonyms and distinctAttribute for the GET settings #[serde(
// request default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none"
)]
pub stop_words: Option<Option<BTreeSet<String>>>,
} }
impl Settings { impl Settings {
@ -55,6 +59,7 @@ impl Settings {
searchable_attributes: Some(None), searchable_attributes: Some(None),
attributes_for_faceting: Some(None), attributes_for_faceting: Some(None),
ranking_rules: Some(None), ranking_rules: Some(None),
stop_words: Some(None),
} }
} }
} }
@ -170,6 +175,14 @@ impl Index {
} }
} }
// We transpose the settings JSON struct into a real setting update.
if let Some(ref stop_words) = settings.stop_words {
match stop_words {
Some(stop_words) => builder.set_stop_words(stop_words.clone()),
_ => builder.reset_stop_words(),
}
}
let result = builder let result = builder
.execute(|indexing_step, update_id| info!("update {}: {:?}", update_id, indexing_step)); .execute(|indexing_step, update_id| info!("update {}: {:?}", update_id, indexing_step));

View File

@ -1,43 +0,0 @@
use actix_web::{web, HttpResponse, get};
use crate::error::{Error, ResponseError};
use crate::helpers::Authentication;
use crate::make_update_delete_routes;
use crate::Data;
#[get(
"/indexes/{index_uid}/settings/attributes-for-faceting",
wrap = "Authentication::Private"
)]
async fn get(
data: web::Data<Data>,
index_uid: web::Path<String>,
) -> Result<HttpResponse, ResponseError> {
let index = data
.db
.load()
.open_index(&index_uid.as_ref())
.ok_or(Error::index_not_found(&index_uid.as_ref()))?;
let attributes_for_faceting = data.db.load().main_read::<_, _, ResponseError>(|reader| {
let schema = index.main.schema(reader)?;
let attrs = index.main.attributes_for_faceting(reader)?;
let attr_names = match (&schema, &attrs) {
(Some(schema), Some(attrs)) => attrs
.iter()
.filter_map(|&id| schema.name(id))
.map(str::to_string)
.collect(),
_ => vec![],
};
Ok(attr_names)
})?;
Ok(HttpResponse::Ok().json(attributes_for_faceting))
}
make_update_delete_routes!(
"/indexes/{index_uid}/settings/attributes-for-faceting",
Vec<String>,
attributes_for_faceting
);

View File

@ -1,25 +0,0 @@
use std::collections::HashSet;
use actix_web::{web, HttpResponse, get};
use crate::error::{Error, ResponseError};
use crate::helpers::Authentication;
use crate::make_update_delete_routes;
use crate::Data;
#[get(
"/indexes/{index_uid}/settings/displayed-attributes",
wrap = "Authentication::Private"
)]
async fn get(
data: web::Data<Data>,
index_uid: web::Path<String>,
) -> Result<HttpResponse, ResponseError> {
todo!()
}
make_update_delete_routes!(
"/indexes/{index_uid}/settings/displayed-attributes",
HashSet<String>,
displayed_attributes
);

View File

@ -91,6 +91,12 @@ make_setting_route!(
searchable_attributes searchable_attributes
); );
make_setting_route!(
"/indexes/{index_uid}/settings/stop-words",
std::collections::BTreeSet<String>,
stop_words
);
//make_setting_route!( //make_setting_route!(
//"/indexes/{index_uid}/settings/distinct-attribute", //"/indexes/{index_uid}/settings/distinct-attribute",
//String, //String,
@ -122,7 +128,8 @@ macro_rules! create_services {
create_services!( create_services!(
attributes_for_faceting, attributes_for_faceting,
displayed_attributes, displayed_attributes,
searchable_attributes searchable_attributes,
stop_words
); );
#[post("/indexes/{index_uid}/settings", wrap = "Authentication::Private")] #[post("/indexes/{index_uid}/settings", wrap = "Authentication::Private")]

View File

@ -1,34 +0,0 @@
use actix_web::{web, HttpResponse, get};
use crate::data::get_indexed_attributes;
use crate::error::{Error, ResponseError};
use crate::helpers::Authentication;
use crate::make_update_delete_routes;
use crate::Data;
#[get(
"/indexes/{index_uid}/settings/searchable-attributes",
wrap = "Authentication::Private"
)]
async fn get(
data: web::Data<Data>,
index_uid: web::Path<String>,
) -> Result<HttpResponse, ResponseError> {
let index = data
.db
.load()
.open_index(&index_uid.as_ref())
.ok_or(Error::index_not_found(&index_uid.as_ref()))?;
let reader = data.db.load().main_read_txn()?;
let schema = index.main.schema(&reader)?;
let searchable_attributes: Option<Vec<String>> = schema.as_ref().map(get_indexed_attributes);
Ok(HttpResponse::Ok().json(searchable_attributes))
}
make_update_delete_routes!(
"/indexes/{index_uid}/settings/searchable-attributes",
Vec<String>,
searchable_attributes
);

View File

@ -1,33 +0,0 @@
use std::collections::BTreeSet;
use crate::make_update_delete_routes;
use actix_web::{web, HttpResponse, get};
use crate::error::{Error, ResponseError};
use crate::helpers::Authentication;
use crate::Data;
#[get(
"/indexes/{index_uid}/settings/stop-words",
wrap = "Authentication::Private"
)]
async fn get(
data: web::Data<Data>,
index_uid: web::Path<String>,
) -> Result<HttpResponse, ResponseError> {
let index = data
.db
.load()
.open_index(&index_uid.as_ref())
.ok_or(Error::index_not_found(&index_uid.as_ref()))?;
let reader = data.db.load().main_read_txn()?;
let stop_words = index.main.stop_words(&reader)?;
Ok(HttpResponse::Ok().json(stop_words))
}
make_update_delete_routes!(
"/indexes/{index_uid}/settings/stop-words",
BTreeSet<String>,
stop_words
);

View File

@ -16,21 +16,21 @@ async fn get_settings() {
let (response, code) = index.settings().await; let (response, code) = index.settings().await;
assert_eq!(code, 200); assert_eq!(code, 200);
let settings = response.as_object().unwrap(); let settings = response.as_object().unwrap();
assert_eq!(settings.keys().len(), 4); assert_eq!(settings.keys().len(), 5);
assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["displayedAttributes"], json!(["*"]));
assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"]));
assert_eq!(settings["attributesForFaceting"], json!({})); assert_eq!(settings["attributesForFaceting"], json!({}));
assert_eq!( assert_eq!(
settings["rankingRules"], settings["rankingRules"],
json!([ json!([
"typo",
"words", "words",
"typo",
"proximity", "proximity",
"attribute", "attribute",
"wordsPosition",
"exactness" "exactness"
]) ])
); );
assert_eq!(settings["stopWords"], json!([]));
} }
#[actix_rt::test] #[actix_rt::test]
@ -78,13 +78,14 @@ async fn reset_all_settings() {
let server = Server::new().await; let server = Server::new().await;
let index = server.index("test"); let index = server.index("test");
index index
.update_settings(json!({"displayedAttributes": ["foo"], "searchableAttributes": ["bar"]})) .update_settings(json!({"displayedAttributes": ["foo"], "searchableAttributes": ["bar"], "stopWords": ["the"] }))
.await; .await;
index.wait_update_id(0).await; index.wait_update_id(0).await;
let (response, code) = index.settings().await; let (response, code) = index.settings().await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["displayedAttributes"], json!(["foo"])); assert_eq!(response["displayedAttributes"], json!(["foo"]));
assert_eq!(response["searchableAttributes"], json!(["bar"])); assert_eq!(response["searchableAttributes"], json!(["bar"]));
assert_eq!(response["stopWords"], json!(["the"]));
index.delete_settings().await; index.delete_settings().await;
index.wait_update_id(1).await; index.wait_update_id(1).await;
@ -93,6 +94,7 @@ async fn reset_all_settings() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["displayedAttributes"], json!(["*"])); assert_eq!(response["displayedAttributes"], json!(["*"]));
assert_eq!(response["searchableAttributes"], json!(["*"])); assert_eq!(response["searchableAttributes"], json!(["*"]));
assert_eq!(response["stopWords"], json!([]));
} }
#[actix_rt::test] #[actix_rt::test]
@ -166,5 +168,6 @@ macro_rules! test_setting_routes {
test_setting_routes!( test_setting_routes!(
attributes_for_faceting, attributes_for_faceting,
displayed_attributes, displayed_attributes,
searchable_attributes searchable_attributes,
stop_words
); );