diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index 043715224..452776e38 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -53,5 +53,6 @@ jobs: uses: mislav/bump-homebrew-formula-action@v2 with: formula-name: meilisearch + formula-path: Formula/m/meilisearch.rb env: COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index 339826383..652f9daf2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1444,6 +1444,7 @@ dependencies = [ "insta", "nom", "nom_locate", + "unescaper", ] [[package]] @@ -4180,6 +4181,15 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" +[[package]] +name = "unescaper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8" +dependencies = [ + "thiserror", +] + [[package]] name = "unicase" version = "2.6.0" diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 58111ee08..d9b47f638 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -14,6 +14,7 @@ license.workspace = true [dependencies] nom = "7.1.3" nom_locate = "4.1.0" +unescaper = "0.1.2" [dev-dependencies] insta = "1.29.0" diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index b9d14a271..c71a3aaee 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -62,6 +62,7 @@ pub enum ErrorKind<'a> { MisusedGeoRadius, MisusedGeoBoundingBox, InvalidPrimary, + InvalidEscapedNumber, ExpectedEof, ExpectedValue(ExpectedValueKind), MalformedValue, @@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> { let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) }; writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)? } + ErrorKind::InvalidEscapedNumber => { + writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)? + } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 11ffbb148..5760c8865 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> { #[cfg(test)] pub mod tests { + use FilterCondition as Fc; + use super::*; /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element @@ -556,14 +558,22 @@ pub mod tests { unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into() } + fn p(s: &str) -> impl std::fmt::Display + '_ { + Fc::parse(s).unwrap().unwrap() + } + + #[test] + fn parse_escaped() { + insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#); + insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#); + insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#); + insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#); + // but it also works with other sequencies + insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); + } + #[test] fn parse() { - use FilterCondition as Fc; - - fn p(s: &str) -> impl std::fmt::Display + '_ { - Fc::parse(s).unwrap().unwrap() - } - // Test equal insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}"); insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}"); diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 735352fc3..63d5ac384 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult { }) })?; - Ok((input, value)) + match unescaper::unescape(value.value()) { + Ok(content) => { + if content.len() != value.value().len() { + Ok((input, Token::new(value.original_span(), Some(content)))) + } else { + Ok((input, value)) + } + } + Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)), + Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::InvalidEscapedNumber, + ))), + Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::MalformedValue, + ))), + } } fn is_value_component(c: char) -> bool { @@ -318,17 +335,17 @@ pub mod test { ("\"cha'nnel\"", "cha'nnel", false), ("I'm tamo", "I", false), // escaped thing but not quote - (r#""\\""#, r#"\\"#, false), - (r#""\\\\\\""#, r#"\\\\\\"#, false), - (r#""aa\\aa""#, r#"aa\\aa"#, false), + (r#""\\""#, r#"\"#, true), + (r#""\\\\\\""#, r#"\\\"#, true), + (r#""aa\\aa""#, r#"aa\aa"#, true), // with double quote (r#""Hello \"world\"""#, r#"Hello "world""#, true), - (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true), + (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true), (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true), (r#""\"\"""#, r#""""#, true), // with simple quote (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true), - (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true), + (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true), (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true), (r#"'\'\''"#, r#"''"#, true), ]; @@ -350,7 +367,14 @@ pub mod test { "Filter `{}` was not supposed to be escaped", input ); - assert_eq!(token.value(), expected, "Filter `{}` failed.", input); + assert_eq!( + token.value(), + expected, + "Filter `{}` failed by giving `{}` instead of `{}`.", + input, + token.value(), + expected + ); } } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index fb865a98b..ccdcbcbb6 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -67,10 +67,6 @@ pub(crate) enum Batch { op: IndexOperation, must_create_index: bool, }, - IndexDocumentDeletionByFilter { - index_uid: String, - task: Task, - }, IndexCreation { index_uid: String, primary_key: Option, @@ -114,6 +110,10 @@ pub(crate) enum IndexOperation { documents: Vec>, tasks: Vec, }, + IndexDocumentDeletionByFilter { + index_uid: String, + task: Task, + }, DocumentClear { index_uid: String, tasks: Vec, @@ -155,7 +155,6 @@ impl Batch { | Batch::TaskDeletion(task) | Batch::Dump(task) | Batch::IndexCreation { task, .. } - | Batch::IndexDocumentDeletionByFilter { task, .. } | Batch::IndexUpdate { task, .. } => vec![task.uid], Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => { tasks.iter().map(|task| task.uid).collect() @@ -167,6 +166,7 @@ impl Batch { | IndexOperation::DocumentClear { tasks, .. } => { tasks.iter().map(|task| task.uid).collect() } + IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid], IndexOperation::SettingsAndDocumentOperation { document_import_tasks: tasks, settings_tasks: other, @@ -194,8 +194,7 @@ impl Batch { IndexOperation { op, .. } => Some(op.index_uid()), IndexCreation { index_uid, .. } | IndexUpdate { index_uid, .. } - | IndexDeletion { index_uid, .. } - | IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid), + | IndexDeletion { index_uid, .. } => Some(index_uid), } } } @@ -205,6 +204,7 @@ impl IndexOperation { match self { IndexOperation::DocumentOperation { index_uid, .. } | IndexOperation::DocumentDeletion { index_uid, .. } + | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::Settings { index_uid, .. } | IndexOperation::DocumentClearAndSetting { index_uid, .. } @@ -239,9 +239,12 @@ impl IndexScheduler { let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; match &task.kind { KindWithContent::DocumentDeletionByFilter { index_uid, .. } => { - Ok(Some(Batch::IndexDocumentDeletionByFilter { - index_uid: index_uid.clone(), - task, + Ok(Some(Batch::IndexOperation { + op: IndexOperation::IndexDocumentDeletionByFilter { + index_uid: index_uid.clone(), + task, + }, + must_create_index: false, })) } _ => unreachable!(), @@ -896,51 +899,6 @@ impl IndexScheduler { Ok(tasks) } - Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { - let (index_uid, filter) = - if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } = - &task.kind - { - (index_uid, filter_expr) - } else { - unreachable!() - }; - let index = { - let rtxn = self.env.read_txn()?; - self.index_mapper.index(&rtxn, index_uid)? - }; - let deleted_documents = delete_document_by_filter(filter, index); - let original_filter = if let Some(Details::DocumentDeletionByFilter { - original_filter, - deleted_documents: _, - }) = task.details - { - original_filter - } else { - // In the case of a `documentDeleteByFilter` the details MUST be set - unreachable!(); - }; - - match deleted_documents { - Ok(deleted_documents) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletionByFilter { - original_filter, - deleted_documents: Some(deleted_documents), - }); - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentDeletionByFilter { - original_filter, - deleted_documents: Some(0), - }); - task.error = Some(e.into()); - } - } - - Ok(vec![task]) - } Batch::IndexCreation { index_uid, primary_key, task } => { let wtxn = self.env.write_txn()?; if self.index_mapper.exists(&wtxn, &index_uid)? { @@ -1299,6 +1257,47 @@ impl IndexScheduler { Ok(tasks) } + IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { + let filter = + if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } = + &task.kind + { + filter_expr + } else { + unreachable!() + }; + let deleted_documents = delete_document_by_filter(index_wtxn, filter, index); + let original_filter = if let Some(Details::DocumentDeletionByFilter { + original_filter, + deleted_documents: _, + }) = task.details + { + original_filter + } else { + // In the case of a `documentDeleteByFilter` the details MUST be set + unreachable!(); + }; + + match deleted_documents { + Ok(deleted_documents) => { + task.status = Status::Succeeded; + task.details = Some(Details::DocumentDeletionByFilter { + original_filter, + deleted_documents: Some(deleted_documents), + }); + } + Err(e) => { + task.status = Status::Failed; + task.details = Some(Details::DocumentDeletionByFilter { + original_filter, + deleted_documents: Some(0), + }); + task.error = Some(e.into()); + } + } + + Ok(vec![task]) + } IndexOperation::Settings { index_uid: _, settings, mut tasks } => { let indexer_config = self.index_mapper.indexer_config(); let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); @@ -1498,23 +1497,22 @@ impl IndexScheduler { } } -fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result { +fn delete_document_by_filter<'a>( + wtxn: &mut RwTxn<'a, '_>, + filter: &serde_json::Value, + index: &'a Index, +) -> Result { let filter = Filter::from_json(filter)?; Ok(if let Some(filter) = filter { - let mut wtxn = index.write_txn()?; - - let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err { + let candidates = filter.evaluate(wtxn, index).map_err(|err| match err { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) } e => e.into(), })?; - let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?; + let mut delete_operation = DeleteDocuments::new(wtxn, index)?; delete_operation.delete_documents(&candidates); - let deleted_documents = - delete_operation.execute().map(|result| result.deleted_documents)?; - wtxn.commit()?; - deleted_documents + delete_operation.execute().map(|result| result.deleted_documents)? } else { 0 }) diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index c4e204c09..79e619705 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -60,8 +60,7 @@ pub async fn swap_indexes( } let task = KindWithContent::IndexSwap { swaps }; - - let task = index_scheduler.register(task)?; - let task: SummarizedTaskView = task.into(); + let task: SummarizedTaskView = + tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); Ok(HttpResponse::Accepted().json(task)) } diff --git a/meilisearch/tests/documents/delete_documents.rs b/meilisearch/tests/documents/delete_documents.rs index 760ebbaa8..b3f04aea0 100644 --- a/meilisearch/tests/documents/delete_documents.rs +++ b/meilisearch/tests/documents/delete_documents.rs @@ -154,6 +154,19 @@ async fn delete_document_by_filter() { ) .await; index.wait_task(1).await; + + let (stats, _) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 4, + "isIndexing": false, + "fieldDistribution": { + "color": 3, + "id": 4 + } + } + "###); + let (response, code) = index.delete_document_by_filter(json!({ "filter": "color = blue"})).await; snapshot!(code, @"202 Accepted"); @@ -188,6 +201,18 @@ async fn delete_document_by_filter() { } "###); + let (stats, _) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "fieldDistribution": { + "color": 1, + "id": 2 + } + } + "###); + let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(documents), @r###" @@ -241,6 +266,18 @@ async fn delete_document_by_filter() { } "###); + let (stats, _) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 1, + "isIndexing": false, + "fieldDistribution": { + "color": 1, + "id": 1 + } + } + "###); + let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(documents), @r###" diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 43304b223..44051eb39 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1104,3 +1104,59 @@ async fn camelcased_words() { }) .await; } + +#[actix_rt::test] +async fn simple_search_with_strange_synonyms() { + let server = Server::new().await; + let index = server.index("test"); + + index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await; + let r = index.wait_task(0).await; + meili_snap::snapshot!(r["status"], @r###""succeeded""###); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + index + .search(json!({"q": "How to train"}), |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "How to Train Your Dragon: The Hidden World", + "id": "166428" + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "How & train"}), |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "How to Train Your Dragon: The Hidden World", + "id": "166428" + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "to"}), |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "How to Train Your Dragon: The Hidden World", + "id": "166428" + } + ] + "###); + }) + .await; +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 72e155b3e..5d61de0f4 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -418,19 +418,11 @@ impl<'t> Matcher<'t, '_> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text has to be cropped, - // compute the best interval to crop around. - let matches = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.find_best_match_interval(matches, crop_size) - } - _ => matches, - }; - // If the text has to be cropped, // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { + let matches = self.find_best_match_interval(matches, crop_size); self.crop_bounds(tokens, matches, crop_size) } _ => (0, self.text.len()), @@ -450,6 +442,11 @@ impl<'t> Matcher<'t, '_> { for m in matches { let token = &tokens[m.token_position]; + // skip matches out of the crop window. + if token.byte_start < byte_start || token.byte_end > byte_end { + continue; + } + if byte_index < token.byte_start { formatted.push(&self.text[byte_index..token.byte_start]); } @@ -800,6 +797,37 @@ mod tests { ); } + #[test] + fn format_highlight_crop_phrase_query() { + //! testing: https://github.com/meilisearch/meilisearch/issues/3975 + let temp_index = TempIndex::new(); + temp_index + .add_documents(documents!([ + { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" } + ])) + .unwrap(); + let rtxn = temp_index.read_txn().unwrap(); + + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; + + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); + let mut matcher = builder.build(text); + // should return 10 words with a marker at the start as well the end, and the highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…had the power to split the world between those who…" + ); + + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); + let mut matcher = builder.build(text); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"…world between those who embraced progress and those who resisted…" + ); + } + #[test] fn smaller_crop_size() { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ac041a8b0..1c24a0fcf 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -226,9 +226,9 @@ fn process_tokens<'a>( ) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator()) - .scan((0, None), |(offset, prev_kind), token| { + .scan((0, None), |(offset, prev_kind), mut token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, @@ -244,7 +244,7 @@ fn process_tokens<'a>( { *prev_kind = Some(token.kind); } - _ => (), + _ => token.kind = TokenKind::Unknown, } Some((*offset, token)) }) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 605ef8dd8..c3a023e71 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -59,7 +59,13 @@ pub(crate) fn data_from_obkv_documents( original_obkv_chunks .par_bridge() .map(|original_documents_chunk| { - send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) + send_original_documents_data( + original_documents_chunk, + indexer, + lmdb_writer_sx.clone(), + vectors_field_id, + primary_key_id, + ) }) .collect::>()?; @@ -76,7 +82,6 @@ pub(crate) fn data_from_obkv_documents( &faceted_fields, primary_key_id, geo_fields_ids, - vectors_field_id, &stop_words, &allowed_separators, &dictionary, @@ -265,11 +270,33 @@ fn spawn_extraction_task( /// - documents fn send_original_documents_data( original_documents_chunk: Result>, + indexer: GrenadParameters, lmdb_writer_sx: Sender>, + vectors_field_id: Option, + primary_key_id: FieldId, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + if let Some(vectors_field_id) = vectors_field_id { + let documents_chunk_cloned = original_documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let result = extract_vector_points( + documents_chunk_cloned, + indexer, + primary_key_id, + vectors_field_id, + ); + let _ = match result { + Ok(vector_points) => { + lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) + } + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } + // TODO: create a custom internal error lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); Ok(()) @@ -291,7 +318,6 @@ fn send_and_extract_flattened_documents_data( faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - vectors_field_id: Option, stop_words: &Option>, allowed_separators: &Option<&[&str]>, dictionary: &Option<&[&str]>, @@ -322,25 +348,6 @@ fn send_and_extract_flattened_documents_data( }); } - if let Some(vectors_field_id) = vectors_field_id { - let documents_chunk_cloned = flattened_documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - rayon::spawn(move || { - let result = extract_vector_points( - documents_chunk_cloned, - indexer, - primary_key_id, - vectors_field_id, - ); - let _ = match result { - Ok(vector_points) => { - lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) - } - Err(error) => lmdb_writer_sx_cloned.send(Err(error)), - }; - }); - } - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f50b81d2d..52aa1113e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2550,6 +2550,25 @@ mod tests { db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); } + /// Index multiple different number of vectors in documents. + /// Vectors must be of the same length. + #[test] + fn test_multiple_vectors() { + let index = TempIndex::new(); + + index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap(); + index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap(); + index + .add_documents( + documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]), + ) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap(); + assert_eq!(res.documents_ids.len(), 3); + } + #[test] fn reproduce_the_bug() { /* diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7e52c04c1..c2c0e9084 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { tokenizer .tokenize(text) .filter_map(|token| { - if token.is_word() { + if token.is_word() && !token.lemma().is_empty() { Some(token.lemma().to_string()) } else { None @@ -608,13 +608,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for (word, synonyms) in user_synonyms { // Normalize both the word and associated synonyms. let normalized_word = normalize(&tokenizer, word); - let normalized_synonyms = - synonyms.iter().map(|synonym| normalize(&tokenizer, synonym)); + let normalized_synonyms: Vec<_> = synonyms + .iter() + .map(|synonym| normalize(&tokenizer, synonym)) + .filter(|synonym| !synonym.is_empty()) + .collect(); // Store the normalized synonyms under the normalized word, // merging the possible duplicate words. - let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); - entry.extend(normalized_synonyms); + if !normalized_word.is_empty() && !normalized_synonyms.is_empty() { + let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); + entry.extend(normalized_synonyms.into_iter()); + } } // Make sure that we don't have duplicate synonyms. @@ -1422,6 +1427,43 @@ mod tests { assert!(result.documents_ids.is_empty()); } + #[test] + fn thai_synonyms() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "ยี่ปุ่น" }, + { "name": "ญี่ปุ่น" }, + ]), + ) + .unwrap(); + + // In the same transaction provide some synonyms + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(btreemap! { + "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")], + }); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("japanese").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + #[test] fn setting_searchable_recomputes_other_settings() { let index = TempIndex::new(); diff --git a/permissive-json-pointer/src/lib.rs b/permissive-json-pointer/src/lib.rs index 7e5b3371c..c771f3321 100644 --- a/permissive-json-pointer/src/lib.rs +++ b/permissive-json-pointer/src/lib.rs @@ -186,12 +186,16 @@ fn create_value(value: &Document, mut selectors: HashSet<&str>) -> Document { let array = create_array(array, &sub_selectors); if !array.is_empty() { new_value.insert(key.to_string(), array.into()); + } else { + new_value.insert(key.to_string(), Value::Array(vec![])); } } Value::Object(object) => { let object = create_value(object, sub_selectors); if !object.is_empty() { new_value.insert(key.to_string(), object.into()); + } else { + new_value.insert(key.to_string(), Value::Object(Map::new())); } } _ => (), @@ -211,6 +215,8 @@ fn create_array(array: &[Value], selectors: &HashSet<&str>) -> Vec { let array = create_array(array, selectors); if !array.is_empty() { res.push(array.into()); + } else { + res.push(Value::Array(vec![])); } } Value::Object(object) => { @@ -637,6 +643,24 @@ mod tests { ); } + #[test] + fn empty_array_object_return_empty() { + let value: Value = json!({ + "array": [], + "object": {}, + }); + let value: &Document = value.as_object().unwrap(); + + let res: Value = select_values(value, vec!["array.name", "object.name"]).into(); + assert_eq!( + res, + json!({ + "array": [], + "object": {}, + }) + ); + } + #[test] fn all_conflict_variation() { let value: Value = json!({