mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-14 23:37:29 +01:00
45636d315c
3670: Fix addition deletion bug r=irevoire a=irevoire The first commit of this PR is a revert of https://github.com/meilisearch/meilisearch/pull/3667. It re-enable the auto-batching of addition and deletion of tasks. No new changes have been introduced outside of `milli`. So all the changes you see on the autobatcher have actually already been reviewed. It fixes https://github.com/meilisearch/meilisearch/issues/3440. ### What was happening? The issue was that the `external_documents_ids` generated in the `transform` were used in a very strange way that wasn’t compatible with the deletion of documents. Instead of doing a clear merge between the external document IDs of the DB and the one returned by the transform + writing it on disk, we were doing some weird tricks with the soft-deleted to avoid writing the fst on disk as much as possible. The new algorithm may be a bit slower but is way more straightforward and doesn’t change depending on if the soft deletion was used or not. Here is a list of the changes introduced: 1. We now do a clear distinction between the `new_external_documents_ids` coming from the transform and only held on RAM and the `external_documents_ids` coming from the DB. 2. The `new_external_documents_ids` (coming out of the transform) are now represented as an `fst`. We don't need to struggle with the hard, soft distinction + the soft_deleted => That's easier to understand 3. When indexing documents, we merge the `external_documents_ids` coming from the DB and the `new_external_documents_ids` coming from the transform. ### Other things introduced in this PR Since we constantly have to write small, very specialized fuzzers for this kind of bug, we decided to push the one used to reproduce this bug. It's not perfect, but it's easy to improve in the future. It'll also run for as long as possible on every merge on the main branch. Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Loïc Lecrenier <loic.lecrenier@icloud.com>
103 lines
2.6 KiB
TOML
103 lines
2.6 KiB
TOML
[package]
|
|
name = "milli"
|
|
edition = "2018"
|
|
publish = false
|
|
|
|
version.workspace = true
|
|
authors.workspace = true
|
|
description.workspace = true
|
|
homepage.workspace = true
|
|
readme.workspace = true
|
|
# edition.workspace = true
|
|
license.workspace = true
|
|
|
|
[dependencies]
|
|
bimap = { version = "0.6.3", features = ["serde"] }
|
|
bincode = "1.3.3"
|
|
bstr = "1.4.0"
|
|
byteorder = "1.4.3"
|
|
charabia = { version = "0.7.2", default-features = false }
|
|
concat-arrays = "0.1.2"
|
|
crossbeam-channel = "0.5.8"
|
|
deserr = "0.5.0"
|
|
either = "1.8.1"
|
|
flatten-serde-json = { path = "../flatten-serde-json" }
|
|
fst = "0.4.7"
|
|
fxhash = "0.2.1"
|
|
geoutils = "0.5.1"
|
|
grenad = { version = "0.4.4", default-features = false, features = [
|
|
"tempfile",
|
|
] }
|
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-features = false, features = [
|
|
"lmdb",
|
|
"sync-read-txn",
|
|
] }
|
|
json-depth-checker = { path = "../json-depth-checker" }
|
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
|
memmap2 = "0.5.10"
|
|
obkv = "0.2.0"
|
|
once_cell = "1.17.1"
|
|
ordered-float = "3.6.0"
|
|
rayon = "1.7.0"
|
|
roaring = "0.10.1"
|
|
rstar = { version = "0.10.0", features = ["serde"] }
|
|
serde = { version = "1.0.160", features = ["derive"] }
|
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
|
slice-group-by = "0.3.0"
|
|
smallstr = { version = "0.3.0", features = ["serde"] }
|
|
smallvec = "1.10.0"
|
|
smartstring = "1.0.1"
|
|
tempfile = "3.5.0"
|
|
thiserror = "1.0.40"
|
|
time = { version = "0.3.20", features = [
|
|
"serde-well-known",
|
|
"formatting",
|
|
"parsing",
|
|
"macros",
|
|
] }
|
|
uuid = { version = "1.3.1", features = ["v4"] }
|
|
|
|
filter-parser = { path = "../filter-parser" }
|
|
|
|
# documents words self-join
|
|
itertools = "0.10.5"
|
|
|
|
# logging
|
|
log = "0.4.17"
|
|
logging_timer = "1.1.0"
|
|
csv = "1.2.1"
|
|
|
|
[dev-dependencies]
|
|
mimalloc = { version = "0.1.29", default-features = false }
|
|
big_s = "1.0.2"
|
|
insta = "1.29.0"
|
|
maplit = "1.0.2"
|
|
md5 = "0.7.0"
|
|
rand = { version = "0.8.5", features = ["small_rng"] }
|
|
|
|
[features]
|
|
all-tokenizations = ["charabia/default"]
|
|
|
|
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
|
# For more information on this feature, see heed's Cargo.toml
|
|
lmdb-posix-sem = ["heed/posix-sem"]
|
|
|
|
# allow chinese specialized tokenization
|
|
chinese = ["charabia/chinese"]
|
|
|
|
# allow hebrew specialized tokenization
|
|
hebrew = ["charabia/hebrew"]
|
|
|
|
# allow japanese specialized tokenization
|
|
japanese = ["charabia/japanese"]
|
|
japanese-transliteration = ["charabia/japanese-transliteration"]
|
|
|
|
# allow korean specialized tokenization
|
|
korean = ["charabia/korean"]
|
|
|
|
# allow thai specialized tokenization
|
|
thai = ["charabia/thai"]
|
|
|
|
# allow greek specialized tokenization
|
|
greek = ["charabia/greek"]
|