MeiliSearch/milli/Cargo.toml
bors[bot] 25123af3b8
Merge #436
436: Speed up the word prefix databases computation time r=Kerollmops a=Kerollmops

This PR depends on the fixes done in #431 and must be merged after it.

In this PR we will bring the `WordPrefixPairProximityDocids`, `WordPrefixDocids` and, `WordPrefixPositionDocids` update structures to a new era, a better era, where computing the word prefix pair proximities costs much fewer CPU cycles, an era where this update structure can use the, previously computed, set of new word docids from the newly indexed batch of documents.

---

The `WordPrefixPairProximityDocids` is an update structure, which means that it is an object that we feed with some parameters and which modifies the LMDB database of an index when asked for. This structure specifically computes the list of word prefix pair proximities, which correspond to a list of pairs of words associated with a proximity (the distance between both words) where the second word is not a word but a prefix e.g. `s`, `se`, `a`. This word prefix pair proximity is associated with the list of documents ids which contains the pair of words and prefix at the given proximity.

The origin of the performances issue that this struct brings is related to the fact that it starts its job from the beginning, it clears the LMDB database before rewriting everything from scratch, using the other LMDB databases to achieve that. I hope you understand that this is absolutely not an optimized way of doing things.

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
2022-02-16 15:41:14 +00:00

58 lines
1.6 KiB
TOML

[package]
name = "milli"
version = "0.23.0"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
[dependencies]
bimap = { version = "0.6.1", features = ["serde"] }
bincode = "1.3.3"
bstr = "0.2.15"
byteorder = "1.4.2"
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.1"
either = "1.6.1"
flate2 = "1.0.20"
fst = "0.4.5"
fxhash = "0.2.1"
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
geoutils = "0.4.1"
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
human_format = "1.0.3"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.4"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
memmap2 = "0.5.0"
obkv = "0.2.0"
once_cell = "1.5.2"
ordered-float = "2.1.1"
rayon = "1.5.0"
roaring = "0.6.6"
rstar = { version = "0.9.1", features = ["serde"] }
serde = { version = "1.0.123", features = ["derive"] }
serde_json = { version = "1.0.62", features = ["preserve_order"] }
slice-group-by = "0.2.6"
smallstr = { version = "0.2.0", features = ["serde"] }
smallvec = "1.6.1"
tempfile = "3.2.0"
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
uuid = { version = "0.8.2", features = ["v4"] }
filter-parser = { path = "../filter-parser" }
# documents words self-join
itertools = "0.10.0"
# logging
log = "0.4.14"
logging_timer = "1.0.0"
csv = "1.1.6"
[dev-dependencies]
big_s = "1.0.2"
maplit = "1.0.2"
rand = "0.8.3"
[features]
default = []