[package] name = "milli" edition = "2018" publish = false version.workspace = true authors.workspace = true description.workspace = true homepage.workspace = true readme.workspace = true # edition.workspace = true license.workspace = true [dependencies] bimap = { version = "0.6.3", features = ["serde"] } bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" charabia = { version = "0.8.10", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" either = { version = "1.9.0", features = ["serde"] } flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" grenad = { version = "0.4.6", default-features = false, features = [ "rayon", "tempfile", ] } heed = { version = "0.20.1", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", ] } indexmap = { version = "2.1.0", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.7.1" obkv = "0.2.1" once_cell = "1.19.0" ordered-float = "4.2.0" rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.8.0" roaring = "0.10.2" rstar = { version = "0.11.0", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } slice-group-by = "0.3.1" smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.12.0" smartstring = "1.0.1" tempfile = "3.9.0" thiserror = "1.0.56" time = { version = "0.3.31", features = [ "serde-well-known", "formatting", "parsing", "macros", ] } uuid = { version = "1.6.1", features = ["v4"] } filter-parser = { path = "../filter-parser" } # documents words self-join itertools = "0.11.0" # profiling puffin = "0.16.0" csv = "1.3.0" candle-core = { version = "0.4.1" } candle-transformers = { version = "0.4.1" } candle-nn = { version = "0.4.1" } tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [ "onig", ] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ "online", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" arroy = "0.3.1" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } url = "2.5.0" [dev-dependencies] mimalloc = { version = "0.1.39", default-features = false } big_s = "1.0.2" insta = "1.34.0" maplit = "1.0.2" md5 = "0.7.0" meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] all-tokenizations = [ "charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer", "charabia/vietnamese", ] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml lmdb-posix-sem = ["heed/posix-sem"] # allow chinese specialized tokenization chinese = ["charabia/chinese"] chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"] # allow hebrew specialized tokenization hebrew = ["charabia/hebrew"] # allow japanese specialized tokenization japanese = ["charabia/japanese"] japanese-transliteration = ["charabia/japanese-transliteration"] # allow korean specialized tokenization korean = ["charabia/korean"] # allow thai specialized tokenization thai = ["charabia/thai"] # allow greek specialized tokenization greek = ["charabia/greek"] # allow khmer specialized tokenization khmer = ["charabia/khmer"] # allow vietnamese specialized tokenization vietnamese = ["charabia/vietnamese"] # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] # allow CUDA support, see cuda = ["candle-core/cuda"]