add a way to provide primary_key or autogenerate documents ids

This commit is contained in:
tamo 2021-05-25 17:55:45 +02:00 committed by Tamo
parent 06c414a753
commit 4536dfccd0
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
3 changed files with 338 additions and 16 deletions

346
Cargo.lock generated
View File

@ -122,6 +122,20 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
[[package]]
name = "benchmarks"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes 1.0.1",
"convert_case",
"criterion",
"flate2",
"heed",
"milli",
"reqwest",
]
[[package]]
name = "big_s"
version = "1.0.2"
@ -327,6 +341,12 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "cow-utils"
version = "0.1.2"
@ -506,6 +526,15 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encoding_rs"
version = "0.8.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065"
dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "fake-simd"
version = "0.1.2"
@ -750,12 +779,31 @@ dependencies = [
"http",
"indexmap",
"slab",
"tokio",
"tokio-util",
"tokio 0.2.25",
"tokio-util 0.3.1",
"tracing",
"tracing-futures",
]
[[package]]
name = "h2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
dependencies = [
"bytes 1.0.1",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http",
"indexmap",
"slab",
"tokio 1.6.0",
"tokio-util 0.6.7",
"tracing",
]
[[package]]
name = "half"
version = "1.7.1"
@ -893,6 +941,17 @@ dependencies = [
"http",
]
[[package]]
name = "http-body"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
dependencies = [
"bytes 1.0.1",
"http",
"pin-project-lite 0.2.6",
]
[[package]]
name = "http-ui"
version = "0.2.1"
@ -922,7 +981,7 @@ dependencies = [
"stderrlog",
"structopt",
"tempfile",
"tokio",
"tokio 0.2.25",
"warp",
]
@ -960,20 +1019,59 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"h2 0.2.7",
"http",
"http-body",
"http-body 0.3.1",
"httparse",
"httpdate",
"itoa",
"pin-project 1.0.5",
"socket2",
"tokio",
"socket2 0.3.19",
"tokio 0.2.25",
"tower-service",
"tracing",
"want",
]
[[package]]
name = "hyper"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1"
dependencies = [
"bytes 1.0.1",
"futures-channel",
"futures-core",
"futures-util",
"h2 0.3.3",
"http",
"http-body 0.4.2",
"httparse",
"httpdate",
"itoa",
"pin-project 1.0.5",
"socket2 0.4.0",
"tokio 1.6.0",
"tower-service",
"tracing",
"want",
]
[[package]]
name = "hyper-rustls"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64"
dependencies = [
"futures-util",
"hyper 0.14.5",
"log",
"rustls",
"tokio 1.6.0",
"tokio-rustls",
"webpki",
]
[[package]]
name = "idna"
version = "0.2.2"
@ -1029,6 +1127,12 @@ dependencies = [
"libc",
]
[[package]]
name = "ipnet"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
[[package]]
name = "itertools"
version = "0.9.0"
@ -1261,7 +1365,6 @@ dependencies = [
"bstr",
"byteorder",
"chrono",
"criterion",
"crossbeam-channel",
"csv",
"either",
@ -1343,6 +1446,19 @@ dependencies = [
"winapi 0.2.8",
]
[[package]]
name = "mio"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956"
dependencies = [
"libc",
"log",
"miow 0.3.7",
"ntapi",
"winapi 0.3.9",
]
[[package]]
name = "mio-named-pipes"
version = "0.1.7"
@ -1350,7 +1466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656"
dependencies = [
"log",
"mio",
"mio 0.6.23",
"miow 0.3.7",
"winapi 0.3.9",
]
@ -1363,7 +1479,7 @@ checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0"
dependencies = [
"iovec",
"libc",
"mio",
"mio 0.6.23",
]
[[package]]
@ -1441,6 +1557,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "ntapi"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "num-integer"
version = "0.1.44"
@ -1956,12 +2081,62 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "reqwest"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124"
dependencies = [
"base64 0.13.0",
"bytes 1.0.1",
"encoding_rs",
"futures-core",
"futures-util",
"http",
"http-body 0.4.2",
"hyper 0.14.5",
"hyper-rustls",
"ipnet",
"js-sys",
"lazy_static",
"log",
"mime",
"percent-encoding",
"pin-project-lite 0.2.6",
"rustls",
"serde",
"serde_urlencoded 0.7.0",
"tokio 1.6.0",
"tokio-rustls",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"webpki-roots",
"winreg",
]
[[package]]
name = "retain_mut"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1"
[[package]]
name = "ring"
version = "0.16.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
dependencies = [
"cc",
"libc",
"once_cell",
"spin",
"untrusted",
"web-sys",
"winapi 0.3.9",
]
[[package]]
name = "roaring"
version = "0.6.6"
@ -1982,6 +2157,19 @@ dependencies = [
"semver",
]
[[package]]
name = "rustls"
version = "0.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7"
dependencies = [
"base64 0.13.0",
"log",
"ring",
"sct",
"webpki",
]
[[package]]
name = "ryu"
version = "1.0.5"
@ -2015,6 +2203,16 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "sct"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce"
dependencies = [
"ring",
"untrusted",
]
[[package]]
name = "search"
version = "0.2.1"
@ -2108,6 +2306,18 @@ dependencies = [
"url",
]
[[package]]
name = "serde_urlencoded"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9"
dependencies = [
"form_urlencoded",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "sha-1"
version = "0.8.2"
@ -2193,6 +2403,22 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "socket2"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2"
dependencies = [
"libc",
"winapi 0.3.9",
]
[[package]]
name = "spin"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "static_assertions"
version = "1.1.0"
@ -2386,7 +2612,7 @@ dependencies = [
"lazy_static",
"libc",
"memchr",
"mio",
"mio 0.6.23",
"mio-named-pipes",
"mio-uds",
"num_cpus",
@ -2397,6 +2623,21 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "tokio"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [
"autocfg",
"bytes 1.0.1",
"libc",
"memchr",
"mio 0.7.11",
"num_cpus",
"pin-project-lite 0.2.6",
]
[[package]]
name = "tokio-macros"
version = "0.2.6"
@ -2408,6 +2649,17 @@ dependencies = [
"syn 1.0.64",
]
[[package]]
name = "tokio-rustls"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6"
dependencies = [
"rustls",
"tokio 1.6.0",
"webpki",
]
[[package]]
name = "tokio-tungstenite"
version = "0.11.0"
@ -2417,7 +2669,7 @@ dependencies = [
"futures-util",
"log",
"pin-project 0.4.27",
"tokio",
"tokio 0.2.25",
"tungstenite",
]
@ -2432,7 +2684,21 @@ dependencies = [
"futures-sink",
"log",
"pin-project-lite 0.1.12",
"tokio",
"tokio 0.2.25",
]
[[package]]
name = "tokio-util"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592"
dependencies = [
"bytes 1.0.1",
"futures-core",
"futures-sink",
"log",
"pin-project-lite 0.2.6",
"tokio 1.6.0",
]
[[package]]
@ -2578,6 +2844,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "untrusted"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "url"
version = "2.2.1"
@ -2654,7 +2926,7 @@ dependencies = [
"futures",
"headers",
"http",
"hyper",
"hyper 0.13.10",
"log",
"mime",
"mime_guess",
@ -2663,8 +2935,8 @@ dependencies = [
"scoped-tls",
"serde",
"serde_json",
"serde_urlencoded",
"tokio",
"serde_urlencoded 0.6.1",
"tokio 0.2.25",
"tokio-tungstenite",
"tower-service",
"tracing",
@ -2691,6 +2963,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe"
dependencies = [
"cfg-if 1.0.0",
"serde",
"serde_json",
"wasm-bindgen-macro",
]
@ -2709,6 +2983,18 @@ dependencies = [
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468"
dependencies = [
"cfg-if 1.0.0",
"js-sys",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.72"
@ -2748,6 +3034,25 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "webpki"
version = "0.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea"
dependencies = [
"ring",
"untrusted",
]
[[package]]
name = "webpki-roots"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940"
dependencies = [
"webpki",
]
[[package]]
name = "whatlang"
version = "0.9.0"
@ -2800,6 +3105,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "winreg"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69"
dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "ws2_32-sys"
version = "0.2.1"

View File

@ -48,6 +48,7 @@ const BASE_CONF: Conf = Conf {
"Notstandskomitee ", // 4
],
configure: base_conf,
primary_key: Some("id"),
..Conf::BASE
};

View File

@ -56,6 +56,10 @@ pub fn base_setup(conf: &Conf) -> Index {
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
let index = Index::new(options, conf.database_name).unwrap();
if let Some(primary_key) = conf.primary_key {
let mut wtxn = index.write_txn().unwrap();
index.put_primary_key(&mut wtxn, primary_key).unwrap();
}
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
@ -78,6 +82,9 @@ pub fn base_setup(conf: &Conf) -> Index {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);
if let None = conf.primary_key {
builder.enable_autogenerate_docids();
}
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(conf.dataset)