mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-22 03:07:27 +01:00
Merge #561
561: Enriched documents batch reader r=curquiza a=Kerollmops ~This PR is based on #555 and must be rebased on main after it has been merged to ease the review.~ This PR contains the work in #555 and can be merged on main as soon as reviewed and approved. - [x] Create an `EnrichedDocumentsBatchReader` that contains the external documents id. - [x] Extract the primary key name and make it accessible in the `EnrichedDocumentsBatchReader`. - [x] Use the external id from the `EnrichedDocumentsBatchReader` in the `Transform::read_documents`. - [x] Remove the `update_primary_key` from the _transform.rs_ file. - [x] Really generate the auto-generated documents ids. - [x] Insert the (auto-generated) document ids in the document while processing it in `Transform::read_documents`. Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
941af58239
@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -169,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -184,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -223,11 +226,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -279,11 +283,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -294,19 +299,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -339,13 +346,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
|
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -377,12 +385,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -415,12 +424,13 @@ fn indexing_wiki(c: &mut Criterion) {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -452,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -468,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -507,11 +519,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -564,12 +577,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents =
|
let documents =
|
||||||
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
|
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -581,24 +595,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents =
|
let documents =
|
||||||
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
|
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents =
|
let documents =
|
||||||
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
|
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -631,12 +647,13 @@ fn indexing_movies_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -667,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -682,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -720,11 +739,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -775,12 +795,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
|||||||
// as we don't care about the time it takes.
|
// as we don't care about the time it takes.
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -791,21 +812,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -861,12 +884,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -922,11 +946,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -984,12 +1009,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1021,12 +1047,13 @@ fn indexing_geo(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1058,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1074,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1113,11 +1142,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json");
|
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
@ -7,12 +7,12 @@ use std::path::Path;
|
|||||||
|
|
||||||
use criterion::BenchmarkId;
|
use criterion::BenchmarkId;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use milli::documents::DocumentBatchReader;
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{
|
use milli::update::{
|
||||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
};
|
};
|
||||||
use milli::{Filter, Index};
|
use milli::{Filter, Index, Object};
|
||||||
use serde_json::{Map, Value};
|
use serde_json::Value;
|
||||||
|
|
||||||
pub struct Conf<'a> {
|
pub struct Conf<'a> {
|
||||||
/// where we are going to create our database.mmdb directory
|
/// where we are going to create our database.mmdb directory
|
||||||
@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder =
|
let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
|
||||||
let documents = documents_from(conf.dataset, conf.dataset_format);
|
let documents = documents_from(conf.dataset, conf.dataset_format);
|
||||||
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
user_error.unwrap();
|
||||||
|
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -140,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl BufRead + Seek> {
|
pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||||
let reader =
|
let reader =
|
||||||
File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
|
File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
|
||||||
let reader = BufReader::new(reader);
|
let reader = BufReader::new(reader);
|
||||||
@ -150,39 +148,35 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
|
|||||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
||||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||||
};
|
};
|
||||||
DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
|
||||||
|
|
||||||
let mut buf = String::new();
|
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
|
let object = result?;
|
||||||
while reader.read_line(&mut buf)? > 0 {
|
documents.append_json_object(&object)?;
|
||||||
documents.extend_from_json(&mut buf.as_bytes())?;
|
|
||||||
buf.clear();
|
|
||||||
}
|
}
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
|
||||||
|
|
||||||
documents.extend_from_json(reader)?;
|
documents.append_json_array(reader)?;
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let csv = csv::Reader::from_reader(reader);
|
||||||
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
documents.append_csv(csv)?;
|
||||||
|
|
||||||
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
enum AllowedType {
|
enum AllowedType {
|
||||||
@ -222,14 +216,14 @@ impl<R: Read> CSVDocumentDeserializer<R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
|
impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
|
||||||
type Item = anyhow::Result<Map<String, Value>>;
|
type Item = anyhow::Result<Object>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
let csv_document = self.documents.next()?;
|
let csv_document = self.documents.next()?;
|
||||||
|
|
||||||
match csv_document {
|
match csv_document {
|
||||||
Ok(csv_document) => {
|
Ok(csv_document) => {
|
||||||
let mut document = Map::new();
|
let mut document = Object::new();
|
||||||
|
|
||||||
for ((field_name, field_type), value) in
|
for ((field_name, field_type), value) in
|
||||||
self.headers.iter().zip(csv_document.into_iter())
|
self.headers.iter().zip(csv_document.into_iter())
|
||||||
|
@ -8,12 +8,12 @@ use std::time::Instant;
|
|||||||
use byte_unit::Byte;
|
use byte_unit::Byte;
|
||||||
use eyre::Result;
|
use eyre::Result;
|
||||||
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
|
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
|
||||||
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::UpdateIndexingStep::{
|
use milli::update::UpdateIndexingStep::{
|
||||||
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
|
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
|
||||||
};
|
};
|
||||||
use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
|
use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
|
||||||
use milli::Index;
|
use milli::{Index, Object};
|
||||||
use serde_json::{Map, Value};
|
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
@ -225,9 +225,9 @@ impl Performer for DocumentAddition {
|
|||||||
DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?,
|
DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?;
|
let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
|
||||||
|
|
||||||
println!("Adding {} documents to the index.", reader.len());
|
println!("Adding {} documents to the index.", reader.documents_count());
|
||||||
|
|
||||||
let mut txn = index.write_txn()?;
|
let mut txn = index.write_txn()?;
|
||||||
let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() };
|
let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() };
|
||||||
@ -255,7 +255,7 @@ impl Performer for DocumentAddition {
|
|||||||
let bar = progesses.add(bar);
|
let bar = progesses.add(bar);
|
||||||
bars.push(bar);
|
bars.push(bar);
|
||||||
}
|
}
|
||||||
let mut addition = milli::update::IndexDocuments::new(
|
let addition = milli::update::IndexDocuments::new(
|
||||||
&mut txn,
|
&mut txn,
|
||||||
&index,
|
&index,
|
||||||
&config,
|
&config,
|
||||||
@ -263,7 +263,10 @@ impl Performer for DocumentAddition {
|
|||||||
|step| indexing_callback(step, &bars),
|
|step| indexing_callback(step, &bars),
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
addition.add_documents(reader)?;
|
let (addition, user_error) = addition.add_documents(reader)?;
|
||||||
|
if let Err(error) = user_error {
|
||||||
|
return Err(error.into());
|
||||||
|
}
|
||||||
|
|
||||||
std::thread::spawn(move || {
|
std::thread::spawn(move || {
|
||||||
progesses.join().unwrap();
|
progesses.join().unwrap();
|
||||||
@ -321,35 +324,32 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
|
fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let reader = BufReader::new(reader);
|
||||||
|
|
||||||
let mut buf = String::new();
|
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
let mut reader = BufReader::new(reader);
|
let object = result?;
|
||||||
|
documents.append_json_object(&object)?;
|
||||||
while reader.read_line(&mut buf)? > 0 {
|
|
||||||
documents.extend_from_json(&mut buf.as_bytes())?;
|
|
||||||
}
|
}
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
|
fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
|
||||||
|
|
||||||
documents.extend_from_json(reader)?;
|
documents.append_json_array(reader)?;
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
|
fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let csv = csv::Reader::from_reader(reader);
|
||||||
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
documents.append_csv(csv)?;
|
||||||
|
|
||||||
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
#[derive(Debug, StructOpt)]
|
||||||
@ -423,7 +423,7 @@ impl Search {
|
|||||||
filter: &Option<String>,
|
filter: &Option<String>,
|
||||||
offset: &Option<usize>,
|
offset: &Option<usize>,
|
||||||
limit: &Option<usize>,
|
limit: &Option<usize>,
|
||||||
) -> Result<Vec<Map<String, Value>>> {
|
) -> Result<Vec<Object>> {
|
||||||
let txn = index.read_txn()?;
|
let txn = index.read_txn()?;
|
||||||
let mut search = index.search(&txn);
|
let mut search = index.search(&txn);
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ mod update_store;
|
|||||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::fs::{create_dir_all, File};
|
use std::fs::{create_dir_all, File};
|
||||||
use std::io::{BufRead, BufReader, Cursor, Read};
|
use std::io::{BufReader, Cursor, Read};
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@ -18,7 +18,7 @@ use either::Either;
|
|||||||
use flate2::read::GzDecoder;
|
use flate2::read::GzDecoder;
|
||||||
use futures::{stream, FutureExt, StreamExt};
|
use futures::{stream, FutureExt, StreamExt};
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use milli::documents::DocumentBatchReader;
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::tokenizer::TokenizerBuilder;
|
use milli::tokenizer::TokenizerBuilder;
|
||||||
use milli::update::UpdateIndexingStep::*;
|
use milli::update::UpdateIndexingStep::*;
|
||||||
use milli::update::{
|
use milli::update::{
|
||||||
@ -26,11 +26,11 @@ use milli::update::{
|
|||||||
};
|
};
|
||||||
use milli::{
|
use milli::{
|
||||||
obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index,
|
obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index,
|
||||||
MatcherBuilder, SearchResult, SortError,
|
MatcherBuilder, Object, SearchResult, SortError,
|
||||||
};
|
};
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::{Map, Value};
|
use serde_json::Value;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use tokio::fs::File as TFile;
|
use tokio::fs::File as TFile;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
@ -169,11 +169,7 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn highlight_record(
|
fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet<String>) {
|
||||||
&self,
|
|
||||||
object: &mut Map<String, Value>,
|
|
||||||
attributes_to_highlight: &HashSet<String>,
|
|
||||||
) {
|
|
||||||
// TODO do we need to create a string for element that are not and needs to be highlight?
|
// TODO do we need to create a string for element that are not and needs to be highlight?
|
||||||
for (key, value) in object.iter_mut() {
|
for (key, value) in object.iter_mut() {
|
||||||
if attributes_to_highlight.contains(key) {
|
if attributes_to_highlight.contains(key) {
|
||||||
@ -378,7 +374,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut builder = milli::update::IndexDocuments::new(
|
let builder = milli::update::IndexDocuments::new(
|
||||||
&mut wtxn,
|
&mut wtxn,
|
||||||
&index_cloned,
|
&index_cloned,
|
||||||
GLOBAL_CONFIG.get().unwrap(),
|
GLOBAL_CONFIG.get().unwrap(),
|
||||||
@ -399,10 +395,10 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||||
};
|
};
|
||||||
|
|
||||||
let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?;
|
let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
|
||||||
|
|
||||||
builder.add_documents(documents)?;
|
|
||||||
|
|
||||||
|
let (builder, user_error) = builder.add_documents(documents)?;
|
||||||
|
let _count = user_error?;
|
||||||
let result = builder.execute();
|
let result = builder.execute();
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
@ -708,7 +704,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
struct Answer {
|
struct Answer {
|
||||||
documents: Vec<Map<String, Value>>,
|
documents: Vec<Object>,
|
||||||
number_of_candidates: u64,
|
number_of_candidates: u64,
|
||||||
facets: BTreeMap<String, BTreeMap<String, u64>>,
|
facets: BTreeMap<String, BTreeMap<String, u64>>,
|
||||||
}
|
}
|
||||||
@ -1032,35 +1028,33 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
let reader = BufReader::new(reader);
|
||||||
|
|
||||||
for result in BufReader::new(reader).lines() {
|
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
let line = result?;
|
let object = result?;
|
||||||
documents.extend_from_json(Cursor::new(line))?;
|
documents.append_json_object(&object)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.finish()?;
|
documents.into_inner().map_err(Into::into)
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
|
|
||||||
|
|
||||||
documents.extend_from_json(reader)?;
|
documents.append_json_array(reader)?;
|
||||||
documents.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
|
fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut writer = Cursor::new(Vec::new());
|
let csv = csv::Reader::from_reader(reader);
|
||||||
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
|
|
||||||
|
|
||||||
Ok(writer.into_inner())
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
documents.append_csv(csv)?;
|
||||||
|
|
||||||
|
documents.into_inner().map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
|||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
geoutils = "0.4.1"
|
geoutils = "0.4.1"
|
||||||
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
|
grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||||
json-depth-checker = { path = "../json-depth-checker" }
|
json-depth-checker = { path = "../json-depth-checker" }
|
||||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||||
|
3
milli/fuzz/.gitignore
vendored
3
milli/fuzz/.gitignore
vendored
@ -1,2 +1,5 @@
|
|||||||
|
Cargo.lock
|
||||||
|
target/
|
||||||
|
|
||||||
/corpus/
|
/corpus/
|
||||||
/artifacts/
|
/artifacts/
|
||||||
|
@ -7,10 +7,10 @@ use anyhow::{bail, Result};
|
|||||||
use arbitrary_json::ArbitraryValue;
|
use arbitrary_json::ArbitraryValue;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use libfuzzer_sys::fuzz_target;
|
use libfuzzer_sys::fuzz_target;
|
||||||
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
use milli::Index;
|
use milli::Index;
|
||||||
use serde_json::Value;
|
use serde_json::{Map, Value};
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
@ -19,21 +19,26 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|||||||
/// reads json from input and write an obkv batch to writer.
|
/// reads json from input and write an obkv batch to writer.
|
||||||
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
||||||
let writer = BufWriter::new(writer);
|
let writer = BufWriter::new(writer);
|
||||||
let mut builder = DocumentBatchBuilder::new(writer)?;
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||||
builder.extend_from_json(input)?;
|
|
||||||
|
|
||||||
if builder.len() == 0 {
|
let values: Vec<Object> = serde_json::from_reader(input)?;
|
||||||
|
if builder.documents_count() == 0 {
|
||||||
bail!("Empty payload");
|
bail!("Empty payload");
|
||||||
}
|
}
|
||||||
|
|
||||||
let count = builder.finish()?;
|
for object in values {
|
||||||
|
builder.append_json_object(&object)?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(count)
|
let count = builder.documents_count();
|
||||||
|
let vector = builder.into_inner()?;
|
||||||
|
|
||||||
|
Ok(count as usize)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index_documents(
|
fn index_documents(
|
||||||
index: &mut milli::Index,
|
index: &mut milli::Index,
|
||||||
documents: DocumentBatchReader<Cursor<Vec<u8>>>,
|
documents: DocumentsBatchReader<Cursor<Vec<u8>>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn()?;
|
let mut wtxn = index.write_txn()?;
|
||||||
@ -98,7 +103,7 @@ fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
|
|||||||
// We ignore all malformed documents
|
// We ignore all malformed documents
|
||||||
if let Ok(_) = read_json(json.as_bytes(), &mut documents) {
|
if let Ok(_) = read_json(json.as_bytes(), &mut documents) {
|
||||||
documents.rewind().unwrap();
|
documents.rewind().unwrap();
|
||||||
let documents = DocumentBatchReader::from_reader(documents).unwrap();
|
let documents = DocumentsBatchReader::from_reader(documents).unwrap();
|
||||||
// A lot of errors can come out of milli and we don't know which ones are normal or not
|
// A lot of errors can come out of milli and we don't know which ones are normal or not
|
||||||
// so we are only going to look for the unexpected panics.
|
// so we are only going to look for the unexpected panics.
|
||||||
let _ = index_documents(&mut index, documents);
|
let _ = index_documents(&mut index, documents);
|
||||||
|
@ -1,157 +1,170 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::io::{self, Write};
|
||||||
use std::io;
|
|
||||||
use std::io::{Cursor, Write};
|
|
||||||
|
|
||||||
use byteorder::{BigEndian, WriteBytesExt};
|
use grenad::{CompressionType, WriterBuilder};
|
||||||
use serde::Deserializer;
|
use serde::de::Deserializer;
|
||||||
use serde_json::Value;
|
use serde_json::{to_writer, Value};
|
||||||
|
|
||||||
use super::serde_impl::DocumentVisitor;
|
use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
|
||||||
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
use crate::documents::serde_impl::DocumentVisitor;
|
||||||
use crate::FieldId;
|
use crate::Object;
|
||||||
|
|
||||||
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
|
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
|
||||||
/// format used by milli.
|
/// format used by milli.
|
||||||
///
|
///
|
||||||
/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
|
/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader`
|
||||||
/// iterate over the documents.
|
/// to iterate over the documents.
|
||||||
///
|
///
|
||||||
/// ## example:
|
/// ## example:
|
||||||
/// ```
|
/// ```
|
||||||
/// use milli::documents::DocumentBatchBuilder;
|
|
||||||
/// use serde_json::json;
|
/// use serde_json::json;
|
||||||
/// use std::io::Cursor;
|
/// use milli::documents::DocumentsBatchBuilder;
|
||||||
///
|
///
|
||||||
/// let json = r##"{"id": 1, "name": "foo"}"##;
|
/// let json = json!({ "id": 1, "name": "foo" });
|
||||||
/// let mut writer = Cursor::new(Vec::new());
|
///
|
||||||
/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
|
/// let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
|
/// builder.append_json_object(json.as_object().unwrap()).unwrap();
|
||||||
/// builder.finish().unwrap();
|
/// let _vector = builder.into_inner().unwrap();
|
||||||
/// ```
|
/// ```
|
||||||
pub struct DocumentBatchBuilder<W> {
|
pub struct DocumentsBatchBuilder<W> {
|
||||||
inner: ByteCounter<W>,
|
/// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`.
|
||||||
index: DocumentsBatchIndex,
|
writer: grenad::Writer<W>,
|
||||||
|
/// A map that creates the relation between field ids and field names.
|
||||||
|
fields_index: DocumentsBatchIndex,
|
||||||
|
/// The number of documents that were added to this builder,
|
||||||
|
/// it doesn't take the primary key of the documents into account at this point.
|
||||||
|
documents_count: u32,
|
||||||
|
|
||||||
|
/// A buffer to store a temporary obkv buffer and avoid reallocating.
|
||||||
obkv_buffer: Vec<u8>,
|
obkv_buffer: Vec<u8>,
|
||||||
|
/// A buffer to serialize the values and avoid reallocating,
|
||||||
|
/// serialized values are stored in an obkv.
|
||||||
value_buffer: Vec<u8>,
|
value_buffer: Vec<u8>,
|
||||||
values: BTreeMap<FieldId, Value>,
|
|
||||||
count: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
impl<W: Write> DocumentsBatchBuilder<W> {
|
||||||
pub fn new(writer: W) -> Result<Self, Error> {
|
pub fn new(writer: W) -> DocumentsBatchBuilder<W> {
|
||||||
let index = DocumentsBatchIndex::default();
|
DocumentsBatchBuilder {
|
||||||
let mut writer = ByteCounter::new(writer);
|
writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer),
|
||||||
// add space to write the offset of the metadata at the end of the writer
|
fields_index: DocumentsBatchIndex::default(),
|
||||||
writer.write_u64::<BigEndian>(0)?;
|
documents_count: 0,
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
inner: writer,
|
|
||||||
index,
|
|
||||||
obkv_buffer: Vec::new(),
|
obkv_buffer: Vec::new(),
|
||||||
value_buffer: Vec::new(),
|
value_buffer: Vec::new(),
|
||||||
values: BTreeMap::new(),
|
}
|
||||||
count: 0,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of documents that have been written to the builder.
|
/// Returns the number of documents inserted into this builder.
|
||||||
pub fn len(&self) -> usize {
|
pub fn documents_count(&self) -> u32 {
|
||||||
self.count
|
self.documents_count
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This method must be called after the document addition is terminated. It will put the
|
/// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly.
|
||||||
/// metadata at the end of the file, and write the metadata offset at the beginning on the
|
pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> {
|
||||||
/// file.
|
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
||||||
pub fn finish(self) -> Result<usize, Error> {
|
let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect();
|
||||||
let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
|
fields_ids.sort_unstable();
|
||||||
|
|
||||||
let meta = DocumentsMetadata { count, index };
|
self.obkv_buffer.clear();
|
||||||
|
let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
|
||||||
|
for field_id in fields_ids {
|
||||||
|
let key = self.fields_index.name(field_id).unwrap();
|
||||||
|
self.value_buffer.clear();
|
||||||
|
to_writer(&mut self.value_buffer, &object[key])?;
|
||||||
|
writer.insert(field_id, &self.value_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
bincode::serialize_into(&mut writer, &meta)?;
|
let internal_id = self.documents_count.to_be_bytes();
|
||||||
|
let document_bytes = writer.into_inner()?;
|
||||||
|
self.writer.insert(internal_id, &document_bytes)?;
|
||||||
|
self.documents_count += 1;
|
||||||
|
|
||||||
writer.seek(io::SeekFrom::Start(0))?;
|
Ok(())
|
||||||
writer.write_u64::<BigEndian>(offset as u64)?;
|
|
||||||
|
|
||||||
writer.flush()?;
|
|
||||||
|
|
||||||
Ok(count)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extends the builder with json documents from a reader.
|
/// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly.
|
||||||
pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
|
pub fn append_json_array<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
|
||||||
let mut de = serde_json::Deserializer::from_reader(reader);
|
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||||
|
let mut visitor = DocumentVisitor::new(self);
|
||||||
let mut visitor = DocumentVisitor {
|
de.deserialize_any(&mut visitor)?
|
||||||
inner: &mut self.inner,
|
|
||||||
index: &mut self.index,
|
|
||||||
obkv_buffer: &mut self.obkv_buffer,
|
|
||||||
value_buffer: &mut self.value_buffer,
|
|
||||||
values: &mut self.values,
|
|
||||||
count: &mut self.count,
|
|
||||||
};
|
|
||||||
|
|
||||||
de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a builder from a reader of CSV documents.
|
/// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
|
||||||
///
|
pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
|
||||||
/// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
|
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
||||||
/// optimisations, and extending from another CSV is not allowed.
|
let mut typed_fields_ids: Vec<_> = reader
|
||||||
pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
|
|
||||||
let mut this = Self::new(writer)?;
|
|
||||||
// Ensure that this is the first and only addition made with this builder
|
|
||||||
debug_assert!(this.index.is_empty());
|
|
||||||
|
|
||||||
let mut records = csv::Reader::from_reader(reader);
|
|
||||||
|
|
||||||
let headers = records
|
|
||||||
.headers()?
|
.headers()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(parse_csv_header)
|
.map(parse_csv_header)
|
||||||
.map(|(k, t)| (this.index.insert(&k), t))
|
.map(|(k, t)| (self.fields_index.insert(k), t))
|
||||||
.collect::<BTreeMap<_, _>>();
|
.enumerate()
|
||||||
|
.collect();
|
||||||
|
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
||||||
|
typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
|
||||||
|
|
||||||
for (i, record) in records.into_records().enumerate() {
|
let mut record = csv::StringRecord::new();
|
||||||
let record = record?;
|
let mut line = 0;
|
||||||
this.obkv_buffer.clear();
|
while reader.read_record(&mut record)? {
|
||||||
let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
|
// We increment here and not at the end of the while loop to take
|
||||||
for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
|
// the header offset into account.
|
||||||
let value = match ty {
|
line += 1;
|
||||||
|
|
||||||
|
self.obkv_buffer.clear();
|
||||||
|
let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
|
||||||
|
|
||||||
|
for (i, (field_id, type_)) in typed_fields_ids.iter() {
|
||||||
|
self.value_buffer.clear();
|
||||||
|
|
||||||
|
let value = &record[*i];
|
||||||
|
match type_ {
|
||||||
AllowedType::Number => {
|
AllowedType::Number => {
|
||||||
if value.trim().is_empty() {
|
if value.trim().is_empty() {
|
||||||
Value::Null
|
to_writer(&mut self.value_buffer, &Value::Null)?;
|
||||||
} else {
|
} else {
|
||||||
value.trim().parse::<f64>().map(Value::from).map_err(|error| {
|
match value.trim().parse::<f64>() {
|
||||||
Error::ParseFloat {
|
Ok(float) => {
|
||||||
error,
|
to_writer(&mut self.value_buffer, &float)?;
|
||||||
// +1 for the header offset.
|
|
||||||
line: i + 1,
|
|
||||||
value: value.to_string(),
|
|
||||||
}
|
}
|
||||||
})?
|
Err(error) => {
|
||||||
|
return Err(Error::ParseFloat {
|
||||||
|
error,
|
||||||
|
line,
|
||||||
|
value: value.to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
AllowedType::String => {
|
AllowedType::String => {
|
||||||
if value.is_empty() {
|
if value.is_empty() {
|
||||||
Value::Null
|
to_writer(&mut self.value_buffer, &Value::Null)?;
|
||||||
} else {
|
} else {
|
||||||
Value::String(value.to_string())
|
to_writer(&mut self.value_buffer, value)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
this.value_buffer.clear();
|
// We insert into the obkv writer the value buffer that has been filled just above.
|
||||||
serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
|
writer.insert(*field_id, &self.value_buffer)?;
|
||||||
writer.insert(*fid, &this.value_buffer)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
|
let internal_id = self.documents_count.to_be_bytes();
|
||||||
this.inner.write_all(&this.obkv_buffer)?;
|
let document_bytes = writer.into_inner()?;
|
||||||
|
self.writer.insert(internal_id, &document_bytes)?;
|
||||||
this.count += 1;
|
self.documents_count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(this)
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
|
||||||
|
pub fn into_inner(mut self) -> io::Result<W> {
|
||||||
|
let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
|
||||||
|
|
||||||
|
// We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer.
|
||||||
|
self.value_buffer.clear();
|
||||||
|
to_writer(&mut self.value_buffer, &fields_index)?;
|
||||||
|
writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?;
|
||||||
|
|
||||||
|
writer.into_inner()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,16 +174,16 @@ enum AllowedType {
|
|||||||
Number,
|
Number,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
fn parse_csv_header(header: &str) -> (&str, AllowedType) {
|
||||||
// if there are several separators we only split on the last one.
|
// if there are several separators we only split on the last one.
|
||||||
match header.rsplit_once(':') {
|
match header.rsplit_once(':') {
|
||||||
Some((field_name, field_type)) => match field_type {
|
Some((field_name, field_type)) => match field_type {
|
||||||
"string" => (field_name.to_string(), AllowedType::String),
|
"string" => (field_name, AllowedType::String),
|
||||||
"number" => (field_name.to_string(), AllowedType::Number),
|
"number" => (field_name, AllowedType::Number),
|
||||||
// if the pattern isn't reconized, we keep the whole field.
|
// if the pattern isn't reconized, we keep the whole field.
|
||||||
_otherwise => (header.to_string(), AllowedType::String),
|
_otherwise => (header, AllowedType::String),
|
||||||
},
|
},
|
||||||
None => (header.to_string(), AllowedType::String),
|
None => (header, AllowedType::String),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,35 +191,20 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
|||||||
mod test {
|
mod test {
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
use serde_json::{json, Map};
|
use serde_json::json;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::DocumentBatchReader;
|
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
|
|
||||||
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
|
|
||||||
let mut map = Map::new();
|
|
||||||
|
|
||||||
for (fid, value) in obkv.iter() {
|
|
||||||
let field_name = index.name(fid).unwrap().clone();
|
|
||||||
let value: Value = serde_json::from_slice(value).unwrap();
|
|
||||||
|
|
||||||
map.insert(field_name, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
Value::Object(map)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_single_documents_json() {
|
fn add_single_documents_json() {
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
|
|
||||||
let json = serde_json::json!({
|
let json = serde_json::json!({
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"field": "hello!",
|
"field": "hello!",
|
||||||
});
|
});
|
||||||
|
|
||||||
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
builder.append_json_object(json.as_object().unwrap()).unwrap();
|
||||||
|
|
||||||
let json = serde_json::json!({
|
let json = serde_json::json!({
|
||||||
"blabla": false,
|
"blabla": false,
|
||||||
@ -214,100 +212,64 @@ mod test {
|
|||||||
"id": 1,
|
"id": 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
|
builder.append_json_object(json.as_object().unwrap()).unwrap();
|
||||||
|
|
||||||
assert_eq!(builder.len(), 2);
|
assert_eq!(builder.documents_count(), 2);
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
builder.finish().unwrap();
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
|
.unwrap()
|
||||||
cursor.set_position(0);
|
.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
|
||||||
|
|
||||||
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
|
||||||
assert_eq!(index.len(), 3);
|
assert_eq!(index.len(), 3);
|
||||||
|
|
||||||
|
let document = cursor.next_document().unwrap().unwrap();
|
||||||
assert_eq!(document.iter().count(), 2);
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
let document = cursor.next_document().unwrap().unwrap();
|
||||||
assert_eq!(index.len(), 3);
|
|
||||||
assert_eq!(document.iter().count(), 3);
|
assert_eq!(document.iter().count(), 3);
|
||||||
|
|
||||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
assert!(cursor.next_document().unwrap().is_none());
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn add_documents_seq_json() {
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
|
|
||||||
let json = serde_json::json!([{
|
|
||||||
"id": 1,
|
|
||||||
"field": "hello!",
|
|
||||||
},{
|
|
||||||
"blabla": false,
|
|
||||||
"field": "hello!",
|
|
||||||
"id": 1,
|
|
||||||
}
|
|
||||||
]);
|
|
||||||
|
|
||||||
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(builder.len(), 2);
|
|
||||||
|
|
||||||
builder.finish().unwrap();
|
|
||||||
|
|
||||||
cursor.set_position(0);
|
|
||||||
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
|
||||||
|
|
||||||
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
|
||||||
assert_eq!(index.len(), 3);
|
|
||||||
assert_eq!(document.iter().count(), 2);
|
|
||||||
|
|
||||||
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
|
||||||
assert_eq!(index.len(), 3);
|
|
||||||
assert_eq!(document.iter().count(), 3);
|
|
||||||
|
|
||||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_documents_csv() {
|
fn add_documents_csv() {
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let csv = "id:number,field:string\n1,hello!\n2,blabla";
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
builder.append_csv(csv).unwrap();
|
||||||
|
assert_eq!(builder.documents_count(), 2);
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
let builder =
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
|
.unwrap()
|
||||||
builder.finish().unwrap();
|
.into_cursor_and_fields_index();
|
||||||
|
|
||||||
cursor.set_position(0);
|
|
||||||
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
|
||||||
|
|
||||||
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
|
||||||
assert_eq!(index.len(), 2);
|
assert_eq!(index.len(), 2);
|
||||||
|
|
||||||
|
let document = cursor.next_document().unwrap().unwrap();
|
||||||
assert_eq!(document.iter().count(), 2);
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
|
let document = cursor.next_document().unwrap().unwrap();
|
||||||
assert_eq!(document.iter().count(), 2);
|
assert_eq!(document.iter().count(), 2);
|
||||||
|
|
||||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
assert!(cursor.next_document().unwrap().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_csv_document() {
|
fn simple_csv_document() {
|
||||||
let documents = r#"city,country,pop
|
let csv_content = r#"city,country,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -318,22 +280,25 @@ mod test {
|
|||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
assert!(cursor.next_document().unwrap().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn coma_in_field() {
|
fn coma_in_field() {
|
||||||
let documents = r#"city,country,pop
|
let csv_content = r#"city,country,pop
|
||||||
"Boston","United, States","4628910""#;
|
"Boston","United, States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -347,17 +312,20 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn quote_in_field() {
|
fn quote_in_field() {
|
||||||
let documents = r#"city,country,pop
|
let csv_content = r#"city,country,pop
|
||||||
"Boston","United"" States","4628910""#;
|
"Boston","United"" States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -371,17 +339,20 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn integer_in_field() {
|
fn integer_in_field() {
|
||||||
let documents = r#"city,country,pop:number
|
let csv_content = r#"city,country,pop:number
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -395,17 +366,20 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn float_in_field() {
|
fn float_in_field() {
|
||||||
let documents = r#"city,country,pop:number
|
let csv_content = r#"city,country,pop:number
|
||||||
"Boston","United States","4628910.01""#;
|
"Boston","United States","4628910.01""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -419,17 +393,20 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn several_colon_in_header() {
|
fn several_colon_in_header() {
|
||||||
let documents = r#"city:love:string,country:state,pop
|
let csv_content = r#"city:love:string,country:state,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -443,17 +420,20 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ending_by_colon_in_header() {
|
fn ending_by_colon_in_header() {
|
||||||
let documents = r#"city:,country,pop
|
let csv_content = r#"city:,country,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -467,17 +447,20 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn starting_by_colon_in_header() {
|
fn starting_by_colon_in_header() {
|
||||||
let documents = r#":city,country,pop
|
let csv_content = r#":city,country,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -492,32 +475,37 @@ mod test {
|
|||||||
#[ignore]
|
#[ignore]
|
||||||
#[test]
|
#[test]
|
||||||
fn starting_by_colon_in_header2() {
|
fn starting_by_colon_in_header2() {
|
||||||
let documents = r#":string,country,pop
|
let csv_content = r#":string,country,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
|
||||||
|
|
||||||
assert!(reader.next_document_with_index().is_err());
|
assert!(cursor.next_document().is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn double_colon_in_header() {
|
fn double_colon_in_header() {
|
||||||
let documents = r#"city::string,country,pop
|
let csv_content = r#"city::string,country,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
builder.append_csv(csv).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.finish()
|
.into_cursor_and_fields_index();
|
||||||
.unwrap();
|
|
||||||
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||||
let val = obkv_to_value(&doc, index);
|
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
val,
|
val,
|
||||||
@ -531,34 +519,32 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn bad_type_in_header() {
|
fn bad_type_in_header() {
|
||||||
let documents = r#"city,country:number,pop
|
let csv_content = r#"city,country:number,pop
|
||||||
"Boston","United States","4628910""#;
|
"Boston","United States","4628910""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
assert!(
|
assert!(builder.append_csv(csv).is_err());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn bad_column_count1() {
|
fn bad_column_count1() {
|
||||||
let documents = r#"city,country,pop
|
let csv_content = r#"city,country,pop
|
||||||
"Boston","United States","4628910", "too much""#;
|
"Boston","United States","4628910", "too much
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
assert!(
|
assert!(builder.append_csv(csv).is_err());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn bad_column_count2() {
|
fn bad_column_count2() {
|
||||||
let documents = r#"city,country,pop
|
let csv_content = r#"city,country,pop
|
||||||
"Boston","United States""#;
|
"Boston","United States""#;
|
||||||
|
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
assert!(
|
assert!(builder.append_csv(csv).is_err());
|
||||||
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
109
milli/src/documents/enriched.rs
Normal file
109
milli/src/documents/enriched.rs
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::{io, str};
|
||||||
|
|
||||||
|
use obkv::KvReader;
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||||
|
Error,
|
||||||
|
};
|
||||||
|
use crate::update::DocumentId;
|
||||||
|
use crate::FieldId;
|
||||||
|
|
||||||
|
/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have
|
||||||
|
/// been created with a `DocumentsBatchWriter` and, for the enriched data,
|
||||||
|
/// a simple `grenad::Reader<File>`.
|
||||||
|
///
|
||||||
|
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
||||||
|
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
||||||
|
pub struct EnrichedDocumentsBatchReader<R> {
|
||||||
|
documents: DocumentsBatchReader<R>,
|
||||||
|
primary_key: String,
|
||||||
|
external_ids: grenad::ReaderCursor<File>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||||
|
pub fn new(
|
||||||
|
documents: DocumentsBatchReader<R>,
|
||||||
|
primary_key: String,
|
||||||
|
external_ids: grenad::Reader<File>,
|
||||||
|
) -> Result<Self, Error> {
|
||||||
|
if documents.documents_count() as u64 == external_ids.len() {
|
||||||
|
Ok(EnrichedDocumentsBatchReader {
|
||||||
|
documents,
|
||||||
|
primary_key,
|
||||||
|
external_ids: external_ids.into_cursor()?,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
Err(Error::InvalidEnrichedData)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn documents_count(&self) -> u32 {
|
||||||
|
self.documents.documents_count()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn primary_key(&self) -> &str {
|
||||||
|
&self.primary_key
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.documents.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
||||||
|
self.documents.documents_batch_index()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This method returns a forward cursor over the enriched documents.
|
||||||
|
pub fn into_cursor_and_fields_index(
|
||||||
|
self,
|
||||||
|
) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
||||||
|
let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
|
||||||
|
let (documents, fields_index) = documents.into_cursor_and_fields_index();
|
||||||
|
external_ids.reset();
|
||||||
|
(EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct EnrichedDocument<'a> {
|
||||||
|
pub document: KvReader<'a, FieldId>,
|
||||||
|
pub document_id: DocumentId,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct EnrichedDocumentsBatchCursor<R> {
|
||||||
|
documents: DocumentsBatchCursor<R>,
|
||||||
|
primary_key: String,
|
||||||
|
external_ids: grenad::ReaderCursor<File>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||||
|
pub fn primary_key(&self) -> &str {
|
||||||
|
&self.primary_key
|
||||||
|
}
|
||||||
|
/// Resets the cursor to be able to read from the start again.
|
||||||
|
pub fn reset(&mut self) {
|
||||||
|
self.documents.reset();
|
||||||
|
self.external_ids.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> {
|
||||||
|
/// Returns the next document, starting from the first one. Subsequent calls to
|
||||||
|
/// `next_document` advance the document reader until all the documents have been read.
|
||||||
|
pub fn next_enriched_document(
|
||||||
|
&mut self,
|
||||||
|
) -> Result<Option<EnrichedDocument>, DocumentsBatchCursorError> {
|
||||||
|
let document = self.documents.next_document()?;
|
||||||
|
let document_id = match self.external_ids.move_on_next()? {
|
||||||
|
Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?,
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
match document.zip(document_id) {
|
||||||
|
Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,24 +1,41 @@
|
|||||||
mod builder;
|
mod builder;
|
||||||
/// The documents module defines an intermediary document format that milli uses for indexation, and
|
mod enriched;
|
||||||
/// provides an API to easily build and read such documents.
|
|
||||||
///
|
|
||||||
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
|
|
||||||
/// later be read by milli using the `DocumentBatchReader` interface.
|
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serde_impl;
|
mod serde_impl;
|
||||||
|
|
||||||
use std::fmt::{self, Debug};
|
use std::fmt::{self, Debug};
|
||||||
use std::io;
|
use std::io;
|
||||||
|
use std::str::Utf8Error;
|
||||||
|
|
||||||
use bimap::BiHashMap;
|
use bimap::BiHashMap;
|
||||||
pub use builder::DocumentBatchBuilder;
|
pub use builder::DocumentsBatchBuilder;
|
||||||
pub use reader::DocumentBatchReader;
|
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||||
|
use obkv::KvReader;
|
||||||
|
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::FieldId;
|
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
||||||
|
use crate::{FieldId, Object, Result};
|
||||||
|
|
||||||
|
/// The key that is used to store the `DocumentsBatchIndex` datastructure,
|
||||||
|
/// it is the absolute last key of the list.
|
||||||
|
const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
|
||||||
|
|
||||||
|
/// Helper function to convert an obkv reader into a JSON object.
|
||||||
|
pub fn obkv_to_object(obkv: &KvReader<FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
|
||||||
|
obkv.iter()
|
||||||
|
.map(|(field_id, value)| {
|
||||||
|
let field_name = index.name(field_id).ok_or_else(|| {
|
||||||
|
FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" }
|
||||||
|
})?;
|
||||||
|
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||||
|
Ok((field_name.to_string(), value))
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// A bidirectional map that links field ids to their name in a document batch.
|
/// A bidirectional map that links field ids to their name in a document batch.
|
||||||
#[derive(Default, Debug, Serialize, Deserialize)]
|
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||||
pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
|
pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
|
||||||
|
|
||||||
impl DocumentsBatchIndex {
|
impl DocumentsBatchIndex {
|
||||||
@ -46,15 +63,16 @@ impl DocumentsBatchIndex {
|
|||||||
self.0.iter()
|
self.0.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn name(&self, id: FieldId) -> Option<&String> {
|
pub fn name(&self, id: FieldId) -> Option<&str> {
|
||||||
self.0.get_by_left(&id)
|
self.0.get_by_left(&id).map(AsRef::as_ref)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn recreate_json(
|
pub fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
&self,
|
self.0.get_by_right(name).cloned()
|
||||||
document: &obkv::KvReaderU16,
|
}
|
||||||
) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> {
|
|
||||||
let mut map = serde_json::Map::new();
|
pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result<Object> {
|
||||||
|
let mut map = Object::new();
|
||||||
|
|
||||||
for (k, v) in document.iter() {
|
for (k, v) in document.iter() {
|
||||||
// TODO: TAMO: update the error type
|
// TODO: TAMO: update the error type
|
||||||
@ -69,50 +87,22 @@ impl DocumentsBatchIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
struct DocumentsMetadata {
|
|
||||||
count: usize,
|
|
||||||
index: DocumentsBatchIndex,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct ByteCounter<W> {
|
|
||||||
count: usize,
|
|
||||||
writer: W,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<W> ByteCounter<W> {
|
|
||||||
fn new(writer: W) -> Self {
|
|
||||||
Self { count: 0, writer }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<W: io::Write> io::Write for ByteCounter<W> {
|
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
||||||
let count = self.writer.write(buf)?;
|
|
||||||
self.count += count;
|
|
||||||
Ok(count)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
|
||||||
self.writer.flush()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
|
ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
|
||||||
InvalidDocumentFormat,
|
InvalidDocumentFormat,
|
||||||
Custom(String),
|
InvalidEnrichedData,
|
||||||
JsonError(serde_json::Error),
|
InvalidUtf8(Utf8Error),
|
||||||
CsvError(csv::Error),
|
Csv(csv::Error),
|
||||||
Serialize(bincode::Error),
|
Json(serde_json::Error),
|
||||||
|
Serialize(serde_json::Error),
|
||||||
|
Grenad(grenad::Error),
|
||||||
Io(io::Error),
|
Io(io::Error),
|
||||||
DocumentTooLarge,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<csv::Error> for Error {
|
impl From<csv::Error> for Error {
|
||||||
fn from(e: csv::Error) -> Self {
|
fn from(e: csv::Error) -> Self {
|
||||||
Self::CsvError(e)
|
Self::Csv(e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,15 +112,21 @@ impl From<io::Error> for Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<bincode::Error> for Error {
|
impl From<serde_json::Error> for Error {
|
||||||
fn from(other: bincode::Error) -> Self {
|
fn from(other: serde_json::Error) -> Self {
|
||||||
Self::Serialize(other)
|
Self::Json(other)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<serde_json::Error> for Error {
|
impl From<grenad::Error> for Error {
|
||||||
fn from(other: serde_json::Error) -> Self {
|
fn from(other: grenad::Error) -> Self {
|
||||||
Self::JsonError(other)
|
Self::Grenad(other)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Utf8Error> for Error {
|
||||||
|
fn from(other: Utf8Error) -> Self {
|
||||||
|
Self::InvalidUtf8(other)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,13 +136,16 @@ impl fmt::Display for Error {
|
|||||||
Error::ParseFloat { error, line, value } => {
|
Error::ParseFloat { error, line, value } => {
|
||||||
write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
|
write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
|
||||||
}
|
}
|
||||||
Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
|
Error::InvalidDocumentFormat => {
|
||||||
Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
|
f.write_str("Invalid document addition format, missing the documents batch index.")
|
||||||
Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
|
}
|
||||||
|
Error::InvalidEnrichedData => f.write_str("Invalid enriched data."),
|
||||||
|
Error::InvalidUtf8(e) => write!(f, "{}", e),
|
||||||
Error::Io(e) => write!(f, "{}", e),
|
Error::Io(e) => write!(f, "{}", e),
|
||||||
Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
|
|
||||||
Error::Serialize(e) => write!(f, "{}", e),
|
Error::Serialize(e) => write!(f, "{}", e),
|
||||||
Error::CsvError(e) => write!(f, "{}", e),
|
Error::Grenad(e) => write!(f, "{}", e),
|
||||||
|
Error::Csv(e) => write!(f, "{}", e),
|
||||||
|
Error::Json(e) => write!(f, "{}", e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -158,15 +157,25 @@ impl std::error::Error for Error {}
|
|||||||
macro_rules! documents {
|
macro_rules! documents {
|
||||||
($data:tt) => {{
|
($data:tt) => {{
|
||||||
let documents = serde_json::json!($data);
|
let documents = serde_json::json!($data);
|
||||||
let mut writer = std::io::Cursor::new(Vec::new());
|
let documents = match documents {
|
||||||
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
object @ serde_json::Value::Object(_) => vec![object],
|
||||||
let documents = serde_json::to_vec(&documents).unwrap();
|
serde_json::Value::Array(objects) => objects,
|
||||||
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
invalid => {
|
||||||
builder.finish().unwrap();
|
panic!("an array of objects must be specified, {:#?} is not an array", invalid)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
writer.set_position(0);
|
let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
for document in documents {
|
||||||
|
let object = match document {
|
||||||
|
serde_json::Value::Object(object) => object,
|
||||||
|
invalid => panic!("an object must be specified, {:#?} is not an object", invalid),
|
||||||
|
};
|
||||||
|
builder.append_json_object(&object).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
|
let vector = builder.into_inner().unwrap();
|
||||||
|
crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap()
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,7 +189,7 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn create_documents_no_errors() {
|
fn create_documents_no_errors() {
|
||||||
let json = json!({
|
let value = json!({
|
||||||
"number": 1,
|
"number": 1,
|
||||||
"string": "this is a field",
|
"string": "this is a field",
|
||||||
"array": ["an", "array"],
|
"array": ["an", "array"],
|
||||||
@ -190,26 +199,18 @@ mod test {
|
|||||||
"bool": true
|
"bool": true
|
||||||
});
|
});
|
||||||
|
|
||||||
let json = serde_json::to_vec(&json).unwrap();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
builder.append_json_object(value.as_object().unwrap()).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
let mut v = Vec::new();
|
let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
.unwrap()
|
||||||
|
.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
assert_eq!(index.iter().count(), 5);
|
||||||
|
let reader = documents.next_document().unwrap().unwrap();
|
||||||
builder.extend_from_json(Cursor::new(json)).unwrap();
|
assert_eq!(reader.iter().count(), 5);
|
||||||
|
assert!(documents.next_document().unwrap().is_none());
|
||||||
builder.finish().unwrap();
|
|
||||||
|
|
||||||
let mut documents =
|
|
||||||
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(documents.index().iter().count(), 5);
|
|
||||||
|
|
||||||
let reader = documents.next_document_with_index().unwrap().unwrap();
|
|
||||||
|
|
||||||
assert_eq!(reader.1.iter().count(), 5);
|
|
||||||
assert!(documents.next_document_with_index().unwrap().is_none());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -221,101 +222,56 @@ mod test {
|
|||||||
"toto": false,
|
"toto": false,
|
||||||
});
|
});
|
||||||
|
|
||||||
let doc1 = serde_json::to_vec(&doc1).unwrap();
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let doc2 = serde_json::to_vec(&doc2).unwrap();
|
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
|
||||||
|
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
let mut v = Vec::new();
|
let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector))
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
.unwrap()
|
||||||
|
.into_cursor_and_fields_index();
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
assert_eq!(index.iter().count(), 2);
|
||||||
|
let reader = documents.next_document().unwrap().unwrap();
|
||||||
builder.extend_from_json(Cursor::new(doc1)).unwrap();
|
assert_eq!(reader.iter().count(), 1);
|
||||||
builder.extend_from_json(Cursor::new(doc2)).unwrap();
|
assert!(documents.next_document().unwrap().is_some());
|
||||||
|
assert!(documents.next_document().unwrap().is_none());
|
||||||
builder.finish().unwrap();
|
|
||||||
|
|
||||||
let mut documents =
|
|
||||||
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(documents.index().iter().count(), 2);
|
|
||||||
|
|
||||||
let reader = documents.next_document_with_index().unwrap().unwrap();
|
|
||||||
|
|
||||||
assert_eq!(reader.1.iter().count(), 1);
|
|
||||||
assert!(documents.next_document_with_index().unwrap().is_some());
|
|
||||||
assert!(documents.next_document_with_index().unwrap().is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn add_documents_array() {
|
|
||||||
let docs = json!([
|
|
||||||
{ "toto": false },
|
|
||||||
{ "tata": "hello" },
|
|
||||||
]);
|
|
||||||
|
|
||||||
let docs = serde_json::to_vec(&docs).unwrap();
|
|
||||||
|
|
||||||
let mut v = Vec::new();
|
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
|
|
||||||
builder.extend_from_json(Cursor::new(docs)).unwrap();
|
|
||||||
|
|
||||||
builder.finish().unwrap();
|
|
||||||
|
|
||||||
let mut documents =
|
|
||||||
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(documents.index().iter().count(), 2);
|
|
||||||
|
|
||||||
let reader = documents.next_document_with_index().unwrap().unwrap();
|
|
||||||
|
|
||||||
assert_eq!(reader.1.iter().count(), 1);
|
|
||||||
assert!(documents.next_document_with_index().unwrap().is_some());
|
|
||||||
assert!(documents.next_document_with_index().unwrap().is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn add_invalid_document_format() {
|
|
||||||
let mut v = Vec::new();
|
|
||||||
let mut cursor = io::Cursor::new(&mut v);
|
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
|
|
||||||
let docs = json!([[
|
|
||||||
{ "toto": false },
|
|
||||||
{ "tata": "hello" },
|
|
||||||
]]);
|
|
||||||
|
|
||||||
let docs = serde_json::to_vec(&docs).unwrap();
|
|
||||||
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
|
|
||||||
|
|
||||||
let docs = json!("hello");
|
|
||||||
let docs = serde_json::to_vec(&docs).unwrap();
|
|
||||||
|
|
||||||
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_nested() {
|
fn test_nested() {
|
||||||
let mut docs = documents!([{
|
let docs_reader = documents!([{
|
||||||
"hello": {
|
"hello": {
|
||||||
"toto": ["hello"]
|
"toto": ["hello"]
|
||||||
}
|
}
|
||||||
}]);
|
}]);
|
||||||
|
|
||||||
let (_index, doc) = docs.next_document_with_index().unwrap().unwrap();
|
let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
|
||||||
|
let doc = cursor.next_document().unwrap().unwrap();
|
||||||
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
|
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
|
||||||
assert_eq!(nested, json!({ "toto": ["hello"] }));
|
assert_eq!(nested, json!({ "toto": ["hello"] }));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn out_of_order_fields() {
|
fn out_of_order_json_fields() {
|
||||||
let _documents = documents!([
|
let _documents = documents!([
|
||||||
{"id": 1,"b": 0},
|
{"id": 1,"b": 0},
|
||||||
{"id": 2,"a": 0,"b": 0},
|
{"id": 2,"a": 0,"b": 0},
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn out_of_order_csv_fields() {
|
||||||
|
let csv1_content = "id:number,b\n1,0";
|
||||||
|
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
|
||||||
|
|
||||||
|
let csv2_content = "id:number,a,b\n2,0,0";
|
||||||
|
let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
|
||||||
|
|
||||||
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
builder.append_csv(csv1).unwrap();
|
||||||
|
builder.append_csv(csv2).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
use std::io;
|
use std::convert::TryInto;
|
||||||
use std::io::{BufReader, Read};
|
use std::{error, fmt, io};
|
||||||
use std::mem::size_of;
|
|
||||||
|
|
||||||
use byteorder::{BigEndian, ReadBytesExt};
|
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
|
|
||||||
use super::{DocumentsBatchIndex, DocumentsMetadata, Error};
|
use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
|
/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
|
||||||
@ -13,63 +11,106 @@ use crate::FieldId;
|
|||||||
///
|
///
|
||||||
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
|
||||||
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
|
||||||
pub struct DocumentBatchReader<R> {
|
pub struct DocumentsBatchReader<R> {
|
||||||
reader: BufReader<R>,
|
cursor: grenad::ReaderCursor<R>,
|
||||||
metadata: DocumentsMetadata,
|
fields_index: DocumentsBatchIndex,
|
||||||
buffer: Vec<u8>,
|
|
||||||
seen_documents: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> DocumentBatchReader<R> {
|
impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
|
||||||
|
pub fn new(cursor: DocumentsBatchCursor<R>, fields_index: DocumentsBatchIndex) -> Self {
|
||||||
|
Self { cursor: cursor.cursor, fields_index }
|
||||||
|
}
|
||||||
|
|
||||||
/// Construct a `DocumentsReader` from a reader.
|
/// Construct a `DocumentsReader` from a reader.
|
||||||
///
|
///
|
||||||
/// It first retrieves the index, then moves to the first document. Subsequent calls to
|
/// It first retrieves the index, then moves to the first document. Use the `into_cursor`
|
||||||
/// `next_document` advance the document reader until all the documents have been read.
|
/// method to iterator over the documents, from the first to the last.
|
||||||
pub fn from_reader(mut reader: R) -> Result<Self, Error> {
|
pub fn from_reader(reader: R) -> Result<Self, Error> {
|
||||||
let mut buffer = Vec::new();
|
let reader = grenad::Reader::new(reader)?;
|
||||||
|
let mut cursor = reader.into_cursor()?;
|
||||||
|
|
||||||
let meta_offset = reader.read_u64::<BigEndian>()?;
|
let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? {
|
||||||
reader.seek(io::SeekFrom::Start(meta_offset))?;
|
Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?,
|
||||||
reader.read_to_end(&mut buffer)?;
|
None => return Err(Error::InvalidDocumentFormat),
|
||||||
let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?;
|
};
|
||||||
|
|
||||||
reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?;
|
Ok(DocumentsBatchReader { cursor, fields_index })
|
||||||
buffer.clear();
|
|
||||||
|
|
||||||
let reader = BufReader::new(reader);
|
|
||||||
|
|
||||||
Ok(Self { reader, metadata, buffer, seen_documents: 0 })
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a
|
pub fn documents_count(&self) -> u32 {
|
||||||
/// reference to the addition index.
|
self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents")
|
||||||
pub fn next_document_with_index<'a>(
|
|
||||||
&'a mut self,
|
|
||||||
) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> {
|
|
||||||
if self.seen_documents < self.metadata.count {
|
|
||||||
let doc_len = self.reader.read_u32::<BigEndian>()?;
|
|
||||||
self.buffer.resize(doc_len as usize, 0);
|
|
||||||
self.reader.read_exact(&mut self.buffer)?;
|
|
||||||
self.seen_documents += 1;
|
|
||||||
|
|
||||||
let reader = KvReader::new(&self.buffer);
|
|
||||||
Ok(Some((&self.metadata.index, reader)))
|
|
||||||
} else {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the fields index for the documents batch.
|
|
||||||
pub fn index(&self) -> &DocumentsBatchIndex {
|
|
||||||
&self.metadata.index
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the number of documents in the reader.
|
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.metadata.count
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.len() == 0
|
self.cursor.len().saturating_sub(1) == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
|
||||||
|
&self.fields_index
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This method returns a forward cursor over the documents.
|
||||||
|
pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor<R>, DocumentsBatchIndex) {
|
||||||
|
let DocumentsBatchReader { cursor, fields_index } = self;
|
||||||
|
let mut cursor = DocumentsBatchCursor { cursor };
|
||||||
|
cursor.reset();
|
||||||
|
(cursor, fields_index)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A forward cursor over the documents in a `DocumentsBatchReader`.
|
||||||
|
pub struct DocumentsBatchCursor<R> {
|
||||||
|
cursor: grenad::ReaderCursor<R>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R> DocumentsBatchCursor<R> {
|
||||||
|
/// Resets the cursor to be able to read from the start again.
|
||||||
|
pub fn reset(&mut self) {
|
||||||
|
self.cursor.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
|
||||||
|
/// Returns the next document, starting from the first one. Subsequent calls to
|
||||||
|
/// `next_document` advance the document reader until all the documents have been read.
|
||||||
|
pub fn next_document(
|
||||||
|
&mut self,
|
||||||
|
) -> Result<Option<KvReader<FieldId>>, DocumentsBatchCursorError> {
|
||||||
|
match self.cursor.move_on_next()? {
|
||||||
|
Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
|
||||||
|
Ok(Some(KvReader::new(value)))
|
||||||
|
}
|
||||||
|
_otherwise => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum DocumentsBatchCursorError {
|
||||||
|
Grenad(grenad::Error),
|
||||||
|
SerdeJson(serde_json::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<grenad::Error> for DocumentsBatchCursorError {
|
||||||
|
fn from(error: grenad::Error) -> DocumentsBatchCursorError {
|
||||||
|
DocumentsBatchCursorError::Grenad(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<serde_json::Error> for DocumentsBatchCursorError {
|
||||||
|
fn from(error: serde_json::Error) -> DocumentsBatchCursorError {
|
||||||
|
DocumentsBatchCursorError::SerdeJson(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for DocumentsBatchCursorError {}
|
||||||
|
|
||||||
|
impl fmt::Display for DocumentsBatchCursorError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
DocumentsBatchCursorError::Grenad(e) => e.fmt(f),
|
||||||
|
DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,11 @@
|
|||||||
use std::collections::BTreeMap;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io::{Cursor, Write};
|
use std::io::Write;
|
||||||
|
|
||||||
use byteorder::WriteBytesExt;
|
|
||||||
use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
|
use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
|
||||||
use serde::Deserialize;
|
|
||||||
use serde_json::Value;
|
|
||||||
|
|
||||||
use super::{ByteCounter, DocumentsBatchIndex, Error};
|
use super::Error;
|
||||||
use crate::FieldId;
|
use crate::documents::DocumentsBatchBuilder;
|
||||||
|
use crate::Object;
|
||||||
|
|
||||||
macro_rules! tri {
|
macro_rules! tri {
|
||||||
($e:expr) => {
|
($e:expr) => {
|
||||||
@ -19,54 +16,15 @@ macro_rules! tri {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
|
|
||||||
|
|
||||||
impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
|
|
||||||
type Value = FieldId;
|
|
||||||
|
|
||||||
fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
deserializer.deserialize_str(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
|
|
||||||
type Value = FieldId;
|
|
||||||
|
|
||||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
|
||||||
where
|
|
||||||
E: serde::de::Error,
|
|
||||||
{
|
|
||||||
Ok(self.0.insert(v))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
write!(f, "a string")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ValueDeserializer;
|
|
||||||
|
|
||||||
impl<'de> DeserializeSeed<'de> for ValueDeserializer {
|
|
||||||
type Value = serde_json::Value;
|
|
||||||
|
|
||||||
fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
serde_json::Value::deserialize(deserializer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct DocumentVisitor<'a, W> {
|
pub struct DocumentVisitor<'a, W> {
|
||||||
pub inner: &'a mut ByteCounter<W>,
|
inner: &'a mut DocumentsBatchBuilder<W>,
|
||||||
pub index: &'a mut DocumentsBatchIndex,
|
object: Object,
|
||||||
pub obkv_buffer: &'a mut Vec<u8>,
|
}
|
||||||
pub value_buffer: &'a mut Vec<u8>,
|
|
||||||
pub values: &'a mut BTreeMap<FieldId, Value>,
|
impl<'a, W> DocumentVisitor<'a, W> {
|
||||||
pub count: &'a mut usize,
|
pub fn new(inner: &'a mut DocumentsBatchBuilder<W>) -> Self {
|
||||||
|
DocumentVisitor { inner, object: Object::new() }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
|
impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
|
||||||
@ -88,28 +46,12 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
|
|||||||
where
|
where
|
||||||
A: MapAccess<'de>,
|
A: MapAccess<'de>,
|
||||||
{
|
{
|
||||||
while let Some((key, value)) =
|
self.object.clear();
|
||||||
map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
|
while let Some((key, value)) = map.next_entry()? {
|
||||||
{
|
self.object.insert(key, value);
|
||||||
self.values.insert(key, value);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.obkv_buffer.clear();
|
tri!(self.inner.append_json_object(&self.object));
|
||||||
let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
|
|
||||||
for (key, value) in self.values.iter() {
|
|
||||||
self.value_buffer.clear();
|
|
||||||
// This is guaranteed to work
|
|
||||||
tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
|
|
||||||
tri!(obkv.insert(*key, &self.value_buffer));
|
|
||||||
}
|
|
||||||
|
|
||||||
let reader = tri!(obkv.into_inner()).into_inner();
|
|
||||||
|
|
||||||
tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
|
|
||||||
tri!(self.inner.write_all(reader));
|
|
||||||
|
|
||||||
*self.count += 1;
|
|
||||||
self.values.clear();
|
|
||||||
|
|
||||||
Ok(Ok(()))
|
Ok(Ok(()))
|
||||||
}
|
}
|
||||||
|
@ -4,12 +4,11 @@ use std::{io, str};
|
|||||||
|
|
||||||
use heed::{Error as HeedError, MdbError};
|
use heed::{Error as HeedError, MdbError};
|
||||||
use rayon::ThreadPoolBuildError;
|
use rayon::ThreadPoolBuildError;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::Value;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
use crate::{CriterionError, DocumentId, FieldId, SortError};
|
use crate::documents::{self, DocumentsBatchCursorError};
|
||||||
|
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
|
||||||
pub type Object = Map<String, Value>;
|
|
||||||
|
|
||||||
pub fn is_reserved_keyword(keyword: &str) -> bool {
|
pub fn is_reserved_keyword(keyword: &str) -> bool {
|
||||||
["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword)
|
["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword)
|
||||||
@ -37,6 +36,8 @@ pub enum InternalError {
|
|||||||
FieldIdMappingMissingEntry { key: FieldId },
|
FieldIdMappingMissingEntry { key: FieldId },
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Fst(#[from] fst::Error),
|
Fst(#[from] fst::Error),
|
||||||
|
#[error(transparent)]
|
||||||
|
DocumentsError(#[from] documents::Error),
|
||||||
#[error("Invalid compression type have been specified to grenad.")]
|
#[error("Invalid compression type have been specified to grenad.")]
|
||||||
GrenadInvalidCompressionType,
|
GrenadInvalidCompressionType,
|
||||||
#[error("Invalid grenad file with an invalid version format.")]
|
#[error("Invalid grenad file with an invalid version format.")]
|
||||||
@ -123,6 +124,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
MaxDatabaseSizeReached,
|
MaxDatabaseSizeReached,
|
||||||
#[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
#[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
||||||
MissingDocumentId { primary_key: String, document: Object },
|
MissingDocumentId { primary_key: String, document: Object },
|
||||||
|
#[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
|
||||||
|
TooManyDocumentIds { primary_key: String, document: Object },
|
||||||
#[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
|
#[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
|
||||||
MissingPrimaryKey,
|
MissingPrimaryKey,
|
||||||
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
|
#[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
|
||||||
@ -141,13 +144,19 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum GeoError {
|
pub enum GeoError {
|
||||||
|
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
|
||||||
|
NotAnObject { document_id: Value, value: Value },
|
||||||
|
#[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
|
||||||
|
MissingLatitudeAndLongitude { document_id: Value },
|
||||||
#[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
|
#[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
|
||||||
MissingLatitude { document_id: Value },
|
MissingLatitude { document_id: Value },
|
||||||
#[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")]
|
#[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")]
|
||||||
MissingLongitude { document_id: Value },
|
MissingLongitude { document_id: Value },
|
||||||
#[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")]
|
#[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")]
|
||||||
|
BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value },
|
||||||
|
#[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")]
|
||||||
BadLatitude { document_id: Value, value: Value },
|
BadLatitude { document_id: Value, value: Value },
|
||||||
#[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")]
|
#[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")]
|
||||||
BadLongitude { document_id: Value, value: Value },
|
BadLongitude { document_id: Value, value: Value },
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,6 +187,7 @@ macro_rules! error_from_sub_error {
|
|||||||
error_from_sub_error! {
|
error_from_sub_error! {
|
||||||
FieldIdMapMissingEntry => InternalError,
|
FieldIdMapMissingEntry => InternalError,
|
||||||
fst::Error => InternalError,
|
fst::Error => InternalError,
|
||||||
|
documents::Error => InternalError,
|
||||||
str::Utf8Error => InternalError,
|
str::Utf8Error => InternalError,
|
||||||
ThreadPoolBuildError => InternalError,
|
ThreadPoolBuildError => InternalError,
|
||||||
SerializationError => InternalError,
|
SerializationError => InternalError,
|
||||||
@ -203,6 +213,15 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<DocumentsBatchCursorError> for Error {
|
||||||
|
fn from(error: DocumentsBatchCursorError) -> Error {
|
||||||
|
match error {
|
||||||
|
DocumentsBatchCursorError::Grenad(e) => Error::from(e),
|
||||||
|
DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<Infallible> for Error {
|
impl From<Infallible> for Error {
|
||||||
fn from(_error: Infallible) -> Error {
|
fn from(_error: Infallible) -> Error {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
|
@ -1212,10 +1212,11 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1234,7 +1235,7 @@ pub(crate) mod tests {
|
|||||||
// we add all the documents a second time. we are supposed to get the same
|
// we add all the documents a second time. we are supposed to get the same
|
||||||
// field_distribution in the end
|
// field_distribution in the end
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let content = documents!([
|
let content = documents!([
|
||||||
@ -1242,7 +1243,8 @@ pub(crate) mod tests {
|
|||||||
{ "id": 2, "name": "bob", "age": 20 },
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
{ "id": 2, "name": "bob", "age": 20 },
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
]);
|
]);
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1265,10 +1267,11 @@ pub(crate) mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1333,10 +1336,11 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1390,10 +1394,11 @@ pub(crate) mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ use std::hash::BuildHasherDefault;
|
|||||||
pub use filter_parser::{Condition, FilterCondition};
|
pub use filter_parser::{Condition, FilterCondition};
|
||||||
use fxhash::{FxHasher32, FxHasher64};
|
use fxhash::{FxHasher32, FxHasher64};
|
||||||
pub use grenad::CompressionType;
|
pub use grenad::CompressionType;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::Value;
|
||||||
pub use {charabia as tokenizer, heed};
|
pub use {charabia as tokenizer, heed};
|
||||||
|
|
||||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||||
@ -43,20 +43,21 @@ pub use self::search::{
|
|||||||
|
|
||||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||||
|
|
||||||
|
pub type Attribute = u32;
|
||||||
|
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
||||||
|
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
||||||
|
pub type DocumentId = u32;
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
||||||
|
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||||
|
pub type FieldId = u16;
|
||||||
|
pub type Object = serde_json::Map<String, serde_json::Value>;
|
||||||
|
pub type Position = u32;
|
||||||
|
pub type RelativePosition = u16;
|
||||||
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
|
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
|
||||||
pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
|
pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
|
||||||
pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
|
pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
|
||||||
pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
|
pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
|
||||||
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
|
||||||
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
|
||||||
pub type Attribute = u32;
|
|
||||||
pub type DocumentId = u32;
|
|
||||||
pub type FieldId = u16;
|
|
||||||
pub type Position = u32;
|
|
||||||
pub type RelativePosition = u16;
|
|
||||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
|
||||||
|
|
||||||
/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
|
/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
|
||||||
/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
|
/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
|
||||||
@ -82,7 +83,7 @@ pub fn obkv_to_json(
|
|||||||
displayed_fields: &[FieldId],
|
displayed_fields: &[FieldId],
|
||||||
fields_ids_map: &FieldsIdsMap,
|
fields_ids_map: &FieldsIdsMap,
|
||||||
obkv: obkv::KvReaderU16,
|
obkv: obkv::KvReaderU16,
|
||||||
) -> Result<Map<String, Value>> {
|
) -> Result<Object> {
|
||||||
displayed_fields
|
displayed_fields
|
||||||
.iter()
|
.iter()
|
||||||
.copied()
|
.copied()
|
||||||
|
@ -35,7 +35,7 @@ mod test {
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
use crate::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
@ -43,14 +43,11 @@ mod test {
|
|||||||
};
|
};
|
||||||
use crate::{DocumentId, FieldId, BEU32};
|
use crate::{DocumentId, FieldId, BEU32};
|
||||||
|
|
||||||
static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents);
|
static JSON: Lazy<Vec<u8>> = Lazy::new(|| {
|
||||||
|
|
||||||
fn generate_documents() -> Vec<u8> {
|
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
let num_docs = rng.gen_range(10..30);
|
let num_docs = rng.gen_range(10..30);
|
||||||
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
let txts = ["Toto", "Titi", "Tata"];
|
let txts = ["Toto", "Titi", "Tata"];
|
||||||
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
||||||
let cat_ints = (1..10).collect::<Vec<_>>();
|
let cat_ints = (1..10).collect::<Vec<_>>();
|
||||||
@ -63,7 +60,7 @@ mod test {
|
|||||||
let mut sample_ints = cat_ints.clone();
|
let mut sample_ints = cat_ints.clone();
|
||||||
sample_ints.shuffle(&mut rng);
|
sample_ints.shuffle(&mut rng);
|
||||||
|
|
||||||
let doc = json!({
|
let json = json!({
|
||||||
"id": i,
|
"id": i,
|
||||||
"txt": txt,
|
"txt": txt,
|
||||||
"cat-int": rng.gen_range(0..3),
|
"cat-int": rng.gen_range(0..3),
|
||||||
@ -71,13 +68,16 @@ mod test {
|
|||||||
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
||||||
});
|
});
|
||||||
|
|
||||||
let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
|
let object = match json {
|
||||||
builder.extend_from_json(doc).unwrap();
|
Value::Object(object) => object,
|
||||||
|
_ => panic!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.append_json_object(&object).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.finish().unwrap();
|
builder.into_inner().unwrap()
|
||||||
cursor.into_inner()
|
});
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a temporary index populated with random test documents, the FieldId for the
|
/// Returns a temporary index populated with random test documents, the FieldId for the
|
||||||
/// distinct attribute, and the RoaringBitmap with the document ids.
|
/// distinct attribute, and the RoaringBitmap with the document ids.
|
||||||
@ -97,20 +97,22 @@ mod test {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut addition =
|
let addition =
|
||||||
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
|
|
||||||
let reader =
|
let reader =
|
||||||
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
addition.add_documents(reader).unwrap();
|
let (addition, user_error) = addition.add_documents(reader).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
addition.execute().unwrap();
|
addition.execute().unwrap();
|
||||||
|
|
||||||
let fields_map = index.fields_ids_map(&txn).unwrap();
|
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||||
let fid = fields_map.id(&distinct).unwrap();
|
let fid = fields_map.id(&distinct).unwrap();
|
||||||
|
|
||||||
let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap();
|
||||||
let map = (0..documents.len() as u32).collect();
|
let map = (0..documents.documents_count() as u32).collect();
|
||||||
|
|
||||||
txn.commit().unwrap();
|
txn.commit().unwrap();
|
||||||
|
|
||||||
|
@ -648,10 +648,11 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -100,9 +100,10 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// Clear all documents from the database.
|
// Clear all documents from the database.
|
||||||
|
@ -657,13 +657,13 @@ mod tests {
|
|||||||
fn insert_documents<'t, R: std::io::Read + std::io::Seek>(
|
fn insert_documents<'t, R: std::io::Read + std::io::Seek>(
|
||||||
wtxn: &mut RwTxn<'t, '_>,
|
wtxn: &mut RwTxn<'t, '_>,
|
||||||
index: &'t Index,
|
index: &'t Index,
|
||||||
documents: crate::documents::DocumentBatchReader<R>,
|
documents: crate::documents::DocumentsBatchReader<R>,
|
||||||
) {
|
) {
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -701,9 +701,10 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
||||||
@ -736,9 +737,10 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// Delete not all of the documents but some of them.
|
// Delete not all of the documents but some of them.
|
||||||
|
365
milli/src/update/index_documents/enrich.rs
Normal file
365
milli/src/update/index_documents/enrich.rs
Normal file
@ -0,0 +1,365 @@
|
|||||||
|
use std::io::{Read, Seek};
|
||||||
|
use std::result::Result as StdResult;
|
||||||
|
use std::{fmt, iter};
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
|
||||||
|
use crate::error::{GeoError, InternalError, UserError};
|
||||||
|
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||||
|
use crate::{FieldId, Index, Object, Result};
|
||||||
|
|
||||||
|
/// The symbol used to define levels in a nested primary key.
|
||||||
|
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||||
|
|
||||||
|
/// The default primary that is used when not specified.
|
||||||
|
const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||||
|
|
||||||
|
/// This function validates and enrich the documents by checking that:
|
||||||
|
/// - we can infer a primary key,
|
||||||
|
/// - all the documents id exist and are extracted,
|
||||||
|
/// - the validity of them but also,
|
||||||
|
/// - the validity of the `_geo` field depending on the settings.
|
||||||
|
pub fn enrich_documents_batch<R: Read + Seek>(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
autogenerate_docids: bool,
|
||||||
|
reader: DocumentsBatchReader<R>,
|
||||||
|
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
||||||
|
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||||
|
|
||||||
|
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
||||||
|
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||||
|
|
||||||
|
// The primary key *field id* that has already been set for this index or the one
|
||||||
|
// we will guess by searching for the first key that contains "id" as a substring.
|
||||||
|
let primary_key = match index.primary_key(rtxn)? {
|
||||||
|
Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
|
||||||
|
PrimaryKey::nested(primary_key)
|
||||||
|
}
|
||||||
|
Some(primary_key) => match documents_batch_index.id(primary_key) {
|
||||||
|
Some(id) => PrimaryKey::flat(primary_key, id),
|
||||||
|
None if autogenerate_docids => {
|
||||||
|
PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
return match cursor.next_document()? {
|
||||||
|
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: primary_key.to_string(),
|
||||||
|
document: obkv_to_object(&first_document, &documents_batch_index)?,
|
||||||
|
})),
|
||||||
|
None => Ok(Err(UserError::MissingPrimaryKey)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
let guessed = documents_batch_index
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
|
||||||
|
.min_by_key(|(fid, _)| *fid);
|
||||||
|
match guessed {
|
||||||
|
Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
|
||||||
|
None if autogenerate_docids => PrimaryKey::flat(
|
||||||
|
DEFAULT_PRIMARY_KEY,
|
||||||
|
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||||
|
),
|
||||||
|
None => return Ok(Err(UserError::MissingPrimaryKey)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// If the settings specifies that a _geo field must be used therefore we must check the
|
||||||
|
// validity of it in all the documents of this batch and this is when we return `Some`.
|
||||||
|
let geo_field_id = match documents_batch_index.id("_geo") {
|
||||||
|
Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id),
|
||||||
|
_otherwise => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut count = 0;
|
||||||
|
while let Some(document) = cursor.next_document()? {
|
||||||
|
let document_id = match fetch_or_generate_document_id(
|
||||||
|
&document,
|
||||||
|
&documents_batch_index,
|
||||||
|
primary_key,
|
||||||
|
autogenerate_docids,
|
||||||
|
&mut uuid_buffer,
|
||||||
|
count,
|
||||||
|
)? {
|
||||||
|
Ok(document_id) => document_id,
|
||||||
|
Err(user_error) => return Ok(Err(user_error)),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
|
||||||
|
if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
|
||||||
|
return Ok(Err(UserError::from(user_error)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?;
|
||||||
|
external_ids.insert(count.to_be_bytes(), document_id)?;
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let external_ids = writer_into_reader(external_ids)?;
|
||||||
|
let primary_key_name = primary_key.name().to_string();
|
||||||
|
let reader = EnrichedDocumentsBatchReader::new(
|
||||||
|
DocumentsBatchReader::new(cursor, documents_batch_index),
|
||||||
|
primary_key_name,
|
||||||
|
external_ids,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(Ok(reader))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieve the document id after validating it, returning a `UserError`
|
||||||
|
/// if the id is invalid or can't be guessed.
|
||||||
|
fn fetch_or_generate_document_id(
|
||||||
|
document: &obkv::KvReader<FieldId>,
|
||||||
|
documents_batch_index: &DocumentsBatchIndex,
|
||||||
|
primary_key: PrimaryKey,
|
||||||
|
autogenerate_docids: bool,
|
||||||
|
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||||
|
count: u32,
|
||||||
|
) -> Result<StdResult<DocumentId, UserError>> {
|
||||||
|
match primary_key {
|
||||||
|
PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
|
||||||
|
match document.get(primary_key_id) {
|
||||||
|
Some(document_id_bytes) => {
|
||||||
|
let document_id = serde_json::from_slice(document_id_bytes)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
match validate_document_id_value(document_id)? {
|
||||||
|
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||||
|
Err(user_error) => Ok(Err(user_error)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None if autogenerate_docids => {
|
||||||
|
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||||
|
Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
|
||||||
|
}
|
||||||
|
None => Ok(Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: primary_key.to_string(),
|
||||||
|
document: obkv_to_object(&document, &documents_batch_index)?,
|
||||||
|
})),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nested @ PrimaryKey::Nested { .. } => {
|
||||||
|
let mut matching_documents_ids = Vec::new();
|
||||||
|
for (first_level_name, right) in nested.possible_level_names() {
|
||||||
|
if let Some(field_id) = documents_batch_index.id(first_level_name) {
|
||||||
|
if let Some(value_bytes) = document.get(field_id) {
|
||||||
|
let object = serde_json::from_slice(value_bytes)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||||
|
|
||||||
|
if matching_documents_ids.len() >= 2 {
|
||||||
|
return Ok(Err(UserError::TooManyDocumentIds {
|
||||||
|
primary_key: nested.name().to_string(),
|
||||||
|
document: obkv_to_object(&document, &documents_batch_index)?,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match matching_documents_ids.pop() {
|
||||||
|
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||||
|
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||||
|
Err(user_error) => Ok(Err(user_error)),
|
||||||
|
},
|
||||||
|
None => Ok(Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: nested.name().to_string(),
|
||||||
|
document: obkv_to_object(&document, &documents_batch_index)?,
|
||||||
|
})),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type that represent the type of primary key that has been set
|
||||||
|
/// for this index, a classic flat one or a nested one.
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
enum PrimaryKey<'a> {
|
||||||
|
Flat { name: &'a str, field_id: FieldId },
|
||||||
|
Nested { name: &'a str },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrimaryKey<'_> {
|
||||||
|
fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
|
||||||
|
PrimaryKey::Flat { name, field_id }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn nested(name: &str) -> PrimaryKey {
|
||||||
|
PrimaryKey::Nested { name }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
PrimaryKey::Flat { name, .. } => name,
|
||||||
|
PrimaryKey::Nested { name } => name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||||
|
/// can have depending of the first level name and deepnes of the objects.
|
||||||
|
fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||||
|
let name = self.name();
|
||||||
|
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||||
|
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||||
|
.chain(iter::once((name, "")))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||||
|
///
|
||||||
|
/// In case the document id has been auto-generated, the document nth is kept to help
|
||||||
|
/// users debug if there is an issue with the document itself.
|
||||||
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
|
pub enum DocumentId {
|
||||||
|
Retrieved { value: String },
|
||||||
|
Generated { value: String, document_nth: u32 },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocumentId {
|
||||||
|
fn retrieved(value: String) -> DocumentId {
|
||||||
|
DocumentId::Retrieved { value }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn generated(value: String, document_nth: u32) -> DocumentId {
|
||||||
|
DocumentId::Generated { value, document_nth }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn debug(&self) -> String {
|
||||||
|
format!("{:?}", self)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_generated(&self) -> bool {
|
||||||
|
matches!(self, DocumentId::Generated { .. })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn value(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
DocumentId::Retrieved { value } => value,
|
||||||
|
DocumentId::Generated { value, .. } => value,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for DocumentId {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
DocumentId::Retrieved { value } => write!(f, "{:?}", value),
|
||||||
|
DocumentId::Generated { value, document_nth } => {
|
||||||
|
write!(f, "{{{:?}}} of the {}nth document", value, document_nth)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn starts_with(selector: &str, key: &str) -> bool {
|
||||||
|
selector.strip_prefix(key).map_or(false, |tail| {
|
||||||
|
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||||
|
match value {
|
||||||
|
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||||
|
otherwise => output.push(otherwise),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fetch_matching_values_in_object(
|
||||||
|
object: Object,
|
||||||
|
selector: &str,
|
||||||
|
base_key: &str,
|
||||||
|
output: &mut Vec<Value>,
|
||||||
|
) {
|
||||||
|
for (key, value) in object {
|
||||||
|
let base_key = if base_key.is_empty() {
|
||||||
|
key.to_string()
|
||||||
|
} else {
|
||||||
|
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||||
|
};
|
||||||
|
|
||||||
|
if starts_with(selector, &base_key) {
|
||||||
|
match value {
|
||||||
|
Value::Object(object) => {
|
||||||
|
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||||
|
}
|
||||||
|
value => output.push(value),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a trimmed version of the document id or `None` if it is invalid.
|
||||||
|
pub fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||||
|
let document_id = document_id.trim();
|
||||||
|
if !document_id.is_empty()
|
||||||
|
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||||
|
{
|
||||||
|
Some(document_id)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses a Json encoded document id and validate it, returning a user error when it is one.
|
||||||
|
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||||
|
match document_id {
|
||||||
|
Value::String(string) => match validate_document_id(&string) {
|
||||||
|
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||||
|
Some(s) => Ok(Ok(s.to_string())),
|
||||||
|
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||||
|
},
|
||||||
|
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||||
|
content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||||
|
/// in the `Err` variant if it failed.
|
||||||
|
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||||
|
let number = match value {
|
||||||
|
Value::Number(ref n) => match n.as_f64() {
|
||||||
|
Some(number) => number,
|
||||||
|
None => return Err(value),
|
||||||
|
},
|
||||||
|
Value::String(ref s) => match s.parse::<f64>() {
|
||||||
|
Ok(number) => number,
|
||||||
|
Err(_) => return Err(value),
|
||||||
|
},
|
||||||
|
value => return Err(value),
|
||||||
|
};
|
||||||
|
|
||||||
|
if number.is_finite() {
|
||||||
|
Ok(number)
|
||||||
|
} else {
|
||||||
|
Err(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
|
||||||
|
use GeoError::*;
|
||||||
|
let debug_id = || Value::from(id.debug());
|
||||||
|
match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
|
||||||
|
Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
|
||||||
|
(Some(lat), Some(lng)) => {
|
||||||
|
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
|
||||||
|
(Ok(_), Ok(_)) => Ok(Ok(())),
|
||||||
|
(Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
|
||||||
|
(Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
|
||||||
|
(Err(lat), Err(lng)) => {
|
||||||
|
Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })),
|
||||||
|
(Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
|
||||||
|
(None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
|
||||||
|
},
|
||||||
|
value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
|
||||||
|
}
|
||||||
|
}
|
@ -1,12 +1,12 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::result::Result as StdResult;
|
|
||||||
|
|
||||||
use concat_arrays::concat_arrays;
|
use concat_arrays::concat_arrays;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::error::GeoError;
|
use crate::error::GeoError;
|
||||||
|
use crate::update::index_documents::extract_finite_float_from_value;
|
||||||
use crate::{FieldId, InternalError, Result};
|
use crate::{FieldId, InternalError, Result};
|
||||||
|
|
||||||
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
||||||
@ -29,9 +29,9 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
// since we only needs the primary key when we throw an error we create this getter to
|
// since we only needs the primary key when we throw an error we create this getter to
|
||||||
// lazily get it when needed
|
// lazily get it when needed
|
||||||
let primary_key = || -> Value {
|
let document_id = || -> Value {
|
||||||
let primary_key = obkv.get(primary_key_id).unwrap();
|
let document_id = obkv.get(primary_key_id).unwrap();
|
||||||
serde_json::from_slice(primary_key).unwrap()
|
serde_json::from_slice(document_id).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
// first we get the two fields
|
// first we get the two fields
|
||||||
@ -40,32 +40,24 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
if let Some((lat, lng)) = lat.zip(lng) {
|
if let Some((lat, lng)) = lat.zip(lng) {
|
||||||
// then we extract the values
|
// then we extract the values
|
||||||
let lat = extract_float_from_value(
|
let lat = extract_finite_float_from_value(
|
||||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||||
)
|
)
|
||||||
.map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?;
|
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||||
|
|
||||||
let lng = extract_float_from_value(
|
let lng = extract_finite_float_from_value(
|
||||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||||
)
|
)
|
||||||
.map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?;
|
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||||
|
|
||||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||||
writer.insert(docid_bytes, bytes)?;
|
writer.insert(docid_bytes, bytes)?;
|
||||||
} else if lat.is_none() && lng.is_some() {
|
} else if lat.is_none() && lng.is_some() {
|
||||||
return Err(GeoError::MissingLatitude { document_id: primary_key() })?;
|
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
|
||||||
} else if lat.is_some() && lng.is_none() {
|
} else if lat.is_some() && lng.is_none() {
|
||||||
return Err(GeoError::MissingLongitude { document_id: primary_key() })?;
|
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(writer_into_reader(writer)?)
|
Ok(writer_into_reader(writer)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_float_from_value(value: Value) -> StdResult<f64, Value> {
|
|
||||||
match value {
|
|
||||||
Value::Number(ref n) => n.as_f64().ok_or(value),
|
|
||||||
Value::String(ref s) => s.parse::<f64>().map_err(|_| value),
|
|
||||||
value => Err(value),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
mod enrich;
|
||||||
mod extract;
|
mod extract;
|
||||||
mod helpers;
|
mod helpers;
|
||||||
mod transform;
|
mod transform;
|
||||||
@ -7,6 +8,7 @@ use std::collections::HashSet;
|
|||||||
use std::io::{Cursor, Read, Seek};
|
use std::io::{Cursor, Read, Seek};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
use crossbeam_channel::{Receiver, Sender};
|
use crossbeam_channel::{Receiver, Sender};
|
||||||
use heed::types::Str;
|
use heed::types::Str;
|
||||||
@ -17,6 +19,11 @@ use serde::{Deserialize, Serialize};
|
|||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||||
|
|
||||||
|
use self::enrich::enrich_documents_batch;
|
||||||
|
pub use self::enrich::{
|
||||||
|
extract_finite_float_from_value, validate_document_id, validate_document_id_value,
|
||||||
|
validate_geo_from_json, DocumentId,
|
||||||
|
};
|
||||||
pub use self::helpers::{
|
pub use self::helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||||
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||||
@ -25,13 +32,14 @@ pub use self::helpers::{
|
|||||||
};
|
};
|
||||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
use crate::documents::DocumentBatchReader;
|
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
|
use crate::error::UserError;
|
||||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
||||||
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec, UserError};
|
use crate::{Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||||
@ -117,29 +125,42 @@ where
|
|||||||
|
|
||||||
/// Adds a batch of documents to the current builder.
|
/// Adds a batch of documents to the current builder.
|
||||||
///
|
///
|
||||||
/// Since the documents are progressively added to the writer, a failure will cause a stale
|
/// Since the documents are progressively added to the writer, a failure will cause only
|
||||||
/// builder, and the builder must be discarded.
|
/// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward.
|
||||||
///
|
///
|
||||||
/// Returns the number of documents added to the builder.
|
/// Returns the number of documents added to the builder.
|
||||||
pub fn add_documents<R>(&mut self, reader: DocumentBatchReader<R>) -> Result<u64>
|
pub fn add_documents<R: Read + Seek>(
|
||||||
where
|
mut self,
|
||||||
R: Read + Seek,
|
reader: DocumentsBatchReader<R>,
|
||||||
{
|
) -> Result<(Self, StdResult<u64, UserError>)> {
|
||||||
// Early return when there is no document to add
|
// Early return when there is no document to add
|
||||||
if reader.is_empty() {
|
if reader.is_empty() {
|
||||||
return Ok(0);
|
return Ok((self, Ok(0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We check for user errors in this validator and if there is one, we can return
|
||||||
|
// the `IndexDocument` struct as it is valid to send more documents into it.
|
||||||
|
// However, if there is an internal error we throw it away!
|
||||||
|
let enriched_documents_reader = match enrich_documents_batch(
|
||||||
|
self.wtxn,
|
||||||
|
self.index,
|
||||||
|
self.config.autogenerate_docids,
|
||||||
|
reader,
|
||||||
|
)? {
|
||||||
|
Ok(reader) => reader,
|
||||||
|
Err(user_error) => return Ok((self, Err(user_error))),
|
||||||
|
};
|
||||||
|
|
||||||
let indexed_documents = self
|
let indexed_documents = self
|
||||||
.transform
|
.transform
|
||||||
.as_mut()
|
.as_mut()
|
||||||
.expect("Invalid document addition state")
|
.expect("Invalid document addition state")
|
||||||
.read_documents(reader, self.wtxn, &self.progress)?
|
.read_documents(enriched_documents_reader, self.wtxn, &self.progress)?
|
||||||
as u64;
|
as u64;
|
||||||
|
|
||||||
self.added_documents += indexed_documents;
|
self.added_documents += indexed_documents;
|
||||||
|
|
||||||
Ok(indexed_documents)
|
Ok((self, Ok(indexed_documents)))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[logging_timer::time("IndexDocuments::{}")]
|
#[logging_timer::time("IndexDocuments::{}")]
|
||||||
@ -590,9 +611,8 @@ mod tests {
|
|||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::DocumentBatchBuilder;
|
use crate::documents::DocumentsBatchBuilder;
|
||||||
use crate::update::DeleteDocuments;
|
use crate::update::DeleteDocuments;
|
||||||
use crate::HashMap;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_document_replacement() {
|
fn simple_document_replacement() {
|
||||||
@ -611,10 +631,11 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -627,10 +648,11 @@ mod tests {
|
|||||||
// Second we send 1 document with id 1, to erase the previous ones.
|
// Second we send 1 document with id 1, to erase the previous ones.
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
|
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -647,9 +669,11 @@ mod tests {
|
|||||||
{ "id": 2, "name": "updated kevina" },
|
{ "id": 2, "name": "updated kevina" },
|
||||||
{ "id": 3, "name": "updated benoit" }
|
{ "id": 3, "name": "updated benoit" }
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
// Check that there is **always** 3 documents.
|
// Check that there is **always** 3 documents.
|
||||||
@ -679,10 +703,11 @@ mod tests {
|
|||||||
update_method: IndexDocumentsMethod::UpdateDocuments,
|
update_method: IndexDocumentsMethod::UpdateDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -707,9 +732,10 @@ mod tests {
|
|||||||
// Second we send 1 document with id 1, to force it to be merged with the previous one.
|
// Second we send 1 document with id 1, to force it to be merged with the previous one.
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let content = documents!([ { "id": 1, "age": 25 } ]);
|
let content = documents!([ { "id": 1, "age": 25 } ]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -750,9 +776,10 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
assert!(builder.add_documents(content).is_err());
|
let (_builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
assert!(user_error.is_err());
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
// Check that there is no document.
|
// Check that there is no document.
|
||||||
@ -779,10 +806,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -799,9 +827,10 @@ mod tests {
|
|||||||
// Second we send 1 document with the generated uuid, to erase the previous ones.
|
// Second we send 1 document with the generated uuid, to erase the previous ones.
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
|
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -841,9 +870,10 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -858,9 +888,10 @@ mod tests {
|
|||||||
let content = documents!([ { "name": "new kevin" } ]);
|
let content = documents!([ { "name": "new kevin" } ]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -883,9 +914,10 @@ mod tests {
|
|||||||
let content = documents!([]);
|
let content = documents!([]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -909,19 +941,21 @@ mod tests {
|
|||||||
let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]);
|
let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(builder.add_documents(content).is_err());
|
let (_builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
assert!(user_error.is_err());
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
// First we send 1 document with a valid id.
|
// First we send 1 document with a valid id.
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
// There is a space in the document id.
|
// There is a space in the document id.
|
||||||
let content = documents!([ { "id": 32, "name": "kevin" } ]);
|
let content = documents!([ { "id": 32, "name": "kevin" } ]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -948,9 +982,10 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -994,9 +1029,10 @@ mod tests {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1005,7 +1041,7 @@ mod tests {
|
|||||||
update_method: IndexDocumentsMethod::UpdateDocuments,
|
update_method: IndexDocumentsMethod::UpdateDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{
|
{
|
||||||
@ -1015,7 +1051,8 @@ mod tests {
|
|||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
@ -1042,9 +1079,10 @@ mod tests {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1084,10 +1122,11 @@ mod tests {
|
|||||||
{ "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
|
{ "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
|
||||||
{ "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
|
{ "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1123,10 +1162,11 @@ mod tests {
|
|||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{ "id": 0, "_geo": { "lng": 42 } }
|
{ "id": 0, "_geo": { "lng": 42 } }
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
let error = builder.execute().unwrap_err();
|
let error = builder.execute().unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&error.to_string(),
|
&error.to_string(),
|
||||||
@ -1136,10 +1176,11 @@ mod tests {
|
|||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{ "id": 0, "_geo": { "lat": 42 } }
|
{ "id": 0, "_geo": { "lat": 42 } }
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
let error = builder.execute().unwrap_err();
|
let error = builder.execute().unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&error.to_string(),
|
&error.to_string(),
|
||||||
@ -1149,40 +1190,43 @@ mod tests {
|
|||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{ "id": 0, "_geo": { "lat": "lol", "lng": 42 } }
|
{ "id": 0, "_geo": { "lat": "lol", "lng": 42 } }
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
let error = builder.execute().unwrap_err();
|
let error = builder.execute().unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&error.to_string(),
|
&error.to_string(),
|
||||||
r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."#
|
r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."#
|
||||||
);
|
);
|
||||||
|
|
||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{ "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } }
|
{ "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } }
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
let error = builder.execute().unwrap_err();
|
let error = builder.execute().unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&error.to_string(),
|
&error.to_string(),
|
||||||
r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."#
|
r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."#
|
||||||
);
|
);
|
||||||
|
|
||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{ "id": 0, "_geo": { "lat": 12, "lng": "hello" } }
|
{ "id": 0, "_geo": { "lat": 12, "lng": "hello" } }
|
||||||
]);
|
]);
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
let error = builder.execute().unwrap_err();
|
let error = builder.execute().unwrap_err();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&error.to_string(),
|
&error.to_string(),
|
||||||
r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."#
|
r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."#
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1202,10 +1246,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
||||||
@ -1222,10 +1267,11 @@ mod tests {
|
|||||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||||
assert!(external_documents_ids.get("30").is_some());
|
assert!(external_documents_ids.get("30").is_some());
|
||||||
@ -1234,10 +1280,11 @@ mod tests {
|
|||||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1252,28 +1299,25 @@ mod tests {
|
|||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
let mut big_object = HashMap::new();
|
let mut big_object = serde_json::Map::new();
|
||||||
big_object.insert(S("id"), "wow");
|
big_object.insert(S("id"), serde_json::Value::from("wow"));
|
||||||
for i in 0..1000 {
|
for i in 0..1000 {
|
||||||
let key = i.to_string();
|
let key = i.to_string();
|
||||||
big_object.insert(key, "I am a text!");
|
big_object.insert(key, serde_json::Value::from("I am a text!"));
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
builder.append_json_object(&big_object).unwrap();
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let vector = builder.into_inner().unwrap();
|
||||||
let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
|
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
builder.extend_from_json(big_object).unwrap();
|
|
||||||
builder.finish().unwrap();
|
|
||||||
cursor.set_position(0);
|
|
||||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1288,30 +1332,27 @@ mod tests {
|
|||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
let mut big_object = HashMap::new();
|
let mut big_object = serde_json::Map::new();
|
||||||
big_object.insert(S("id"), "wow");
|
big_object.insert(S("id"), serde_json::Value::from("wow"));
|
||||||
let content: String = (0..=u16::MAX)
|
let content: String = (0..=u16::MAX)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|p| p.to_string())
|
.map(|p| p.to_string())
|
||||||
.reduce(|a, b| a + " " + b.as_ref())
|
.reduce(|a, b| a + " " + b.as_ref())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
big_object.insert("content".to_string(), &content);
|
big_object.insert("content".to_string(), serde_json::Value::from(content));
|
||||||
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
builder.append_json_object(&big_object).unwrap();
|
||||||
let big_object = serde_json::to_string(&big_object).unwrap();
|
let vector = builder.into_inner().unwrap();
|
||||||
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
|
|
||||||
builder.finish().unwrap();
|
|
||||||
cursor.set_position(0);
|
|
||||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1366,10 +1407,11 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1419,10 +1461,11 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1551,10 +1594,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1583,6 +1627,58 @@ mod tests {
|
|||||||
assert_eq!(documents_ids, vec![3]);
|
assert_eq!(documents_ids, vec![3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn retrieve_a_b_nested_document_id() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
builder.set_primary_key("a.b".to_owned());
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
|
||||||
|
let content = documents!({ "a" : { "b" : { "c" : 1 }}});
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
let (_builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
|
||||||
|
// There must be an issue with the primary key no present in the given document
|
||||||
|
user_error.unwrap_err();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn retrieve_a_b_c_nested_document_id() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
builder.set_primary_key("a.b.c".to_owned());
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
|
||||||
|
let content = documents!({ "a" : { "b" : { "c" : 1 }}});
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
|
||||||
|
assert!(external_documents_ids.get("1").is_some());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facets_generation() {
|
fn test_facets_generation() {
|
||||||
let path = tempfile::tempdir().unwrap();
|
let path = tempfile::tempdir().unwrap();
|
||||||
@ -1621,10 +1717,11 @@ mod tests {
|
|||||||
// index the documents
|
// index the documents
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1713,10 +1810,11 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1730,10 +1828,11 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1752,10 +1851,11 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1780,10 +1880,11 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1825,10 +1926,11 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
@ -1843,28 +1945,31 @@ mod tests {
|
|||||||
|
|
||||||
// Create 200 documents with a long text
|
// Create 200 documents with a long text
|
||||||
let content = {
|
let content = {
|
||||||
let documents: Vec<_> = (0..200i32)
|
let documents_iter = (0..200i32)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|i| serde_json::json!({ "id": i, "script": script }))
|
.map(|i| serde_json::json!({ "id": i, "script": script }))
|
||||||
.collect();
|
.filter_map(|json| match json {
|
||||||
|
serde_json::Value::Object(object) => Some(object),
|
||||||
|
_ => None,
|
||||||
|
});
|
||||||
|
|
||||||
let mut writer = std::io::Cursor::new(Vec::new());
|
let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
for object in documents_iter {
|
||||||
let documents = serde_json::to_vec(&documents).unwrap();
|
builder.append_json_object(&object).unwrap();
|
||||||
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
}
|
||||||
builder.finish().unwrap();
|
let vector = builder.into_inner().unwrap();
|
||||||
writer.set_position(0);
|
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap()
|
||||||
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Index those 200 long documents
|
// Index those 200 long documents
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// Create one long document
|
// Create one long document
|
||||||
@ -1875,10 +1980,11 @@ mod tests {
|
|||||||
// Index this one long document
|
// Index this one long document
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1892,7 +1998,7 @@ mod tests {
|
|||||||
let index = Index::new(options, tmp).unwrap();
|
let index = Index::new(options, tmp).unwrap();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexer_config = IndexerConfig::default();
|
let indexer_config = IndexerConfig::default();
|
||||||
let mut builder = IndexDocuments::new(
|
let builder = IndexDocuments::new(
|
||||||
&mut wtxn,
|
&mut wtxn,
|
||||||
&index,
|
&index,
|
||||||
&indexer_config,
|
&indexer_config,
|
||||||
@ -1921,8 +2027,10 @@ mod tests {
|
|||||||
"branch_id_number": 0
|
"branch_id_number": 0
|
||||||
}]};
|
}]};
|
||||||
|
|
||||||
builder.add_documents(doc1).unwrap();
|
let (builder, user_error) = builder.add_documents(doc1).unwrap();
|
||||||
builder.add_documents(doc2).unwrap();
|
user_error.unwrap();
|
||||||
|
let (builder, user_error) = builder.add_documents(doc2).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
|
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1931,4 +2039,51 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(ids.len(), map.len());
|
assert_eq!(ids.len(), map.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn primary_key_must_not_contain_floats() {
|
||||||
|
let tmp = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(4096 * 100);
|
||||||
|
let index = Index::new(options, tmp).unwrap();
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let indexer_config = IndexerConfig::default();
|
||||||
|
let builder = IndexDocuments::new(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
&indexer_config,
|
||||||
|
IndexDocumentsConfig::default(),
|
||||||
|
|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let doc1 = documents! {[{
|
||||||
|
"id": -228142,
|
||||||
|
"title": "asdsad",
|
||||||
|
}]};
|
||||||
|
|
||||||
|
let doc2 = documents! {[{
|
||||||
|
"id": 228143.56,
|
||||||
|
"title": "something",
|
||||||
|
}]};
|
||||||
|
|
||||||
|
let doc3 = documents! {[{
|
||||||
|
"id": -228143.56,
|
||||||
|
"title": "something",
|
||||||
|
}]};
|
||||||
|
|
||||||
|
let doc4 = documents! {[{
|
||||||
|
"id": 2.0,
|
||||||
|
"title": "something",
|
||||||
|
}]};
|
||||||
|
|
||||||
|
let (builder, user_error) = builder.add_documents(doc1).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
|
let (builder, user_error) = builder.add_documents(doc2).unwrap();
|
||||||
|
assert!(user_error.is_err());
|
||||||
|
let (builder, user_error) = builder.add_documents(doc3).unwrap();
|
||||||
|
assert!(user_error.is_err());
|
||||||
|
let (_builder, user_error) = builder.add_documents(doc4).unwrap();
|
||||||
|
assert!(user_error.is_err());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,12 +9,12 @@ use heed::RoTxn;
|
|||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use obkv::{KvReader, KvWriter};
|
use obkv::{KvReader, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::Value;
|
||||||
use smartstring::SmartString;
|
use smartstring::SmartString;
|
||||||
|
|
||||||
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
|
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
|
||||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||||
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
|
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
use crate::index::db_name;
|
use crate::index::db_name;
|
||||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||||
@ -23,8 +23,6 @@ use crate::{
|
|||||||
Result, BEU32,
|
Result, BEU32,
|
||||||
};
|
};
|
||||||
|
|
||||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
|
||||||
|
|
||||||
pub struct TransformOutput {
|
pub struct TransformOutput {
|
||||||
pub primary_key: String,
|
pub primary_key: String,
|
||||||
pub fields_ids_map: FieldsIdsMap,
|
pub fields_ids_map: FieldsIdsMap,
|
||||||
@ -84,18 +82,6 @@ fn create_fields_mapping(
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields.
|
|
||||||
/// It doesn't look in the subfield because we don't want to enable the
|
|
||||||
/// primary key inference on nested objects.
|
|
||||||
fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
|
|
||||||
index
|
|
||||||
.iter()
|
|
||||||
.sorted_by_key(|(k, _)| *k)
|
|
||||||
.map(|(_, v)| v)
|
|
||||||
.find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME))
|
|
||||||
.map(String::as_str)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, 'i> Transform<'a, 'i> {
|
impl<'a, 'i> Transform<'a, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
@ -152,7 +138,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
pub fn read_documents<R, F>(
|
pub fn read_documents<R, F>(
|
||||||
&mut self,
|
&mut self,
|
||||||
mut reader: DocumentBatchReader<R>,
|
reader: EnrichedDocumentsBatchReader<R>,
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
progress_callback: F,
|
progress_callback: F,
|
||||||
) -> Result<usize>
|
) -> Result<usize>
|
||||||
@ -160,33 +146,25 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
R: Read + Seek,
|
R: Read + Seek,
|
||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
{
|
{
|
||||||
let fields_index = reader.index();
|
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
|
|
||||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
|
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
||||||
|
|
||||||
let alternative_name = self
|
let primary_key = cursor.primary_key().to_string();
|
||||||
.index
|
let primary_key_id =
|
||||||
.primary_key(wtxn)?
|
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
|
||||||
.or_else(|| find_primary_key(fields_index))
|
|
||||||
.map(String::from);
|
|
||||||
|
|
||||||
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
|
|
||||||
self.index.primary_key(wtxn)?,
|
|
||||||
&mut self.fields_ids_map,
|
|
||||||
alternative_name,
|
|
||||||
self.autogenerate_docids,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let primary_key_id_nested = primary_key_name.contains('.');
|
|
||||||
|
|
||||||
let mut flattened_document = None;
|
|
||||||
let mut obkv_buffer = Vec::new();
|
let mut obkv_buffer = Vec::new();
|
||||||
let mut flattened_obkv_buffer = Vec::new();
|
|
||||||
let mut documents_count = 0;
|
let mut documents_count = 0;
|
||||||
let mut external_id_buffer = Vec::new();
|
let mut docid_buffer: Vec<u8> = Vec::new();
|
||||||
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
||||||
while let Some((addition_index, document)) = reader.next_document_with_index()? {
|
while let Some(enriched_document) = cursor.next_enriched_document()? {
|
||||||
|
let EnrichedDocument { document, document_id } = enriched_document;
|
||||||
|
|
||||||
|
// drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer
|
||||||
|
// does not keep references from the cursor between loop iterations
|
||||||
let mut field_buffer_cache = drop_and_reuse(field_buffer);
|
let mut field_buffer_cache = drop_and_reuse(field_buffer);
|
||||||
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||||
@ -194,52 +172,21 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// When the document id has been auto-generated by the `enrich_documents_batch`
|
||||||
|
// we must insert this document id into the remaped document.
|
||||||
|
let external_id = document_id.value();
|
||||||
|
if document_id.is_generated() {
|
||||||
|
serde_json::to_writer(&mut docid_buffer, external_id)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer)));
|
||||||
|
}
|
||||||
|
|
||||||
for (k, v) in document.iter() {
|
for (k, v) in document.iter() {
|
||||||
let mapped_id =
|
let mapped_id =
|
||||||
*mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
|
*mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
|
||||||
field_buffer_cache.push((mapped_id, Cow::from(v)));
|
field_buffer_cache.push((mapped_id, Cow::from(v)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// We need to make sure that every document has a primary key. After we have remapped
|
|
||||||
// all the fields in the document, we try to find the primary key value. If we can find
|
|
||||||
// it, transform it into a string and validate it, and then update it in the
|
|
||||||
// document. If none is found, and we were told to generate missing document ids, then
|
|
||||||
// we create the missing field, and update the new document.
|
|
||||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
|
||||||
let external_id = if primary_key_id_nested {
|
|
||||||
let mut field_buffer_cache = field_buffer_cache.clone();
|
|
||||||
self.flatten_from_field_mapping(
|
|
||||||
&mapping,
|
|
||||||
&document,
|
|
||||||
&mut flattened_obkv_buffer,
|
|
||||||
&mut field_buffer_cache,
|
|
||||||
)?;
|
|
||||||
flattened_document = Some(&flattened_obkv_buffer);
|
|
||||||
let document = KvReader::new(&flattened_obkv_buffer);
|
|
||||||
|
|
||||||
update_primary_key(
|
|
||||||
document,
|
|
||||||
&addition_index,
|
|
||||||
primary_key_id,
|
|
||||||
&primary_key_name,
|
|
||||||
&mut uuid_buffer,
|
|
||||||
&mut field_buffer_cache,
|
|
||||||
&mut external_id_buffer,
|
|
||||||
self.autogenerate_docids,
|
|
||||||
)?
|
|
||||||
} else {
|
|
||||||
update_primary_key(
|
|
||||||
document,
|
|
||||||
&addition_index,
|
|
||||||
primary_key_id,
|
|
||||||
&primary_key_name,
|
|
||||||
&mut uuid_buffer,
|
|
||||||
&mut field_buffer_cache,
|
|
||||||
&mut external_id_buffer,
|
|
||||||
self.autogenerate_docids,
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
|
|
||||||
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
|
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
|
||||||
// according to the document addition key order, so we sort it according to the
|
// according to the document addition key order, so we sort it according to the
|
||||||
// fieldids map keys order.
|
// fieldids map keys order.
|
||||||
@ -294,18 +241,12 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We use the extracted/generated user id as the key for this document.
|
// We use the extracted/generated user id as the key for this document.
|
||||||
self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?;
|
self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?;
|
||||||
documents_count += 1;
|
documents_count += 1;
|
||||||
|
|
||||||
if let Some(flatten) = flattened_document {
|
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
|
||||||
self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
|
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
|
||||||
} else {
|
None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?,
|
||||||
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
|
|
||||||
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
|
|
||||||
None => {
|
|
||||||
self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||||
@ -313,7 +254,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
field_buffer = drop_and_reuse(field_buffer_cache);
|
field_buffer = drop_and_reuse(field_buffer_cache);
|
||||||
external_id_buffer.clear();
|
docid_buffer.clear();
|
||||||
obkv_buffer.clear();
|
obkv_buffer.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -322,7 +263,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
|
self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
|
||||||
self.index.put_primary_key(wtxn, &primary_key_name)?;
|
self.index.put_primary_key(wtxn, &primary_key)?;
|
||||||
self.documents_count += documents_count;
|
self.documents_count += documents_count;
|
||||||
// Now that we have a valid sorter that contains the user id and the obkv we
|
// Now that we have a valid sorter that contains the user id and the obkv we
|
||||||
// give it to the last transforming function which returns the TransformOutput.
|
// give it to the last transforming function which returns the TransformOutput.
|
||||||
@ -384,61 +325,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
Ok(Some(buffer))
|
Ok(Some(buffer))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flatten a document from a field mapping generated by [create_fields_mapping]
|
|
||||||
fn flatten_from_field_mapping(
|
|
||||||
&mut self,
|
|
||||||
mapping: &HashMap<FieldId, FieldId>,
|
|
||||||
obkv: &KvReader<FieldId>,
|
|
||||||
output_buffer: &mut Vec<u8>,
|
|
||||||
field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>,
|
|
||||||
) -> Result<()> {
|
|
||||||
// store the keys and values of the json + the original obkv
|
|
||||||
let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new();
|
|
||||||
|
|
||||||
// if the primary_key is nested we need to flatten the document before being able to do anything
|
|
||||||
let mut doc = serde_json::Map::new();
|
|
||||||
|
|
||||||
// we recreate a json containing only the fields that needs to be flattened.
|
|
||||||
// all the raw values get inserted directly in the `key_value` vec.
|
|
||||||
for (key, value) in obkv.iter() {
|
|
||||||
if json_depth_checker::should_flatten_from_unchecked_slice(value) {
|
|
||||||
let key =
|
|
||||||
mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?;
|
|
||||||
let key =
|
|
||||||
self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
|
|
||||||
field_id: *key,
|
|
||||||
process: "Flatten from field mapping.",
|
|
||||||
})?;
|
|
||||||
let value = serde_json::from_slice::<serde_json::Value>(value)
|
|
||||||
.map_err(InternalError::SerdeJson)?;
|
|
||||||
doc.insert(key.to_string(), value);
|
|
||||||
} else {
|
|
||||||
key_value.push((key, value.into()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let flattened = flatten_serde_json::flatten(&doc);
|
|
||||||
|
|
||||||
// Once we have the flattened version we insert all the new generated fields_ids
|
|
||||||
// (if any) in the fields ids map and serialize the value.
|
|
||||||
for (key, value) in flattened.into_iter() {
|
|
||||||
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
|
||||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
|
||||||
key_value.push((fid, value.clone().into()));
|
|
||||||
|
|
||||||
if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() {
|
|
||||||
field_buffer_cache.push((fid, value.into()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// we sort the key. If there was a conflict between the obkv and the new generated value the
|
|
||||||
// keys will be consecutive.
|
|
||||||
key_value.sort_unstable_by_key(|(key, _)| *key);
|
|
||||||
|
|
||||||
Self::create_obkv_from_key_value(&mut key_value, output_buffer)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generate an obkv from a slice of key / value sorted by key.
|
/// Generate an obkv from a slice of key / value sorted by key.
|
||||||
fn create_obkv_from_key_value(
|
fn create_obkv_from_key_value(
|
||||||
key_value: &mut [(FieldId, Cow<[u8]>)],
|
key_value: &mut [(FieldId, Cow<[u8]>)],
|
||||||
@ -744,50 +630,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name)
|
|
||||||
/// for the primary key according to the following rules:
|
|
||||||
/// - if primary_key is `Some`, returns the id and the name, else
|
|
||||||
/// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else
|
|
||||||
/// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and
|
|
||||||
/// returns the pair, else
|
|
||||||
/// - returns an error.
|
|
||||||
fn compute_primary_key_pair(
|
|
||||||
primary_key: Option<&str>,
|
|
||||||
fields_ids_map: &mut FieldsIdsMap,
|
|
||||||
alternative_name: Option<String>,
|
|
||||||
autogenerate_docids: bool,
|
|
||||||
) -> Result<(FieldId, String)> {
|
|
||||||
match primary_key {
|
|
||||||
Some(primary_key) => {
|
|
||||||
let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?;
|
|
||||||
Ok((id, primary_key.to_string()))
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let name = match alternative_name {
|
|
||||||
Some(key) => key,
|
|
||||||
None => {
|
|
||||||
if !autogenerate_docids {
|
|
||||||
// If there is no primary key in the current document batch, we must
|
|
||||||
// return an error and not automatically generate any document id.
|
|
||||||
return Err(UserError::MissingPrimaryKey.into());
|
|
||||||
}
|
|
||||||
DEFAULT_PRIMARY_KEY_NAME.to_string()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
|
|
||||||
Ok((id, name))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
|
||||||
let document_id = document_id.trim();
|
|
||||||
Some(document_id).filter(|id| {
|
|
||||||
!id.is_empty()
|
|
||||||
&& id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
|
/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
|
||||||
///
|
///
|
||||||
/// The size and alignment of T and U must match.
|
/// The size and alignment of T and U must match.
|
||||||
@ -799,63 +641,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
|
|||||||
vec.into_iter().map(|_| unreachable!()).collect()
|
vec.into_iter().map(|_| unreachable!()).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_primary_key<'a>(
|
|
||||||
document: KvReader<'a, FieldId>,
|
|
||||||
addition_index: &DocumentsBatchIndex,
|
|
||||||
primary_key_id: FieldId,
|
|
||||||
primary_key_name: &str,
|
|
||||||
uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
|
||||||
field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>,
|
|
||||||
mut external_id_buffer: &'a mut Vec<u8>,
|
|
||||||
autogenerate_docids: bool,
|
|
||||||
) -> Result<Cow<'a, str>> {
|
|
||||||
match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
|
|
||||||
Some((_, bytes)) => {
|
|
||||||
let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
|
|
||||||
Value::String(string) => match validate_document_id(&string) {
|
|
||||||
Some(s) if s.len() == string.len() => string,
|
|
||||||
Some(s) => s.to_string(),
|
|
||||||
None => {
|
|
||||||
return Err(UserError::InvalidDocumentId {
|
|
||||||
document_id: Value::String(string),
|
|
||||||
}
|
|
||||||
.into())
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Value::Number(number) => number.to_string(),
|
|
||||||
content => {
|
|
||||||
return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into())
|
|
||||||
}
|
|
||||||
};
|
|
||||||
serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?;
|
|
||||||
Ok(Cow::Owned(value))
|
|
||||||
}
|
|
||||||
None if autogenerate_docids => {
|
|
||||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
|
||||||
serde_json::to_writer(&mut external_id_buffer, &uuid)
|
|
||||||
.map_err(InternalError::SerdeJson)?;
|
|
||||||
field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into()));
|
|
||||||
Ok(Cow::Borrowed(&*uuid))
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let mut json = Map::new();
|
|
||||||
for (key, value) in document.iter() {
|
|
||||||
let key = addition_index.name(key).cloned();
|
|
||||||
let value = serde_json::from_slice::<Value>(&value).ok();
|
|
||||||
|
|
||||||
if let Some((k, v)) = key.zip(value) {
|
|
||||||
json.insert(k, v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(UserError::MissingDocumentId {
|
|
||||||
primary_key: primary_key_name.to_string(),
|
|
||||||
document: json,
|
|
||||||
})?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TransformOutput {
|
impl TransformOutput {
|
||||||
// find and insert the new field ids
|
// find and insert the new field ids
|
||||||
pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
|
pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
|
||||||
@ -869,88 +654,3 @@ impl TransformOutput {
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
mod compute_primary_key {
|
|
||||||
use big_s::S;
|
|
||||||
|
|
||||||
use super::{compute_primary_key_pair, FieldsIdsMap};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn should_return_primary_key_if_is_some() {
|
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
|
||||||
fields_map.insert("toto").unwrap();
|
|
||||||
let result = compute_primary_key_pair(
|
|
||||||
Some("toto"),
|
|
||||||
&mut fields_map,
|
|
||||||
Some("tata".to_string()),
|
|
||||||
false,
|
|
||||||
);
|
|
||||||
assert_eq!(result.unwrap(), (0, "toto".to_string()));
|
|
||||||
assert_eq!(fields_map.len(), 1);
|
|
||||||
|
|
||||||
// and with nested fields
|
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
|
||||||
fields_map.insert("toto.tata").unwrap();
|
|
||||||
let result = compute_primary_key_pair(
|
|
||||||
Some("toto.tata"),
|
|
||||||
&mut fields_map,
|
|
||||||
Some(S("titi")),
|
|
||||||
false,
|
|
||||||
);
|
|
||||||
assert_eq!(result.unwrap(), (0, "toto.tata".to_string()));
|
|
||||||
assert_eq!(fields_map.len(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn should_return_alternative_if_primary_is_none() {
|
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
|
||||||
let result =
|
|
||||||
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
|
||||||
assert_eq!(result.unwrap(), (0, S("tata")));
|
|
||||||
assert_eq!(fields_map.len(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn should_return_default_if_both_are_none() {
|
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
|
||||||
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
|
||||||
assert_eq!(result.unwrap(), (0, S("id")));
|
|
||||||
assert_eq!(fields_map.len(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn should_return_err_if_both_are_none_and_recompute_is_false() {
|
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
|
||||||
let result = compute_primary_key_pair(None, &mut fields_map, None, false);
|
|
||||||
assert!(result.is_err());
|
|
||||||
assert_eq!(fields_map.len(), 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mod primary_key_inference {
|
|
||||||
use big_s::S;
|
|
||||||
use bimap::BiHashMap;
|
|
||||||
|
|
||||||
use crate::documents::DocumentsBatchIndex;
|
|
||||||
use crate::update::index_documents::transform::find_primary_key;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn primary_key_infered_on_first_field() {
|
|
||||||
// We run the test multiple times to change the order in which the fields are iterated upon.
|
|
||||||
for _ in 1..50 {
|
|
||||||
let mut map = BiHashMap::new();
|
|
||||||
map.insert(1, S("fakeId"));
|
|
||||||
map.insert(2, S("fakeId"));
|
|
||||||
map.insert(3, S("fakeId"));
|
|
||||||
map.insert(4, S("fakeId"));
|
|
||||||
map.insert(0, S("realId"));
|
|
||||||
|
|
||||||
assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments;
|
|||||||
pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult};
|
pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult};
|
||||||
pub use self::facets::Facets;
|
pub use self::facets::Facets;
|
||||||
pub use self::index_documents::{
|
pub use self::index_documents::{
|
||||||
DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||||
};
|
};
|
||||||
pub use self::indexer_config::IndexerConfig;
|
pub use self::indexer_config::IndexerConfig;
|
||||||
pub use self::settings::{Setting, Settings};
|
pub use self::settings::{Setting, Settings};
|
||||||
|
@ -735,10 +735,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -798,10 +799,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -850,10 +852,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -880,10 +883,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// In the same transaction we change the displayed fields to be only the age.
|
// In the same transaction we change the displayed fields to be only the age.
|
||||||
@ -934,10 +938,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -974,10 +979,11 @@ mod tests {
|
|||||||
|
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1016,10 +1022,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1067,10 +1074,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1110,10 +1118,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1142,10 +1151,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1172,10 +1182,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// In the same transaction we provide some stop_words
|
// In the same transaction we provide some stop_words
|
||||||
@ -1251,10 +1262,11 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
// In the same transaction provide some synonyms
|
// In the same transaction provide some synonyms
|
||||||
@ -1389,10 +1401,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -1452,10 +1465,11 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
@ -3,9 +3,10 @@ use std::io::Cursor;
|
|||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
use milli::{FacetDistribution, Index};
|
use milli::{FacetDistribution, Index, Object};
|
||||||
|
use serde_json::Deserializer;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_distribution_with_no_facet_values() {
|
fn test_facet_distribution_with_no_facet_values() {
|
||||||
@ -28,38 +29,33 @@ fn test_facet_distribution_with_no_facet_values() {
|
|||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
|
|
||||||
let mut builder =
|
let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
|
||||||
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
let reader = Cursor::new(
|
let reader = Cursor::new(
|
||||||
r#"[
|
r#"{
|
||||||
{
|
|
||||||
"id": 123,
|
"id": 123,
|
||||||
"title": "What a week, hu...",
|
"title": "What a week, hu...",
|
||||||
"genres": [],
|
"genres": [],
|
||||||
"tags": ["blue"]
|
"tags": ["blue"]
|
||||||
},
|
}
|
||||||
{
|
{
|
||||||
"id": 345,
|
"id": 345,
|
||||||
"title": "I am the pig!",
|
"title": "I am the pig!",
|
||||||
"tags": ["red"]
|
"tags": ["red"]
|
||||||
}
|
}"#,
|
||||||
]"#,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
|
for result in Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
|
let object = result.unwrap();
|
||||||
documents_builder.extend_from_json(doc).unwrap();
|
documents_builder.append_json_object(&object).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
documents_builder.finish().unwrap();
|
let vector = documents_builder.into_inner().unwrap();
|
||||||
|
|
||||||
cursor.set_position(0);
|
|
||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -6,10 +6,11 @@ use big_s::S;
|
|||||||
use either::{Either, Left, Right};
|
use either::{Either, Left, Right};
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{hashmap, hashset};
|
||||||
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
use milli::{AscDesc, Criterion, DocumentId, Index, Member};
|
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
use serde_json::Deserializer;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
mod distinct;
|
mod distinct;
|
||||||
@ -60,24 +61,21 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
|
|
||||||
let mut builder =
|
let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
|
||||||
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
let reader = Cursor::new(CONTENT.as_bytes());
|
let reader = Cursor::new(CONTENT.as_bytes());
|
||||||
|
|
||||||
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
|
for result in Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
|
let object = result.unwrap();
|
||||||
documents_builder.extend_from_json(doc).unwrap();
|
documents_builder.append_json_object(&object).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
documents_builder.finish().unwrap();
|
let vector = documents_builder.into_inner().unwrap();
|
||||||
|
|
||||||
cursor.set_position(0);
|
|
||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let content = DocumentBatchReader::from_reader(cursor).unwrap();
|
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -5,7 +5,7 @@ use big_s::S;
|
|||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult};
|
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult};
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
@ -390,11 +390,9 @@ fn criteria_ascdesc() {
|
|||||||
// index documents
|
// index documents
|
||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
|
||||||
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
|
||||||
|
|
||||||
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
|
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
@ -412,17 +410,19 @@ fn criteria_ascdesc() {
|
|||||||
"age": age,
|
"age": age,
|
||||||
});
|
});
|
||||||
|
|
||||||
let json = Cursor::new(serde_json::to_vec(&json).unwrap());
|
let object = match json {
|
||||||
batch_builder.extend_from_json(json).unwrap();
|
serde_json::Value::Object(object) => object,
|
||||||
|
_ => panic!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
batch_builder.append_json_object(&object).unwrap();
|
||||||
});
|
});
|
||||||
|
|
||||||
batch_builder.finish().unwrap();
|
let vector = batch_builder.into_inner().unwrap();
|
||||||
|
|
||||||
cursor.set_position(0);
|
let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
|
let (builder, user_error) = builder.add_documents(reader).unwrap();
|
||||||
let reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
user_error.unwrap();
|
||||||
|
|
||||||
builder.add_documents(reader).unwrap();
|
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -106,35 +106,31 @@ fn test_typo_disabled_on_word() {
|
|||||||
options.map_size(4096 * 100);
|
options.map_size(4096 * 100);
|
||||||
let index = Index::new(options, tmp.path()).unwrap();
|
let index = Index::new(options, tmp.path()).unwrap();
|
||||||
|
|
||||||
let documents = json!([
|
let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new());
|
||||||
{
|
let doc1 = json!({
|
||||||
"id": 1usize,
|
"id": 1usize,
|
||||||
"data": "zealand",
|
"data": "zealand",
|
||||||
},
|
});
|
||||||
{
|
|
||||||
"id": 2usize,
|
|
||||||
"data": "zearand",
|
|
||||||
},
|
|
||||||
]);
|
|
||||||
|
|
||||||
let mut writer = std::io::Cursor::new(Vec::new());
|
let doc2 = json!({
|
||||||
let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
"id": 2usize,
|
||||||
let documents = serde_json::to_vec(&documents).unwrap();
|
"data": "zearand",
|
||||||
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
});
|
||||||
builder.finish().unwrap();
|
|
||||||
|
|
||||||
writer.set_position(0);
|
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
|
||||||
|
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
|
||||||
|
let vector = builder.into_inner().unwrap();
|
||||||
|
|
||||||
let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
|
let documents =
|
||||||
|
milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap();
|
||||||
|
|
||||||
let mut txn = index.write_txn().unwrap();
|
let mut txn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
|
||||||
|
|
||||||
builder.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
txn.commit().unwrap();
|
txn.commit().unwrap();
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user