From 02123a3326463dce22bfad27e91440da2ff77b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 May 2024 13:07:32 +0200 Subject: [PATCH] It works perfectly with some Rhai --- Cargo.lock | 119 +++++++++++++++--------- milli/Cargo.toml | 2 +- milli/src/update/index_documents/mod.rs | 115 ++++++++++++++--------- 3 files changed, 145 insertions(+), 91 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b22b12fdc..a1d3da421 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -262,6 +262,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "const-random", "getrandom", "once_cell", "version_check", @@ -1049,6 +1050,26 @@ dependencies = [ "windows-sys 0.45.0", ] +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + [[package]] name = "constant_time_eq" version = "0.1.5" @@ -1987,30 +2008,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "gc-arena" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d4a9d8c3c1ef4301b8afc383e53e102a13f9947da2181bf82828480dcc5165" -dependencies = [ - "allocator-api2", - "gc-arena-derive", - "hashbrown", - "sptr", -] - -[[package]] -name = "gc-arena-derive" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c952d28a64896b1c4ac382dcd7beeaeaabc13e8c7c7f800ea2938abd828ed30" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.60", - "synstructure", -] - [[package]] name = "gemm" version = "0.17.1" @@ -3535,11 +3532,11 @@ dependencies = [ "obkv", "once_cell", "ordered-float", - "piccolo", "puffin", "rand", "rand_pcg", "rayon", + "rhai", "roaring", "rstar", "serde", @@ -4039,21 +4036,6 @@ dependencies = [ "siphasher 0.3.11", ] -[[package]] -name = "piccolo" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93dd1815b42446904bb2689d1c5d7680e8c68113d5b15a5a3297ba6c7a5f84af" -dependencies = [ - "ahash", - "allocator-api2", - "anyhow", - "gc-arena", - "hashbrown", - "rand", - "thiserror", -] - [[package]] name = "pin-project" version = "1.1.4" @@ -4462,6 +4444,35 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" +[[package]] +name = "rhai" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a7d88770120601ba1e548bb6bc2a05019e54ff01b51479e38e64ec3b59d4759" +dependencies = [ + "ahash", + "bitflags 2.5.0", + "instant", + "num-traits", + "once_cell", + "rhai_codegen", + "serde", + "smallvec", + "smartstring", + "thin-vec", +] + +[[package]] +name = "rhai_codegen" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59aecf17969c04b9c0c5d21f6bc9da9fec9dd4980e64d1871443a476589d8c86" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.60", +] + [[package]] name = "ring" version = "0.17.8" @@ -4875,6 +4886,9 @@ name = "smallvec" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2593d31f82ead8df961d8bd23a64c2ccf2eb5dd34b0a34bfb4dd54011c72009e" +dependencies = [ + "serde", +] [[package]] name = "smartstring" @@ -4883,6 +4897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" dependencies = [ "autocfg", + "serde", "static_assertions", "version_check", ] @@ -4939,12 +4954,6 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "sptr" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b9b39299b249ad65f3b7e96443bad61c02ca5cd3589f46cb6d610a0fd6c0d6a" - [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -5135,6 +5144,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "thin-vec" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38c90d48152c236a3ab59271da4f4ae63d678c5d7ad6b7714d7cb9760be5e4b" +dependencies = [ + "serde", +] + [[package]] name = "thiserror" version = "1.0.58" @@ -5213,6 +5231,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.1" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 017d8c93d..c9ff3c051 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -87,7 +87,7 @@ rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } url = "2.5.0" -piccolo = "0.3.1" +rhai = { version = "1.18.0", features = ["serde", "no_module", "no_custom_syntax"] } [dev-dependencies] mimalloc = { version = "0.1.39", default-features = false } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3072d55b5..b1e3276d1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -15,6 +15,7 @@ use grenad::{Merger, MergerBuilder}; use heed::types::Str; use heed::Database; use rand::SeedableRng; +use rhai::{Engine, Scope}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; @@ -31,7 +32,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::{obkv_to_object, DocumentsBatchReader}; +use crate::documents::{obkv_to_object, DocumentsBatchBuilder, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -39,7 +40,7 @@ use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::vector::EmbeddingConfigs; -use crate::{fields_ids_map, CboRoaringBitmapCodec, Index, Result}; +use crate::{CboRoaringBitmapCodec, FieldsIdsMap, Index, Object, Result}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 4; @@ -175,7 +176,7 @@ where #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")] pub fn edit_documents( - mut self, + self, documents: &RoaringBitmap, code: &str, ) -> Result<(Self, StdResult)> { @@ -184,49 +185,75 @@ where return Ok((self, Ok(0))); } - let mut lua = piccolo::Lua::core(); - let executor = lua.enter(|ctx| ctx.stash(piccolo::Executor::new(ctx))); - let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - - for docid in documents { - let document = match self.index.documents.get(self.wtxn, &docid)? { - Some(document) => document, - None => panic!("a document should always exists"), - }; - - lua.try_enter(|ctx| { - let closure = match piccolo::Closure::load( - ctx, - None, - ("return ".to_string() + code).as_bytes(), - ) { - Ok(closure) => closure, - Err(_) => piccolo::Closure::load(ctx, None, code.as_bytes())?, - }; - let function = piccolo::Function::Closure(closure); - - let table = piccolo::Table::new(&ctx); - table.set(ctx, "internal-id", docid)?; - table.set(ctx, "title", "hello")?; - table.set(ctx, "description", "world")?; - dbg!(&table); - ctx.set_global("doc", table)?; - - ctx.fetch(&executor).restart(ctx, function, ()); - Ok(()) - }) - .unwrap(); - - lua.execute::<()>(&executor).unwrap(); - lua.try_enter(|ctx| { - let value = ctx.get_global("doc"); - dbg!(value); - Ok(()) - }) - .unwrap(); + /// Transform every field of a raw obkv store into a JSON Object. + pub fn all_obkv_to_rhaimap( + obkv: obkv::KvReaderU16, + fields_ids_map: &FieldsIdsMap, + ) -> Result { + let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); + all_keys + .iter() + .copied() + .flat_map(|id| obkv.get(id).map(|value| (id, value))) + .map(|(id, value)| { + let name = fields_ids_map.name(id).ok_or( + crate::error::FieldIdMapMissingEntry::FieldId { + field_id: id, + process: "allobkv_to_rhaimap", + }, + )?; + let value = serde_json::from_slice(value) + .map_err(crate::error::InternalError::SerdeJson)?; + Ok((name.into(), value)) + }) + .collect() } - Ok((self, Ok(documents.len()))) + fn rhaimap_to_object(map: rhai::Map) -> Object { + let mut output = Object::new(); + for (key, value) in map { + let value = serde_json::to_value(&value).unwrap(); + output.insert(key.into(), value); + } + output + } + + let engine = Engine::new(); + let ast = engine.compile(code).unwrap(); + let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + let primary_key_id = fields_ids_map.id(primary_key).unwrap(); + let mut documents_batch_builder = tempfile::tempfile().map(DocumentsBatchBuilder::new)?; + + for docid in documents { + let (document, document_id) = match self.index.documents.get(self.wtxn, &docid)? { + Some(obkv) => { + let document_id_bytes = obkv.get(primary_key_id).unwrap(); + let document_id: serde_json::Value = + serde_json::from_slice(document_id_bytes).unwrap(); + let document = all_obkv_to_rhaimap(obkv, &fields_ids_map)?; + (document, document_id) + } + None => panic!("documents must exist"), + }; + + let mut scope = Scope::new(); + scope.push("doc", document); + + let new_document = engine.eval_ast_with_scope::(&mut scope, &ast).unwrap(); + let new_document = rhaimap_to_object(new_document); + + assert_eq!( + document_id, new_document[primary_key], + "you cannot change the document id when editing documents" + ); + documents_batch_builder.append_json_object(&new_document)?; + } + + let file = documents_batch_builder.into_inner()?; + let reader = DocumentsBatchReader::from_reader(file)?; + + self.add_documents(reader) } pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {