diff --git a/.gitignore b/.gitignore index c38aa51d3..c0747b6e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,7 @@ /target -/Cargo.lock -meilidb/Cargo.lock -meilidb-core/Cargo.lock -**/*.rs.bk +Cargo.lock **/*.csv **/*.json_lines -**/*.rdb +**/*.rs.bk +/*.mdb +/query-history.txt diff --git a/Cargo.toml b/Cargo.toml index 84a45aa9f..0903eab10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,6 @@ [workspace] members = [ - "meilidb", "meilidb-core", - "meilidb-data", "meilidb-schema", "meilidb-tokenizer", ] diff --git a/LICENSE b/LICENSE index 4589babfa..25a8574b1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,13 @@ -MIT License +“Commons Clause” License Condition v1.0 -Copyright (c) 2018 Clément Renault +The Software is provided to you by the Licensor under the License, as defined below, subject to the following condition. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +Without limiting other conditions in the License, the grant of rights under the License will not include, and the License does not grant to you, the right to Sell the Software. -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. +For purposes of the foregoing, “Sell” means practicing any or all of the rights granted to you under the License to provide to third parties, for a fee or other consideration (including without limitation fees for hosting or consulting/ support services related to the Software), a product or service whose value derives, entirely or substantially, from the functionality of the Software. Any license notice or attribution required by the License must also include this Commons Clause License Condition notice. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Software: MeiliDB + +License: MIT + +Licensor: MEILI SAS diff --git a/README.md b/README.md index d92653895..372389242 100644 --- a/README.md +++ b/README.md @@ -6,19 +6,19 @@ [![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)]( https://www.rust-lang.org) -A _full-text search database_ using a key-value store internally. +A _full-text search database_ based on the fast [LMDB key-value store](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). ## Features -- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L95-L101) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents -- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L22-L29) and can apply them in any custom order -- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L146), useful for paginating results -- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L68) and [filter](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L57) returned documents based on context defined rules -- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/examples/movies/schema-movies.toml) -- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-tokenizer/src/lib.rs#L99) can index latin and kanji based languages -- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/lib.rs#L117-L120), useful to highlight matched words in results -- Accepts query time search config like the [searchable fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L79) -- Supports run time indexing (incremental indexing) +- Provides [6 default ranking criteria](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/criterion/mod.rs#L14-L19) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents +- Accepts [custom criteria](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order +- Support [ranged queries](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/query_builder.rs#L255-L260), useful for paginating results +- Can [distinct](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/query_builder.rs#L241-L246) and [filter](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/query_builder.rs#L223-L235) returned documents based on context defined rules +- Can store complete documents or only [user schema specified fields](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-schema/src/lib.rs#L265-L279) +- The [default tokenizer](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages +- Returns [the matching text areas](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results +- Accepts query time search config like the [searchable attributes](https://github.com/Kerollmops/new-meilidb/blob/dea7e28a45dde897f97742bdd33fcf75d5673502/meilidb-core/src/query_builder.rs#L248-L252) +- Supports run time indexing (incremental indexing) @@ -64,19 +64,18 @@ Currently MeiliDB do not provide an http server but you can run these two exampl It creates an index named _movies_ and insert _19 700_ (in batches of _1000_) movies into it. ```bash -cargo run --release --example create-database -- \ - --schema examples/movies/schema-movies.toml \ - --update-group-size 1000 \ - movies.mdb \ - examples/movies/movies.csv +cargo run --release --example from_file -- \ + index example.mdb datasets/movies/data.csv \ + --schema datasets/movies/schema.toml \ + --update-group-size 1000 ``` Once this is done, you can query this database using the second binary example. ```bash -cargo run --release --example query-database -- \ - movies.mdb \ - --fetch-timeout-ms 50 \ - -n 4 \ - id title overview release_date poster +cargo run --release --example from_file -- \ + search example.mdb + --number 4 \ + --filter '!adult' \ + id popularity adult original_title ``` diff --git a/ci/meilidb.sh b/ci/meilidb.sh deleted file mode 100755 index 35bf88fef..000000000 --- a/ci/meilidb.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -cd "$(dirname "$0")"/.. -set -ex - -export RUSTFLAGS="-D warnings" - -cargo check --no-default-features -cargo check --bins --examples --tests -cargo test - -if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then - cargo check --no-default-features --features nightly - cargo test --features nightly -fi diff --git a/examples/movies/README.md b/datasets/movies/README.md similarity index 100% rename from examples/movies/README.md rename to datasets/movies/README.md diff --git a/examples/movies/movies.csv b/datasets/movies/movies.csv similarity index 100% rename from examples/movies/movies.csv rename to datasets/movies/movies.csv diff --git a/examples/movies/schema-movies.toml b/datasets/movies/schema-movies.toml similarity index 100% rename from examples/movies/schema-movies.toml rename to datasets/movies/schema-movies.toml diff --git a/examples/kaggle/kaggle.csv b/examples/kaggle/kaggle.csv deleted file mode 100644 index 6bf7a70d3..000000000 --- a/examples/kaggle/kaggle.csv +++ /dev/null @@ -1,122 +0,0 @@ -id,title,description,image -711158459,Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/mzvzEUIknaQclZ801YCY1ew.jpg -711158460,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs3.ebaystatic.com/d/l225/m/mJNDmSyIS3vUasKIJEBy4Cw.jpg -711158461,Sony PlayStation 4 PS4 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs4.ebaystatic.com/d/l225/m/m10NZXArmiIkpkTDDkAUVvA.jpg -711158462,Sony - PlayStation 4 500GB The Last of Us Remastered Bundle - Black,,http://thumbs2.ebaystatic.com/d/l225/m/mZZXTmAE8WZDH1l_E_PPAkg.jpg -711158463,Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs3.ebaystatic.com/d/l225/m/mzvzEUIknaQclZ801YCY1ew.jpg -711158464,Sony PlayStation 4 (PS4) (Latest Model)- 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs4.ebaystatic.com/d/l225/m/mzvzEUIknaQclZ801YCY1ew.jpg -711158465,BRAND NEW Sony PlayStation 4 BUNDLE 500gb,,http://thumbs4.ebaystatic.com/d/l225/m/m9TQTiWcWig7SeQh9algLZg.jpg -711158466,"Sony PlayStation 4 500GB, Dualshock Wireless Control, HDMI Gaming Console Refurb","The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs4.ebaystatic.com/d/l225/m/mTZYG5N6xWfBi4Ok03HmpMw.jpg -711158467,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console w/ 2 Controllers,,http://thumbs2.ebaystatic.com/d/l225/m/mX5Qphrygqeoi7tAH5eku2A.jpg -711158468,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console *NEW*,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/mGjN4IrJ0O8kKD_TYMWgGgQ.jpg -711158469,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console..wth Mortal Kombat X,,http://thumbs2.ebaystatic.com/d/l225/m/mrpqSNXwlnUVKnEscE4348w.jpg -711158470,Genuine SONY PS4 Playstation 4 500GB Gaming Console - Black,,http://thumbs4.ebaystatic.com/d/l225/m/myrPBFCpb4H5rHI8NyiS2zA.jpg -711158471,[Sony] Playstation 4 PS4 Video Game Console Black - Latest Model,,http://thumbs4.ebaystatic.com/d/l225/m/mce0c7mCuv3xpjllJXx093w.jpg -711158472,Sony PlayStation 4 (Latest Model) 500 GB Jet Black Console,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/miVSA1xPO5fCNdYzEMc8rSQ.jpg -711158473,Sony PlayStation 4 - 500 GB Jet Black Console - WITH LAST OF US REMASTERED,,http://thumbs2.ebaystatic.com/d/l225/m/mLjnOxv2GWkrkCtgsDGhJ6A.jpg -711158474,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs3.ebaystatic.com/d/l225/m/mjMittBaXmm_n4AMpETBXhQ.jpg -711158475,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs2.ebaystatic.com/d/l225/m/m1n1qrJ7-VGbe7xQvGdeD6Q.jpg -711158476,"Sony PlayStation 4 - 500 GB Jet Black Console (3 controllers,3 games included)",,http://thumbs3.ebaystatic.com/d/l225/m/mIoGIj9FZG7HoEVkPlnyizA.jpg -711158477,Sony PlayStation 4 500GB Console with 2 Controllers,"The PlayStation 4 system opens the door to an incredible journey through immersive new gaming worlds and a deeply connected gaming community. Step into living, breathing worlds where you are hero of your epic journey. Explore gritty urban environments, vast galactic landscapes, and fantastic historical settings brought to life on an epic scale, without limits. With an astounding launch lineup and over 180 games in development the PS4 system offers more top-tier blockbusters and inventive indie hits than any other next-gen console. The PS4 system is developer inspired, gamer focused. The PS4 system learns how you play and intuitively curates the content you use most often. Fire it up, and your PS4 system points the way to new, amazing experiences you can jump into alone or with friends. Create your own legend using a sophisticated, intuitive network built for gamers. Broadcast your gameplay live and direct to the world, complete with your commentary. Or immortalize your most epic moments and share at the press of a button. Access the best in music, movies, sports and television. PS4 system doesn t require a membership fee to access your digital entertainment subscriptions. You get the full spectrum of entertainment that matters to you on the PS4 system. PlayStation 4: The Best Place to Play The PlayStation 4 system provides dynamic, connected gaming, powerful graphics and speed, intelligent personalization, deeply integrated social capabilities, and innovative second-screen features. Combining unparalleled content, immersive gaming experiences, all of your favorite digital entertainment apps, and PlayStation exclusives, the PS4 system focuses on the gamers.Gamer Focused, Developer InspiredThe PS4 system focuses on the gamer, ensuring that the very best games and the most immersive experiences are possible on the platform.
Read more about the PS4 on ebay guides.
",http://thumbs2.ebaystatic.com/d/l225/m/m4fuJ5Ibrj450-TZ83FAkIQ.jpg -711158478,Sony - PlayStation 4 500GB The Last of Us Remastered Bundle - Black,,http://thumbs3.ebaystatic.com/d/l225/m/mzXSIw8Hlnff8IjXJQrXJSw.jpg -711158479,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs2.ebaystatic.com/d/l225/m/m-9S63CgFoUijY3ZTyNs3KA.jpg -711158480,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs1.ebaystatic.com/d/l225/m/mdF9Bisg9wXjv_R9Y_13MWw.jpg -711158481,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console*,,http://thumbs1.ebaystatic.com/d/l225/m/m4_OQHMmIOCa8uEkBepRR5A.jpg -711158482,Sony PlayStation 4 (Latest Model)- 500 GB Jet Black Console,,http://thumbs2.ebaystatic.com/d/l225/m/mZ0nR8iz-QAfLssJZMp3L5Q.jpg -711158483,[Sony] Playstation 4 PS4 1105A Video Game Console 500GB White - Latest Model,,http://thumbs4.ebaystatic.com/d/l225/m/m8iTz5cLQLNjD9D3O2jT3IQ.jpg -711158484,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream 5ml,,http://thumbs2.ebaystatic.com/d/l225/m/mrraWCpvP5YKk5rYgotVDLg.jpg -711158485,Obagi Elastiderm Eye Treatment Cream 0.5 oz / 15g Authentic NiB Sealed [5],,http://thumbs1.ebaystatic.com/d/l225/m/mJ4ekz6_bDT5G7wYtjM-qRg.jpg -711158486,Lancome Renergie Eye Anti-Wrinkle & Firming Eye Cream 0.5oz New,,http://thumbs2.ebaystatic.com/d/l225/m/mxwwyDQraZ-TEtr_Y6qRi7Q.jpg -711158487,OZ Naturals - The BEST Eye Gel - Eye Cream For Dark Circles Puffiness and,,http://thumbs2.ebaystatic.com/d/l225/m/mk2Z-hX5sT4kUxfG6g_KFpg.jpg -711158488,Elastiderm Eye Cream (0.5oz/15g),,http://thumbs3.ebaystatic.com/d/l225/m/mHxb5WUc5MtGzCT2UXgY_hg.jpg -711158489,new CLINIQUE Repairwear Laser Focus Wrinkle Correcting Eye Cream 0.17 oz/ 5 ml,,http://thumbs1.ebaystatic.com/d/l225/m/mQSX2wfrSeGy3uA8Q4SbOKw.jpg -711158490,NIB Full Size Dermalogica Multivitamin Power Firm Eye Cream,,http://thumbs4.ebaystatic.com/d/l225/m/m2hxo12e5NjXgGiKIaCvTLA.jpg -711158491,24K Gold Collagen Anti-Dark Circles Anti-Aging Bio Essence Repairing Eye Cream,,http://thumbs4.ebaystatic.com/d/l225/m/mt96efUK5cPAe60B9aGmgMA.jpg -711158492,Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream Full Size .5oz 15mL,,http://thumbs3.ebaystatic.com/d/l225/m/mZyV3wKejCMx9RrnC8X-eMw.jpg -711158493,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream 5ml,,http://thumbs4.ebaystatic.com/d/l225/m/m9hX_z_DFnbNCTh0VFv3KcQ.jpg -711158494,3 Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream .17 oz/5 ml Each,,http://thumbs1.ebaystatic.com/d/l225/m/mYiHsrGffCg_qgkTbUWZU1A.jpg -711158495,Lancome High Resolution Eye Cream .95 Oz Refill-3X .25 Oz Plus .20 Oz Lot,,http://thumbs1.ebaystatic.com/d/l225/m/mFuQxKoEKQ6wtk2bGxfKwow.jpg -711158496,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream 5ml,,http://thumbs4.ebaystatic.com/d/l225/m/mLBRCDiELUnYos-vFmIcc7A.jpg -711158497,Neutrogena Rapid Wrinkle Repair Eye Cream -0.5 Oz. -New-,,http://thumbs4.ebaystatic.com/d/l225/m/mE1RWpCOxkCGuuiJBX6HiBQ.jpg -711158498,20g Snail Repair Eye Cream Natural Anti-Dark Circles Puffiness Aging Wrinkles,,http://thumbs4.ebaystatic.com/d/l225/m/mh4gBNzINDwds_r778sJRjg.jpg -711158499,Vichy-Neovadiol GF Eye & Lip Contour Cream 0.5 Fl. Oz,,http://thumbs4.ebaystatic.com/d/l225/m/m_6f0ofCm7PTzuithYuZx3w.jpg -711158500,Obagi Elastiderm Eye Cream 0.5 oz. New In Box. 100% Authentic! New Packaging!,,http://thumbs2.ebaystatic.com/d/l225/m/ma0PK-ASBXUiHERR19MyImA.jpg -711158501,NEW! Clinique Repairwear Laser Focus Wrinkle Correcting Eye Cream .17oz / 5ml,,http://thumbs3.ebaystatic.com/d/l225/m/m72NaXYlcXcEeqQFKWvsdZA.jpg -711158502,Kiehl's CREAMY EYE TREATMENT cream with AVOCADO 0.5 oz FULL SIZE,,http://thumbs3.ebaystatic.com/d/l225/m/mOI407HnILb_tf-RgdvfYyA.jpg -711158503,Clinique repairwear laser focus wrinkle correcting eye cream .5 oz 15ml,,http://thumbs4.ebaystatic.com/d/l225/m/mQwNVst3bYG6QXouubmLaJg.jpg -711158504,Caudalie Premier Cru The Eye Cream La Creme New Anti Aging Eye Treatment,,http://thumbs1.ebaystatic.com/d/l225/m/mM4hPTAWXeOjovNk9s_Cqag.jpg -711158505,Jeunesse Instantly Ageless -- New Box Of 50 Sachets -- Eye - Face Wrinkle Cream,,http://thumbs2.ebaystatic.com/d/l225/m/m5EfWbi6ZYs4JpYcsl0Ubaw.jpg -711158506,VELOUR SKIN EYE CREAM .5 FL OZ 15ML NEW NIP ANTI-AGING WRINKLE CREAM,,http://thumbs1.ebaystatic.com/d/l225/m/m2uEf6q1yASH8FkWqYdOv1w.jpg -711158507,Shiseido White Lucent Anti-Dark Circles/Puffiness Eye Cream 15ml/.53oz Full Size,,http://thumbs1.ebaystatic.com/d/l225/m/m_CtzoqU2Vgv4GKx8ONS6qw.jpg -711158508,Murad Resurgence Renewing Eye Cream Anti-Aging .25 oz NEW Dark Circles Wrinkle,,http://thumbs1.ebaystatic.com/d/l225/m/mhWJC10iowgUDGm4KMQKNMg.jpg -711158509,D-Link DIR-615 300Mbps Wireless-N Router 4-Port w/Firewall,,http://thumbs3.ebaystatic.com/d/l225/m/mdSBH9ROXRn3TBb8OFDT6jA.jpg -711158510,Triton MOF001 2 1/4hp dual mode precision Router. New!! *3 day auction*,,http://thumbs1.ebaystatic.com/d/l225/m/mozWd2SBskbDBlWAKsMlVew.jpg -711158511,Porter-Cable 3-1/4 HP Five-Speed Router 7518 - Power Tools Routers,,http://thumbs2.ebaystatic.com/d/l225/m/mpZDTXpiyesDrZh_FLMyqXQ.jpg -711158512,Linksys EA6900 AC1900 Wi-Fi Wireless Router Dual Band with Gigabit &USB 3.0 Port,,http://thumbs4.ebaystatic.com/d/l225/m/m3OfBSnHBDhhs_Ve-DSBKQw.jpg -711158513,Linksys EA6500 1300 Mbps 4-Port Gigabit Wireless AC Router,,http://thumbs1.ebaystatic.com/d/l225/m/m7cfymJPc7CLADoTiEYFzwA.jpg -711158514,Makita RT0700CX3 1-1/4 Horsepower Compact Router Kit / Trimmer NEW,,http://thumbs2.ebaystatic.com/d/l225/m/mr-F3rCxDYsLcj8hnmaRN4A.jpg -711158515,NETGEAR R6250 AC1600 Smart WiFi Dual Band Gigabit Router 802.11ac 300 1300 Mbps,,http://thumbs4.ebaystatic.com/d/l225/m/mc8Ic8Cq2lPqPnjNGAQBBCQ.jpg -711158516,NETGEAR Nighthawk AC1900 Dual Band Wi-Fi Gigabit Router (R7000) BRAND NEW SEALED,,http://thumbs3.ebaystatic.com/d/l225/m/mdL34EQi0l-Kg-DlvF6wpqA.jpg -711158517,Netgear WNDR3400 N600 Wireless Dual Band Router (WNDR3400-100),,http://thumbs4.ebaystatic.com/d/l225/m/mKr4cNk6utJXSdVYXzwrScQ.jpg -711158518,Netgear N600 300 Mbps 4-Port 10/100 Wireless N Router (WNDR3400),,http://thumbs2.ebaystatic.com/d/l225/m/mUPdyhbW9pzEm1VbqX0YudA.jpg -711158519,NETGEAR N600 WNDR3400 Wireless Dual Band Router F/S,,http://thumbs1.ebaystatic.com/d/l225/m/my55jF5kHnG9ipzFycnjooA.jpg -711158520,Netgear NIGHTHAWK AC1900 1300 Mbps 4-Port Gigabit Wireless AC Router (R7000),,http://thumbs3.ebaystatic.com/d/l225/m/mrPLRTnWx_JXLNIp5pCBnzQ.jpg -711158521,Netgear N900 450 Mbps 4-Port Gigabit Wireless N Router (WNDR4500),,http://thumbs2.ebaystatic.com/d/l225/m/mXBL01faHlHm7Ukh188t3yQ.jpg -711158522,Netgear R6300V2 AC1750 1300 Mbps 4-Port Gigabit Wireless AC Router,,http://thumbs1.ebaystatic.com/d/l225/m/mTdnFB9Z71efYJ9I5-k186w.jpg -711158523,Makita RT0701C 1-1/4 HP Compact Router With FACTORY WARRANTY!!!,,http://thumbs2.ebaystatic.com/d/l225/m/m7AA4k3MzYFJcTlBrT3DwhA.jpg -711158524,"CISCO LINKSYS EA4500 DUAL-BAND N9000 WIRELESS ROUTER, 802.11N, UP TO 450 MBPs",,http://thumbs4.ebaystatic.com/d/l225/m/mwfVIXD3dZYt_qpHyprd7hg.jpg -711158525,Netgear N300 v.3 300 Mbps 5-Port 10/100 Wireless N Router (WNR2000),,http://thumbs4.ebaystatic.com/d/l225/m/mopRjvnZwbsVH9euqGov5kw.jpg -711158526,Netgear Nighthawk R7000 2330 Mbps 4-Port Gigabit Wireless N Router...,,http://thumbs4.ebaystatic.com/d/l225/m/mns82UY4FfqYXPgqrpJ9Bzw.jpg -711158527,Netgear N900 450 Mbps 4-Port Gigabit Wireless N Router R4500 ~ FreE ShiPPinG ~,,http://thumbs1.ebaystatic.com/d/l225/m/m_o0mSRmySgJUuqHYDIQiuA.jpg -711158528,D-Link Wireless Router Model DIR-625,,http://thumbs2.ebaystatic.com/d/l225/m/mYPXwZMlDUjOQ3Sm3EtU37Q.jpg -711158529,D-Link DIR-657 300 Mbps 4-Port Gigabit Wireless N Router Hd Media Router 1000,"Stream multiple media content - videos, music and more to multiple devices all at the same time without lag or skipping. The HD Fuel technology in the DIR-657 lets you watch Netflix and Vudu , play your Wii or Xbox 360 online or make Skype calls all without worrying about the skipping or latency you might experience with standard routers. It does so by automatically giving extra bandwidth for video, gaming and VoIP calls using HD Fuel QoS technology. The D-Link HD Media Router 1000(DIR-657) also comes equipped with 4 Gigabit ports to provide speeds up to 10x faster than standard 10/100 ports. What s more, it uses 802.11n technology with multiple intelligent antennas to maximize the speed and range of your wireless signal to significantly outperform 802.11g devices.",http://thumbs1.ebaystatic.com/d/l225/m/m0xyPdWrdVKe7By4QFouVeA.jpg -711158530,D-Link DIR-860L AC1200 4-Port Cloud Router Gigabit Wireless 802.11 AC,,http://thumbs3.ebaystatic.com/d/l225/m/mk4KNj6oLm7863qCS-TqmbQ.jpg -711158531,D-Link DIR-862L Wireless AC1600 Dual Band Gigabit Router,,http://thumbs2.ebaystatic.com/d/l225/m/m6Arw8kaZ4EUbyKjHtJZLkA.jpg -711158532,LINKSYS AC1600 DUAL BAND SMART WI-FI ROUTER EA6400 BRAND NEW,,http://thumbs3.ebaystatic.com/d/l225/m/mdK7igTS7_TDD7ajfVqj-_w.jpg -711158533,Netgear AC1900 1300 Mbps 4-Port Gigabit Wireless AC Router (R7000),,http://thumbs4.ebaystatic.com/d/l225/m/mdL34EQi0l-Kg-DlvF6wpqA.jpg -711158534,Panasonic ES-LA63 Cordless Rechargeable Men's Electric Shaver,,http://thumbs3.ebaystatic.com/d/l225/m/mzKKlCxbADObevcgoNjbXRg.jpg -711158535,Panasonic ARC 5 Best Mens Shaver,,http://thumbs4.ebaystatic.com/d/l225/m/mt34Y-u0okj-SqQm8Ng_rbQ.jpg -711158536,Panasonic Es8092 Wet Dry Electric Razor Shaver Cordless,,http://thumbs3.ebaystatic.com/d/l225/m/mlIxTz1LsVjXiZz2CzDquJw.jpg -711158537,Panasonic ARC4 ES-RF31-s Rechargeable Electric Shaver Wet/dry 4 Nanotech Blade,"Made for folks who need a great shave, the Panasonic electric shaver is convenient and consistent. Featuring an ergonomic design, this Panasonic ES-RF31-S is ideal for keeping a stubble-free face, so you can retain wonderfully smooth skin. With the precision blades included on the Panasonic electric shaver, you can get smooth shaves with every use. As this men's electric shaver features a gentle shaving mechanism, you can help avoid burning sensations on tender skin. Make sure you consistently get multiple perfect shaves without depleting the power with the exceptional shave time typical of this Panasonic ES-RF31-S.",http://thumbs1.ebaystatic.com/d/l225/m/mi4QM99Jq4oma5WLAL0K7Wg.jpg -711158538,"Panasonic ES3831K Single Blade Travel Shaver, Black New","Strong and trustworthy, the Panasonic electric shaver is built for folks who are worried about a wonderful shave every day. This Panasonic ES3833S is just right for taming your beard, with an easy-to-maneuver design, so you can retain wonderfully soft skin. Spend as much time as you need getting a complete shave by making use of the outstanding shave time typical of the Panasonic electric shaver. Moreover, this men's electric shaver includes precision foil blades, so you can get wonderful shaves over a prolonged period. With the gentle shaving mechanism on this Panasonic ES3833S, you can help avoid burning sensations on tender skin.",http://thumbs3.ebaystatic.com/d/l225/m/mfqMoj4xDlBFXp1ZznxCGbQ.jpg -711158539,Panasonic ES8103S Arc3 Electric Shaver Wet/Dry with Nanotech Blades for Men,,http://thumbs1.ebaystatic.com/d/l225/m/myaZLqzt3I7O-3xXxsJ_4fQ.jpg -711158540,Panasonic ES8103S Arc3 Electric Shaver Wet/Dry with Nanotech Blades,,http://thumbs1.ebaystatic.com/d/l225/m/mcrO4BkjBkM78XHm-aClRGg.jpg -711158543,Panasonic ES3831K Single Blade Wet & Dry Travel Shaver - New & Sealed,,http://thumbs4.ebaystatic.com/d/l225/m/mqWDU2mHsFWAuGosMIGcIMg.jpg -711158544,Panasonic ES8103S Arc 3 E W/O POUCH & MANUAL Men's Wet/Dry Rechargeable Shaver,,http://thumbs2.ebaystatic.com/d/l225/m/mZXgTj-fQfcgAlzOGQYkqFw.jpg -711158545,PANASONIC ES3831K Pro-Curve Battery Operated Travel Wet/Dry Shaver,,http://thumbs1.ebaystatic.com/d/l225/m/m8McQMCfgdp50trM_YJ88cw.jpg -711158546,PANASONIC ARC3 ES-LT33-S WET DRY WASHABLE RECHARGEABLE MEN'S ELECTRIC SHAVER NIB,,http://thumbs1.ebaystatic.com/d/l225/m/m9yUif5xyhGfh7Ag-_fcLdA.jpg -711158547,Panasonic ES-LV81-k Arc 5 Wet & Dry Rechargeable Men's Foil Shaver New,,http://thumbs1.ebaystatic.com/d/l225/m/mEfZHzDoKrH4DBfU8e_K93A.jpg -711158548,"NEW Panasonic ES-RF31-S 4 Blade Men's Electric Razor Wet/Dry, Factory Sealed",,http://thumbs2.ebaystatic.com/d/l225/m/mfhMhMoDkrGtqWW_IyqVGuQ.jpg -711158549,Panasonic ES8243A E Arc4 Men's Electric Shaver Wet/Dry,"eBay item number:181670746515 - - - Seller assumes all responsibility for this listing. - - Last updated on -  Mar 23, 2015 08:55:50 PDT  - View all revisions - - - - - - Item specifics - - - -
Condition:Used - : - - -
",http://thumbs4.ebaystatic.com/d/l225/m/mcxFUwt3FrGEEPzT7cfQn7w.jpg -711158550,Panasonic ES-3833 Wet/Dry Men Shaver Razor Battery Operate Compact Travel ES3833,,http://thumbs2.ebaystatic.com/d/l225/m/mAqa9pHisKsLSk5nqMg4JJQ.jpg -711158551,Panasonic Pro-Curve ES3831K Shaver - Dry/Wet Technology - Stainless Steel Foil,,http://thumbs3.ebaystatic.com/d/l225/m/mGqD8eGIwseT5nsM53W3uRQ.jpg -711158552,Panasonic Wet and Dry Shaver - ES-RW30s ES-RW30-S,"The Panasonic electric shaver is well-suited to shielding particularly sensitive skin and providing a smooth shave. It's both trustworthy and transportable. Because this Panasonic ES-RW30-S has a gentle shaving mechanism, you can avoid irritation and raw feeling skin in particularly tender areas. The Panasonic electric shaver is ideal for ridding yourself of stubble, with its special design, so you can sustain wonderfully supple skin. The exceptional shave time featured on this men's electric shaver helps you to make sure you consistently receive many complete shaves without depleting the power. Plus, this Panasonic ES-RW30-S features precision blades, so you can enjoy smooth shaves for months on end.",http://thumbs1.ebaystatic.com/d/l225/m/mvPElpjXmgo0NhP-P5F8LlQ.jpg -711158553,Panasonic ES-LF51-A Arc4 Electric Shaver Wet/Dry with Flexible Pivoting Head,,http://thumbs3.ebaystatic.com/d/l225/m/mC_zAQrMQKPLHdENU7N3UjQ.jpg -711158554,Panasonic ES8103S Arc3 Men's Electric Shaver Wet/Dry with Nanotech Blades,,http://thumbs3.ebaystatic.com/d/l225/m/moBByNwPn93-g-oBBceS2kw.jpg -711158555,panasonic ARC3 shaver es8103s,,http://thumbs1.ebaystatic.com/d/l225/m/mJlAp6t6OMIOaYgKnyelIMg.jpg -711158556,Panasonic ES-534 Men's Electric Shaver New ES534 Battery Operated Compact Travel,,http://thumbs3.ebaystatic.com/d/l225/m/mDr2kpZLVSdy1KTPVYK2YUg.jpg -711158557,Panasonic Portable Shaving Machine Cclippers Washable Single Blade Shaver+Brush,,http://thumbs3.ebaystatic.com/d/l225/m/mJdzJPoOALps0Lv4WtW2b0A.jpg -711158559,Baratza Solis Maestro Conical Burr Coffee Bean Grinder Works Great Nice Cond,,http://thumbs4.ebaystatic.com/d/l225/m/mdjbD7YFR6JRq-pkeajhK7w.jpg -711158560,Proctor Silex Fresh Grind Electric Coffee Bean Grinder White,,http://thumbs4.ebaystatic.com/d/l225/m/mtXoRn5Ytmqz0GLHYmBUxpA.jpg -711158561,Cuisinart 8-oz. Supreme Grind Automatic Burr Coffee Grinder,,http://thumbs4.ebaystatic.com/d/l225/m/my_9cXPvwwRVFqo6MXWfpag.jpg diff --git a/examples/kaggle/schema-kaggle.toml b/examples/kaggle/schema-kaggle.toml deleted file mode 100644 index bd729904b..000000000 --- a/examples/kaggle/schema-kaggle.toml +++ /dev/null @@ -1,19 +0,0 @@ -# This schema has been generated ... -# The order in which the attributes are declared is important, -# it specify the attribute xxx... - -identifier = "id" - -[attributes.id] -displayed = true - -[attributes.title] -displayed = true -indexed = true - -[attributes.description] -displayed = true -indexed = true - -[attributes.image] -displayed = true diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index fce1ecdc8..f9dcb6d74 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -1,34 +1,55 @@ [package] name = "meilidb-core" version = "0.1.0" -authors = ["Kerollmops "] +authors = ["Kerollmops "] edition = "2018" [dependencies] -byteorder = "1.3.1" +arc-swap = "0.4.3" +bincode = "1.1.4" +byteorder = "1.3.2" +crossbeam-channel = "0.3.9" deunicode = "1.0.0" -hashbrown = "0.6.0" -lazy_static = "1.2.0" -log = "0.4.6" +env_logger = "0.7.0" +hashbrown = { version = "0.6.0", features = ["serde"] } +lmdb-rkv = "0.12.3" +log = "0.4.8" +meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } -rayon = "1.2.0" +once_cell = "1.2.0" +ordered-float = { version = "1.0.2", features = ["serde"] } +rkv = "0.10.2" sdset = "0.3.2" -serde = { version = "1.0.88", features = ["derive"] } +serde = { version = "1.0.99", features = ["derive"] } +serde_json = "1.0.40" +siphasher = "0.3.0" slice-group-by = "0.2.6" zerocopy = "0.2.8" -[dependencies.fst] -git = "https://github.com/Kerollmops/fst.git" -branch = "arc-byte-slice" +[dependencies.rmp-serde] +git = "https://github.com/3Hren/msgpack-rust.git" +rev = "40b3d48" + +[dependencies.rmpv] +git = "https://github.com/3Hren/msgpack-rust.git" +rev = "40b3d48" +features = ["with-serde"] [dependencies.levenshtein_automata] git = "https://github.com/Kerollmops/levenshtein-automata.git" branch = "arc-byte-slice" features = ["fst_automaton"] +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "arc-byte-slice" + [dev-dependencies] assert_matches = "1.3" - -[features] -i128 = ["byteorder/i128"] -nightly = ["hashbrown/nightly", "slice-group-by/nightly"] +csv = "1.0.7" +indexmap = { version = "1.2.0", features = ["serde-1"] } +rustyline = { version = "5.0.0", default-features = false } +structopt = "0.3.2" +tempfile = "3.1.0" +termcolor = "1.0.4" +toml = "0.5.3" diff --git a/meilidb/examples/query-database.rs b/meilidb-core/examples/from_file.rs similarity index 50% rename from meilidb/examples/query-database.rs rename to meilidb-core/examples/from_file.rs index 9677eead6..ec40e13b9 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb-core/examples/from_file.rs @@ -1,45 +1,161 @@ -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; use std::error::Error; -use std::io::{self, Write}; +use std::io::Write; use std::iter::FromIterator; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::time::{Instant, Duration}; +use std::{fs, io, sync::mpsc}; -use indexmap::IndexMap; use rustyline::{Editor, Config}; +use serde::{Serialize, Deserialize}; use structopt::StructOpt; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; -use meilidb_core::Highlight; -use meilidb_data::Database; +use meilidb_core::{Highlight, Database, UpdateResult, BoxUpdateFn}; use meilidb_schema::SchemaAttr; +const INDEX_NAME: &str = "default"; + #[derive(Debug, StructOpt)] -pub struct Opt { - /// The destination where the database must be created +struct IndexCommand { + /// The destination where the database must be created. #[structopt(parse(from_os_str))] - pub database_path: PathBuf, + database_path: PathBuf, - #[structopt(long = "fetch-timeout-ms")] - pub fetch_timeout_ms: Option, + /// The csv file to index. + #[structopt(parse(from_os_str))] + csv_data_path: PathBuf, - /// Fields that must be displayed. - pub displayed_fields: Vec, + /// The path to the schema. + #[structopt(long, parse(from_os_str))] + schema: PathBuf, - /// The number of returned results - #[structopt(short = "n", long = "number-results", default_value = "10")] - pub number_results: usize, - - /// The number of characters before and after the first match - #[structopt(short = "C", long = "context", default_value = "35")] - pub char_context: usize, + #[structopt(long)] + update_group_size: Option, } -type Document = IndexMap; +#[derive(Debug, StructOpt)] +struct SearchCommand { + /// The destination where the database must be created. + #[structopt(parse(from_os_str))] + database_path: PathBuf, + + /// Timeout after which the search will return results. + #[structopt(long)] + fetch_timeout_ms: Option, + + /// The number of returned results + #[structopt(short, long, default_value = "10")] + number_results: usize, + + /// The number of characters before and after the first match + #[structopt(short = "C", long, default_value = "35")] + char_context: usize, + + /// A filter string that can be `!adult` or `adult` to + /// filter documents on this specfied field + #[structopt(short, long)] + filter: Option, + + /// Fields that must be displayed. + displayed_fields: Vec, +} + +#[derive(Debug, StructOpt)] +enum Command { + Index(IndexCommand), + Search(SearchCommand), +} + +impl Command { + fn path(&self) -> &Path { + match self { + Command::Index(command) => &command.database_path, + Command::Search(command) => &command.database_path, + } + } +} + +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +struct Document(indexmap::IndexMap); + +fn index_command(command: IndexCommand, database: Database) -> Result<(), Box> { + let start = Instant::now(); + + let (sender, receiver) = mpsc::sync_channel(100); + let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap(); + let index = database.open_index(INDEX_NAME, Some(Box::new(update_fn)))?; + let rkv = database.rkv.read().unwrap(); + + let schema = { + let string = fs::read_to_string(&command.schema)?; + toml::from_str(&string).unwrap() + }; + + let writer = rkv.write().unwrap(); + match index.main.schema(&writer)? { + Some(current_schema) => { + if current_schema != schema { + return Err(meilidb_core::Error::SchemaDiffer.into()) + } + writer.abort(); + }, + None => index.schema_update(writer, schema)?, + } + + let mut rdr = csv::Reader::from_path(command.csv_data_path)?; + let mut raw_record = csv::StringRecord::new(); + let headers = rdr.headers()?.clone(); + + let mut max_update_id = 0; + let mut i = 0; + let mut end_of_file = false; + + while !end_of_file { + let mut additions = index.documents_addition(); + + loop { + end_of_file = !rdr.read_record(&mut raw_record)?; + if end_of_file { break } + + let document: Document = match raw_record.deserialize(Some(&headers)) { + Ok(document) => document, + Err(e) => { + eprintln!("{:?}", e); + continue; + } + }; + + additions.update_document(document); + + print!("\rindexing document {}", i); + i += 1; + + if let Some(group_size) = command.update_group_size { + if i % group_size == 0 { break } + } + } + + println!(); + + let writer = rkv.write().unwrap(); + println!("committing update..."); + let update_id = additions.finalize(writer)?; + max_update_id = max_update_id.max(update_id); + println!("committed update {}", update_id); + } + + println!("Waiting for update {}", max_update_id); + for id in receiver { + if id == max_update_id { break } + } + + println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path); + + Ok(()) +} fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { let mut stdout = StandardStream::stdout(ColorChoice::Always); @@ -138,19 +254,16 @@ fn crop_text( (text, highlights) } -fn main() -> Result<(), Box> { - let _ = env_logger::init(); - let opt = Opt::from_args(); +fn search_command(command: SearchCommand, database: Database) -> Result<(), Box> { + let rkv = database.rkv.read().unwrap(); + let update_fn = None as Option::; + let index = database.open_index(INDEX_NAME, update_fn)?; + let reader = rkv.read().unwrap(); - let start = Instant::now(); - let database = Database::open(&opt.database_path)?; + let schema = index.main.schema(&reader)?; + let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?; - let index = database.open_index("test")?.unwrap(); - let schema = index.schema(); - - println!("database prepared for you in {:.2?}", start.elapsed()); - - let fields = opt.displayed_fields.iter().map(String::as_str); + let fields = command.displayed_fields.iter().map(String::as_str); let fields = HashSet::from_iter(fields); let config = Config::builder().auto_add_history(true).build(); @@ -162,14 +275,29 @@ fn main() -> Result<(), Box> { Ok(query) => { let start_total = Instant::now(); - let builder = match opt.fetch_timeout_ms { - Some(timeout_ms) => { - let timeout = Duration::from_millis(timeout_ms); - index.query_builder().with_fetch_timeout(timeout) + let documents = match command.filter { + Some(ref filter) => { + let filter = filter.as_str(); + let (positive, filter) = if filter.chars().next() == Some('!') { + (false, &filter[1..]) + } else { + (true, filter) + }; + + let attr = schema.attribute(&filter).expect("Could not find filtered attribute"); + + let builder = index.query_builder(); + let builder = builder.with_filter(|document_id| { + let string: String = index.document_attribute(&reader, document_id, attr).unwrap().unwrap(); + (string == "true") == positive + }); + builder.query(&reader, &query, 0..command.number_results)? }, - None => index.query_builder(), + None => { + let builder = index.query_builder(); + builder.query(&reader, &query, 0..command.number_results)? + } }; - let documents = builder.query(&query, 0..opt.number_results)?; let mut retrieve_duration = Duration::default(); @@ -179,19 +307,20 @@ fn main() -> Result<(), Box> { doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); let start_retrieve = Instant::now(); - let result = index.document::(Some(&fields), doc.id); + let result = index.document::(&reader, Some(&fields), doc.id); retrieve_duration += start_retrieve.elapsed(); match result { Ok(Some(document)) => { - for (name, text) in document { + println!("raw-id: {:?}", doc.id); + for (name, text) in document.0 { print!("{}: ", name); let attr = schema.attribute(&name).unwrap(); let highlights = doc.highlights.iter() .filter(|m| SchemaAttr::new(m.attribute) == attr) .cloned(); - let (text, highlights) = crop_text(&text, highlights, opt.char_context); + let (text, highlights) = crop_text(&text, highlights, command.char_context); let areas = create_highlight_areas(&text, &highlights); display_highlights(&text, &areas)?; println!(); @@ -214,7 +343,7 @@ fn main() -> Result<(), Box> { println!(); } - eprintln!("document field retrieve took {:.2?}", retrieve_duration); + eprintln!("whole documents fields retrieve took {:.2?}", retrieve_duration); eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); }, Err(err) => { @@ -225,5 +354,18 @@ fn main() -> Result<(), Box> { } readline.save_history("query-history.txt").unwrap(); + Ok(()) } + +fn main() -> Result<(), Box> { + env_logger::init(); + + let opt = Command::from_args(); + let database = Database::open_or_create(opt.path())?; + + match opt { + Command::Index(command) => index_command(command, database), + Command::Search(command) => search_command(command, database), + } +} diff --git a/meilidb-core/src/automaton.rs b/meilidb-core/src/automaton.rs deleted file mode 100644 index 1ab845933..000000000 --- a/meilidb-core/src/automaton.rs +++ /dev/null @@ -1,44 +0,0 @@ -use lazy_static::lazy_static; -use levenshtein_automata::{ - LevenshteinAutomatonBuilder as LevBuilder, - DFA, -}; - -lazy_static! { - static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false); - static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false); - static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false); -} - -#[derive(Copy, Clone)] -enum PrefixSetting { - Prefix, - NoPrefix, -} - -fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA { - use self::PrefixSetting::{Prefix, NoPrefix}; - - match query.len() { - 0 ..= 4 => match setting { - Prefix => LEVDIST0.build_prefix_dfa(query), - NoPrefix => LEVDIST0.build_dfa(query), - }, - 5 ..= 8 => match setting { - Prefix => LEVDIST1.build_prefix_dfa(query), - NoPrefix => LEVDIST1.build_dfa(query), - }, - _ => match setting { - Prefix => LEVDIST2.build_prefix_dfa(query), - NoPrefix => LEVDIST2.build_dfa(query), - }, - } -} - -pub fn build_prefix_dfa(query: &str) -> DFA { - build_dfa_with_setting(query, PrefixSetting::Prefix) -} - -pub fn build_dfa(query: &str) -> DFA { - build_dfa_with_setting(query, PrefixSetting::NoPrefix) -} diff --git a/meilidb-core/src/automaton/dfa.rs b/meilidb-core/src/automaton/dfa.rs new file mode 100644 index 000000000..015fdd877 --- /dev/null +++ b/meilidb-core/src/automaton/dfa.rs @@ -0,0 +1,51 @@ +use once_cell::sync::OnceCell; +use levenshtein_automata::{ + LevenshteinAutomatonBuilder as LevBuilder, + DFA, +}; + +static LEVDIST0: OnceCell = OnceCell::new(); +static LEVDIST1: OnceCell = OnceCell::new(); +static LEVDIST2: OnceCell = OnceCell::new(); + +#[derive(Copy, Clone)] +enum PrefixSetting { + Prefix, + NoPrefix, +} + +fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA { + use PrefixSetting::{Prefix, NoPrefix}; + + match query.len() { + 0 ..= 4 => { + let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, false)); + match setting { + Prefix => builder.build_prefix_dfa(query), + NoPrefix => builder.build_dfa(query), + } + }, + 5 ..= 8 => { + let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, false)); + match setting { + Prefix => builder.build_prefix_dfa(query), + NoPrefix => builder.build_dfa(query), + } + }, + _ => { + let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, false)); + match setting { + Prefix => builder.build_prefix_dfa(query), + NoPrefix => builder.build_dfa(query), + } + }, + } +} + +pub fn build_prefix_dfa(query: &str) -> DFA { + build_dfa_with_setting(query, PrefixSetting::Prefix) +} + +pub fn build_dfa(query: &str) -> DFA { + build_dfa_with_setting(query, PrefixSetting::NoPrefix) +} diff --git a/meilidb-core/src/automaton/mod.rs b/meilidb-core/src/automaton/mod.rs new file mode 100644 index 000000000..f1d864a9a --- /dev/null +++ b/meilidb-core/src/automaton/mod.rs @@ -0,0 +1,219 @@ +mod dfa; +mod query_enhancer; + +use std::cmp::Reverse; +use std::vec; + +use fst::{IntoStreamer, Streamer}; +use levenshtein_automata::DFA; +use meilidb_tokenizer::{split_query_string, is_cjk}; + +use crate::store; +use crate::error::MResult; + +use self::dfa::{build_dfa, build_prefix_dfa}; +use self::query_enhancer::QueryEnhancerBuilder; +pub use self::query_enhancer::QueryEnhancer; + +const NGRAMS: usize = 3; + +pub struct AutomatonProducer { + automatons: Vec>, +} + +impl AutomatonProducer { + pub fn new( + reader: &impl rkv::Readable, + query: &str, + main_store: store::Main, + synonyms_store: store::Synonyms, + ) -> MResult<(AutomatonProducer, QueryEnhancer)> + { + let (automatons, query_enhancer) = generate_automatons( + reader, + query, + main_store, + synonyms_store, + )?; + + Ok((AutomatonProducer { automatons }, query_enhancer)) + } + + pub fn into_iter(self) -> vec::IntoIter> { + self.automatons.into_iter() + } +} + +#[derive(Debug)] +pub struct Automaton { + pub index: usize, + pub ngram: usize, + pub query_len: usize, + pub is_exact: bool, + pub is_prefix: bool, + pub query: String, +} + +impl Automaton { + pub fn dfa(&self) -> DFA { + if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) + } + } + + fn exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, + query_len: query.len(), + is_exact: true, + is_prefix: false, + query: query.to_string(), + } + } + + fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, + query_len: query.len(), + is_exact: true, + is_prefix: true, + query: query.to_string(), + } + } + + fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, + query_len: query.len(), + is_exact: false, + is_prefix: false, + query: query.to_string(), + } + } +} + +pub fn normalize_str(string: &str) -> String { + let mut string = string.to_lowercase(); + + if !string.contains(is_cjk) { + string = deunicode::deunicode_with_tofu(&string, ""); + } + + string +} + +fn generate_automatons( + reader: &impl rkv::Readable, + query: &str, + main_store: store::Main, + synonym_store: store::Synonyms, +) -> MResult<(Vec>, QueryEnhancer)> +{ + let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); + let synonyms = match main_store.synonyms_fst(reader)? { + Some(synonym) => synonym, + None => fst::Set::default(), + }; + + let mut automaton_index = 0; + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); + + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_automatons = Vec::new(); + let mut original_words = query_words.iter().peekable(); + while let Some(word) = original_words.next() { + + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + Automaton::exact(automaton_index, 1, word) + } else { + Automaton::prefix_exact(automaton_index, 1, word) + }; + automaton_index += 1; + original_automatons.push(automaton); + } + + automatons.push(original_automatons); + + for n in 1..=NGRAMS { + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + + let query_range = query_index..query_index + n; + let ngram_nb_words = ngram_slice.len(); + let ngram = ngram_slice.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + // automaton of synonyms of the ngrams + let normalized = normalize_str(&ngram); + let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }; + + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(base) = stream.next() { + + // only trigger alternatives when the last word has been typed + // i.e. "new " do not but "new yo" triggers alternatives to "new york" + let base = std::str::from_utf8(base).unwrap(); + let base_nb_words = split_query_string(base).count(); + if ngram_nb_words != base_nb_words { continue } + + if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { + + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for synonym in synonyms_words { + let automaton = if nb_synonym_words == 1 { + Automaton::exact(automaton_index, n, synonym) + } else { + Automaton::non_exact(automaton_index, n, synonym) + }; + automaton_index += 1; + automatons.push(vec![automaton]); + } + } + } + } + + if n != 1 { + // automaton of concatenation of query words + let concat = ngram_slice.concat(); + let normalized = normalize_str(&concat); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); + + let automaton = Automaton::exact(automaton_index, n, &normalized); + automaton_index += 1; + automatons.push(vec![automaton]); + } + } + } + + // order automatons, the most important first, + // we keep the original automatons at the front. + automatons[1..].sort_unstable_by_key(|a| { + let a = a.first().unwrap(); + (Reverse(a.is_exact), Reverse(a.ngram)) + }); + + Ok((automatons, enhancer_builder.build())) +} diff --git a/meilidb-core/src/automaton/query_enhancer.rs b/meilidb-core/src/automaton/query_enhancer.rs new file mode 100644 index 000000000..165c1b094 --- /dev/null +++ b/meilidb-core/src/automaton/query_enhancer.rs @@ -0,0 +1,398 @@ +use std::ops::Range; +use std::cmp::Ordering::{Less, Greater, Equal}; + +/// Return `true` if the specified range can accept the given replacements words. +/// Returns `false` if the replacements words are already present in the original query +/// or if there is fewer replacement words than the range to replace. +// +// +// ## Ignored because already present in original +// +// new york city subway +// -------- ^^^^ +// / \ +// [new york city] +// +// +// ## Ignored because smaller than the original +// +// new york city subway +// ------------- +// \ / +// [new york] +// +// +// ## Accepted because bigger than the original +// +// NYC subway +// --- +// / \ +// / \ +// / \ +// / \ +// / \ +// [new york city] +// +fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool +where S: AsRef, + T: AsRef, +{ + if words.len() <= range.len() { + // there is fewer or equal replacement words + // than there is already in the replaced range + return false + } + + // retrieve the part to rewrite but with the length + // of the replacement part + let original = query.iter().skip(range.start).take(words.len()); + + // check if the original query doesn't already contain + // the replacement words + !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) +} + +type Origin = usize; +type RealLength = usize; + +struct FakeIntervalTree { + intervals: Vec<(Range, (Origin, RealLength))>, +} + +impl FakeIntervalTree { + fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { + intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); + FakeIntervalTree { intervals } + } + + fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { + let element = self.intervals.binary_search_by(|(r, _)| { + if point >= r.start { + if point < r.end { Equal } else { Less } + } else { Greater } + }); + + let n = match element { Ok(n) => n, Err(n) => n }; + + match self.intervals.get(n) { + Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), + _otherwise => None, + } + } +} + +pub struct QueryEnhancerBuilder<'a, S> { + query: &'a [S], + origins: Vec, + real_to_origin: Vec<(Range, (Origin, RealLength))>, +} + +impl> QueryEnhancerBuilder<'_, S> { + pub fn new(query: &[S]) -> QueryEnhancerBuilder { + // we initialize origins query indices based on their positions + let origins: Vec<_> = (0..query.len() + 1).collect(); + let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); + + QueryEnhancerBuilder { query, origins, real_to_origin } + } + + /// Update the final real to origin query indices mapping. + /// + /// `range` is the original words range that this `replacement` words replace + /// and `real` is the first real query index of these replacement words. + pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) + where T: AsRef, + { + // check if the range of original words + // can be rewritten with the replacement words + if rewrite_range_with(self.query, range.clone(), replacement) { + + // this range can be replaced so we need to + // modify the origins accordingly + let offset = replacement.len() - range.len(); + + let previous_padding = self.origins[range.end - 1]; + let current_offset = (self.origins[range.end] - 1) - previous_padding; + let diff = offset.saturating_sub(current_offset); + self.origins[range.end] += diff; + + for r in &mut self.origins[range.end + 1..] { + *r += diff; + } + } + + // we need to store the real number and origins relations + // this way it will be possible to know by how many + // we need to pad real query indices + let real_range = real..real + replacement.len().max(range.len()); + let real_length = replacement.len(); + self.real_to_origin.push((real_range, (range.start, real_length))); + } + + pub fn build(self) -> QueryEnhancer { + QueryEnhancer { + origins: self.origins, + real_to_origin: FakeIntervalTree::new(self.real_to_origin), + } + } +} + +pub struct QueryEnhancer { + origins: Vec, + real_to_origin: FakeIntervalTree, +} + +impl QueryEnhancer { + /// Returns the query indices to use to replace this real query index. + pub fn replacement(&self, real: u32) -> Range { + let real = real as usize; + + // query the fake interval tree with the real query index + let (range, (origin, real_length)) = + self.real_to_origin + .query(real) + .expect("real has never been declared"); + + // if `real` is the end bound of the range + if (range.start + real_length - 1) == real { + let mut count = range.len(); + let mut new_origin = origin; + for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { + let len = slice[1] - slice[0]; + count = count.saturating_sub(len); + if count == 0 { new_origin = origin + i; break } + } + + let n = real - range.start; + let start = self.origins[origin]; + let end = self.origins[new_origin + 1]; + let remaining = (end - start) - n; + + Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } + + } else { + // just return the origin along with + // the real position of the word + let n = real as usize - range.start; + let origin = self.origins[origin]; + + Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn original_unmodified() { + let query = ["new", "york", "city", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..2); // york + assert_eq!(enhancer.replacement(2), 2..3); // city + assert_eq!(enhancer.replacement(3), 3..4); // subway + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + } + + #[test] + fn simple_growing() { + let query = ["new", "york", "subway"]; + // 0 1 2 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 3, &["new", "york", "city"]); + // ^ 3 4 5 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..3); // york + assert_eq!(enhancer.replacement(2), 3..4); // subway + assert_eq!(enhancer.replacement(3), 0..1); // new + assert_eq!(enhancer.replacement(4), 1..2); // york + assert_eq!(enhancer.replacement(5), 2..3); // city + } + + #[test] + fn same_place_growings() { + let query = ["NY", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NY = new york + builder.declare(0..1, 2, &["new", "york"]); + // ^ 2 3 + + // NY = new york city + builder.declare(0..1, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // NY = NYC + builder.declare(0..1, 7, &["NYC"]); + // ^ 7 + + // NY = new york city + builder.declare(0..1, 8, &["new", "york", "city"]); + // ^ 8 9 10 + + // subway = underground train + builder.declare(1..2, 11, &["underground", "train"]); + // ^ 11 12 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NY + assert_eq!(enhancer.replacement(1), 3..5); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..3); // york + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + assert_eq!(enhancer.replacement(7), 0..3); // NYC + assert_eq!(enhancer.replacement(8), 0..1); // new + assert_eq!(enhancer.replacement(9), 1..2); // york + assert_eq!(enhancer.replacement(10), 2..3); // city + assert_eq!(enhancer.replacement(11), 3..4); // underground + assert_eq!(enhancer.replacement(12), 4..5); // train + } + + #[test] + fn bigger_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(0..1, 2, &["new", "york", "city"]); + // ^ 2 3 4 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NYC + assert_eq!(enhancer.replacement(1), 3..4); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..2); // york + assert_eq!(enhancer.replacement(4), 2..3); // city + } + + #[test] + fn middle_query_growing() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..6); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + } + + #[test] + fn end_query_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(1..2, 2, &["underground", "train"]); + // ^ 2 3 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // NYC + assert_eq!(enhancer.replacement(1), 1..3); // subway + assert_eq!(enhancer.replacement(2), 1..2); // underground + assert_eq!(enhancer.replacement(3), 2..3); // train + } + + #[test] + fn multiple_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + } + + #[test] + fn multiple_probable_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + // great awesome = good + builder.declare(0..2, 9, &["good"]); + // ^ 9 + + // awesome NYC = NY + builder.declare(1..3, 10, &["NY"]); + // ^^ 10 + + // NYC subway = metro + builder.declare(2..4, 11, &["metro"]); + // ^^ 11 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + assert_eq!(enhancer.replacement(9), 0..2); // good + assert_eq!(enhancer.replacement(10), 1..5); // NY + assert_eq!(enhancer.replacement(11), 2..5); // metro + } +} diff --git a/meilidb-core/src/criterion/document_id.rs b/meilidb-core/src/criterion/document_id.rs index 34d0bd7f5..15549da24 100644 --- a/meilidb-core/src/criterion/document_id.rs +++ b/meilidb-core/src/criterion/document_id.rs @@ -10,7 +10,7 @@ impl Criterion for DocumentId { lhs.id.cmp(&rhs.id) } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "DocumentId" } } diff --git a/meilidb-core/src/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs index bde3ca733..820c35aa0 100644 --- a/meilidb-core/src/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -37,7 +37,7 @@ impl Criterion for Exact { lhs.cmp(&rhs).reverse() } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "Exact" } } diff --git a/meilidb-core/src/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs index 6ce42007c..ad02d3023 100644 --- a/meilidb-core/src/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -4,6 +4,7 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; +mod sort_by_attr; mod document_id; use std::cmp::Ordering; @@ -16,13 +17,14 @@ pub use self::{ sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, exact::Exact, + sort_by_attr::SortByAttr, document_id::DocumentId, }; pub trait Criterion: Send + Sync { fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; - fn name(&self) -> &'static str; + fn name(&self) -> &str; #[inline] fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { @@ -35,7 +37,7 @@ impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { (**self).evaluate(lhs, rhs) } - fn name(&self) -> &'static str { + fn name(&self) -> &str { (**self).name() } @@ -49,7 +51,7 @@ impl Criterion for Box { (**self).evaluate(lhs, rhs) } - fn name(&self) -> &'static str { + fn name(&self) -> &str { (**self).name() } diff --git a/meilidb-core/src/criterion/number_of_words.rs b/meilidb-core/src/criterion/number_of_words.rs index 43095a066..641385fb1 100644 --- a/meilidb-core/src/criterion/number_of_words.rs +++ b/meilidb-core/src/criterion/number_of_words.rs @@ -25,7 +25,7 @@ impl Criterion for NumberOfWords { lhs.cmp(&rhs).reverse() } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "NumberOfWords" } } diff --git a/meilidb/src/sort_by_attr.rs b/meilidb-core/src/criterion/sort_by_attr.rs similarity index 96% rename from meilidb/src/sort_by_attr.rs rename to meilidb-core/src/criterion/sort_by_attr.rs index 83577df13..c19062dd6 100644 --- a/meilidb/src/sort_by_attr.rs +++ b/meilidb-core/src/criterion/sort_by_attr.rs @@ -2,9 +2,9 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; -use meilidb_core::{criterion::Criterion, RawDocument}; -use meilidb_data::RankedMap; use meilidb_schema::{Schema, SchemaAttr}; +use crate::criterion::Criterion; +use crate::{RawDocument, RankedMap}; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -101,7 +101,7 @@ impl<'a> Criterion for SortByAttr<'a> { } } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "SortByAttr" } } diff --git a/meilidb-core/src/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs index 6736e6caa..9fbf0dab9 100644 --- a/meilidb-core/src/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -54,7 +54,7 @@ impl Criterion for SumOfTypos { lhs.cmp(&rhs).reverse() } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "SumOfTypos" } } diff --git a/meilidb-core/src/criterion/sum_of_words_attribute.rs b/meilidb-core/src/criterion/sum_of_words_attribute.rs index d5787ef3a..2bf052159 100644 --- a/meilidb-core/src/criterion/sum_of_words_attribute.rs +++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs @@ -36,7 +36,7 @@ impl Criterion for SumOfWordsAttribute { lhs.cmp(&rhs) } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "SumOfWordsAttribute" } } diff --git a/meilidb-core/src/criterion/sum_of_words_position.rs b/meilidb-core/src/criterion/sum_of_words_position.rs index 13f26774c..d5dd10ab7 100644 --- a/meilidb-core/src/criterion/sum_of_words_position.rs +++ b/meilidb-core/src/criterion/sum_of_words_position.rs @@ -36,7 +36,7 @@ impl Criterion for SumOfWordsPosition { lhs.cmp(&rhs) } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "SumOfWordsPosition" } } diff --git a/meilidb-core/src/criterion/words_proximity.rs b/meilidb-core/src/criterion/words_proximity.rs index 10f167bef..ed3775b50 100644 --- a/meilidb-core/src/criterion/words_proximity.rs +++ b/meilidb-core/src/criterion/words_proximity.rs @@ -99,7 +99,7 @@ impl Criterion for WordsProximity { lhs.cmp(&rhs) } - fn name(&self) -> &'static str { + fn name(&self) -> &str { "WordsProximity" } } diff --git a/meilidb-core/src/database.rs b/meilidb-core/src/database.rs new file mode 100644 index 000000000..c74bfcc7a --- /dev/null +++ b/meilidb-core/src/database.rs @@ -0,0 +1,177 @@ +use std::collections::HashMap; +use std::path::Path; +use std::sync::{Arc, RwLock}; +use std::{fs, thread}; + +use crossbeam_channel::Receiver; +use log::{debug, error}; + +use crate::{store, update, Index, MResult}; + +pub type BoxUpdateFn = Box; +type ArcSwapFn = arc_swap::ArcSwapOption; + +pub struct Database { + pub rkv: Arc>, + main_store: rkv::SingleStore, + indexes_store: rkv::SingleStore, + indexes: RwLock, thread::JoinHandle<()>)>>, +} + +fn update_awaiter( + receiver: Receiver<()>, + rkv: Arc>, + update_fn: Arc, + index: Index, +) +{ + for () in receiver { + // consume all updates in order (oldest first) + loop { + let rkv = match rkv.read() { + Ok(rkv) => rkv, + Err(e) => { error!("rkv RwLock read failed: {}", e); break } + }; + + let mut writer = match rkv.write() { + Ok(writer) => writer, + Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break } + }; + + match update::update_task(&mut writer, index.clone()) { + Ok(Some(status)) => { + if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) } + + if let Some(ref callback) = *update_fn.load() { + (callback)(status); + } + }, + // no more updates to handle for now + Ok(None) => { debug!("no more updates"); writer.abort(); break }, + Err(e) => { error!("update task failed: {}", e); writer.abort() }, + } + } + } +} + +impl Database { + pub fn open_or_create(path: impl AsRef) -> MResult { + let manager = rkv::Manager::singleton(); + let mut rkv_write = manager.write().unwrap(); + + fs::create_dir_all(path.as_ref())?; + + let rkv = rkv_write + .get_or_create(path.as_ref(), |path| { + let mut builder = rkv::Rkv::environment_builder(); + builder.set_max_dbs(3000).set_map_size(10 * 1024 * 1024 * 1024); // 10GB + rkv::Rkv::from_env(path, builder) + })?; + + drop(rkv_write); + + let rkv_read = rkv.read().unwrap(); + let create_options = rkv::store::Options::create(); + let main_store = rkv_read.open_single("main", create_options)?; + let indexes_store = rkv_read.open_single("indexes", create_options)?; + + // list all indexes that needs to be opened + let mut must_open = Vec::new(); + let reader = rkv_read.read()?; + for result in indexes_store.iter_start(&reader)? { + let (key, _) = result?; + if let Ok(index_name) = std::str::from_utf8(key) { + must_open.push(index_name.to_owned()); + } + } + + drop(reader); + + // open the previously aggregated indexes + let mut indexes = HashMap::new(); + for index_name in must_open { + + let (sender, receiver) = crossbeam_channel::bounded(100); + let index = store::open(&rkv_read, &index_name, sender.clone())?; + let update_fn = Arc::new(ArcSwapFn::empty()); + + let rkv_clone = rkv.clone(); + let index_clone = index.clone(); + let update_fn_clone = update_fn.clone(); + + let handle = thread::spawn(move || { + update_awaiter(receiver, rkv_clone, update_fn_clone, index_clone) + }); + + // send an update notification to make sure that + // possible previous boot updates are consumed + sender.send(()).unwrap(); + + let result = indexes.insert(index_name, (index, update_fn, handle)); + assert!(result.is_none(), "The index should not have been already open"); + } + + drop(rkv_read); + + Ok(Database { rkv, main_store, indexes_store, indexes: RwLock::new(indexes) }) + } + + pub fn open_index( + &self, + name: impl Into, + update_fn: Option, + ) -> MResult + { + let indexes_lock = self.indexes.read().unwrap(); + let name = name.into(); + + match indexes_lock.get(&name) { + Some((index, old_update_fn, _)) => { + old_update_fn.swap(update_fn.map(Arc::new)); + Ok(index.clone()) + }, + None => { + drop(indexes_lock); + + let rkv_lock = self.rkv.read().unwrap(); + let (sender, receiver) = crossbeam_channel::bounded(100); + let index = store::create(&rkv_lock, &name, sender)?; + + let mut writer = rkv_lock.write()?; + let value = rkv::Value::Blob(&[]); + self.indexes_store.put(&mut writer, &name, &value)?; + + { + let mut indexes_write = self.indexes.write().unwrap(); + indexes_write.entry(name).or_insert_with(|| { + let rkv_clone = self.rkv.clone(); + let index_clone = index.clone(); + + let update_fn = update_fn.map(Arc::new); + let update_fn = Arc::new(ArcSwapFn::new(update_fn)); + let update_fn_clone = update_fn.clone(); + + let handle = thread::spawn(move || { + update_awaiter(receiver, rkv_clone, update_fn_clone, index_clone) + }); + + (index.clone(), update_fn, handle) + }); + } + + writer.commit()?; + + Ok(index) + }, + } + } + + pub fn indexes_names(&self) -> MResult> { + let indexes = self.indexes.read().unwrap(); + Ok(indexes.keys().cloned().collect()) + } + + pub fn main_store(&self) -> rkv::SingleStore { + self.main_store + } +} diff --git a/meilidb-core/src/error.rs b/meilidb-core/src/error.rs new file mode 100644 index 000000000..db83e39fd --- /dev/null +++ b/meilidb-core/src/error.rs @@ -0,0 +1,112 @@ +use std::{error, fmt, io}; +use crate::serde::{SerializerError, DeserializerError}; + +pub type MResult = Result; + +#[derive(Debug)] +pub enum Error { + Io(io::Error), + SchemaDiffer, + SchemaMissing, + WordIndexMissing, + MissingDocumentId, + Rkv(rkv::StoreError), + Fst(fst::Error), + RmpDecode(rmp_serde::decode::Error), + RmpEncode(rmp_serde::encode::Error), + Bincode(bincode::Error), + Serializer(SerializerError), + Deserializer(DeserializerError), + UnsupportedOperation(UnsupportedOperation), +} + +impl From for Error { + fn from(error: io::Error) -> Error { + Error::Io(error) + } +} + +impl From for Error { + fn from(error: rkv::StoreError) -> Error { + Error::Rkv(error) + } +} + +impl From for Error { + fn from(error: fst::Error) -> Error { + Error::Fst(error) + } +} + +impl From for Error { + fn from(error: rmp_serde::decode::Error) -> Error { + Error::RmpDecode(error) + } +} + +impl From for Error { + fn from(error: rmp_serde::encode::Error) -> Error { + Error::RmpEncode(error) + } +} + +impl From for Error { + fn from(error: bincode::Error) -> Error { + Error::Bincode(error) + } +} + +impl From for Error { + fn from(error: SerializerError) -> Error { + Error::Serializer(error) + } +} + +impl From for Error { + fn from(error: DeserializerError) -> Error { + Error::Deserializer(error) + } +} + +impl From for Error { + fn from(op: UnsupportedOperation) -> Error { + Error::UnsupportedOperation(op) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + Io(e) => write!(f, "{}", e), + SchemaDiffer => write!(f, "schemas differ"), + SchemaMissing => write!(f, "this index does not have a schema"), + WordIndexMissing => write!(f, "this index does not have a word index"), + MissingDocumentId => write!(f, "document id is missing"), + Rkv(e) => write!(f, "rkv error; {}", e), + Fst(e) => write!(f, "fst error; {}", e), + RmpDecode(e) => write!(f, "rmp decode error; {}", e), + RmpEncode(e) => write!(f, "rmp encode error; {}", e), + Bincode(e) => write!(f, "bincode error; {}", e), + Serializer(e) => write!(f, "serializer error; {}", e), + Deserializer(e) => write!(f, "deserializer error; {}", e), + UnsupportedOperation(op) => write!(f, "unsupported operation; {}", op), + } + } +} + +impl error::Error for Error { } + +#[derive(Debug)] +pub enum UnsupportedOperation { + SchemaAlreadyExists, +} + +impl fmt::Display for UnsupportedOperation { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::UnsupportedOperation::*; + match self { + SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"), + } + } +} diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 0a7844292..83b0d9424 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,25 +1,31 @@ -#![feature(checked_duration_since)] - #[cfg(test)] #[macro_use] extern crate assert_matches; mod automaton; +mod database; mod distinct_map; +mod error; +mod number; mod query_builder; -mod query_enhancer; +mod ranked_map; mod raw_document; mod reordered_attrs; -mod store; +mod update; pub mod criterion; +pub mod raw_indexer; +pub mod serde; +pub mod store; -use serde::{Serialize, Deserialize}; -use zerocopy::{AsBytes, FromBytes}; - -use self::raw_document::raw_documents_from; - -pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; +pub use self::database::{Database, BoxUpdateFn}; +pub use self::error::{Error, MResult}; +pub use self::number::{Number, ParseNumberError}; +pub use self::ranked_map::RankedMap; pub use self::raw_document::RawDocument; -pub use self::store::Store; +pub use self::store::Index; +pub use self::update::{UpdateStatus, UpdateResult}; + +use zerocopy::{AsBytes, FromBytes}; +use ::serde::{Serialize, Deserialize}; /// Represent an internally generated document unique identifier. /// diff --git a/meilidb-data/src/number.rs b/meilidb-core/src/number.rs similarity index 100% rename from meilidb-data/src/number.rs rename to meilidb-core/src/number.rs diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index d0c117cad..7bbcf94fb 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -1,248 +1,40 @@ +use hashbrown::HashMap; use std::hash::Hash; +use std::mem; use std::ops::Range; use std::rc::Rc; use std::time::{Instant, Duration}; -use std::{mem, cmp, cmp::Reverse}; -use fst::{Streamer, IntoStreamer}; -use hashbrown::HashMap; -use levenshtein_automata::DFA; -use log::trace; -use meilidb_tokenizer::{is_cjk, split_query_string}; -use rayon::slice::ParallelSliceMut; -use rayon::iter::{ParallelIterator, ParallelBridge}; +use fst::{IntoStreamer, Streamer}; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; -use crate::automaton::{build_dfa, build_prefix_dfa}; -use crate::criterion::Criteria; +use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; -use crate::query_enhancer::{QueryEnhancerBuilder, QueryEnhancer}; -use crate::raw_documents_from; -use crate::reordered_attrs::ReorderedAttrs; -use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; +use crate::raw_document::{RawDocument, raw_documents_from}; +use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria}; +use crate::{store, MResult, reordered_attrs::ReorderedAttrs}; -const NGRAMS: usize = 3; - -struct Automaton { - index: usize, - ngram: usize, - query_len: usize, - is_exact: bool, - is_prefix: bool, - query: String, -} - -impl Automaton { - fn dfa(&self) -> DFA { - if self.is_prefix { - build_prefix_dfa(&self.query) - } else { - build_dfa(&self.query) - } - } - - fn exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: false, - query: query.to_string(), - } - } - - fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: true, - query: query.to_string(), - } - } - - fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: false, - is_prefix: false, - query: query.to_string(), - } - } -} - -pub fn normalize_str(string: &str) -> String { - let mut string = string.to_lowercase(); - - if !string.contains(is_cjk) { - string = deunicode::deunicode_with_tofu(&string, ""); - } - - string -} - -fn generate_automatons(query: &str, store: &S) -> Result<(Vec, QueryEnhancer), S::Error> { - let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let synonyms = store.synonyms()?; - - let mut automatons = Vec::new(); - let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - - // We must not declare the original words to the query enhancer - // *but* we need to push them in the automatons list first - let mut original_words = query_words.iter().peekable(); - while let Some(word) = original_words.next() { - - let has_following_word = original_words.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - - let automaton = if not_prefix_dfa { - Automaton::exact(automatons.len(), 1, word) - } else { - Automaton::prefix_exact(automatons.len(), 1, word) - }; - automatons.push(automaton); - } - - for n in 1..=NGRAMS { - - let mut ngrams = query_words.windows(n).enumerate().peekable(); - while let Some((query_index, ngram_slice)) = ngrams.next() { - - let query_range = query_index..query_index + n; - let ngram_nb_words = ngram_slice.len(); - let ngram = ngram_slice.join(" "); - - let has_following_word = ngrams.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); - - // automaton of synonyms of the ngrams - let normalized = normalize_str(&ngram); - let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }; - - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(base) = stream.next() { - - // only trigger alternatives when the last word has been typed - // i.e. "new " do not but "new yo" triggers alternatives to "new york" - let base = std::str::from_utf8(base).unwrap(); - let base_nb_words = split_query_string(base).count(); - if ngram_nb_words != base_nb_words { continue } - - if let Some(synonyms) = store.alternatives_to(base.as_bytes())? { - - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); - let nb_synonym_words = synonyms_words.len(); - - let real_query_index = automatons.len(); - enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); - - for synonym in synonyms_words { - let automaton = if nb_synonym_words == 1 { - Automaton::exact(automatons.len(), n, synonym) - } else { - Automaton::non_exact(automatons.len(), n, synonym) - }; - automatons.push(automaton); - } - } - } - } - - if n != 1 { - // automaton of concatenation of query words - let concat = ngram_slice.concat(); - let normalized = normalize_str(&concat); - - let real_query_index = automatons.len(); - enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - - let automaton = Automaton::exact(automatons.len(), n, &normalized); - automatons.push(automaton); - } - } - } - - // order automatons, the most important first, - // we keep the original automatons at the front. - let original_len = query_words.len(); - automatons[original_len..].sort_unstable_by_key(|a| (Reverse(a.is_exact), Reverse(a.ngram))); - - Ok((automatons, enhancer_builder.build())) -} - -pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { - store: S, +pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> { criteria: Criteria<'c>, searchable_attrs: Option, filter: Option, - fetch_timeout: Option, -} - -impl<'c, S> QueryBuilder<'c, S, fn(DocumentId) -> bool> { - pub fn new(store: S) -> Self { - QueryBuilder::with_criteria(store, Criteria::default()) - } - - pub fn with_criteria(store: S, criteria: Criteria<'c>) -> Self { - QueryBuilder { store, criteria, searchable_attrs: None, filter: None, fetch_timeout: None } - } -} - -impl<'c, S, FI> QueryBuilder<'c, S, FI> -{ - pub fn with_filter(self, function: F) -> QueryBuilder<'c, S, F> - where F: Fn(DocumentId) -> bool, - { - QueryBuilder { - store: self.store, - criteria: self.criteria, - searchable_attrs: self.searchable_attrs, - filter: Some(function), - fetch_timeout: self.fetch_timeout, - } - } - - pub fn with_fetch_timeout(self, timeout: Duration) -> QueryBuilder<'c, S, FI> { - QueryBuilder { fetch_timeout: Some(timeout), ..self } - } - - pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, S, FI, F> - where F: Fn(DocumentId) -> Option, - K: Hash + Eq, - { - DistinctQueryBuilder { inner: self, function, size } - } - - pub fn add_searchable_attribute(&mut self, attribute: u16) { - let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new); - reorders.insert_attribute(attribute); - } + timeout: Duration, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + synonyms_store: store::Synonyms, } fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, - timeout: Option, ) -> SetBuf<(DocumentId, TmpMatch)> { let mut padded_matches = Vec::with_capacity(matches.len()); // we sort the matches by word index to make them rewritable - let start = Instant::now(); - matches.par_sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); - trace!("rewrite sort by word_index took {:.2?}", start.elapsed()); + matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); - let start = Instant::now(); // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { @@ -322,194 +114,248 @@ fn multiword_rewrite_matches( padding += biggest; } - - // check the timeout *after* having processed at least one element - if timeout.map_or(false, |timeout| start.elapsed() > timeout) { break } } - trace!("main multiword rewrite took {:.2?}", start.elapsed()); - let start = Instant::now(); for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { document_matches.sort_unstable(); } - trace!("final rewrite sort took {:.2?}", start.elapsed()); SetBuf::new_unchecked(padded_matches) } -impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: Store + Sync, - S::Error: Send, +fn fetch_raw_documents( + reader: &impl rkv::Readable, + automatons: &[Automaton], + query_enhancer: &QueryEnhancer, + searchables: Option<&ReorderedAttrs>, + main_store: &store::Main, + postings_lists_store: &store::PostingsLists, +) -> MResult> { - fn query_all(&self, query: &str) -> Result, S::Error> { - let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; - let searchables = self.searchable_attrs.as_ref(); - let store = &self.store; - let fetch_timeout = &self.fetch_timeout; + let mut matches = Vec::new(); + let mut highlights = Vec::new(); - let mut matches = Vec::new(); - let mut highlights = Vec::new(); + for automaton in automatons { + let Automaton { index, is_exact, query_len, .. } = automaton; + let dfa = automaton.dfa(); - let timeout = fetch_timeout.map(|d| d * 75 / 100); - let start = Instant::now(); + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; - let results: Vec<_> = automatons - .into_iter() - .par_bridge() - .map_with((store, searchables), |(store, searchables), automaton| { - let Automaton { index, is_exact, query_len, .. } = automaton; - let dfa = automaton.dfa(); + let mut stream = words.search(&dfa).into_stream(); + while let Some(input) = stream.next() { + let distance = dfa.eval(input).to_u8(); + let is_exact = *is_exact && distance == 0 && input.len() == *query_len; - let words = match store.words() { - Ok(words) => words, - Err(err) => return Some(Err(err)), - }; + let doc_indexes = match postings_lists_store.postings_list(reader, input)? { + Some(doc_indexes) => doc_indexes, + None => continue, + }; - let mut stream = words.search(&dfa).into_stream(); - let mut matches = Vec::new(); - let mut highlights = Vec::new(); + matches.reserve(doc_indexes.len()); + highlights.reserve(doc_indexes.len()); - while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); - let is_exact = is_exact && distance == 0 && input.len() == query_len; - - let doc_indexes = match store.word_indexes(input) { - Ok(Some(doc_indexes)) => doc_indexes, - Ok(None) => continue, - Err(err) => return Some(Err(err)), + for di in doc_indexes.as_ref() { + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); + if let Some(attribute) = attribute { + let match_ = TmpMatch { + query_index: *index as u32, + distance, + attribute, + word_index: di.word_index, + is_exact, }; - matches.reserve(doc_indexes.len()); - highlights.reserve(doc_indexes.len()); + let highlight = Highlight { + attribute: di.attribute, + char_index: di.char_index, + char_length: di.char_length, + }; - for di in doc_indexes.as_slice() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: di.char_length, - }; - - matches.push((di.document_id, match_)); - highlights.push((di.document_id, highlight)); - } - } - - // check the timeout *after* having processed at least one element - if timeout.map_or(false, |timeout| start.elapsed() > timeout) { break } + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); } - - Some(Ok((matches, highlights))) - }) - .while_some() - .collect(); - - for result in results { - let (mut rcv_matches, mut rcv_highlights) = result?; - matches.append(&mut rcv_matches); - highlights.append(&mut rcv_highlights); + } } + } - trace!("main query all took {:.2?}", start.elapsed()); - trace!("{} total matches to rewrite", matches.len()); + let matches = multiword_rewrite_matches(matches, &query_enhancer); + let highlights = { + highlights.sort_unstable_by_key(|(id, _)| *id); + SetBuf::new_unchecked(highlights) + }; - let start = Instant::now(); - let timeout = fetch_timeout.map(|d| d * 25 / 100); - let matches = multiword_rewrite_matches(matches, &query_enhancer, timeout); - trace!("multiword rewrite took {:.2?}", start.elapsed()); + Ok(raw_documents_from(matches, highlights)) +} - let start = Instant::now(); - let highlights = { - highlights.par_sort_unstable_by_key(|(id, _)| *id); - SetBuf::new_unchecked(highlights) - }; - trace!("sorting highlights took {:.2?}", start.elapsed()); +impl<'c> QueryBuilder<'c> { + pub fn new( + main: store::Main, + postings_lists: store::PostingsLists, + synonyms: store::Synonyms, + ) -> QueryBuilder<'c> + { + QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default()) + } - trace!("{} total matches to classify", matches.len()); - - let start = Instant::now(); - let raw_documents = raw_documents_from(matches, highlights); - trace!("making raw documents took {:.2?}", start.elapsed()); - - trace!("{} total documents to classify", raw_documents.len()); - - Ok(raw_documents) + pub fn with_criteria( + main: store::Main, + postings_lists: store::PostingsLists, + synonyms: store::Synonyms, + criteria: Criteria<'c>, + ) -> QueryBuilder<'c> + { + QueryBuilder { + criteria, + searchable_attrs: None, + filter: None, + timeout: Duration::from_millis(30), + main_store: main, + postings_lists_store: postings_lists, + synonyms_store: synonyms, + } } } -impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: Store + Sync, - S::Error: Send, - FI: Fn(DocumentId) -> bool, -{ - pub fn query(self, query: &str, range: Range) -> Result, S::Error> { +impl<'c, FI> QueryBuilder<'c, FI> { + pub fn with_filter(self, function: F) -> QueryBuilder<'c, F> + where F: Fn(DocumentId) -> bool, + { + QueryBuilder { + criteria: self.criteria, + searchable_attrs: self.searchable_attrs, + filter: Some(function), + timeout: self.timeout, + main_store: self.main_store, + postings_lists_store: self.postings_lists_store, + synonyms_store: self.synonyms_store, + } + } + + pub fn with_fetch_timeout(self, timeout: Duration) -> QueryBuilder<'c, FI> { + QueryBuilder { timeout, ..self } + } + + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, FI, F> + where F: Fn(DocumentId) -> Option, + K: Hash + Eq, + { + DistinctQueryBuilder { inner: self, function, size } + } + + pub fn add_searchable_attribute(&mut self, attribute: u16) { + let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new); + reorders.insert_attribute(attribute); + } +} + +impl QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool { + pub fn query( + self, + reader: &impl rkv::Readable, + query: &str, + range: Range, + ) -> MResult> + { // We delegate the filter work to the distinct query builder, // specifying a distinct rule that has no effect. if self.filter.is_some() { let builder = self.with_distinct(|_| None as Option<()>, 1); - return builder.query(query, range); + return builder.query(reader, query, range); } - let start = Instant::now(); - let mut documents = self.query_all(query)?; - trace!("query_all took {:.2?}", start.elapsed()); + let start_processing = Instant::now(); + let mut raw_documents_processed = Vec::with_capacity(range.len()); - let mut groups = vec![documents.as_mut_slice()]; + let (automaton_producer, query_enhancer) = AutomatonProducer::new( + reader, + query, + self.main_store, + self.synonyms_store, + )?; - 'criteria: for criterion in self.criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut documents_seen = 0; + let mut automaton_producer = automaton_producer.into_iter(); + let mut automatons = Vec::new(); - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < range.start { - documents_seen += group.len(); - groups.push(group); - continue; - } + // aggregate automatons groups by groups after time + while let Some(auts) = automaton_producer.next() { + automatons.extend(auts); - let start = Instant::now(); - group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); - trace!("criterion {} sort took {:.2?}", criterion.name(), start.elapsed()); + // we must retrieve the documents associated + // with the current automatons + let mut raw_documents = fetch_raw_documents( + reader, + &automatons, + &query_enhancer, + self.searchable_attrs.as_ref(), + &self.main_store, + &self.postings_lists_store, + )?; - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - trace!("criterion {} produced a group of size {}", criterion.name(), group.len()); + // stop processing when time is running out + if !raw_documents_processed.is_empty() && start_processing.elapsed() > self.timeout { + break + } - documents_seen += group.len(); - groups.push(group); + let mut groups = vec![raw_documents.as_mut_slice()]; - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if documents_seen >= range.end { continue 'criteria } + 'criteria: for criterion in self.criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut documents_seen = 0; + + for group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < range.start { + documents_seen += group.len(); + groups.push(group); + continue; + } + + group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { + documents_seen += group.len(); + groups.push(group); + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if documents_seen >= range.end { continue 'criteria } + } } } + + // once we classified the documents related to the current + // automatons we save that as the next valid result + let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + raw_documents_processed.clear(); + raw_documents_processed.extend(iter); + + // stop processing when time is running out + if start_processing.elapsed() > self.timeout { break } } - let offset = cmp::min(documents.len(), range.start); - let iter = documents.into_iter().skip(offset).take(range.len()); - Ok(iter.map(|d| Document::from_raw(d)).collect()) + // make real documents now that we know + // those must be returned + let documents = raw_documents_processed + .into_iter() + .map(|d| Document::from_raw(d)) + .collect(); + + Ok(documents) } } -pub struct DistinctQueryBuilder<'c, I, FI, FD> { - inner: QueryBuilder<'c, I, FI>, +pub struct DistinctQueryBuilder<'c, FI, FD> { + inner: QueryBuilder<'c, FI>, function: FD, size: usize, } -impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> -{ - pub fn with_filter(self, function: F) -> DistinctQueryBuilder<'c, I, F, FD> +impl<'c, FI, FD> DistinctQueryBuilder<'c, FI, FD> { + pub fn with_filter(self, function: F) -> DistinctQueryBuilder<'c, F, FD> where F: Fn(DocumentId) -> bool, { DistinctQueryBuilder { @@ -519,7 +365,7 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } } - pub fn with_fetch_timeout(self, timeout: Duration) -> DistinctQueryBuilder<'c, I, FI, FD> { + pub fn with_fetch_timeout(self, timeout: Duration) -> DistinctQueryBuilder<'c, FI, FD> { DistinctQueryBuilder { inner: self.inner.with_fetch_timeout(timeout), function: self.function, @@ -532,114 +378,156 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } } -impl<'c, S, FI, FD, K> DistinctQueryBuilder<'c, S, FI, FD> -where S: Store + Sync, - S::Error: Send, - FI: Fn(DocumentId) -> bool, +impl<'c, FI, FD, K> DistinctQueryBuilder<'c, FI, FD> +where FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, { - pub fn query(self, query: &str, range: Range) -> Result, S::Error> { - let start = Instant::now(); - let mut documents = self.inner.query_all(query)?; - trace!("query_all took {:.2?}", start.elapsed()); + pub fn query( + self, + reader: &impl rkv::Readable, + query: &str, + range: Range, + ) -> MResult> + { + let start_processing = Instant::now(); + let mut raw_documents_processed = Vec::new(); - let mut groups = vec![documents.as_mut_slice()]; - let mut key_cache = HashMap::new(); + let (automaton_producer, query_enhancer) = AutomatonProducer::new( + reader, + query, + self.inner.main_store, + self.inner.synonyms_store, + )?; - let mut filter_map = HashMap::new(); - // these two variables informs on the current distinct map and - // on the raw offset of the start of the group where the - // range.start bound is located according to the distinct function - let mut distinct_map = DistinctMap::new(self.size); - let mut distinct_raw_offset = 0; + let mut automaton_producer = automaton_producer.into_iter(); + let mut automatons = Vec::new(); - 'criteria: for criterion in self.inner.criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); - let mut documents_seen = 0; + // aggregate automatons groups by groups after time + while let Some(auts) = automaton_producer.next() { + automatons.extend(auts); - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < distinct_raw_offset { - documents_seen += group.len(); - groups.push(group); - continue; - } + // we must retrieve the documents associated + // with the current automatons + let mut raw_documents = fetch_raw_documents( + reader, + &automatons, + &query_enhancer, + self.inner.searchable_attrs.as_ref(), + &self.inner.main_store, + &self.inner.postings_lists_store, + )?; - let start = Instant::now(); - group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); - trace!("criterion {} sort took {:.2?}", criterion.name(), start.elapsed()); + // stop processing when time is running out + if !raw_documents_processed.is_empty() && start_processing.elapsed() > self.inner.timeout { + break + } - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - // we must compute the real distinguished len of this sub-group - for document in group.iter() { - let filter_accepted = match &self.inner.filter { - Some(filter) => { - let entry = filter_map.entry(document.id); - *entry.or_insert_with(|| (filter)(document.id)) - }, - None => true, - }; + let mut groups = vec![raw_documents.as_mut_slice()]; + let mut key_cache = HashMap::new(); - if filter_accepted { - let entry = key_cache.entry(document.id); - let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new)); + let mut filter_map = HashMap::new(); + // these two variables informs on the current distinct map and + // on the raw offset of the start of the group where the + // range.start bound is located according to the distinct function + let mut distinct_map = DistinctMap::new(self.size); + let mut distinct_raw_offset = 0; - match key.clone() { - Some(key) => buf_distinct.register(key), - None => buf_distinct.register_without_key(), + 'criteria: for criterion in self.inner.criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); + let mut documents_seen = 0; + + for group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < distinct_raw_offset { + documents_seen += group.len(); + groups.push(group); + continue; + } + + group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { + // we must compute the real distinguished len of this sub-group + for document in group.iter() { + let filter_accepted = match &self.inner.filter { + Some(filter) => { + let entry = filter_map.entry(document.id); + *entry.or_insert_with(|| (filter)(document.id)) + }, + None => true, }; + + if filter_accepted { + let entry = key_cache.entry(document.id); + let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new)); + + match key.clone() { + Some(key) => buf_distinct.register(key), + None => buf_distinct.register_without_key(), + }; + } + + // the requested range end is reached: stop computing distinct + if buf_distinct.len() >= range.end { break } } - // the requested range end is reached: stop computing distinct - if buf_distinct.len() >= range.end { break } + documents_seen += group.len(); + groups.push(group); + + // if this sub-group does not overlap with the requested range + // we must update the distinct map and its start index + if buf_distinct.len() < range.start { + buf_distinct.transfert_to_internal(); + distinct_raw_offset = documents_seen; + } + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if buf_distinct.len() >= range.end { continue 'criteria } } - - trace!("criterion {} produced a group of size {}", criterion.name(), group.len()); - - documents_seen += group.len(); - groups.push(group); - - // if this sub-group does not overlap with the requested range - // we must update the distinct map and its start index - if buf_distinct.len() < range.start { - buf_distinct.transfert_to_internal(); - distinct_raw_offset = documents_seen; - } - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if buf_distinct.len() >= range.end { continue 'criteria } } } - } - let mut out_documents = Vec::with_capacity(range.len()); - let mut seen = BufferedDistinctMap::new(&mut distinct_map); + // once we classified the documents related to the current + // automatons we save that as the next valid result + let mut seen = BufferedDistinctMap::new(&mut distinct_map); + raw_documents_processed.clear(); - for document in documents.into_iter().skip(distinct_raw_offset) { - let filter_accepted = match &self.inner.filter { - Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"), - None => true, - }; - - if filter_accepted { - let key = key_cache.remove(&document.id).expect("BUG: cached key not found"); - let distinct_accepted = match key { - Some(key) => seen.register(key), - None => seen.register_without_key(), + for document in raw_documents.into_iter().skip(distinct_raw_offset) { + let filter_accepted = match &self.inner.filter { + Some(_) => filter_map.remove(&document.id).unwrap(), + None => true, }; - if distinct_accepted && seen.len() > range.start { - out_documents.push(Document::from_raw(document)); - if out_documents.len() == range.len() { break } + if filter_accepted { + let key = key_cache.remove(&document.id).unwrap(); + let distinct_accepted = match key { + Some(key) => seen.register(key), + None => seen.register_without_key(), + }; + + if distinct_accepted && seen.len() > range.start { + raw_documents_processed.push(document); + if raw_documents_processed.len() == range.len() { break } + } } } + + // stop processing when time is running out + if start_processing.elapsed() > self.inner.timeout { break } } - Ok(out_documents) + // make real documents now that we know + // those must be returned + let documents = raw_documents_processed + .into_iter() + .map(|d| Document::from_raw(d)) + .collect(); + + Ok(documents) } } @@ -650,19 +538,14 @@ mod tests { use std::collections::{BTreeSet, HashMap}; use std::iter::FromIterator; - use sdset::SetBuf; use fst::{Set, IntoStreamer}; + use sdset::SetBuf; + use tempfile::TempDir; + use crate::automaton::normalize_str; + use crate::database::{Database, BoxUpdateFn}; use crate::DocIndex; - use crate::store::Store; - - #[derive(Default)] - struct InMemorySetStore { - set: Set, - synonyms: Set, - indexes: HashMap, SetBuf>, - alternatives: HashMap, Set>, - } + use crate::store::Index; fn set_from_stream<'f, I, S>(stream: I) -> Set where @@ -693,57 +576,6 @@ mod tests { builder.into_inner().and_then(Set::from_bytes).unwrap() } - impl InMemorySetStore { - pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { - let word = word.to_lowercase(); - let alternatives = self.alternatives.entry(word.as_bytes().to_vec()).or_default(); - let new = sdset_into_fstset(&new); - *alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); - - self.synonyms = insert_key(&self.synonyms, word.as_bytes()); - } - } - - impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for InMemorySetStore { - fn from_iter>(iter: I) -> Self { - let mut tree = BTreeSet::new(); - let mut map = HashMap::new(); - - for (word, indexes) in iter { - let word = word.to_lowercase().into_bytes(); - tree.insert(word.clone()); - map.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); - } - - InMemorySetStore { - set: Set::from_iter(tree).unwrap(), - synonyms: Set::default(), - indexes: map.into_iter().map(|(k, v)| (k, SetBuf::from_dirty(v))).collect(), - alternatives: HashMap::new(), - } - } - } - - impl Store for InMemorySetStore { - type Error = std::io::Error; - - fn words(&self) -> Result<&Set, Self::Error> { - Ok(&self.set) - } - - fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { - Ok(self.indexes.get(word).cloned()) - } - - fn synonyms(&self) -> Result<&Set, Self::Error> { - Ok(&self.synonyms) - } - - fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { - Ok(self.alternatives.get(word).map(|s| Set::from_bytes(s.as_fst().to_vec()).unwrap())) - } - } - const fn doc_index(document_id: u64, word_index: u16) -> DocIndex { DocIndex { document_id: DocumentId(document_id), @@ -764,16 +596,92 @@ mod tests { } } + pub struct TempDatabase { + database: Database, + index: Index, + _tempdir: TempDir, + } + + impl TempDatabase { + pub fn query_builder(&self) -> QueryBuilder { + self.index.query_builder() + } + + pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { + let rkv = self.database.rkv.read().unwrap(); + let mut writer = rkv.write().unwrap(); + + let word = word.to_lowercase(); + + let alternatives = match self.index.synonyms.synonyms(&writer, word.as_bytes()).unwrap() { + Some(alternatives) => alternatives, + None => fst::Set::default(), + }; + + let new = sdset_into_fstset(&new); + let new_alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); + self.index.synonyms.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives).unwrap(); + + let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() { + Some(synonyms) => synonyms, + None => fst::Set::default(), + }; + + let synonyms_fst = insert_key(&synonyms, word.as_bytes()); + self.index.main.put_synonyms_fst(&mut writer, &synonyms_fst).unwrap(); + + writer.commit().unwrap(); + } + } + + impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { + fn from_iter>(iter: I) -> Self { + let tempdir = TempDir::new().unwrap(); + let database = Database::open_or_create(&tempdir).unwrap(); + let update_fn = None as Option::; + let index = database.open_index("default", update_fn).unwrap(); + + let rkv = database.rkv.read().unwrap(); + let mut writer = rkv.write().unwrap(); + + let mut words_fst = BTreeSet::new(); + let mut postings_lists = HashMap::new(); + + for (word, indexes) in iter { + let word = word.to_lowercase().into_bytes(); + words_fst.insert(word.clone()); + postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); + } + + let words_fst = Set::from_iter(words_fst).unwrap(); + + index.main.put_words_fst(&mut writer, &words_fst).unwrap(); + + for (word, postings_list) in postings_lists { + let postings_list = SetBuf::from_dirty(postings_list); + index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap(); + } + + writer.commit().unwrap(); + drop(rkv); + + TempDatabase { database, index, _tempdir: tempdir } + } + } + #[test] fn simple() { - let store = InMemorySetStore::from_iter(vec![ - ("iphone", &[doc_char_index(0, 0, 0)][..]), - ("from", &[doc_char_index(0, 1, 1)][..]), - ("apple", &[doc_char_index(0, 2, 2)][..]), + let store = TempDatabase::from_iter(vec![ + ("iphone", &[doc_char_index(0, 0, 0)][..]), + ("from", &[doc_char_index(0, 1, 1)][..]), + ("apple", &[doc_char_index(0, 2, 2)][..]), ]); - let builder = QueryBuilder::new(&store); - let results = builder.query("iphone from apple", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "iphone from apple", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -788,14 +696,17 @@ mod tests { #[test] fn simple_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("hello", &[doc_index(0, 0)][..]), ]); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("hello", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "hello", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -805,8 +716,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("bonjour", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "bonjour", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -819,15 +730,18 @@ mod tests { #[test] fn prefix_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("hello", &[doc_index(0, 0)][..]), ]); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("sal", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "sal", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -837,8 +751,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("bonj", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "bonj", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -848,14 +762,14 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("sal blabla", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "sal blabla", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("bonj blabla", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "bonj blabla", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), None); @@ -863,14 +777,17 @@ mod tests { #[test] fn levenshtein_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("hello", &[doc_index(0, 0)][..]), ]); store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("salutution", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "salutution", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -880,8 +797,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("saluttion", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "saluttion", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -894,7 +811,7 @@ mod tests { #[test] fn harder_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("hello", &[doc_index(0, 0)][..]), ("bonjour", &[doc_index(1, 3)]), ("salut", &[doc_index(2, 5)]), @@ -904,8 +821,11 @@ mod tests { store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("hello", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "hello", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -925,8 +845,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("bonjour", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "bonjour", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -946,8 +866,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("salut", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "salut", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -971,7 +891,7 @@ mod tests { #[test] /// Unique word has multi-word synonyms fn unique_to_multiword_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("new", &[doc_char_index(0, 0, 0)][..]), ("york", &[doc_char_index(0, 1, 1)][..]), ("city", &[doc_char_index(0, 2, 2)][..]), @@ -984,8 +904,11 @@ mod tests { store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("NY subway", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1009,8 +932,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("NYC subway", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1037,7 +960,7 @@ mod tests { #[test] fn unique_to_multiword_synonyms_words_proximity() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("new", &[doc_char_index(0, 0, 0)][..]), ("york", &[doc_char_index(0, 1, 1)][..]), ("city", &[doc_char_index(0, 2, 2)][..]), @@ -1053,8 +976,11 @@ mod tests { store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("NY", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "NY", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { @@ -1077,8 +1003,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("new york", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "new york", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1098,7 +1024,7 @@ mod tests { #[test] fn unique_to_multiword_synonyms_cumulative_word_index() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("NY", &[doc_char_index(0, 0, 0)][..]), ("subway", &[doc_char_index(0, 1, 1)][..]), @@ -1109,8 +1035,11 @@ mod tests { store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("NY subway", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1126,8 +1055,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("new york subway", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "new york subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1150,7 +1079,7 @@ mod tests { #[test] /// Unique word has multi-word synonyms fn harder_unique_to_multiword_synonyms_one() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("new", &[doc_char_index(0, 0, 0)][..]), ("york", &[doc_char_index(0, 1, 1)][..]), ("city", &[doc_char_index(0, 2, 2)][..]), @@ -1166,8 +1095,11 @@ mod tests { store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("NY subway", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1191,8 +1123,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("NYC subway", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1221,7 +1153,7 @@ mod tests { #[test] /// Unique word has multi-word synonyms fn even_harder_unique_to_multiword_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("new", &[doc_char_index(0, 0, 0)][..]), ("york", &[doc_char_index(0, 1, 1)][..]), ("city", &[doc_char_index(0, 2, 2)][..]), @@ -1239,8 +1171,11 @@ mod tests { store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("NY subway broken", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "NY subway broken", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1267,8 +1202,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("NYC subway", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1299,7 +1234,7 @@ mod tests { #[test] /// Multi-word has multi-word synonyms fn multiword_to_multiword_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("NY", &[doc_char_index(0, 0, 0)][..]), ("subway", &[doc_char_index(0, 1, 1)][..]), @@ -1319,8 +1254,11 @@ mod tests { store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ])); store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ])); - let builder = QueryBuilder::new(&store); - let results = builder.query("new york underground train broken", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "new york underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { @@ -1356,8 +1294,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("new york city underground train broken", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "new york city underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { @@ -1402,7 +1340,7 @@ mod tests { #[test] fn intercrossed_multiword_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("new", &[doc_index(0, 0)][..]), ("york", &[doc_index(0, 1)][..]), ("big", &[doc_index(0, 2)][..]), @@ -1412,8 +1350,11 @@ mod tests { store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ])); store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ])); - let builder = QueryBuilder::new(&store); - let results = builder.query("new york big ", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "new york big ", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1432,7 +1373,7 @@ mod tests { }); assert_matches!(iter.next(), None); - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("NY", &[doc_index(0, 0)][..]), ("city", &[doc_index(0, 1)][..]), ("subway", &[doc_index(0, 2)][..]), @@ -1448,8 +1389,11 @@ mod tests { store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("NY subway ", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "NY subway ", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1485,7 +1429,7 @@ mod tests { #[test] fn cumulative_word_indices() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("NYC", &[doc_index(0, 0)][..]), ("long", &[doc_index(0, 1)][..]), ("subway", &[doc_index(0, 2)][..]), @@ -1495,8 +1439,11 @@ mod tests { store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("new york city long subway cool ", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "new york city long subway cool ", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1515,7 +1462,7 @@ mod tests { #[test] fn deunicoded_synonyms() { - let mut store = InMemorySetStore::from_iter(vec![ + let mut store = TempDatabase::from_iter(vec![ ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex @@ -1524,8 +1471,11 @@ mod tests { store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); - let builder = QueryBuilder::new(&store); - let results = builder.query("telephone", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "telephone", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1541,8 +1491,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("téléphone", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "téléphone", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1558,8 +1508,8 @@ mod tests { }); assert_matches!(iter.next(), None); - let builder = QueryBuilder::new(&store); - let results = builder.query("télephone", 0..20).unwrap(); + let builder = store.query_builder(); + let results = builder.query(&reader, "télephone", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { @@ -1578,13 +1528,16 @@ mod tests { #[test] fn simple_concatenation() { - let store = InMemorySetStore::from_iter(vec![ + let store = TempDatabase::from_iter(vec![ ("iphone", &[doc_index(0, 0)][..]), ("case", &[doc_index(0, 1)][..]), ]); - let builder = QueryBuilder::new(&store); - let results = builder.query("i phone case", 0..20).unwrap(); + let rkv = store.database.rkv.read().unwrap(); + let reader = rkv.read().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "i phone case", 0..20).unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { diff --git a/meilidb-data/src/ranked_map.rs b/meilidb-core/src/ranked_map.rs similarity index 95% rename from meilidb-data/src/ranked_map.rs rename to meilidb-core/src/ranked_map.rs index 96816613a..0168883ff 100644 --- a/meilidb-data/src/ranked_map.rs +++ b/meilidb-core/src/ranked_map.rs @@ -1,10 +1,9 @@ use std::io::{Read, Write}; use hashbrown::HashMap; -use meilidb_core::DocumentId; use meilidb_schema::SchemaAttr; -use crate::Number; +use crate::{DocumentId, Number}; #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>); diff --git a/meilidb-data/src/indexer.rs b/meilidb-core/src/raw_indexer.rs similarity index 95% rename from meilidb-data/src/indexer.rs rename to meilidb-core/src/raw_indexer.rs index 591ddd705..9c0399be5 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -2,14 +2,14 @@ use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; use deunicode::deunicode_with_tofu; -use meilidb_core::{DocumentId, DocIndex}; +use crate::{DocumentId, DocIndex}; use meilidb_schema::SchemaAttr; use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; use sdset::SetBuf; type Word = Vec; // TODO make it be a SmallVec -pub struct Indexer { +pub struct RawIndexer { word_limit: usize, // the maximum number of indexed words words_doc_indexes: BTreeMap>, docs_words: HashMap>, @@ -20,13 +20,13 @@ pub struct Indexed { pub docs_words: HashMap, } -impl Indexer { - pub fn new() -> Indexer { - Indexer::with_word_limit(1000) +impl RawIndexer { + pub fn new() -> RawIndexer { + RawIndexer::with_word_limit(1000) } - pub fn with_word_limit(limit: usize) -> Indexer { - Indexer { + pub fn with_word_limit(limit: usize) -> RawIndexer { + RawIndexer { word_limit: limit, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), @@ -168,7 +168,7 @@ mod tests { #[test] fn strange_apostrophe() { - let mut indexer = Indexer::new(); + let mut indexer = RawIndexer::new(); let docid = DocumentId(0); let attr = SchemaAttr(0); @@ -188,7 +188,7 @@ mod tests { #[test] fn strange_apostrophe_in_sequence() { - let mut indexer = Indexer::new(); + let mut indexer = RawIndexer::new(); let docid = DocumentId(0); let attr = SchemaAttr(0); diff --git a/meilidb-data/src/serde/convert_to_number.rs b/meilidb-core/src/serde/convert_to_number.rs similarity index 100% rename from meilidb-data/src/serde/convert_to_number.rs rename to meilidb-core/src/serde/convert_to_number.rs diff --git a/meilidb-data/src/serde/convert_to_string.rs b/meilidb-core/src/serde/convert_to_string.rs similarity index 100% rename from meilidb-data/src/serde/convert_to_string.rs rename to meilidb-core/src/serde/convert_to_string.rs diff --git a/meilidb-data/src/serde/deserializer.rs b/meilidb-core/src/serde/deserializer.rs similarity index 67% rename from meilidb-data/src/serde/deserializer.rs rename to meilidb-core/src/serde/deserializer.rs index 58c09c9d5..ebf008eb7 100644 --- a/meilidb-data/src/serde/deserializer.rs +++ b/meilidb-core/src/serde/deserializer.rs @@ -2,18 +2,18 @@ use std::collections::HashSet; use std::io::Cursor; use std::{fmt, error::Error}; -use meilidb_core::DocumentId; -use meilidb_schema::SchemaAttr; +use meilidb_schema::{Schema, SchemaAttr}; use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; use rmp_serde::decode::{Error as RmpError}; use serde::{de, forward_to_deserialize_any}; -use crate::database::Index; +use crate::store::DocumentsFields; +use crate::DocumentId; #[derive(Debug)] pub enum DeserializerError { RmpError(RmpError), - RocksDbError(rocksdb::Error), + RkvError(rkv::StoreError), Custom(String), } @@ -27,7 +27,7 @@ impl fmt::Display for DeserializerError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DeserializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), - DeserializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e), + DeserializerError::RkvError(e) => write!(f, "rkv related error: {}", e), DeserializerError::Custom(s) => f.write_str(s), } } @@ -41,19 +41,22 @@ impl From for DeserializerError { } } -impl From for DeserializerError { - fn from(error: rocksdb::Error) -> DeserializerError { - DeserializerError::RocksDbError(error) +impl From for DeserializerError { + fn from(error: rkv::StoreError) -> DeserializerError { + DeserializerError::RkvError(error) } } -pub struct Deserializer<'a> { +pub struct Deserializer<'a, R> { pub document_id: DocumentId, - pub index: &'a Index, - pub fields: Option<&'a HashSet>, + pub reader: &'a R, + pub documents_fields: DocumentsFields, + pub schema: &'a Schema, + pub attributes: Option<&'a HashSet>, } -impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> +impl<'de, 'a, 'b, R: 'a> de::Deserializer<'de> for &'b mut Deserializer<'a, R> +where R: rkv::Readable, { type Error = DeserializerError; @@ -72,15 +75,19 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> fn deserialize_map(self, visitor: V) -> Result where V: de::Visitor<'de> { - let schema = self.index.schema(); - let documents = self.index.as_ref().documents_index; + let mut error = None; - let iter = documents - .document_fields(self.document_id)? - .filter_map(|(attr, value)| { - let is_displayed = schema.props(attr).is_displayed(); - if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) { - let attribute_name = schema.attribute_name(attr); + let iter = self.documents_fields + .document_fields(self.reader, self.document_id)? + .filter_map(|result| { + let (attr, value) = match result { + Ok(value) => value, + Err(e) => { error = Some(e); return None }, + }; + + let is_displayed = self.schema.props(attr).is_displayed(); + if is_displayed && self.attributes.map_or(true, |f| f.contains(&attr)) { + let attribute_name = self.schema.attribute_name(attr); Some((attribute_name, Value::new(value))) } else { None @@ -90,7 +97,10 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> let map_deserializer = de::value::MapDeserializer::new(iter); let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from); - result + match error.take() { + Some(error) => Err(error.into()), + None => result, + } } } diff --git a/meilidb-data/src/serde/extract_document_id.rs b/meilidb-core/src/serde/extract_document_id.rs similarity index 99% rename from meilidb-data/src/serde/extract_document_id.rs rename to meilidb-core/src/serde/extract_document_id.rs index 27c99b03e..da90101e2 100644 --- a/meilidb-data/src/serde/extract_document_id.rs +++ b/meilidb-core/src/serde/extract_document_id.rs @@ -1,6 +1,6 @@ use std::hash::{Hash, Hasher}; -use meilidb_core::DocumentId; +use crate::DocumentId; use serde::{ser, Serialize}; use serde_json::Value; use siphasher::sip::SipHasher; diff --git a/meilidb-data/src/serde/indexer.rs b/meilidb-core/src/serde/indexer.rs similarity index 99% rename from meilidb-data/src/serde/indexer.rs rename to meilidb-core/src/serde/indexer.rs index b06d48322..69a7ddecf 100644 --- a/meilidb-data/src/serde/indexer.rs +++ b/meilidb-core/src/serde/indexer.rs @@ -1,9 +1,9 @@ -use meilidb_core::DocumentId; use meilidb_schema::SchemaAttr; use serde::ser; use serde::Serialize; -use crate::indexer::Indexer as RawIndexer; +use crate::DocumentId; +use crate::raw_indexer::RawIndexer; use super::{SerializerError, ConvertToString}; pub struct Indexer<'a> { diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-core/src/serde/mod.rs similarity index 88% rename from meilidb-data/src/serde/mod.rs rename to meilidb-core/src/serde/mod.rs index cdf996e9c..e3af21f89 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-core/src/serde/mod.rs @@ -25,20 +25,19 @@ pub use self::serializer::Serializer; use std::collections::BTreeMap; use std::{fmt, error::Error}; -use meilidb_core::DocumentId; use meilidb_schema::SchemaAttr; use rmp_serde::encode::Error as RmpError; use serde_json::Error as SerdeJsonError; use serde::ser; -use crate::number::ParseNumberError; +use crate::{DocumentId, ParseNumberError}; #[derive(Debug)] pub enum SerializerError { DocumentIdNotFound, InvalidDocumentIdType, RmpError(RmpError), - RocksDbError(rocksdb::Error), + RkvError(rkv::StoreError), SerdeJsonError(SerdeJsonError), ParseNumberError(ParseNumberError), UnserializableType { type_name: &'static str }, @@ -63,16 +62,16 @@ impl fmt::Display for SerializerError { write!(f, "document identifier can only be of type string or number") }, SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), - SerializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e), + SerializerError::RkvError(e) => write!(f, "rkv related error: {}", e), SerializerError::SerdeJsonError(e) => write!(f, "serde json error: {}", e), SerializerError::ParseNumberError(e) => { write!(f, "error while trying to parse a number: {}", e) }, SerializerError::UnserializableType { type_name } => { - write!(f, "{} are not a serializable type", type_name) + write!(f, "{} is not a serializable type", type_name) }, SerializerError::UnindexableType { type_name } => { - write!(f, "{} are not an indexable type", type_name) + write!(f, "{} is not an indexable type", type_name) }, SerializerError::UnrankableType { type_name } => { write!(f, "{} types can not be used for ranking", type_name) @@ -102,9 +101,9 @@ impl From for SerializerError { } } -impl From for SerializerError { - fn from(error: rocksdb::Error) -> SerializerError { - SerializerError::RocksDbError(error) +impl From for SerializerError { + fn from(error: rkv::StoreError) -> SerializerError { + SerializerError::RkvError(error) } } diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-core/src/serde/serializer.rs similarity index 97% rename from meilidb-data/src/serde/serializer.rs rename to meilidb-core/src/serde/serializer.rs index c7bea1e18..8764ce526 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-core/src/serde/serializer.rs @@ -1,10 +1,11 @@ -use meilidb_core::DocumentId; use meilidb_schema::Schema; use serde::ser; -use crate::indexer::Indexer as RawIndexer; -use crate::ranked_map::RankedMap; -use super::{RamDocumentStore, SerializerError, ConvertToString, ConvertToNumber, Indexer}; +use crate::{DocumentId, RankedMap}; +use crate::raw_indexer::RawIndexer; +use crate::serde::RamDocumentStore; + +use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, diff --git a/meilidb-core/src/store.rs b/meilidb-core/src/store.rs deleted file mode 100644 index 6e429a1b4..000000000 --- a/meilidb-core/src/store.rs +++ /dev/null @@ -1,34 +0,0 @@ -use std::error::Error; -use fst::Set; -use sdset::SetBuf; -use crate::DocIndex; - -pub trait Store { - type Error: Error; - - fn words(&self) -> Result<&Set, Self::Error>; - fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error>; - - fn synonyms(&self) -> Result<&Set, Self::Error>; - fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error>; -} - -impl Store for &'_ T where T: Store { - type Error = T::Error; - - fn words(&self) -> Result<&Set, Self::Error> { - (*self).words() - } - - fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { - (*self).word_indexes(word) - } - - fn synonyms(&self) -> Result<&Set, Self::Error> { - (*self).synonyms() - } - - fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { - (*self).alternatives_to(word) - } -} diff --git a/meilidb-core/src/store/docs_words.rs b/meilidb-core/src/store/docs_words.rs new file mode 100644 index 000000000..1254d032e --- /dev/null +++ b/meilidb-core/src/store/docs_words.rs @@ -0,0 +1,55 @@ +use std::sync::Arc; +use rkv::{Value, StoreError}; +use crate::{DocumentId, MResult}; + +#[derive(Copy, Clone)] +pub struct DocsWords { + pub(crate) docs_words: rkv::SingleStore, +} + +impl DocsWords { + pub fn put_doc_words( + &self, + writer: &mut rkv::Writer, + document_id: DocumentId, + words: &fst::Set, + ) -> Result<(), rkv::StoreError> + { + let document_id_bytes = document_id.0.to_be_bytes(); + let bytes = words.as_fst().as_bytes(); + self.docs_words.put(writer, document_id_bytes, &Value::Blob(bytes)) + } + + pub fn del_doc_words( + &self, + writer: &mut rkv::Writer, + document_id: DocumentId, + ) -> Result + { + let document_id_bytes = document_id.0.to_be_bytes(); + match self.docs_words.delete(writer, document_id_bytes) { + Ok(()) => Ok(true), + Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false), + Err(e) => Err(e), + } + } + + pub fn doc_words( + &self, + reader: &T, + document_id: DocumentId, + ) -> MResult> + { + let document_id_bytes = document_id.0.to_be_bytes(); + match self.docs_words.get(reader, document_id_bytes)? { + Some(Value::Blob(bytes)) => { + let len = bytes.len(); + let bytes = Arc::from(bytes); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } +} diff --git a/meilidb-core/src/store/documents_fields.rs b/meilidb-core/src/store/documents_fields.rs new file mode 100644 index 000000000..804508f05 --- /dev/null +++ b/meilidb-core/src/store/documents_fields.rs @@ -0,0 +1,127 @@ +use std::convert::TryFrom; +use meilidb_schema::SchemaAttr; +use crate::DocumentId; + +#[derive(Copy, Clone)] +pub struct DocumentsFields { + pub(crate) documents_fields: rkv::SingleStore, +} + +fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] { + let document_id_bytes = document_id.0.to_be_bytes(); + let attr_bytes = attribute.0.to_be_bytes(); + + let mut key = [0u8; 10]; + key[0..8].copy_from_slice(&document_id_bytes); + key[8..10].copy_from_slice(&attr_bytes); + + key +} + +fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) { + let document_id = { + let array = TryFrom::try_from(&key[0..8]).unwrap(); + DocumentId(u64::from_be_bytes(array)) + }; + + let schema_attr = { + let array = TryFrom::try_from(&key[8..8+2]).unwrap(); + SchemaAttr(u16::from_be_bytes(array)) + }; + + (document_id, schema_attr) +} + +impl DocumentsFields { + pub fn put_document_field( + &self, + writer: &mut rkv::Writer, + document_id: DocumentId, + attribute: SchemaAttr, + value: &[u8], + ) -> Result<(), rkv::StoreError> + { + let key = document_attribute_into_key(document_id, attribute); + self.documents_fields.put(writer, key, &rkv::Value::Blob(value)) + } + + pub fn del_all_document_fields( + &self, + writer: &mut rkv::Writer, + document_id: DocumentId, + ) -> Result + { + let document_id_bytes = document_id.0.to_be_bytes(); + let mut keys_to_delete = Vec::new(); + + // WARN we can not delete the keys using the iterator + // so we store them and delete them just after + let iter = self.documents_fields.iter_from(writer, document_id_bytes)?; + for result in iter { + let (key, _) = result?; + let array = TryFrom::try_from(key).unwrap(); + let (current_document_id, _) = document_attribute_from_key(array); + if current_document_id != document_id { break } + + keys_to_delete.push(key.to_owned()); + } + + let count = keys_to_delete.len(); + for key in keys_to_delete { + self.documents_fields.delete(writer, key)?; + } + + Ok(count) + } + + pub fn document_attribute<'a>( + &self, + reader: &'a impl rkv::Readable, + document_id: DocumentId, + attribute: SchemaAttr, + ) -> Result, rkv::StoreError> + { + let key = document_attribute_into_key(document_id, attribute); + + match self.documents_fields.get(reader, key)? { + Some(rkv::Value::Blob(bytes)) => Ok(Some(bytes)), + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } + + pub fn document_fields<'r, T: rkv::Readable>( + &self, + reader: &'r T, + document_id: DocumentId, + ) -> Result, rkv::StoreError> + { + let document_id_bytes = document_id.0.to_be_bytes(); + let iter = self.documents_fields.iter_from(reader, document_id_bytes)?; + Ok(DocumentFieldsIter { document_id, iter }) + } +} + +pub struct DocumentFieldsIter<'r> { + document_id: DocumentId, + iter: rkv::store::single::Iter<'r>, +} + +impl<'r> Iterator for DocumentFieldsIter<'r> { + type Item = Result<(SchemaAttr, &'r [u8]), rkv::StoreError>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok((key, Some(rkv::Value::Blob(bytes))))) => { + let array = TryFrom::try_from(key).unwrap(); + let (current_document_id, attr) = document_attribute_from_key(array); + if current_document_id != self.document_id { return None; } + + Some(Ok((attr, bytes))) + }, + Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data), + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} diff --git a/meilidb-core/src/store/main.rs b/meilidb-core/src/store/main.rs new file mode 100644 index 000000000..2712a0efa --- /dev/null +++ b/meilidb-core/src/store/main.rs @@ -0,0 +1,154 @@ +use std::sync::Arc; +use std::convert::TryInto; + +use meilidb_schema::Schema; +use rkv::Value; +use crate::{RankedMap, MResult}; + +const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; +const RANKED_MAP_KEY: &str = "ranked-map"; +const SCHEMA_KEY: &str = "schema"; +const SYNONYMS_KEY: &str = "synonyms"; +const WORDS_KEY: &str = "words"; + +#[derive(Copy, Clone)] +pub struct Main { + pub(crate) main: rkv::SingleStore, +} + +impl Main { + pub fn put_words_fst( + &self, + writer: &mut rkv::Writer, + fst: &fst::Set, + ) -> Result<(), rkv::StoreError> + { + let blob = rkv::Value::Blob(fst.as_fst().as_bytes()); + self.main.put(writer, WORDS_KEY, &blob) + } + + pub fn words_fst( + &self, + reader: &impl rkv::Readable, + ) -> MResult> + { + match self.main.get(reader, WORDS_KEY)? { + Some(Value::Blob(bytes)) => { + let len = bytes.len(); + let bytes = Arc::from(bytes); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } + + pub fn put_schema( + &self, + writer: &mut rkv::Writer, + schema: &Schema, + ) -> MResult<()> + { + let bytes = bincode::serialize(schema)?; + let blob = Value::Blob(&bytes[..]); + self.main.put(writer, SCHEMA_KEY, &blob)?; + Ok(()) + } + + pub fn schema( + &self, + reader: &impl rkv::Readable, + ) -> MResult> + { + match self.main.get(reader, SCHEMA_KEY)? { + Some(Value::Blob(bytes)) => { + let schema = bincode::deserialize_from(bytes.as_ref())?; + Ok(Some(schema)) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } + + pub fn put_ranked_map( + &self, + writer: &mut rkv::Writer, + ranked_map: &RankedMap, + ) -> MResult<()> + { + let mut bytes = Vec::new(); + ranked_map.write_to_bin(&mut bytes)?; + let blob = Value::Blob(&bytes[..]); + self.main.put(writer, RANKED_MAP_KEY, &blob)?; + Ok(()) + } + + pub fn ranked_map( + &self, + reader: &impl rkv::Readable, + ) -> MResult> + { + match self.main.get(reader, RANKED_MAP_KEY)? { + Some(Value::Blob(bytes)) => { + let ranked_map = RankedMap::read_from_bin(bytes)?; + Ok(Some(ranked_map)) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } + + pub fn put_synonyms_fst( + &self, + writer: &mut rkv::Writer, + fst: &fst::Set, + ) -> MResult<()> + { + let blob = rkv::Value::Blob(fst.as_fst().as_bytes()); + Ok(self.main.put(writer, SYNONYMS_KEY, &blob)?) + } + + pub fn synonyms_fst( + &self, + reader: &impl rkv::Readable, + ) -> MResult> + { + match self.main.get(reader, SYNONYMS_KEY)? { + Some(Value::Blob(bytes)) => { + let len = bytes.len(); + let bytes = Arc::from(bytes); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } + + pub fn put_number_of_documents u64>( + &self, + writer: &mut rkv::Writer, + f: F, + ) -> Result + { + let new = self.number_of_documents(writer).map(f)?; + self.main.put(writer, NUMBER_OF_DOCUMENTS_KEY, &Value::Blob(&new.to_be_bytes()))?; + Ok(new) + } + + pub fn number_of_documents( + &self, + reader: &impl rkv::Readable, + ) -> Result + { + match self.main.get(reader, NUMBER_OF_DOCUMENTS_KEY)? { + Some(Value::Blob(bytes)) => { + let array = bytes.try_into().unwrap(); + Ok(u64::from_be_bytes(array)) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(0), + } + } +} diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs new file mode 100644 index 000000000..62f28a851 --- /dev/null +++ b/meilidb-core/src/store/mod.rs @@ -0,0 +1,224 @@ +mod docs_words; +mod documents_fields; +mod main; +mod postings_lists; +mod synonyms; +mod updates; +mod updates_results; + +pub use self::docs_words::DocsWords; +pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter}; +pub use self::main::Main; +pub use self::postings_lists::PostingsLists; +pub use self::synonyms::Synonyms; +pub use self::updates::Updates; +pub use self::updates_results::UpdatesResults; + +use std::collections::HashSet; +use meilidb_schema::{Schema, SchemaAttr}; +use serde::de; +use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error}; +use crate::serde::Deserializer; + +fn aligned_to(bytes: &[u8], align: usize) -> bool { + (bytes as *const _ as *const () as usize) % align == 0 +} + +fn main_name(name: &str) -> String { + format!("store-{}", name) +} + +fn postings_lists_name(name: &str) -> String { + format!("store-{}-postings-lists", name) +} + +fn documents_fields_name(name: &str) -> String { + format!("store-{}-documents-fields", name) +} + +fn synonyms_name(name: &str) -> String { + format!("store-{}-synonyms", name) +} + +fn docs_words_name(name: &str) -> String { + format!("store-{}-docs-words", name) +} + +fn updates_name(name: &str) -> String { + format!("store-{}-updates", name) +} + +fn updates_results_name(name: &str) -> String { + format!("store-{}-updates-results", name) +} + +#[derive(Clone)] +pub struct Index { + pub main: Main, + pub postings_lists: PostingsLists, + pub documents_fields: DocumentsFields, + pub synonyms: Synonyms, + pub docs_words: DocsWords, + + pub updates: Updates, + pub updates_results: UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, +} + +impl Index { + pub fn document( + &self, + reader: &R, + attributes: Option<&HashSet<&str>>, + document_id: DocumentId, + ) -> MResult> + { + let schema = self.main.schema(reader)?; + let schema = schema.ok_or(Error::SchemaMissing)?; + + let attributes = match attributes { + Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(), + None => None, + }; + + let mut deserializer = Deserializer { + document_id, + reader, + documents_fields: self.documents_fields, + schema: &schema, + attributes: attributes.as_ref(), + }; + + // TODO: currently we return an error if all document fields are missing, + // returning None would have been better + Ok(T::deserialize(&mut deserializer).map(Some)?) + } + + pub fn document_attribute( + &self, + reader: &R, + document_id: DocumentId, + attribute: SchemaAttr, + ) -> MResult> + { + let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?; + match bytes { + Some(bytes) => Ok(Some(rmp_serde::from_read_ref(bytes)?)), + None => Ok(None), + } + } + + pub fn schema_update(&self, mut writer: rkv::Writer, schema: Schema) -> MResult<()> { + update::push_schema_update(&mut writer, self.updates, self.updates_results, schema)?; + writer.commit()?; + let _ = self.updates_notifier.send(()); + Ok(()) + } + + pub fn documents_addition(&self) -> update::DocumentsAddition { + update::DocumentsAddition::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + + pub fn documents_deletion(&self) -> update::DocumentsDeletion { + update::DocumentsDeletion::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + + pub fn synonyms_addition(&self) -> update::SynonymsAddition { + update::SynonymsAddition::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + + pub fn synonyms_deletion(&self) -> update::SynonymsDeletion { + update::SynonymsDeletion::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + + pub fn update_status( + &self, + reader: &T, + update_id: u64, + ) -> MResult + { + update::update_status( + reader, + self.updates, + self.updates_results, + update_id, + ) + } + + pub fn query_builder(&self) -> QueryBuilder { + QueryBuilder::new(self.main, self.postings_lists, self.synonyms) + } +} + +pub fn create( + env: &rkv::Rkv, + name: &str, + updates_notifier: crossbeam_channel::Sender<()>, +) -> Result +{ + open_options(env, name, rkv::StoreOptions::create(), updates_notifier) +} + +pub fn open( + env: &rkv::Rkv, + name: &str, + updates_notifier: crossbeam_channel::Sender<()>, +) -> Result +{ + let mut options = rkv::StoreOptions::default(); + options.create = false; + open_options(env, name, options, updates_notifier) +} + +fn open_options( + env: &rkv::Rkv, + name: &str, + options: rkv::StoreOptions, + updates_notifier: crossbeam_channel::Sender<()>, +) -> Result +{ + // create all the store names + let main_name = main_name(name); + let postings_lists_name = postings_lists_name(name); + let documents_fields_name = documents_fields_name(name); + let synonyms_name = synonyms_name(name); + let docs_words_name = docs_words_name(name); + let updates_name = updates_name(name); + let updates_results_name = updates_results_name(name); + + // open all the stores + let main = env.open_single(main_name.as_str(), options)?; + let postings_lists = env.open_single(postings_lists_name.as_str(), options)?; + let documents_fields = env.open_single(documents_fields_name.as_str(), options)?; + let synonyms = env.open_single(synonyms_name.as_str(), options)?; + let docs_words = env.open_single(docs_words_name.as_str(), options)?; + let updates = env.open_single(updates_name.as_str(), options)?; + let updates_results = env.open_single(updates_results_name.as_str(), options)?; + + Ok(Index { + main: Main { main }, + postings_lists: PostingsLists { postings_lists }, + documents_fields: DocumentsFields { documents_fields }, + synonyms: Synonyms { synonyms }, + docs_words: DocsWords { docs_words }, + updates: Updates { updates }, + updates_results: UpdatesResults { updates_results }, + updates_notifier, + }) +} diff --git a/meilidb-core/src/store/postings_lists.rs b/meilidb-core/src/store/postings_lists.rs new file mode 100644 index 000000000..68eb81cbc --- /dev/null +++ b/meilidb-core/src/store/postings_lists.rs @@ -0,0 +1,81 @@ +use std::borrow::Cow; +use std::{mem, ptr}; + +use zerocopy::{AsBytes, LayoutVerified}; +use rkv::StoreError; + +use crate::DocIndex; +use crate::store::aligned_to; + +#[derive(Copy, Clone)] +pub struct PostingsLists { + pub(crate) postings_lists: rkv::SingleStore, +} + +impl PostingsLists { + pub fn put_postings_list( + &self, + writer: &mut rkv::Writer, + word: &[u8], + words_indexes: &[DocIndex], + ) -> Result<(), rkv::StoreError> + { + let blob = rkv::Value::Blob(words_indexes.as_bytes()); + self.postings_lists.put(writer, word, &blob) + } + + pub fn del_postings_list( + &self, + writer: &mut rkv::Writer, + word: &[u8], + ) -> Result + { + match self.postings_lists.delete(writer, word) { + Ok(()) => Ok(true), + Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false), + Err(e) => Err(e), + } + } + + pub fn postings_list<'a>( + &self, + reader: &'a impl rkv::Readable, + word: &[u8], + ) -> Result>>, rkv::StoreError> + { + let bytes = match self.postings_lists.get(reader, word)? { + Some(rkv::Value::Blob(bytes)) => bytes, + Some(value) => panic!("invalid type {:?}", value), + None => return Ok(None), + }; + + match LayoutVerified::new_slice(bytes) { + Some(layout) => { + let set = sdset::Set::new(layout.into_slice()).unwrap(); + Ok(Some(Cow::Borrowed(set))) + }, + None => { + let len = bytes.len(); + let elem_size = mem::size_of::(); + + // ensure that it is the alignment that is wrong + // and the length is valid + if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::()) { + let elems = len / elem_size; + let mut vec = Vec::::with_capacity(elems); + + unsafe { + let dst = vec.as_mut_ptr() as *mut u8; + ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len); + vec.set_len(elems); + } + + let setbuf = sdset::SetBuf::new(vec).unwrap(); + return Ok(Some(Cow::Owned(setbuf))) + } + + Ok(None) + }, + } + } +} diff --git a/meilidb-core/src/store/synonyms.rs b/meilidb-core/src/store/synonyms.rs new file mode 100644 index 000000000..c00f891ce --- /dev/null +++ b/meilidb-core/src/store/synonyms.rs @@ -0,0 +1,52 @@ +use std::sync::Arc; +use rkv::StoreError; +use crate::error::MResult; + +#[derive(Copy, Clone)] +pub struct Synonyms { + pub(crate) synonyms: rkv::SingleStore, +} + +impl Synonyms { + pub fn put_synonyms( + &self, + writer: &mut rkv::Writer, + word: &[u8], + synonyms: &fst::Set, + ) -> Result<(), rkv::StoreError> + { + let blob = rkv::Value::Blob(synonyms.as_fst().as_bytes()); + self.synonyms.put(writer, word, &blob) + } + + pub fn del_synonyms( + &self, + writer: &mut rkv::Writer, + word: &[u8], + ) -> Result + { + match self.synonyms.delete(writer, word) { + Ok(()) => Ok(true), + Err(StoreError::LmdbError(lmdb::Error::NotFound)) => Ok(false), + Err(e) => Err(e), + } + } + + pub fn synonyms( + &self, + reader: &impl rkv::Readable, + word: &[u8], + ) -> MResult> + { + match self.synonyms.get(reader, word)? { + Some(rkv::Value::Blob(bytes)) => { + let len = bytes.len(); + let bytes = Arc::from(bytes); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } +} diff --git a/meilidb-core/src/store/updates.rs b/meilidb-core/src/store/updates.rs new file mode 100644 index 000000000..941abba6b --- /dev/null +++ b/meilidb-core/src/store/updates.rs @@ -0,0 +1,101 @@ +use std::convert::TryInto; +use rkv::Value; +use crate::{update::Update, MResult}; + +#[derive(Copy, Clone)] +pub struct Updates { + pub(crate) updates: rkv::SingleStore, +} + +impl Updates { + // TODO we should use the MDB_LAST op but + // it is not exposed by the rkv library + pub fn last_update_id<'a>( + &self, + reader: &'a impl rkv::Readable, + ) -> Result>)>, rkv::StoreError> + { + let mut last = None; + let iter = self.updates.iter_start(reader)?; + for result in iter { + let (key, data) = result?; + last = Some((key, data)); + } + + let (last_key, last_data) = match last { + Some(entry) => entry, + None => return Ok(None), + }; + + let array = last_key.try_into().unwrap(); + let number = u64::from_be_bytes(array); + + Ok(Some((number, last_data))) + } + + fn first_update_id<'a>( + &self, + reader: &'a impl rkv::Readable, + ) -> Result>)>, rkv::StoreError> + { + let mut iter = self.updates.iter_start(reader)?; + let (first_key, first_data) = match iter.next() { + Some(result) => result?, + None => return Ok(None), + }; + + let array = first_key.try_into().unwrap(); + let number = u64::from_be_bytes(array); + + Ok(Some((number, first_data))) + } + + pub fn contains( + &self, + reader: &impl rkv::Readable, + update_id: u64, + ) -> Result + { + let update_id_bytes = update_id.to_be_bytes(); + self.updates.get(reader, update_id_bytes).map(|v| v.is_some()) + } + + pub fn put_update( + &self, + writer: &mut rkv::Writer, + update_id: u64, + update: &Update, + ) -> MResult<()> + { + let update_id_bytes = update_id.to_be_bytes(); + let update = rmp_serde::to_vec_named(&update)?; + let blob = Value::Blob(&update); + self.updates.put(writer, update_id_bytes, &blob)?; + Ok(()) + } + + pub fn pop_front( + &self, + writer: &mut rkv::Writer, + ) -> MResult> + { + let (first_id, first_data) = match self.first_update_id(writer)? { + Some(entry) => entry, + None => return Ok(None), + }; + + match first_data { + Some(Value::Blob(bytes)) => { + let update = rmp_serde::from_read_ref(&bytes)?; + + // remove it from the database now + let first_id_bytes = first_id.to_be_bytes(); + self.updates.delete(writer, first_id_bytes)?; + + Ok(Some((first_id, update))) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } +} diff --git a/meilidb-core/src/store/updates_results.rs b/meilidb-core/src/store/updates_results.rs new file mode 100644 index 000000000..9b0c2d435 --- /dev/null +++ b/meilidb-core/src/store/updates_results.rs @@ -0,0 +1,67 @@ +use std::convert::TryInto; +use rkv::Value; +use crate::{update::UpdateResult, MResult}; + +#[derive(Copy, Clone)] +pub struct UpdatesResults { + pub(crate) updates_results: rkv::SingleStore, +} + +impl UpdatesResults { + // TODO we should use the MDB_LAST op but + // it is not exposed by the rkv library + pub fn last_update_id<'a>( + &self, + reader: &'a impl rkv::Readable, + ) -> Result>)>, rkv::StoreError> + { + let mut last = None; + let iter = self.updates_results.iter_start(reader)?; + for result in iter { + let (key, data) = result?; + last = Some((key, data)); + } + + let (last_key, last_data) = match last { + Some(entry) => entry, + None => return Ok(None), + }; + + let array = last_key.try_into().unwrap(); + let number = u64::from_be_bytes(array); + + Ok(Some((number, last_data))) + } + + pub fn put_update_result( + &self, + writer: &mut rkv::Writer, + update_id: u64, + update_result: &UpdateResult, + ) -> MResult<()> + { + let update_id_bytes = update_id.to_be_bytes(); + let update_result = bincode::serialize(&update_result)?; + let blob = Value::Blob(&update_result); + self.updates_results.put(writer, update_id_bytes, &blob)?; + Ok(()) + } + + pub fn update_result( + &self, + reader: &impl rkv::Readable, + update_id: u64, + ) -> MResult> + { + let update_id_bytes = update_id.to_be_bytes(); + + match self.updates_results.get(reader, update_id_bytes)? { + Some(Value::Blob(bytes)) => { + let update_result = bincode::deserialize(&bytes)?; + Ok(Some(update_result)) + }, + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } +} diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs new file mode 100644 index 000000000..d0eb6a8b5 --- /dev/null +++ b/meilidb-core/src/update/documents_addition.rs @@ -0,0 +1,189 @@ +use std::collections::HashSet; + +use fst::{SetBuilder, set::OpBuilder}; +use sdset::{SetOperation, duo::Union}; +use serde::Serialize; + +use crate::raw_indexer::RawIndexer; +use crate::serde::{extract_document_id, Serializer, RamDocumentStore}; +use crate::store; +use crate::update::{Update, next_update_id, apply_documents_deletion}; +use crate::{MResult, Error, RankedMap}; + +pub struct DocumentsAddition { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + documents: Vec, +} + +impl DocumentsAddition { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> DocumentsAddition + { + DocumentsAddition { + updates_store, + updates_results_store, + updates_notifier, + documents: Vec::new(), + } + } + + pub fn update_document(&mut self, document: D) { + self.documents.push(document); + } + + pub fn finalize(self, mut writer: rkv::Writer) -> MResult + where D: serde::Serialize + { + let update_id = push_documents_addition( + &mut writer, + self.updates_store, + self.updates_results_store, + self.documents, + )?; + writer.commit()?; + let _ = self.updates_notifier.send(()); + + Ok(update_id) + } +} + +impl Extend for DocumentsAddition { + fn extend>(&mut self, iter: T) { + self.documents.extend(iter) + } +} + +pub fn push_documents_addition( + writer: &mut rkv::Writer, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + addition: Vec, +) -> MResult +{ + let mut values = Vec::with_capacity(addition.len()); + for add in addition { + let vec = rmp_serde::to_vec_named(&add)?; + let add = rmp_serde::from_read(&vec[..])?; + values.push(add); + } + + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::DocumentsAddition(values); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + +pub fn apply_documents_addition( + writer: &mut rkv::Writer, + main_store: store::Main, + documents_fields_store: store::DocumentsFields, + postings_lists_store: store::PostingsLists, + docs_words_store: store::DocsWords, + mut ranked_map: RankedMap, + addition: Vec, +) -> MResult<()> +{ + let mut document_ids = HashSet::new(); + let mut document_store = RamDocumentStore::new(); + let mut indexer = RawIndexer::new(); + + let schema = match main_store.schema(writer)? { + Some(schema) => schema, + None => return Err(Error::SchemaMissing), + }; + + let identifier = schema.identifier_name(); + + for document in addition { + let document_id = match extract_document_id(identifier, &document)? { + Some(id) => id, + None => return Err(Error::MissingDocumentId), + }; + + // 1. store the document id for future deletion + document_ids.insert(document_id); + + // 2. index the document fields in ram stores + let serializer = Serializer { + schema: &schema, + document_store: &mut document_store, + indexer: &mut indexer, + ranked_map: &mut ranked_map, + document_id, + }; + + document.serialize(serializer)?; + } + + // 1. remove the previous documents match indexes + let documents_to_insert = document_ids.iter().cloned().collect(); + apply_documents_deletion( + writer, + main_store, + documents_fields_store, + postings_lists_store, + docs_words_store, + ranked_map.clone(), + documents_to_insert, + )?; + + // 2. insert new document attributes in the database + for ((id, attr), value) in document_store.into_inner() { + documents_fields_store.put_document_field(writer, id, attr, &value)?; + } + + let indexed = indexer.build(); + let mut delta_words_builder = SetBuilder::memory(); + + for (word, delta_set) in indexed.words_doc_indexes { + delta_words_builder.insert(&word).unwrap(); + + let set = match postings_lists_store.postings_list(writer, &word)? { + Some(set) => Union::new(&set, &delta_set).into_set_buf(), + None => delta_set, + }; + + postings_lists_store.put_postings_list(writer, &word, &set)?; + } + + for (id, words) in indexed.docs_words { + docs_words_store.put_doc_words(writer, id, &words)?; + } + + let delta_words = delta_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let words = match main_store.words_fst(writer)? { + Some(words) => { + let op = OpBuilder::new() + .add(words.stream()) + .add(delta_words.stream()) + .r#union(); + + let mut words_builder = SetBuilder::memory(); + words_builder.extend_stream(op).unwrap(); + words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_words, + }; + + main_store.put_words_fst(writer, &words)?; + main_store.put_ranked_map(writer, &ranked_map)?; + + let inserted_documents_len = document_ids.len() as u64; + main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?; + + Ok(()) +} diff --git a/meilidb-data/src/database/update/documents_deletion.rs b/meilidb-core/src/update/documents_deletion.rs similarity index 51% rename from meilidb-data/src/database/update/documents_deletion.rs rename to meilidb-core/src/update/documents_deletion.rs index 99dc35334..c17452ea9 100644 --- a/meilidb-data/src/database/update/documents_deletion.rs +++ b/meilidb-core/src/update/documents_deletion.rs @@ -1,33 +1,43 @@ use std::collections::{HashMap, HashSet, BTreeSet}; -use std::sync::Arc; use fst::{SetBuilder, Streamer}; -use meilidb_core::DocumentId; +use meilidb_schema::Schema; use sdset::{SetBuf, SetOperation, duo::DifferenceByKey}; -use crate::RankedMap; +use crate::{DocumentId, RankedMap, MResult, Error}; use crate::serde::extract_document_id; +use crate::update::{Update, next_update_id}; +use crate::store; -use crate::database::{Index, Error, index::Cache}; - -pub struct DocumentsDeletion<'a> { - index: &'a Index, +pub struct DocumentsDeletion { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, documents: Vec, } -impl<'a> DocumentsDeletion<'a> { - pub fn new(index: &'a Index) -> DocumentsDeletion<'a> { - DocumentsDeletion { index, documents: Vec::new() } +impl DocumentsDeletion { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> DocumentsDeletion + { + DocumentsDeletion { + updates_store, + updates_results_store, + updates_notifier, + documents: Vec::new(), + } } pub fn delete_document_by_id(&mut self, document_id: DocumentId) { self.documents.push(document_id); } - pub fn delete_document(&mut self, document: D) -> Result<(), Error> + pub fn delete_document(&mut self, schema: &Schema, document: D) -> MResult<()> where D: serde::Serialize, { - let schema = self.index.schema(); let identifier = schema.identifier_name(); let document_id = match extract_document_id(identifier, &document)? { Some(id) => id, @@ -39,32 +49,58 @@ impl<'a> DocumentsDeletion<'a> { Ok(()) } - pub fn finalize(self) -> Result { - self.index.push_documents_deletion(self.documents) + pub fn finalize(self, mut writer: rkv::Writer) -> MResult { + let update_id = push_documents_deletion( + &mut writer, + self.updates_store, + self.updates_results_store, + self.documents, + )?; + writer.commit()?; + let _ = self.updates_notifier.send(()); + + Ok(update_id) } } -impl Extend for DocumentsDeletion<'_> { +impl Extend for DocumentsDeletion { fn extend>(&mut self, iter: T) { self.documents.extend(iter) } } +pub fn push_documents_deletion( + writer: &mut rkv::Writer, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + deletion: Vec, +) -> MResult +{ + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::DocumentsDeletion(deletion); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + pub fn apply_documents_deletion( - index: &Index, + writer: &mut rkv::Writer, + main_store: store::Main, + documents_fields_store: store::DocumentsFields, + postings_lists_store: store::PostingsLists, + docs_words_store: store::DocsWords, mut ranked_map: RankedMap, deletion: Vec, -) -> Result<(), Error> +) -> MResult<()> { - let ref_index = index.as_ref(); - let schema = index.schema(); - let docs_words = ref_index.docs_words_index; - let documents = ref_index.documents_index; - let main = ref_index.main_index; - let words = ref_index.words_index; - let idset = SetBuf::from_dirty(deletion); + let schema = match main_store.schema(writer)? { + Some(schema) => schema, + None => return Err(Error::SchemaMissing), + }; + // collect the ranked attributes according to the schema let ranked_attrs: Vec<_> = schema.iter() .filter_map(|(_, attr, prop)| { @@ -79,7 +115,7 @@ pub fn apply_documents_deletion( ranked_map.remove(id, *ranked_attr); } - if let Some(words) = docs_words.doc_words(id)? { + if let Some(words) = docs_words_store.doc_words(writer, id)? { let mut stream = words.stream(); while let Some(word) = stream.next() { let word = word.to_vec(); @@ -93,28 +129,32 @@ pub fn apply_documents_deletion( for (word, document_ids) in words_document_ids { let document_ids = SetBuf::from_dirty(document_ids); - if let Some(doc_indexes) = words.doc_indexes(&word)? { + if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? { let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); let doc_indexes = op.into_set_buf(); if !doc_indexes.is_empty() { - words.set_doc_indexes(&word, &doc_indexes)?; + postings_lists_store.put_postings_list(writer, &word, &doc_indexes)?; } else { - words.del_doc_indexes(&word)?; + postings_lists_store.del_postings_list(writer, &word)?; removed_words.insert(word); } } for id in document_ids { - if documents.del_all_document_fields(id)? != 0 { + if documents_fields_store.del_all_document_fields(writer, id)? != 0 { deleted_documents.insert(id); } - docs_words.del_doc_words(id)?; } } + let deleted_documents_len = deleted_documents.len() as u64; + for id in deleted_documents { + docs_words_store.del_doc_words(writer, id)?; + } + let removed_words = fst::Set::from_iter(removed_words).unwrap(); - let words = match main.words_set()? { + let words = match main_store.words_fst(writer)? { Some(words_set) => { let op = fst::set::OpBuilder::new() .add(words_set.stream()) @@ -131,20 +171,10 @@ pub fn apply_documents_deletion( None => fst::Set::default(), }; - main.set_words_set(&words)?; - main.set_ranked_map(&ranked_map)?; + main_store.put_words_fst(writer, &words)?; + main_store.put_ranked_map(writer, &ranked_map)?; - let deleted_documents_len = deleted_documents.len() as u64; - let number_of_documents = main.set_number_of_documents(|old| old - deleted_documents_len)?; - - // update the "consistent" view of the Index - let cache = ref_index.cache; - let words = Arc::new(words); - let synonyms = cache.synonyms.clone(); - let schema = cache.schema.clone(); - - let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents }; - index.cache.store(Arc::new(cache)); + main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?; Ok(()) } diff --git a/meilidb-core/src/update/mod.rs b/meilidb-core/src/update/mod.rs new file mode 100644 index 000000000..0025e2e28 --- /dev/null +++ b/meilidb-core/src/update/mod.rs @@ -0,0 +1,202 @@ +mod documents_addition; +mod documents_deletion; +mod schema_update; +mod synonyms_addition; +mod synonyms_deletion; + +pub use self::documents_addition::{DocumentsAddition, apply_documents_addition}; +pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion}; +pub use self::schema_update::{apply_schema_update, push_schema_update}; +pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition}; +pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion}; + +use std::time::{Duration, Instant}; +use std::collections::BTreeMap; +use std::cmp; + +use log::debug; +use serde::{Serialize, Deserialize}; + +use crate::{store, MResult, DocumentId, RankedMap}; +use meilidb_schema::Schema; + +#[derive(Debug, Serialize, Deserialize)] +pub enum Update { + SchemaUpdate(Schema), + DocumentsAddition(Vec), + DocumentsDeletion(Vec), + SynonymsAddition(BTreeMap>), + SynonymsDeletion(BTreeMap>>), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum UpdateType { + SchemaUpdate { schema: Schema }, + DocumentsAddition { number: usize }, + DocumentsDeletion { number: usize }, + SynonymsAddition { number: usize }, + SynonymsDeletion { number: usize }, +} + +#[derive(Clone, Serialize, Deserialize)] +pub struct DetailedDuration { + pub main: Duration, +} + +#[derive(Clone, Serialize, Deserialize)] +pub struct UpdateResult { + pub update_id: u64, + pub update_type: UpdateType, + pub result: Result<(), String>, + pub detailed_duration: DetailedDuration, +} + +#[derive(Clone, Serialize, Deserialize)] +pub enum UpdateStatus { + Enqueued, + Processed(UpdateResult), + Unknown, +} + +pub fn update_status( + reader: &T, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + update_id: u64, +) -> MResult +{ + match updates_results_store.update_result(reader, update_id)? { + Some(result) => Ok(UpdateStatus::Processed(result)), + None => { + if updates_store.contains(reader, update_id)? { + Ok(UpdateStatus::Enqueued) + } else { + Ok(UpdateStatus::Unknown) + } + } + } +} + +pub fn next_update_id( + writer: &mut rkv::Writer, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, +) -> MResult +{ + let last_update_id = updates_store.last_update_id(writer)?; + let last_update_id = last_update_id.map(|(n, _)| n); + + let last_update_results_id = updates_results_store.last_update_id(writer)?; + let last_update_results_id = last_update_results_id.map(|(n, _)| n); + + let max_update_id = cmp::max(last_update_id, last_update_results_id); + let new_update_id = max_update_id.map_or(0, |n| n + 1); + + Ok(new_update_id) +} + +pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult> { + let (update_id, update) = match index.updates.pop_front(writer)? { + Some(value) => value, + None => return Ok(None), + }; + + debug!("Processing update number {}", update_id); + + let (update_type, result, duration) = match update { + Update::SchemaUpdate(schema) => { + let start = Instant::now(); + + let update_type = UpdateType::SchemaUpdate { schema: schema.clone() }; + let result = apply_schema_update(writer, index.main, &schema); + + (update_type, result, start.elapsed()) + }, + Update::DocumentsAddition(documents) => { + let start = Instant::now(); + + let ranked_map = match index.main.ranked_map(writer)? { + Some(ranked_map) => ranked_map, + None => RankedMap::default(), + }; + + let update_type = UpdateType::DocumentsAddition { number: documents.len() }; + + let result = apply_documents_addition( + writer, + index.main, + index.documents_fields, + index.postings_lists, + index.docs_words, + ranked_map, + documents, + ); + + (update_type, result, start.elapsed()) + }, + Update::DocumentsDeletion(documents) => { + let start = Instant::now(); + + let ranked_map = match index.main.ranked_map(writer)? { + Some(ranked_map) => ranked_map, + None => RankedMap::default(), + }; + + let update_type = UpdateType::DocumentsDeletion { number: documents.len() }; + + let result = apply_documents_deletion( + writer, + index.main, + index.documents_fields, + index.postings_lists, + index.docs_words, + ranked_map, + documents, + ); + + (update_type, result, start.elapsed()) + }, + Update::SynonymsAddition(synonyms) => { + let start = Instant::now(); + + let update_type = UpdateType::SynonymsAddition { number: synonyms.len() }; + + let result = apply_synonyms_addition( + writer, + index.main, + index.synonyms, + synonyms, + ); + + (update_type, result, start.elapsed()) + }, + Update::SynonymsDeletion(synonyms) => { + let start = Instant::now(); + + let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() }; + + let result = apply_synonyms_deletion( + writer, + index.main, + index.synonyms, + synonyms, + ); + + (update_type, result, start.elapsed()) + }, + }; + + debug!("Processed update number {} {:?} {:?}", update_id, update_type, result); + + let detailed_duration = DetailedDuration { main: duration }; + let status = UpdateResult { + update_id, + update_type, + result: result.map_err(|e| e.to_string()), + detailed_duration, + }; + + index.updates_results.put_update_result(writer, update_id, &status)?; + + Ok(Some(status)) +} diff --git a/meilidb-core/src/update/schema_update.rs b/meilidb-core/src/update/schema_update.rs new file mode 100644 index 000000000..6c258c388 --- /dev/null +++ b/meilidb-core/src/update/schema_update.rs @@ -0,0 +1,31 @@ +use meilidb_schema::Schema; +use crate::{store, error::UnsupportedOperation, MResult}; +use crate::update::{Update, next_update_id}; + +pub fn apply_schema_update( + writer: &mut rkv::Writer, + main_store: store::Main, + new_schema: &Schema, +) -> MResult<()> +{ + if let Some(_) = main_store.schema(writer)? { + return Err(UnsupportedOperation::SchemaAlreadyExists.into()) + } + + main_store.put_schema(writer, new_schema) +} + +pub fn push_schema_update( + writer: &mut rkv::Writer, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + schema: Schema, +) -> MResult +{ + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::SchemaUpdate(schema); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} diff --git a/meilidb-core/src/update/synonyms_addition.rs b/meilidb-core/src/update/synonyms_addition.rs new file mode 100644 index 000000000..7ed24a3d7 --- /dev/null +++ b/meilidb-core/src/update/synonyms_addition.rs @@ -0,0 +1,119 @@ +use std::collections::BTreeMap; + +use fst::{SetBuilder, set::OpBuilder}; +use sdset::SetBuf; + +use crate::automaton::normalize_str; +use crate::update::{Update, next_update_id}; +use crate::{store, MResult}; + +pub struct SynonymsAddition { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + synonyms: BTreeMap>, +} + +impl SynonymsAddition { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> SynonymsAddition + { + SynonymsAddition { + updates_store, + updates_results_store, + updates_notifier, + synonyms: BTreeMap::new(), + } + } + + pub fn add_synonym(&mut self, synonym: S, alternatives: I) + where S: AsRef, + T: AsRef, + I: IntoIterator, + { + let synonym = normalize_str(synonym.as_ref()); + let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase()); + self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); + } + + pub fn finalize(self, mut writer: rkv::Writer) -> MResult { + let update_id = push_synonyms_addition( + &mut writer, + self.updates_store, + self.updates_results_store, + self.synonyms, + )?; + writer.commit()?; + let _ = self.updates_notifier.send(()); + + Ok(update_id) + } +} + +pub fn push_synonyms_addition( + writer: &mut rkv::Writer, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + addition: BTreeMap>, +) -> MResult +{ + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::SynonymsAddition(addition); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + +pub fn apply_synonyms_addition( + writer: &mut rkv::Writer, + main_store: store::Main, + synonyms_store: store::Synonyms, + addition: BTreeMap>, +) -> MResult<()> +{ + let mut synonyms_builder = SetBuilder::memory(); + + for (word, alternatives) in addition { + synonyms_builder.insert(&word).unwrap(); + + let alternatives = { + let alternatives = SetBuf::from_dirty(alternatives); + let mut alternatives_builder = SetBuilder::memory(); + alternatives_builder.extend_iter(alternatives).unwrap(); + let bytes = alternatives_builder.into_inner().unwrap(); + fst::Set::from_bytes(bytes).unwrap() + }; + + synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?; + } + + let delta_synonyms = synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main_store.synonyms_fst(writer)? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .r#union(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_synonyms, + }; + + main_store.put_synonyms_fst(writer, &synonyms)?; + + Ok(()) +} diff --git a/meilidb-data/src/database/update/synonyms_deletion.rs b/meilidb-core/src/update/synonyms_deletion.rs similarity index 60% rename from meilidb-data/src/database/update/synonyms_deletion.rs rename to meilidb-core/src/update/synonyms_deletion.rs index bc036cb06..1acb553e1 100644 --- a/meilidb-data/src/database/update/synonyms_deletion.rs +++ b/meilidb-core/src/update/synonyms_deletion.rs @@ -1,21 +1,33 @@ use std::collections::BTreeMap; use std::iter::FromIterator; -use std::sync::Arc; use fst::{SetBuilder, set::OpBuilder}; -use meilidb_core::normalize_str; use sdset::SetBuf; -use crate::database::{Error, Index, index::Cache}; +use crate::automaton::normalize_str; +use crate::update::{Update, next_update_id}; +use crate::{store, MResult}; -pub struct SynonymsDeletion<'a> { - index: &'a Index, +pub struct SynonymsDeletion { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, synonyms: BTreeMap>>, } -impl<'a> SynonymsDeletion<'a> { - pub fn new(index: &'a Index) -> SynonymsDeletion<'a> { - SynonymsDeletion { index, synonyms: BTreeMap::new() } +impl SynonymsDeletion { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> SynonymsDeletion + { + SynonymsDeletion { + updates_store, + updates_results_store, + updates_notifier, + synonyms: BTreeMap::new(), + } } pub fn delete_all_alternatives_of>(&mut self, synonym: S) { @@ -37,26 +49,48 @@ impl<'a> SynonymsDeletion<'a> { } } - pub fn finalize(self) -> Result { - self.index.push_synonyms_deletion(self.synonyms) + pub fn finalize(self, mut writer: rkv::Writer) -> MResult { + let update_id = push_synonyms_deletion( + &mut writer, + self.updates_store, + self.updates_results_store, + self.synonyms, + )?; + writer.commit()?; + let _ = self.updates_notifier.send(()); + + Ok(update_id) } } -pub fn apply_synonyms_deletion( - index: &Index, +pub fn push_synonyms_deletion( + writer: &mut rkv::Writer, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, deletion: BTreeMap>>, -) -> Result<(), Error> +) -> MResult { - let ref_index = index.as_ref(); - let synonyms = ref_index.synonyms_index; - let main = ref_index.main_index; + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + let update = Update::SynonymsDeletion(deletion); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + +pub fn apply_synonyms_deletion( + writer: &mut rkv::Writer, + main_store: store::Main, + synonyms_store: store::Synonyms, + deletion: BTreeMap>>, +) -> MResult<()> +{ let mut delete_whole_synonym_builder = SetBuilder::memory(); for (synonym, alternatives) in deletion { match alternatives { Some(alternatives) => { - let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?; + let prev_alternatives = synonyms_store.synonyms(writer, synonym.as_bytes())?; let prev_alternatives = match prev_alternatives { Some(alternatives) => alternatives, None => continue, @@ -81,19 +115,21 @@ pub fn apply_synonyms_deletion( let len = builder.get_ref().len(); builder.extend_stream(op).unwrap(); let is_empty = len == builder.get_ref().len(); - let alternatives = builder.into_inner().unwrap(); + let bytes = builder.into_inner().unwrap(); + let alternatives = fst::Set::from_bytes(bytes).unwrap(); + (alternatives, is_empty) }; if empty_alternatives { delete_whole_synonym_builder.insert(synonym.as_bytes())?; } else { - synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; + synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?; } }, None => { delete_whole_synonym_builder.insert(&synonym).unwrap(); - synonyms.del_alternatives_of(synonym.as_bytes())?; + synonyms_store.del_synonyms(writer, synonym.as_bytes())?; } } } @@ -103,7 +139,7 @@ pub fn apply_synonyms_deletion( .and_then(fst::Set::from_bytes) .unwrap(); - let synonyms = match main.synonyms_set()? { + let synonyms = match main_store.synonyms_fst(writer)? { Some(synonyms) => { let op = OpBuilder::new() .add(synonyms.stream()) @@ -120,18 +156,7 @@ pub fn apply_synonyms_deletion( None => fst::Set::default(), }; - main.set_synonyms_set(&synonyms)?; - - // update the "consistent" view of the Index - let cache = ref_index.cache; - let words = Arc::new(main.words_set()?.unwrap_or_default()); - let ranked_map = cache.ranked_map.clone(); - let synonyms = Arc::new(synonyms); - let schema = cache.schema.clone(); - let number_of_documents = cache.number_of_documents; - - let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents }; - index.cache.store(Arc::new(cache)); + main_store.put_synonyms_fst(writer, &synonyms)?; Ok(()) } diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml deleted file mode 100644 index a2428eea2..000000000 --- a/meilidb-data/Cargo.toml +++ /dev/null @@ -1,41 +0,0 @@ -[package] -name = "meilidb-data" -version = "0.1.0" -authors = ["Kerollmops "] -edition = "2018" - -[dependencies] -arc-swap = "0.4.2" -bincode = "1.1.4" -crossbeam-channel = "0.3.9" -deunicode = "1.0.0" -hashbrown = { version = "0.6.0", features = ["serde"] } -log = "0.4.6" -meilidb-core = { path = "../meilidb-core", version = "0.1.0" } -meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" } -meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } -ordered-float = { version = "1.0.2", features = ["serde"] } -rocksdb = "0.12.3" -sdset = "0.3.2" -serde = { version = "1.0.99", features = ["derive"] } -serde_json = "1.0.40" -siphasher = "0.3.0" -zerocopy = "0.2.8" - -[dependencies.rmp-serde] -git = "https://github.com/3Hren/msgpack-rust.git" -rev = "40b3d48" - -[dependencies.rmpv] -git = "https://github.com/3Hren/msgpack-rust.git" -rev = "40b3d48" -features = ["with-serde"] - -[dependencies.fst] -git = "https://github.com/Kerollmops/fst.git" -branch = "arc-byte-slice" - -[dev-dependencies] -tempfile = "3.1.0" -maplit = "1.0.2" -big_s = "1.0.2" diff --git a/meilidb-data/src/cf_tree.rs b/meilidb-data/src/cf_tree.rs deleted file mode 100644 index bb938ff13..000000000 --- a/meilidb-data/src/cf_tree.rs +++ /dev/null @@ -1,126 +0,0 @@ -use std::sync::Arc; -use crossbeam_channel::{unbounded, Sender, Receiver}; -use rocksdb::{DBVector, IteratorMode, Direction}; -use crate::RocksDbResult; - -#[derive(Clone)] -pub struct CfTree { - index: Arc, - sender: Option>, -} - -struct CfTreeInner { - db: Arc, - name: String, -} - -impl CfTree { - pub fn create(db: Arc, name: String) -> RocksDbResult { - let mut options = rocksdb::Options::default(); - options.create_missing_column_families(true); // this doesn't work - - if db.cf_handle(&name).is_none() { - let _cf = db.create_cf(&name, &options)?; - } - - let index = Arc::new(CfTreeInner { db, name }); - - Ok(CfTree { index, sender: None }) - } - - pub fn create_with_subcription( - db: Arc, - name: String, - ) -> RocksDbResult<(CfTree, Receiver<()>)> - { - let mut options = rocksdb::Options::default(); - options.create_missing_column_families(true); // this doesn't work - - if db.cf_handle(&name).is_none() { - let _cf = db.create_cf(&name, &options)?; - } - - let index = Arc::new(CfTreeInner { db, name }); - let (sender, receiver) = unbounded(); - - Ok((CfTree { index, sender: Some(sender) }, receiver)) - } - - pub fn insert(&self, key: K, value: V) -> RocksDbResult<()> - where K: AsRef<[u8]>, - V: AsRef<[u8]>, - { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - let result = self.index.db.put_cf(cf, key, value); - - if let Some(sender) = &self.sender { - let _err = sender.send(()); - } - - result - } - - pub fn get(&self, key: K) -> RocksDbResult> - where K: AsRef<[u8]>, - { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - self.index.db.get_cf(cf, key) - } - - pub fn remove(&self, key: K) -> RocksDbResult<()> - where K: AsRef<[u8]> - { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - self.index.db.delete_cf(cf, key) - } - - /// Start and end key range is inclusive on both bounds. - pub fn range(&self, start: KS, end: KE) -> RocksDbResult - where KS: AsRef<[u8]>, - KE: AsRef<[u8]>, - { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - - let mut iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?; - iter.set_mode(IteratorMode::From(start.as_ref(), Direction::Forward)); - - let end_bound = Box::from(end.as_ref()); - Ok(CfIter { iter, end_bound: Some(end_bound) }) - } - - pub fn iter(&self) -> RocksDbResult { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - let iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?; - Ok(CfIter { iter, end_bound: None }) - } - - pub fn last_key(&self) -> RocksDbResult>> { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - let mut iter = self.index.db.iterator_cf(cf, IteratorMode::End)?; - Ok(iter.next().map(|(key, _)| key)) - } - - pub fn prefix_iterator

(&self, prefix: P) -> RocksDbResult - where P: AsRef<[u8]>, - { - let cf = self.index.db.cf_handle(&self.index.name).unwrap(); - self.index.db.prefix_iterator_cf(cf, prefix) - } -} - -pub struct CfIter<'a> { - iter: rocksdb::DBIterator<'a>, - end_bound: Option>, -} - -impl Iterator for CfIter<'_> { - type Item = (Box<[u8]>, Box<[u8]>); - - fn next(&mut self) -> Option { - match (self.iter.next(), &self.end_bound) { - (Some((ref key, _)), Some(end_bound)) if key > end_bound => None, - (Some(entry), _) => Some(entry), - (None, _) => None, - } - } -} diff --git a/meilidb-data/src/database/error.rs b/meilidb-data/src/database/error.rs deleted file mode 100644 index 6da64c3e0..000000000 --- a/meilidb-data/src/database/error.rs +++ /dev/null @@ -1,73 +0,0 @@ -use std::{error, fmt}; -use crate::serde::SerializerError; - -#[derive(Debug)] -pub enum Error { - SchemaDiffer, - SchemaMissing, - WordIndexMissing, - MissingDocumentId, - RocksDbError(rocksdb::Error), - FstError(fst::Error), - RmpDecodeError(rmp_serde::decode::Error), - RmpEncodeError(rmp_serde::encode::Error), - BincodeError(bincode::Error), - SerializerError(SerializerError), -} - -impl From for Error { - fn from(error: rocksdb::Error) -> Error { - Error::RocksDbError(error) - } -} - -impl From for Error { - fn from(error: fst::Error) -> Error { - Error::FstError(error) - } -} - -impl From for Error { - fn from(error: rmp_serde::decode::Error) -> Error { - Error::RmpDecodeError(error) - } -} - -impl From for Error { - fn from(error: rmp_serde::encode::Error) -> Error { - Error::RmpEncodeError(error) - } -} - -impl From for Error { - fn from(error: bincode::Error) -> Error { - Error::BincodeError(error) - } -} - -impl From for Error { - fn from(error: SerializerError) -> Error { - Error::SerializerError(error) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Error::*; - match self { - SchemaDiffer => write!(f, "schemas differ"), - SchemaMissing => write!(f, "this index does not have a schema"), - WordIndexMissing => write!(f, "this index does not have a word index"), - MissingDocumentId => write!(f, "document id is missing"), - RocksDbError(e) => write!(f, "RocksDB error; {}", e), - FstError(e) => write!(f, "fst error; {}", e), - RmpDecodeError(e) => write!(f, "rmp decode error; {}", e), - RmpEncodeError(e) => write!(f, "rmp encode error; {}", e), - BincodeError(e) => write!(f, "bincode error; {}", e), - SerializerError(e) => write!(f, "serializer error; {}", e), - } - } -} - -impl error::Error for Error { } - diff --git a/meilidb-data/src/database/index/common_index.rs b/meilidb-data/src/database/index/common_index.rs deleted file mode 100644 index 04fe39bd3..000000000 --- a/meilidb-data/src/database/index/common_index.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::ops::Deref; -use serde::de::DeserializeOwned; -use serde::Serialize; -use super::Error; -use std::marker::PhantomData; - -#[derive(Clone)] -pub struct CommonIndex(pub crate::CfTree); - -impl Deref for CommonIndex { - type Target = crate::CfTree; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl CommonIndex { - pub fn get(&self, key: K) -> Result, Error> - where T: DeserializeOwned, - K: AsRef<[u8]>, - { - let raw = match self.0.get(key)? { - Some(raw) => raw, - None => return Ok(None), - }; - let data = bincode::deserialize(&raw)?; - Ok(Some(data)) - } - - pub fn set(&self, key: K, data: &T) -> Result<(), Error> - where T: Serialize, - K: AsRef<[u8]>, - { - let raw = bincode::serialize(data)?; - self.0.insert(key, &raw)?; - Ok(()) - } - - pub fn prefix_iterator(&self, prefix: P) -> Result, Error> - where T: DeserializeOwned, - P: AsRef<[u8]>, - { - let iter = self.0.prefix_iterator(prefix)?; - Ok(SerializedIterator { iter, _marker: PhantomData }) - } -} - -pub struct SerializedIterator<'a, T> { - iter: rocksdb::DBIterator<'a>, - _marker: PhantomData, -} - -impl Iterator for SerializedIterator<'_, T> -where T: DeserializeOwned, -{ - type Item = (String, T); - - fn next(&mut self) -> Option { - let (raw_key, raw_value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return None, - }; - - let value: T = match bincode::deserialize(&raw_value) { - Ok(data) => data, - Err(_) => return None, - }; - - let key = match std::str::from_utf8(&raw_key) { - Ok(key) => key.to_string(), - Err(_) => return None, - }; - - Some((key, value)) - } -} diff --git a/meilidb-data/src/database/index/custom_settings_index.rs b/meilidb-data/src/database/index/custom_settings_index.rs deleted file mode 100644 index 3404eff5b..000000000 --- a/meilidb-data/src/database/index/custom_settings_index.rs +++ /dev/null @@ -1,89 +0,0 @@ -use serde::de::DeserializeOwned; -use serde::{Serialize, Deserialize}; -use std::collections::{HashMap, HashSet}; -use std::ops::Deref; -use super::Error; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum RankingOrdering { - Asc, - Dsc -} - -pub type StopWords = HashSet; -pub type RankingOrder = Vec; -pub type DistinctField = String; -pub type RankingRules = HashMap; - -const STOP_WORDS_KEY: &str = "stop-words"; -const RANKING_ORDER_KEY: &str = "ranking-order"; -const DISTINCT_FIELD_KEY: &str = "distinct-field"; -const RANKING_RULES_KEY: &str = "ranking-rules"; - -#[derive(Clone)] -pub struct CustomSettingsIndex(pub(crate) crate::CfTree); - -impl Deref for CustomSettingsIndex { - type Target = crate::CfTree; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl CustomSettingsIndex { - fn get(&self, key: K) -> Result, Error> - where K: AsRef<[u8]>, - T: DeserializeOwned, - { - let setting = self.0.get(key)?; - let raw = match setting { - Some(raw) => raw, - None => return Ok(None) - }; - - Ok(Some(bincode::deserialize(&raw)?)) - } - - fn set(&self, key: K, data: &T) -> Result<(), Error> - where K: AsRef<[u8]>, - T: Serialize, - { - let raw = bincode::serialize(data)?; - self.0.insert(key, &raw)?; - Ok(()) - } - - pub fn get_stop_words(&self) -> Result, Error> { - self.get(STOP_WORDS_KEY) - } - - pub fn get_ranking_order(&self) -> Result, Error> { - self.get(RANKING_ORDER_KEY) - } - - pub fn get_distinct_field(&self) -> Result, Error> { - self.get(DISTINCT_FIELD_KEY) - } - - pub fn get_ranking_rules(&self) -> Result, Error> { - self.get(RANKING_RULES_KEY) - } - - pub fn set_stop_words(&self, value: &StopWords) -> Result<(), Error> { - self.set(STOP_WORDS_KEY, value) - } - - pub fn set_ranking_order(&self, value: &RankingOrder) -> Result<(), Error> { - self.set(RANKING_ORDER_KEY, value) - } - - pub fn set_distinct_field(&self, value: &DistinctField) -> Result<(), Error> { - self.set(DISTINCT_FIELD_KEY, value) - } - - pub fn set_ranking_rules(&self, value: &RankingRules) -> Result<(), Error> { - self.set(RANKING_RULES_KEY, value) - } -} diff --git a/meilidb-data/src/database/index/docs_words_index.rs b/meilidb-data/src/database/index/docs_words_index.rs deleted file mode 100644 index 8763dc588..000000000 --- a/meilidb-data/src/database/index/docs_words_index.rs +++ /dev/null @@ -1,33 +0,0 @@ -use std::sync::Arc; -use meilidb_core::DocumentId; -use crate::database::Error; - -#[derive(Clone)] -pub struct DocsWordsIndex(pub crate::CfTree); - -impl DocsWordsIndex { - pub fn doc_words(&self, id: DocumentId) -> Result, Error> { - let key = id.0.to_be_bytes(); - match self.0.get(key)? { - Some(bytes) => { - let len = bytes.len(); - let value = Arc::from(bytes.as_ref()); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; - Ok(Some(fst::Set::from(fst))) - }, - None => Ok(None) - } - } - - pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> { - let key = id.0.to_be_bytes(); - self.0.insert(key, words.as_fst().as_bytes())?; - Ok(()) - } - - pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> { - let key = id.0.to_be_bytes(); - self.0.remove(key)?; - Ok(()) - } -} diff --git a/meilidb-data/src/database/index/documents_index.rs b/meilidb-data/src/database/index/documents_index.rs deleted file mode 100644 index 013b23f09..000000000 --- a/meilidb-data/src/database/index/documents_index.rs +++ /dev/null @@ -1,146 +0,0 @@ -use std::convert::TryInto; -use std::collections::HashMap; - -use meilidb_core::DocumentId; -use meilidb_schema::{Schema, SchemaAttr}; -use rocksdb::DBVector; - -use crate::document_attr_key::DocumentAttrKey; -use crate::RocksDbResult; - -fn document_fields_range(id: DocumentId) -> ([u8; 10], [u8; 10]) { - let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes(); - let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes(); - - (start, end) -} - -#[derive(Clone)] -pub struct DocumentsIndex(pub(crate) crate::CfTree); - -impl DocumentsIndex { - pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.get(key) - } - - pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec) -> RocksDbResult<()> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.insert(key, value)?; - Ok(()) - } - - pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<()> { - let key = DocumentAttrKey::new(id, attr).to_be_bytes(); - self.0.remove(key)?; - Ok(()) - } - - pub fn del_all_document_fields(&self, id: DocumentId) -> RocksDbResult { - let (start, end) = document_fields_range(id); - - let mut count = 0; - for (key, _) in self.0.range(start, end)? { - self.0.remove(key)?; - count += 1; - } - - Ok(count) - } - - pub fn document_fields(&self, id: DocumentId) -> RocksDbResult { - let (start, end) = document_fields_range(id); - - let iter = self.0.range(start, end)?; - Ok(DocumentFieldsIter(iter)) - } - - pub fn documents_ids(&self) -> RocksDbResult { - let iter = DocumentsKeysIter(self.0.iter()?); - Ok(DocumentsIdsIter { inner: iter, last: None }) - } - - pub fn documents_fields_repartition(&self, schema: Schema) -> RocksDbResult> { - let iter = self.0.iter()?; - let mut repartition_attributes_id = HashMap::new(); - for key in DocumentsKeysIter(iter) { - let counter = repartition_attributes_id.entry(key.attribute).or_insert(0); - *counter += 1u64; - } - let mut repartition_with_attribute_name = HashMap::new(); - for (key, val) in repartition_attributes_id { - repartition_with_attribute_name.insert(schema.attribute_name(key).to_owned(), val); - } - Ok(repartition_with_attribute_name) - } - - pub fn len(&self) -> RocksDbResult { - let mut last_document_id = None; - let mut count = 0; - - for (key, _) in self.0.iter()? { - let array = key.as_ref().try_into().unwrap(); - let document_id = DocumentAttrKey::from_be_bytes(array).document_id; - - if Some(document_id) != last_document_id { - last_document_id = Some(document_id); - count += 1; - } - } - - Ok(count) - } -} - -pub struct DocumentFieldsIter<'a>(crate::CfIter<'a>); - -impl Iterator for DocumentFieldsIter<'_> { - type Item = (SchemaAttr, Box<[u8]>); - - fn next(&mut self) -> Option { - match self.0.next() { - Some((key, value)) => { - let array = key.as_ref().try_into().unwrap(); - let key = DocumentAttrKey::from_be_bytes(array); - Some((key.attribute, value)) - }, - None => None, - } - } -} - -pub struct DocumentsKeysIter<'a>(crate::CfIter<'a>); - -impl Iterator for DocumentsKeysIter<'_> { - type Item = DocumentAttrKey; - - fn next(&mut self) -> Option { - match self.0.next() { - Some((key, _)) => { - let array = key.as_ref().try_into().unwrap(); - let key = DocumentAttrKey::from_be_bytes(array); - Some(key) - }, - None => None, - } - } -} - -pub struct DocumentsIdsIter<'a> { - inner: DocumentsKeysIter<'a>, - last: Option, -} - -impl Iterator for DocumentsIdsIter<'_> { - type Item = DocumentId; - - fn next(&mut self) -> Option { - for DocumentAttrKey { document_id, .. } in &mut self.inner { - if self.last != Some(document_id) { - self.last = Some(document_id); - return Some(document_id) - } - } - None - } -} diff --git a/meilidb-data/src/database/index/main_index.rs b/meilidb-data/src/database/index/main_index.rs deleted file mode 100644 index b7141c80f..000000000 --- a/meilidb-data/src/database/index/main_index.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::sync::Arc; -use std::convert::TryInto; - -use meilidb_schema::Schema; - -use crate::ranked_map::RankedMap; -use crate::database::Error; - -const SCHEMA_KEY: &str = "schema"; -const WORDS_KEY: &str = "words"; -const SYNONYMS_KEY: &str = "synonyms"; -const RANKED_MAP_KEY: &str = "ranked-map"; -const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; - -#[derive(Clone)] -pub struct MainIndex(pub(crate) crate::CfTree); - -impl MainIndex { - pub fn schema(&self) -> Result, Error> { - match self.0.get(SCHEMA_KEY)? { - Some(bytes) => { - let schema = bincode::deserialize_from(bytes.as_ref())?; - Ok(Some(schema)) - }, - None => Ok(None), - } - } - - pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> { - let bytes = bincode::serialize(schema)?; - self.0.insert(SCHEMA_KEY, bytes)?; - Ok(()) - } - - pub fn words_set(&self) -> Result, Error> { - match self.0.get(WORDS_KEY)? { - Some(bytes) => { - let len = bytes.len(); - let value = Arc::from(bytes.as_ref()); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; - Ok(Some(fst::Set::from(fst))) - }, - None => Ok(None), - } - } - - pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> { - self.0.insert(WORDS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into) - } - - pub fn synonyms_set(&self) -> Result, Error> { - match self.0.get(SYNONYMS_KEY)? { - Some(bytes) => { - let len = bytes.len(); - let value = Arc::from(bytes.as_ref()); - let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; - Ok(Some(fst::Set::from(fst))) - }, - None => Ok(None), - } - } - - pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> { - self.0.insert(SYNONYMS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into) - } - - pub fn ranked_map(&self) -> Result, Error> { - match self.0.get(RANKED_MAP_KEY)? { - Some(bytes) => { - let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?; - Ok(Some(ranked_map)) - }, - None => Ok(None), - } - } - - pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> { - let mut bytes = Vec::new(); - value.write_to_bin(&mut bytes)?; - self.0.insert(RANKED_MAP_KEY, bytes)?; - Ok(()) - } - - pub fn number_of_documents(&self) -> Result { - match self.0.get(NUMBER_OF_DOCUMENTS_KEY)? { - Some(bytes) => { - let array = (*bytes).try_into().unwrap(); - Ok(u64::from_be_bytes(array)) - }, - None => Ok(0), - } - } - - pub fn set_number_of_documents(&self, f: F) -> Result - where F: FnOnce(u64) -> u64, - { - let new = self.number_of_documents().map(f)?; - self.0.insert(NUMBER_OF_DOCUMENTS_KEY, new.to_be_bytes())?; - Ok(new) - } -} diff --git a/meilidb-data/src/database/index/mod.rs b/meilidb-data/src/database/index/mod.rs deleted file mode 100644 index 071b60bab..000000000 --- a/meilidb-data/src/database/index/mod.rs +++ /dev/null @@ -1,525 +0,0 @@ -use std::collections::{HashMap, HashSet, BTreeMap}; -use std::convert::TryInto; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::thread; -use std::time::{Duration, Instant}; - -use arc_swap::{ArcSwap, ArcSwapOption, Guard}; -use crossbeam_channel::Receiver; -use meilidb_core::criterion::Criteria; -use meilidb_core::{DocIndex, Store, DocumentId, QueryBuilder}; -use meilidb_schema::Schema; -use sdset::SetBuf; -use serde::{de, Serialize, Deserialize}; - -use crate::CfTree; -use crate::ranked_map::RankedMap; -use crate::serde::{Deserializer, DeserializerError}; - -pub use self::custom_settings_index::{CustomSettingsIndex, RankingOrdering, StopWords, RankingOrder, DistinctField, RankingRules}; -pub use self::common_index::CommonIndex; -pub use self::documents_index::DocumentsIdsIter; -use self::docs_words_index::DocsWordsIndex; -use self::documents_index::DocumentsIndex; -use self::main_index::MainIndex; -use self::synonyms_index::SynonymsIndex; -use self::words_index::WordsIndex; - -use crate::RocksDbResult; -use crate::database::{ - Error, - DocumentsAddition, DocumentsDeletion, - SynonymsAddition, SynonymsDeletion, - apply_documents_addition, apply_documents_deletion, - apply_synonyms_addition, apply_synonyms_deletion, -}; - -mod common_index; -mod custom_settings_index; -mod docs_words_index; -mod documents_index; -mod main_index; -mod synonyms_index; -mod words_index; - -#[derive(Serialize, Deserialize)] -enum Update { - DocumentsAddition(Vec), - DocumentsDeletion(Vec), - SynonymsAddition(BTreeMap>), - SynonymsDeletion(BTreeMap>>), -} - -#[derive(Clone, Serialize, Deserialize)] -pub enum UpdateType { - DocumentsAddition { number: usize }, - DocumentsDeletion { number: usize }, - SynonymsAddition { number: usize }, - SynonymsDeletion { number: usize }, -} - -#[derive(Clone, Serialize, Deserialize)] -pub struct DetailedDuration { - pub main: Duration, -} - -#[derive(Clone, Serialize, Deserialize)] -pub struct UpdateResult { - pub update_id: u64, - pub update_type: UpdateType, - pub result: Result<(), String>, - pub detailed_duration: DetailedDuration, -} - -#[derive(Clone, Serialize, Deserialize)] -pub enum UpdateStatus { - Enqueued, - Processed(UpdateResult), - Unknown, -} - -fn spawn_update_system(index: Index, subscription: Receiver<()>) -> thread::JoinHandle<()> { - thread::spawn(move || { - let mut subscription = subscription.into_iter(); - - loop { - while let Some((key, _)) = index.updates_index.iter().unwrap().next() { - let update_id = key.as_ref().try_into().map(u64::from_be_bytes).unwrap(); - - let updates = &index.updates_index; - let results = &index.updates_results_index; - - let update = updates.get(&key).unwrap().unwrap(); - - let (update_type, result, duration) = match rmp_serde::from_read_ref(&update).unwrap() { - Update::DocumentsAddition(documents) => { - let update_type = UpdateType::DocumentsAddition { number: documents.len() }; - let ranked_map = index.cache.load().ranked_map.clone(); - let start = Instant::now(); - let result = apply_documents_addition(&index, ranked_map, documents); - (update_type, result, start.elapsed()) - }, - Update::DocumentsDeletion(documents) => { - let update_type = UpdateType::DocumentsDeletion { number: documents.len() }; - let ranked_map = index.cache.load().ranked_map.clone(); - let start = Instant::now(); - let result = apply_documents_deletion(&index, ranked_map, documents); - (update_type, result, start.elapsed()) - }, - Update::SynonymsAddition(synonyms) => { - let update_type = UpdateType::SynonymsAddition { number: synonyms.len() }; - let start = Instant::now(); - let result = apply_synonyms_addition(&index, synonyms); - (update_type, result, start.elapsed()) - }, - Update::SynonymsDeletion(synonyms) => { - let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() }; - let start = Instant::now(); - let result = apply_synonyms_deletion(&index, synonyms); - (update_type, result, start.elapsed()) - }, - }; - - let detailed_duration = DetailedDuration { main: duration }; - let status = UpdateResult { - update_id, - update_type, - result: result.map_err(|e| e.to_string()), - detailed_duration, - }; - - if let Some(callback) = &*index.update_callback.load() { - (callback)(status.clone()); - } - - let value = bincode::serialize(&status).unwrap(); - results.insert(&key, value).unwrap(); - updates.remove(&key).unwrap(); - } - - // this subscription is just used to block - // the loop until a new update is inserted - subscription.next(); - } - }) -} - -fn last_update_id( - update_index: &crate::CfTree, - update_results_index: &crate::CfTree, -) -> RocksDbResult -{ - let uikey = match update_index.last_key()? { - Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()), - None => None, - }; - - let urikey = match update_results_index.last_key()? { - Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()), - None => None, - }; - - Ok(uikey.max(urikey).unwrap_or(0)) -} - -#[derive(Clone)] -pub struct IndexStats { - pub number_of_words: usize, - pub number_of_documents: u64, - pub number_attrs_in_ranked_map: usize, - pub documents_fields_repartition: HashMap, -} - -#[derive(Clone)] -pub struct Index { - pub(crate) cache: Arc>, - - // TODO this will be a snapshot in the future - main_index: MainIndex, - synonyms_index: SynonymsIndex, - words_index: WordsIndex, - docs_words_index: DocsWordsIndex, - documents_index: DocumentsIndex, - custom_settings_index: CustomSettingsIndex, - - // used by the update system - updates_id: Arc, - updates_index: crate::CfTree, - updates_results_index: crate::CfTree, - update_callback: Arc>>, -} - -pub(crate) struct Cache { - pub words: Arc, - pub synonyms: Arc, - pub schema: Schema, - pub ranked_map: RankedMap, - pub number_of_documents: u64, -} - -impl Index { - pub fn new(db: Arc, name: &str) -> Result { - Index::new_raw(db, name, None) - } - - pub fn with_schema(db: Arc, name: &str, schema: Schema) -> Result { - Index::new_raw(db, name, Some(schema)) - } - - fn new_raw(db: Arc, name: &str, schema: Option) -> Result { - let main_index = CfTree::create(db.clone(), name.to_string()).map(MainIndex)?; - let synonyms_index = CfTree::create(db.clone(), format!("{}-synonyms", name)).map(SynonymsIndex)?; - let words_index = CfTree::create(db.clone(), format!("{}-words", name)).map(WordsIndex)?; - let docs_words_index = CfTree::create(db.clone(), format!("{}-docs-words", name)).map(DocsWordsIndex)?; - let documents_index = CfTree::create(db.clone(), format!("{}-documents", name)).map(DocumentsIndex)?; - let custom_settings_index = CfTree::create(db.clone(), format!("{}-custom", name)).map(CustomSettingsIndex)?; - let (updates_index, subscription) = CfTree::create_with_subcription(db.clone(), format!("{}-updates", name))?; - let updates_results_index = CfTree::create(db.clone(), format!("{}-updates-results", name))?; - - let words = match main_index.words_set()? { - Some(words) => Arc::new(words), - None => Arc::new(fst::Set::default()), - }; - - let synonyms = match main_index.synonyms_set()? { - Some(synonyms) => Arc::new(synonyms), - None => Arc::new(fst::Set::default()), - }; - - let schema = match (schema, main_index.schema()?) { - (Some(ref expected), Some(ref current)) if current != expected => { - return Err(Error::SchemaDiffer) - }, - (Some(expected), Some(_)) => expected, - (Some(expected), None) => { - main_index.set_schema(&expected)?; - expected - }, - (None, Some(current)) => current, - (None, None) => return Err(Error::SchemaMissing), - }; - - let ranked_map = match main_index.ranked_map()? { - Some(map) => map, - None => RankedMap::default(), - }; - - let number_of_documents = documents_index.len()?; - - let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents }; - let cache = Arc::new(ArcSwap::from_pointee(cache)); - - let last_update_id = last_update_id(&updates_index, &updates_results_index)?; - let updates_id = Arc::new(AtomicU64::new(last_update_id + 1)); - - let index = Index { - cache, - main_index, - synonyms_index, - words_index, - docs_words_index, - documents_index, - custom_settings_index, - updates_id, - updates_index, - updates_results_index, - update_callback: Arc::new(ArcSwapOption::empty()), - }; - - let _handle = spawn_update_system(index.clone(), subscription); - - Ok(index) - } - - pub fn set_update_callback(&self, callback: F) - where F: Fn(UpdateResult) + Send + Sync + 'static - { - self.update_callback.store(Some(Arc::new(Box::new(callback)))); - } - - pub fn unset_update_callback(&self) { - self.update_callback.store(None); - } - - pub fn stats(&self) -> RocksDbResult { - let cache = self.cache.load(); - let documents_fields_repartition = self.documents_index.documents_fields_repartition(cache.schema.clone())?; - Ok(IndexStats { - number_of_words: cache.words.len(), - number_of_documents: cache.number_of_documents, - number_attrs_in_ranked_map: cache.ranked_map.len(), - documents_fields_repartition, - }) - } - - pub fn query_builder(&self) -> QueryBuilder { - let ref_index = self.as_ref(); - QueryBuilder::new(ref_index) - } - - pub fn query_builder_with_criteria<'c>( - &self, - criteria: Criteria<'c>, - ) -> QueryBuilder<'c, RefIndex> - { - let ref_index = self.as_ref(); - QueryBuilder::with_criteria(ref_index, criteria) - } - - pub fn as_ref(&self) -> RefIndex { - RefIndex { - cache: self.cache.load(), - main_index: &self.main_index, - synonyms_index: &self.synonyms_index, - words_index: &self.words_index, - docs_words_index: &self.docs_words_index, - documents_index: &self.documents_index, - custom_settings_index: &self.custom_settings_index, - } - } - - pub fn schema(&self) -> Schema { - self.cache.load().schema.clone() - } - - pub fn ranked_map(&self) -> RankedMap { - self.cache.load().ranked_map.clone() - } - - pub fn synonyms_index(&self) -> SynonymsIndex { - self.synonyms_index.clone() - } - - pub fn synonyms_set(&self) -> Arc { - self.cache.load().synonyms.clone() - } - - pub fn custom_settings(&self) -> CustomSettingsIndex { - self.custom_settings_index.clone() - } - - pub fn number_of_documents(&self) -> u64 { - self.cache.load().number_of_documents - } - - pub fn documents_addition(&self) -> DocumentsAddition { - DocumentsAddition::new(self) - } - - pub fn documents_deletion(&self) -> DocumentsDeletion { - DocumentsDeletion::new(self) - } - - pub fn synonyms_addition(&self) -> SynonymsAddition { - SynonymsAddition::new(self) - } - - pub fn synonyms_deletion(&self) -> SynonymsDeletion { - SynonymsDeletion::new(self) - } - - pub fn current_update_id(&self) -> Result, Error> { - if let Some((key, _)) = self.updates_index.iter()?.next() { - return Ok(Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap())) - } - Ok(None) - } - - pub fn enqueued_updates_ids(&self) -> Result, Error> { - let mut updates = Vec::new(); - - for (key, _) in self.updates_index.iter()? { - let update_id = key.as_ref().try_into().map(u64::from_be_bytes).unwrap(); - updates.push(update_id); - } - - Ok(updates) - } - - pub fn update_status( - &self, - update_id: u64, - ) -> Result - { - let update_id = update_id.to_be_bytes(); - match self.updates_results_index.get(update_id)? { - Some(value) => { - let value = bincode::deserialize(&value)?; - Ok(UpdateStatus::Processed(value)) - }, - None => { - match self.updates_index.get(update_id)? { - Some(_) => Ok(UpdateStatus::Enqueued), - None => Ok(UpdateStatus::Unknown), - } - } - } - } - - pub fn update_status_blocking( - &self, - update_id: u64, - ) -> Result - { - loop { - if let Some(value) = self.updates_results_index.get(&update_id.to_be_bytes())? { - let value = bincode::deserialize(&value)?; - return Ok(value) - } - std::thread::sleep(Duration::from_millis(300)); - } - } - - pub fn documents_ids(&self) -> Result { - Ok(self.documents_index.documents_ids()?) - } - - pub fn document( - &self, - fields: Option<&HashSet<&str>>, - id: DocumentId, - ) -> Result, DeserializerError> - where T: de::DeserializeOwned, - { - let schema = self.schema(); - let fields = match fields { - Some(fields) => fields.into_iter().map(|name| schema.attribute(name)).collect(), - None => None, - }; - - let mut deserializer = Deserializer { - document_id: id, - index: &self, - fields: fields.as_ref(), - }; - - // TODO: currently we return an error if all document fields are missing, - // returning None would have been better - T::deserialize(&mut deserializer).map(Some) - } -} - -impl Index { - pub(crate) fn push_documents_addition(&self, addition: Vec) -> Result - where D: serde::Serialize - { - let mut values = Vec::with_capacity(addition.len()); - for add in addition { - let vec = rmp_serde::to_vec_named(&add)?; - let add = rmp_serde::from_read(&vec[..])?; - values.push(add); - } - - let addition = Update::DocumentsAddition(values); - let update = rmp_serde::to_vec_named(&addition)?; - self.raw_push_update(update) - } - - pub(crate) fn push_documents_deletion( - &self, - deletion: Vec, - ) -> Result - { - let deletion = Update::DocumentsDeletion(deletion); - let update = rmp_serde::to_vec_named(&deletion)?; - self.raw_push_update(update) - } - - pub(crate) fn push_synonyms_addition( - &self, - addition: BTreeMap>, - ) -> Result - { - let addition = Update::SynonymsAddition(addition); - let update = rmp_serde::to_vec_named(&addition)?; - self.raw_push_update(update) - } - - pub(crate) fn push_synonyms_deletion( - &self, - deletion: BTreeMap>>, - ) -> Result - { - let deletion = Update::SynonymsDeletion(deletion); - let update = rmp_serde::to_vec_named(&deletion)?; - self.raw_push_update(update) - } - - fn raw_push_update(&self, raw_update: Vec) -> Result { - let update_id = self.updates_id.fetch_add(1, Ordering::SeqCst); - let update_id_array = update_id.to_be_bytes(); - self.updates_index.insert(update_id_array, raw_update)?; - Ok(update_id) - } -} - -pub struct RefIndex<'a> { - pub(crate) cache: Guard<'static, Arc>, - pub main_index: &'a MainIndex, - pub synonyms_index: &'a SynonymsIndex, - pub words_index: &'a WordsIndex, - pub docs_words_index: &'a DocsWordsIndex, - pub documents_index: &'a DocumentsIndex, - pub custom_settings_index: &'a CustomSettingsIndex, -} - -impl Store for RefIndex<'_> { - type Error = Error; - - fn words(&self) -> Result<&fst::Set, Self::Error> { - Ok(&self.cache.words) - } - - fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { - Ok(self.words_index.doc_indexes(word)?) - } - - fn synonyms(&self) -> Result<&fst::Set, Self::Error> { - Ok(&self.cache.synonyms) - } - - fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { - Ok(self.synonyms_index.alternatives_to(word)?) - } -} diff --git a/meilidb-data/src/database/index/synonyms_index.rs b/meilidb-data/src/database/index/synonyms_index.rs deleted file mode 100644 index ec8901c9e..000000000 --- a/meilidb-data/src/database/index/synonyms_index.rs +++ /dev/null @@ -1,21 +0,0 @@ -use crate::RocksDbResult; - -#[derive(Clone)] -pub struct SynonymsIndex(pub(crate) crate::CfTree); - -impl SynonymsIndex { - pub fn alternatives_to(&self, word: &[u8]) -> RocksDbResult> { - match self.0.get(word)? { - Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())), - None => Ok(None), - } - } - - pub fn set_alternatives_to(&self, word: &[u8], value: Vec) -> RocksDbResult<()> { - self.0.insert(word, value).map(drop) - } - - pub fn del_alternatives_of(&self, word: &[u8]) -> RocksDbResult<()> { - self.0.remove(word).map(drop) - } -} diff --git a/meilidb-data/src/database/index/words_index.rs b/meilidb-data/src/database/index/words_index.rs deleted file mode 100644 index 97f9372f5..000000000 --- a/meilidb-data/src/database/index/words_index.rs +++ /dev/null @@ -1,45 +0,0 @@ -use meilidb_core::DocIndex; -use sdset::{Set, SetBuf}; -use zerocopy::{LayoutVerified, AsBytes}; -use crate::RocksDbResult; - -#[derive(Clone)] -pub struct WordsIndex(pub(crate) crate::CfTree); - -impl WordsIndex { - pub fn doc_indexes(&self, word: &[u8]) -> RocksDbResult>> { - // we must force an allocation to make the memory aligned - match self.0.get(word)? { - Some(bytes) => { - let vec = match LayoutVerified::new_slice(bytes.as_ref()) { - Some(layout) => layout.into_slice().to_vec(), - None => { - let len = bytes.as_ref().len(); - let count = len / std::mem::size_of::(); - let mut buf: Vec = Vec::with_capacity(count); - unsafe { - let src = bytes.as_ref().as_ptr(); - let dst = buf.as_mut_ptr() as *mut u8; - std::ptr::copy_nonoverlapping(src, dst, len); - buf.set_len(count); - } - buf - } - }; - - let setbuf = SetBuf::new_unchecked(vec); - - Ok(Some(setbuf)) - }, - None => Ok(None), - } - } - - pub fn set_doc_indexes(&self, word: &[u8], set: &Set) -> RocksDbResult<()> { - self.0.insert(word, set.as_bytes()).map(drop) - } - - pub fn del_doc_indexes(&self, word: &[u8]) -> RocksDbResult<()> { - self.0.remove(word).map(drop) - } -} diff --git a/meilidb-data/src/database/mod.rs b/meilidb-data/src/database/mod.rs deleted file mode 100644 index c4ff406a5..000000000 --- a/meilidb-data/src/database/mod.rs +++ /dev/null @@ -1,155 +0,0 @@ -use std::collections::hash_map::Entry; -use std::collections::{HashSet, HashMap}; -use std::path::Path; -use std::sync::Arc; -use std::sync::RwLock; -use meilidb_schema::Schema; - -mod error; -mod index; -mod update; - -use crate::CfTree; - -pub use self::error::Error; -pub use self::index::{ - Index, CustomSettingsIndex, CommonIndex, RankingOrdering, - StopWords, RankingOrder, DistinctField, RankingRules, - UpdateType, DetailedDuration, UpdateResult, UpdateStatus -}; - -pub use self::update::DocumentsAddition; -pub use self::update::DocumentsDeletion; -pub use self::update::SynonymsAddition; -pub use self::update::SynonymsDeletion; - -use self::update::apply_documents_addition; -use self::update::apply_documents_deletion; -use self::update::apply_synonyms_addition; -use self::update::apply_synonyms_deletion; - -const INDEXES_KEY: &str = "indexes"; -const COMMON_KEY: &str = "common-index"; - -fn load_indexes(tree: &rocksdb::DB) -> Result, Error> { - match tree.get(INDEXES_KEY)? { - Some(bytes) => Ok(bincode::deserialize(&bytes)?), - None => Ok(HashSet::new()) - } -} - -pub struct Database { - cache: RwLock>, - inner: Arc, - common: Arc, -} - -impl Database { - pub fn open>(path: P) -> Result { - let cache = RwLock::new(HashMap::new()); - - let mut options = rocksdb::Options::default(); - options.create_if_missing(true); - - let cfs = rocksdb::DB::list_cf(&options, &path).unwrap_or_default(); - let inner = Arc::new(rocksdb::DB::open_cf(&options, path, cfs)?); - let common_tree = CfTree::create(inner.clone(), COMMON_KEY.to_owned())?; - let common = Arc::new(CommonIndex(common_tree)); - let indexes = load_indexes(&inner)?; - let database = Database { cache, inner, common }; - - for index in indexes { - database.open_index(&index)?; - } - - Ok(database) - } - - pub fn indexes(&self) -> Result, Error> { - load_indexes(&self.inner) - } - - fn set_indexes(&self, value: &HashSet) -> Result<(), Error> { - let bytes = bincode::serialize(value)?; - self.inner.put(INDEXES_KEY, bytes)?; - Ok(()) - } - - pub fn open_index(&self, name: &str) -> Result, Error> { - { - let cache = self.cache.read().unwrap(); - if let Some(index) = cache.get(name).cloned() { - return Ok(Some(index)) - } - } - - let mut cache = self.cache.write().unwrap(); - let index = match cache.entry(name.to_string()) { - Entry::Occupied(occupied) => { - occupied.get().clone() - }, - Entry::Vacant(vacant) => { - if !self.indexes()?.contains(name) { - return Ok(None) - } - - let index = Index::new(self.inner.clone(), name)?; - vacant.insert(index).clone() - }, - }; - - Ok(Some(index)) - } - - pub fn create_index(&self, name: &str, schema: Schema) -> Result { - let mut cache = self.cache.write().unwrap(); - - let index = match cache.entry(name.to_string()) { - Entry::Occupied(occupied) => { - occupied.get().clone() - }, - Entry::Vacant(vacant) => { - let index = Index::with_schema(self.inner.clone(), name, schema)?; - - let mut indexes = self.indexes()?; - indexes.insert(name.to_string()); - self.set_indexes(&indexes)?; - - vacant.insert(index).clone() - }, - }; - - Ok(index) - } - - pub fn delete_index(&self, name: &str) -> Result<(), Error> { - let mut cache = self.cache.write().unwrap(); - - self.inner.drop_cf(name)?; - let _ = self.inner.drop_cf(&format!("{}-synonyms", name)); - let _ = self.inner.drop_cf(&format!("{}-words", name)); - let _ = self.inner.drop_cf(&format!("{}-docs-words", name)); - let _ = self.inner.drop_cf(&format!("{}-documents", name)); - let _ = self.inner.drop_cf(&format!("{}-custom", name)); - let _ = self.inner.drop_cf(&format!("{}-updates", name)); - let _ = self.inner.drop_cf(&format!("{}-updates-results", name)); - cache.remove(name); - - if let Ok(mut index_list) = self.indexes() { - index_list.remove(name); - let _ = self.set_indexes(&index_list); - } - Ok(()) - } - - pub fn common_index(&self) -> Arc { - self.common.clone() - } - - pub fn checkpoint_to

(&self, path: P) -> Result<(), Error> - where P: AsRef, - { - let checkpoint = rocksdb::checkpoint::Checkpoint::new(&self.inner)?; - Ok(checkpoint.create_checkpoint(path)?) - } -} diff --git a/meilidb-data/src/database/update/documents_addition.rs b/meilidb-data/src/database/update/documents_addition.rs deleted file mode 100644 index 0ff04221c..000000000 --- a/meilidb-data/src/database/update/documents_addition.rs +++ /dev/null @@ -1,139 +0,0 @@ -use std::collections::HashSet; -use std::sync::Arc; - -use fst::{SetBuilder, set::OpBuilder}; -use sdset::{SetOperation, duo::Union}; -use serde::Serialize; - -use crate::RankedMap; -use crate::database::{Error, Index, index::Cache, apply_documents_deletion}; -use crate::indexer::Indexer; -use crate::serde::{extract_document_id, Serializer, RamDocumentStore}; - -pub struct DocumentsAddition<'a, D> { - index: &'a Index, - documents: Vec, -} - -impl<'a, D> DocumentsAddition<'a, D> { - pub fn new(index: &'a Index) -> DocumentsAddition<'a, D> { - DocumentsAddition { index, documents: Vec::new() } - } - - pub fn update_document(&mut self, document: D) { - self.documents.push(document); - } - - pub fn finalize(self) -> Result - where D: serde::Serialize - { - self.index.push_documents_addition(self.documents) - } -} - -pub fn apply_documents_addition( - index: &Index, - mut ranked_map: RankedMap, - addition: Vec, -) -> Result<(), Error> -{ - let mut document_ids = HashSet::new(); - let mut document_store = RamDocumentStore::new(); - let mut indexer = Indexer::new(); - - let schema = &index.schema(); - let identifier = schema.identifier_name(); - - for document in addition { - let document_id = match extract_document_id(identifier, &document)? { - Some(id) => id, - None => return Err(Error::MissingDocumentId), - }; - - // 1. store the document id for future deletion - document_ids.insert(document_id); - - // 2. index the document fields in ram stores - let serializer = Serializer { - schema, - document_store: &mut document_store, - indexer: &mut indexer, - ranked_map: &mut ranked_map, - document_id, - }; - - document.serialize(serializer)?; - } - - let ref_index = index.as_ref(); - let docs_words = ref_index.docs_words_index; - let documents = ref_index.documents_index; - let main = ref_index.main_index; - let words = ref_index.words_index; - - // 1. remove the previous documents match indexes - let documents_to_insert = document_ids.iter().cloned().collect(); - apply_documents_deletion(index, ranked_map.clone(), documents_to_insert)?; - - // 2. insert new document attributes in the database - for ((id, attr), value) in document_store.into_inner() { - documents.set_document_field(id, attr, value)?; - } - - let indexed = indexer.build(); - let mut delta_words_builder = SetBuilder::memory(); - - for (word, delta_set) in indexed.words_doc_indexes { - delta_words_builder.insert(&word).unwrap(); - - let set = match words.doc_indexes(&word)? { - Some(set) => Union::new(&set, &delta_set).into_set_buf(), - None => delta_set, - }; - - words.set_doc_indexes(&word, &set)?; - } - - for (id, words) in indexed.docs_words { - docs_words.set_doc_words(id, &words)?; - } - - let delta_words = delta_words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap(); - - let words = match main.words_set()? { - Some(words) => { - let op = OpBuilder::new() - .add(words.stream()) - .add(delta_words.stream()) - .r#union(); - - let mut words_builder = SetBuilder::memory(); - words_builder.extend_stream(op).unwrap(); - words_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() - }, - None => delta_words, - }; - - main.set_words_set(&words)?; - main.set_ranked_map(&ranked_map)?; - - let inserted_documents_len = document_ids.len() as u64; - let number_of_documents = main.set_number_of_documents(|old| old + inserted_documents_len)?; - - // update the "consistent" view of the Index - let cache = ref_index.cache; - let words = Arc::new(words); - let synonyms = cache.synonyms.clone(); - let schema = cache.schema.clone(); - - let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents }; - index.cache.store(Arc::new(cache)); - - Ok(()) -} diff --git a/meilidb-data/src/database/update/mod.rs b/meilidb-data/src/database/update/mod.rs deleted file mode 100644 index 3d849256d..000000000 --- a/meilidb-data/src/database/update/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -mod documents_addition; -mod documents_deletion; -mod synonyms_addition; -mod synonyms_deletion; - -pub use self::documents_addition::{DocumentsAddition, apply_documents_addition}; -pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion}; -pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition}; -pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion}; diff --git a/meilidb-data/src/database/update/synonyms_addition.rs b/meilidb-data/src/database/update/synonyms_addition.rs deleted file mode 100644 index 95a650fb7..000000000 --- a/meilidb-data/src/database/update/synonyms_addition.rs +++ /dev/null @@ -1,94 +0,0 @@ -use std::collections::BTreeMap; -use std::sync::Arc; - -use fst::{SetBuilder, set::OpBuilder}; -use meilidb_core::normalize_str; -use sdset::SetBuf; - -use crate::database::{Error, Index,index::Cache}; - -pub struct SynonymsAddition<'a> { - index: &'a Index, - synonyms: BTreeMap>, -} - -impl<'a> SynonymsAddition<'a> { - pub fn new(index: &'a Index) -> SynonymsAddition<'a> { - SynonymsAddition { index, synonyms: BTreeMap::new() } - } - - pub fn add_synonym(&mut self, synonym: S, alternatives: I) - where S: AsRef, - T: AsRef, - I: IntoIterator, - { - let synonym = normalize_str(synonym.as_ref()); - let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase()); - self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); - } - - pub fn finalize(self) -> Result { - self.index.push_synonyms_addition(self.synonyms) - } -} - -pub fn apply_synonyms_addition( - index: &Index, - addition: BTreeMap>, -) -> Result<(), Error> -{ - let ref_index = index.as_ref(); - let synonyms = ref_index.synonyms_index; - let main = ref_index.main_index; - - let mut synonyms_builder = SetBuilder::memory(); - - for (synonym, alternatives) in addition { - synonyms_builder.insert(&synonym).unwrap(); - - let alternatives = { - let alternatives = SetBuf::from_dirty(alternatives); - let mut alternatives_builder = SetBuilder::memory(); - alternatives_builder.extend_iter(alternatives).unwrap(); - alternatives_builder.into_inner().unwrap() - }; - synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; - } - - let delta_synonyms = synonyms_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap(); - - let synonyms = match main.synonyms_set()? { - Some(synonyms) => { - let op = OpBuilder::new() - .add(synonyms.stream()) - .add(delta_synonyms.stream()) - .r#union(); - - let mut synonyms_builder = SetBuilder::memory(); - synonyms_builder.extend_stream(op).unwrap(); - synonyms_builder - .into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() - }, - None => delta_synonyms, - }; - - main.set_synonyms_set(&synonyms)?; - - // update the "consistent" view of the Index - let cache = ref_index.cache; - let words = Arc::new(main.words_set()?.unwrap_or_default()); - let ranked_map = cache.ranked_map.clone(); - let synonyms = Arc::new(synonyms); - let schema = cache.schema.clone(); - let number_of_documents = cache.number_of_documents; - - let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents }; - index.cache.store(Arc::new(cache)); - - Ok(()) -} diff --git a/meilidb-data/src/document_attr_key.rs b/meilidb-data/src/document_attr_key.rs deleted file mode 100644 index 2664e6e9a..000000000 --- a/meilidb-data/src/document_attr_key.rs +++ /dev/null @@ -1,69 +0,0 @@ -use meilidb_core::DocumentId; -use meilidb_schema::SchemaAttr; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct DocumentAttrKey { - pub document_id: DocumentId, - pub attribute: SchemaAttr, -} - -impl DocumentAttrKey { - pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey { - DocumentAttrKey { document_id, attribute } - } - - pub fn to_be_bytes(self) -> [u8; 10] { - let mut output = [0u8; 10]; - - let document_id = self.document_id.0.to_be_bytes(); - let attribute = self.attribute.0.to_be_bytes(); - - unsafe { - use std::{mem::size_of, ptr::copy_nonoverlapping}; - - let output = output.as_mut_ptr(); - copy_nonoverlapping(document_id.as_ptr(), output, size_of::()); - - let output = output.add(size_of::()); - copy_nonoverlapping(attribute.as_ptr(), output, size_of::()); - } - - output - } - - pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey { - let document_id; - let attribute; - - unsafe { - use std::ptr::read_unaligned; - - let pointer = bytes.as_ptr() as *const _; - let document_id_bytes = read_unaligned(pointer); - document_id = u64::from_be_bytes(document_id_bytes); - - let pointer = pointer.add(1) as *const _; - let attribute_bytes = read_unaligned(pointer); - attribute = u16::from_be_bytes(attribute_bytes); - } - - DocumentAttrKey { - document_id: DocumentId(document_id), - attribute: SchemaAttr(attribute), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn to_from_be_bytes() { - let document_id = DocumentId(67578308); - let schema_attr = SchemaAttr(3456); - let x = DocumentAttrKey::new(document_id, schema_attr); - - assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes())); - } -} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs deleted file mode 100644 index 8d313c855..000000000 --- a/meilidb-data/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -mod cf_tree; -mod database; -mod document_attr_key; -mod indexer; -mod number; -mod ranked_map; -mod serde; - -pub use self::cf_tree::{CfTree, CfIter}; -pub use self::database::{ - Database, Index, CustomSettingsIndex, RankingOrdering, - StopWords, RankingOrder, DistinctField, RankingRules, - UpdateType, DetailedDuration, UpdateResult, UpdateStatus, - Error, -}; -pub use self::number::Number; -pub use self::ranked_map::RankedMap; -pub use self::serde::{compute_document_id, extract_document_id, value_to_string}; - -pub type RocksDbResult = Result; diff --git a/meilidb-data/tests/common.rs b/meilidb-data/tests/common.rs deleted file mode 100644 index 8d0aadda7..000000000 --- a/meilidb-data/tests/common.rs +++ /dev/null @@ -1,15 +0,0 @@ -use meilidb_data::{Database}; -use meilidb_data::Index; -use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED}; - -pub fn simple_index() -> Index { - let tmp_dir = tempfile::tempdir().unwrap(); - let database = Database::open(&tmp_dir).unwrap(); - - let mut builder = SchemaBuilder::with_identifier("objectId"); - builder.new_attribute("objectId", DISPLAYED | INDEXED); - builder.new_attribute("title", DISPLAYED | INDEXED); - let schema = builder.build(); - - database.create_index("hello", schema).unwrap() -} diff --git a/meilidb-data/tests/custom_settings_index.rs b/meilidb-data/tests/custom_settings_index.rs deleted file mode 100644 index 9363be9d8..000000000 --- a/meilidb-data/tests/custom_settings_index.rs +++ /dev/null @@ -1,43 +0,0 @@ -#[macro_use] extern crate maplit; - -mod common; - -use big_s::S; -use meilidb_data::RankingOrdering; - -#[test] -fn stop_words() { - let index = common::simple_index(); - let stop_words = hashset!{ S("le"), S("la"), S("les"), }; - index.custom_settings().set_stop_words(&stop_words).unwrap(); - let ret_stop_words = index.custom_settings().get_stop_words().unwrap().unwrap(); - assert_eq!(ret_stop_words, stop_words); -} - -#[test] -fn ranking_order() { - let index = common::simple_index(); - let ranking_order = vec![S("SumOfTypos"), S("NumberOfWords"), S("WordsProximity"), S("SumOfWordsAttribute"), S("SumOfWordsPosition"), S("Exact"), S("DocumentId")]; - index.custom_settings().set_ranking_order(&ranking_order).unwrap(); - let ret_ranking_orderer = index.custom_settings().get_ranking_order().unwrap().unwrap(); - assert_eq!(ret_ranking_orderer, ranking_order); -} - -#[test] -fn distinct_field() { - let index = common::simple_index(); - let distinct_field = S("title"); - index.custom_settings().set_distinct_field(&distinct_field).unwrap(); - let ret_distinct_field = index.custom_settings().get_distinct_field().unwrap().unwrap(); - assert_eq!(ret_distinct_field, distinct_field); -} - -#[test] -fn ranking_rules() { - let index = common::simple_index(); - let ranking_rules = hashmap!{ S("objectId") => RankingOrdering::Asc }; - index.custom_settings().set_ranking_rules(&ranking_rules).unwrap(); - let ret_ranking_rules = index.custom_settings().get_ranking_rules().unwrap().unwrap(); - assert_eq!(ret_ranking_rules, ranking_rules); -} - diff --git a/meilidb-data/tests/database.rs b/meilidb-data/tests/database.rs deleted file mode 100644 index b716b2ea7..000000000 --- a/meilidb-data/tests/database.rs +++ /dev/null @@ -1,67 +0,0 @@ -#[macro_use] extern crate maplit; - -mod common; - -use std::sync::atomic::{AtomicBool, Ordering::Relaxed}; -use std::sync::Arc; - -use big_s::S; -use serde_json::json; - -#[test] -fn database_stats() { - let index = common::simple_index(); - let as_been_updated = Arc::new(AtomicBool::new(false)); - - let as_been_updated_clone = as_been_updated.clone(); - index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed)); - - let doc1 = json!({ "objectId": 123, "title": "hello" }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc1); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - let stats = index.stats().unwrap(); - let repartition = hashmap!{ - S("objectId") => 1u64, - S("title") => 1u64, - }; - assert_eq!(stats.number_of_documents, 1); - assert_eq!(stats.documents_fields_repartition, repartition); - - let doc2 = json!({ "objectId": 456, "title": "world" }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc2); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - let stats = index.stats().unwrap(); - let repartition = hashmap!{ - S("objectId") => 2u64, - S("title") => 2u64, - }; - assert_eq!(stats.number_of_documents, 2); - assert_eq!(stats.documents_fields_repartition, repartition); - - - let doc3 = json!({ "objectId": 789 }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc3); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - let stats = index.stats().unwrap(); - let repartition = hashmap!{ - S("objectId") => 3u64, - S("title") => 2u64, - }; - assert_eq!(stats.number_of_documents, 3); - assert_eq!(stats.documents_fields_repartition, repartition); -} diff --git a/meilidb-data/tests/index.rs b/meilidb-data/tests/index.rs deleted file mode 100644 index a8aee6c5b..000000000 --- a/meilidb-data/tests/index.rs +++ /dev/null @@ -1,148 +0,0 @@ -mod common; - -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering::Relaxed}; -use std::sync::Arc; - -use serde_json::json; - -#[test] -fn insert_delete_document() { - let index = common::simple_index(); - let as_been_updated = Arc::new(AtomicBool::new(false)); - - let as_been_updated_clone = as_been_updated.clone(); - index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed)); - - let doc1 = json!({ "objectId": 123, "title": "hello" }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc1); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - assert_eq!(index.number_of_documents(), 1); - - let docs = index.query_builder().query("hello", 0..10).unwrap(); - assert_eq!(docs.len(), 1); - assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1)); - - let mut deletion = index.documents_deletion(); - deletion.delete_document(&doc1).unwrap(); - let update_id = deletion.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - assert_eq!(index.number_of_documents(), 0); - - let docs = index.query_builder().query("hello", 0..10).unwrap(); - assert_eq!(docs.len(), 0); -} - -#[test] -fn replace_document() { - let index = common::simple_index(); - let as_been_updated = Arc::new(AtomicBool::new(false)); - - let as_been_updated_clone = as_been_updated.clone(); - index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed)); - - let doc1 = json!({ "objectId": 123, "title": "hello" }); - let doc2 = json!({ "objectId": 123, "title": "coucou" }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc1); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - assert_eq!(index.number_of_documents(), 1); - - let docs = index.query_builder().query("hello", 0..10).unwrap(); - assert_eq!(docs.len(), 1); - assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1)); - - let mut addition = index.documents_addition(); - addition.update_document(&doc2); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(as_been_updated.swap(false, Relaxed)); - assert!(status.result.is_ok()); - assert_eq!(index.number_of_documents(), 1); - - let docs = index.query_builder().query("hello", 0..10).unwrap(); - assert_eq!(docs.len(), 0); - - let docs = index.query_builder().query("coucou", 0..10).unwrap(); - assert_eq!(docs.len(), 1); - assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2)); -} - -#[test] -fn documents_ids() { - let index = common::simple_index(); - - let doc1 = json!({ "objectId": 123, "title": "hello" }); - let doc2 = json!({ "objectId": 456, "title": "world" }); - let doc3 = json!({ "objectId": 789 }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc1); - addition.update_document(&doc2); - addition.update_document(&doc3); - let update_id = addition.finalize().unwrap(); - let status = index.update_status_blocking(update_id).unwrap(); - assert!(status.result.is_ok()); - - let documents_ids_count = index.documents_ids().unwrap().count(); - assert_eq!(documents_ids_count, 3); -} - -#[test] -fn current_update_id() { - let index = common::simple_index(); - let update_id = Arc::new(AtomicU64::new(0)); - - let update_id_cloned = update_id.clone(); - let index_cloned = index.clone(); - index.set_update_callback(move |_| { - let current_update_id = index_cloned.current_update_id().unwrap().unwrap(); - assert_eq!(current_update_id, update_id_cloned.load(Relaxed)); - }); - - let doc1 = json!({ "objectId": 123, "title": "hello" }); - let mut addition = index.documents_addition(); - addition.update_document(&doc1); - update_id.store(addition.finalize().unwrap(), Relaxed); -} - -#[test] -fn nest_updates_in_queue() { - let index = common::simple_index(); - - index.set_update_callback(move |_| { - std::thread::sleep(std::time::Duration::from_secs(15)); - }); - - let doc1 = json!({ "objectId": 123, "title": "hello" }); - let doc2 = json!({ "objectId": 456, "title": "world" }); - let doc3 = json!({ "objectId": 789 }); - - let mut addition = index.documents_addition(); - addition.update_document(&doc1); - let _ = addition.finalize().unwrap(); - - let mut addition = index.documents_addition(); - addition.update_document(&doc2); - let _ = addition.finalize().unwrap(); - - let mut addition = index.documents_addition(); - addition.update_document(&doc3); - let _ = addition.finalize().unwrap(); - - let should_have_in_queue_updates = vec![1, 2, 3]; - - let in_queue_updates = index.enqueued_updates_ids().unwrap(); - assert_eq!(in_queue_updates, should_have_in_queue_updates); - -} diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml deleted file mode 100644 index 00c33f091..000000000 --- a/meilidb/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -edition = "2018" -name = "meilidb" -version = "0.3.1" -authors = ["Kerollmops "] - -[dependencies] -meilidb-core = { path = "../meilidb-core", version = "0.1.0" } -meilidb-data = { path = "../meilidb-data", version = "0.1.0" } -meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" } - -[dev-dependencies] -csv = "1.0.7" -diskus = "0.5.0" -env_logger = "0.6.1" -indexmap = { version = "1.1.0", features = ["serde-1"] } -jemallocator = "0.3.2" -meilidb-core = { path = "../meilidb-core", version = "0.1.0" } -quickcheck = "0.9.0" -rand = "0.7.2" -rand_xorshift = "0.2.0" -rustyline = { version = "5.0.0", default-features = false } -serde = { version = "1.0.91" , features = ["derive"] } -serde_json = "1.0.39" -structopt = "0.3.2" -sysinfo = "0.9.5" -tempfile = "3.0.7" -termcolor = "1.0.4" -toml = "0.5.3" diff --git a/meilidb/examples/create-database.rs b/meilidb/examples/create-database.rs deleted file mode 100644 index 1311000d9..000000000 --- a/meilidb/examples/create-database.rs +++ /dev/null @@ -1,215 +0,0 @@ -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -use std::collections::{HashMap, HashSet}; -use std::io::{self, BufRead, BufReader}; -use std::path::{Path, PathBuf}; -use std::time::Instant; -use std::error::Error; -use std::fs::{self, File}; - -use diskus::Walk; -use sysinfo::{SystemExt, ProcessExt}; -use serde::{Serialize, Deserialize}; -use structopt::StructOpt; - -use meilidb_data::Database; -use meilidb_schema::Schema; - -#[derive(Debug, StructOpt)] -pub struct Opt { - /// The destination where the database must be created. - #[structopt(parse(from_os_str))] - pub database_path: PathBuf, - - /// The csv file to index. - #[structopt(parse(from_os_str))] - pub csv_data_path: PathBuf, - - /// The path to the schema. - #[structopt(long = "schema", parse(from_os_str))] - pub schema_path: PathBuf, - - /// The file with the synonyms. - #[structopt(long = "synonyms", parse(from_os_str))] - pub synonyms: Option, - - /// The path to the list of stop words (one by line). - #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words: Option, - - #[structopt(long = "update-group-size")] - pub update_group_size: Option, -} - -#[derive(Serialize, Deserialize)] -struct Document ( - HashMap -); - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(untagged)] -pub enum Synonym { - OneWay(SynonymOneWay), - MultiWay { synonyms: Vec }, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct SynonymOneWay { - pub search_terms: String, - pub synonyms: Synonyms, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(untagged)] -pub enum Synonyms { - Multiple(Vec), - Single(String), -} - -fn read_synomys(path: &Path) -> Result, Box> { - let file = File::open(path)?; - let synonyms = serde_json::from_reader(file)?; - Ok(synonyms) -} - -fn index( - schema: Schema, - database_path: &Path, - csv_data_path: &Path, - update_group_size: Option, - stop_words: &HashSet, - synonyms: Vec, -) -> Result> -{ - let database = Database::open(database_path)?; - - let mut wtr = csv::Writer::from_path("./stats.csv").unwrap(); - wtr.write_record(&["NumberOfDocuments", "DiskUsed", "MemoryUsed"])?; - - let mut system = sysinfo::System::new(); - - let index = database.create_index("test", schema.clone())?; - - let mut synonyms_adder = index.synonyms_addition(); - for synonym in synonyms { - match synonym { - Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => { - let alternatives = match synonyms { - Synonyms::Multiple(alternatives) => alternatives, - Synonyms::Single(alternative) => vec![alternative], - }; - synonyms_adder.add_synonym(search_terms, alternatives); - }, - Synonym::MultiWay { mut synonyms } => { - for _ in 0..synonyms.len() { - if let Some((synonym, alternatives)) = synonyms.split_first() { - synonyms_adder.add_synonym(synonym, alternatives); - } - synonyms.rotate_left(1); - } - }, - } - } - synonyms_adder.finalize()?; - - let mut rdr = csv::Reader::from_path(csv_data_path)?; - let mut raw_record = csv::StringRecord::new(); - let headers = rdr.headers()?.clone(); - - let mut i = 0; - let mut end_of_file = false; - - while !end_of_file { - let mut update = index.documents_addition(); - - loop { - end_of_file = !rdr.read_record(&mut raw_record)?; - if end_of_file { break } - - let document: Document = match raw_record.deserialize(Some(&headers)) { - Ok(document) => document, - Err(e) => { - eprintln!("{:?}", e); - continue; - } - }; - - update.update_document(document); - - print!("\rindexing document {}", i); - i += 1; - - if let Some(group_size) = update_group_size { - if i % group_size == 0 { break } - } - } - - println!(); - - println!("committing update..."); - update.finalize()?; - - // write stats - let directory_size = Walk::new(&[database_path.to_owned()], 4).run(); - system.refresh_all(); - let pid = sysinfo::get_current_pid()?; - let memory = system.get_process(pid).unwrap().memory(); // in kb - wtr.write_record(&[i.to_string(), directory_size.to_string(), memory.to_string()])?; - wtr.flush()?; - } - - Ok(database) -} - -fn retrieve_stop_words(path: &Path) -> io::Result> { - let f = File::open(path)?; - let reader = BufReader::new(f); - let mut words = HashSet::new(); - - for line in reader.lines() { - let line = line?; - let word = line.trim().to_string(); - words.insert(word); - } - - Ok(words) -} - -fn main() -> Result<(), Box> { - let _ = env_logger::init(); - let opt = Opt::from_args(); - - let schema = { - let string = fs::read_to_string(&opt.schema_path)?; - toml::from_str(&string)? - }; - - let stop_words = match opt.stop_words { - Some(ref path) => retrieve_stop_words(path)?, - None => HashSet::new(), - }; - - let synonyms = match opt.synonyms { - Some(ref path) => read_synomys(path)?, - None => Vec::new(), - }; - - let start = Instant::now(); - let result = index( - schema, - &opt.database_path, - &opt.csv_data_path, - opt.update_group_size, - &stop_words, - synonyms, - ); - - if let Err(e) = result { - return Err(e.into()) - } - - println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path); - Ok(()) -} diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs deleted file mode 100644 index 89ed6b07e..000000000 --- a/meilidb/src/lib.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod sort_by_attr; - -pub use self::sort_by_attr::SortByAttr;