mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Merge #639
639: Reduce the size of the word_pair_proximity database r=loiclec a=loiclec # Pull Request ## What does this PR do? Fixes #634 Now, the value corresponding to the key `prox word1 word2` in the `word_pair_proximity_docids` database contains the ids of the documents in which: - `word1` is followed by `word2` - the minimum number of words between `word1` and `word2` is `prox-1` Before this PR, the `word_pair_proximity_docids` had keys with the format `word1 word2 prox` and the value contained the ids of the documents in which either: - `word1` is followed by `word2` after a minimum of `prox-1` words in between them - `word2` is followed by `word1` after a minimum of `prox-2` words As a consequence of this change, calls such as: ``` let docids = word_pair_proximity_docids.get(rtxn, (word1, word2, prox)); ``` have to be replaced with: ``` let docids1 = word_pair_proximity_docids.get(rtxn, (prox, word1, word2)) ; let docids2 = word_pair_proximity_docids.get(rtxn, (prox-1, word2, word1)) ; let docids = docids1 | docids2; ``` ## Phrase search The PR also fixes two bugs in the `resolve_phrase` function. The first bug is that a phrase containing twice the same word would always return zero documents (e.g. `"dog eats dog"`). The second bug occurs with a phrase such as "fox is smarter than a dog"` and the document with the text: ``` fox or dog? a fox is smarter than a dog ``` In that case, the phrase search would not return the documents because: * we only have the key `fox dog 2` in `word_pair_proximity_docids` * but the implementation of `resolve_phrase` looks for `fox dog 5`, which returns 0 documents ### New implementation of `resolve_phrase` Given the phrase: ``` fox is smarter than a dog ``` We select the document ids corresponding to all of the following keys in `word_pair_proximity_docids`: - `1 fox is` - `1 is smarter` - `1 smarter than` - (etc.) - `1 fox smarter` OR `2 fox smarter` - `1 is than` OR `2 is than` - ... - `1 than dog` OR `2 than dog` ## Benchmark Results Indexing: ``` group indexing_main_d94339a8 indexing_word-pair-proximity-docids-refactor_2983dd8e ----- ---------------------- ----------------------------------------------------- indexing/-geo-delete-facetedNumber-facetedGeo-searchable- 1.19 40.7±11.28ms ? ?/sec 1.00 34.3±4.16ms ? ?/sec indexing/-movies-delete-facetedString-facetedNumber-searchable- 1.62 11.3±3.77ms ? ?/sec 1.00 7.0±1.56ms ? ?/sec indexing/-movies-delete-facetedString-facetedNumber-searchable-nested- 1.00 12.5±2.62ms ? ?/sec 1.07 13.4±4.24ms ? ?/sec indexing/-songs-delete-facetedString-facetedNumber-searchable- 1.26 50.2±12.63ms ? ?/sec 1.00 39.8±20.25ms ? ?/sec indexing/-wiki-delete-searchable- 1.83 269.1±16.11ms ? ?/sec 1.00 146.8±6.12ms ? ?/sec indexing/Indexing geo_point 1.00 47.2±0.46s ? ?/sec 1.00 47.3±0.56s ? ?/sec indexing/Indexing movies in three batches 1.42 12.7±0.13s ? ?/sec 1.00 9.0±0.07s ? ?/sec indexing/Indexing movies with default settings 1.40 10.2±0.07s ? ?/sec 1.00 7.3±0.06s ? ?/sec indexing/Indexing nested movies with default settings 1.22 7.8±0.11s ? ?/sec 1.00 6.4±0.13s ? ?/sec indexing/Indexing nested movies without any facets 1.24 7.3±0.07s ? ?/sec 1.00 5.9±0.06s ? ?/sec indexing/Indexing songs in three batches with default settings 1.14 47.6±0.67s ? ?/sec 1.00 41.8±0.63s ? ?/sec indexing/Indexing songs with default settings 1.13 44.1±0.74s ? ?/sec 1.00 38.9±0.76s ? ?/sec indexing/Indexing songs without any facets 1.19 42.0±0.66s ? ?/sec 1.00 35.2±0.48s ? ?/sec indexing/Indexing songs without faceted numbers 1.20 44.3±1.40s ? ?/sec 1.00 37.0±0.48s ? ?/sec indexing/Indexing wiki 1.39 862.9±9.95s ? ?/sec 1.00 622.6±27.11s ? ?/sec indexing/Indexing wiki in three batches 1.40 934.4±5.97s ? ?/sec 1.00 665.7±4.72s ? ?/sec indexing/Reindexing geo_point 1.01 15.9±0.39s ? ?/sec 1.00 15.7±0.28s ? ?/sec indexing/Reindexing movies with default settings 1.15 288.8±25.03ms ? ?/sec 1.00 250.4±2.23ms ? ?/sec indexing/Reindexing songs with default settings 1.01 4.1±0.06s ? ?/sec 1.00 4.1±0.03s ? ?/sec indexing/Reindexing wiki 1.41 1484.7±20.59s ? ?/sec 1.00 1052.0±19.89s ? ?/sec ``` Search Wiki: <details> <pre> group search_wiki_main_d94339a8 search_wiki_word-pair-proximity-docids-refactor_2983dd8e ----- ------------------------- -------------------------------------------------------- smol-wiki-articles.csv: basic placeholder/ 1.02 25.8±0.21µs ? ?/sec 1.00 25.4±0.19µs ? ?/sec smol-wiki-articles.csv: basic with quote/"film" 1.00 441.7±2.57µs ? ?/sec 1.00 442.3±2.41µs ? ?/sec smol-wiki-articles.csv: basic with quote/"france" 1.00 357.0±2.63µs ? ?/sec 1.00 358.3±2.65µs ? ?/sec smol-wiki-articles.csv: basic with quote/"japan" 1.00 239.4±2.24µs ? ?/sec 1.00 240.2±1.82µs ? ?/sec smol-wiki-articles.csv: basic with quote/"machine" 1.00 180.3±2.40µs ? ?/sec 1.00 180.0±1.08µs ? ?/sec smol-wiki-articles.csv: basic with quote/"miles" "davis" 1.00 9.1±0.03ms ? ?/sec 1.03 9.3±0.04ms ? ?/sec smol-wiki-articles.csv: basic with quote/"mingus" 1.00 3.6±0.01ms ? ?/sec 1.03 3.7±0.02ms ? ?/sec smol-wiki-articles.csv: basic with quote/"rock" "and" "roll" 1.00 34.0±0.11ms ? ?/sec 1.03 35.1±0.13ms ? ?/sec smol-wiki-articles.csv: basic with quote/"spain" 1.00 162.0±0.88µs ? ?/sec 1.00 161.9±0.98µs ? ?/sec smol-wiki-articles.csv: basic without quote/film 1.01 164.4±1.46µs ? ?/sec 1.00 163.1±1.58µs ? ?/sec smol-wiki-articles.csv: basic without quote/france 1.00 1698.3±7.37µs ? ?/sec 1.00 1697.7±11.53µs ? ?/sec smol-wiki-articles.csv: basic without quote/japan 1.00 1154.0±23.61µs ? ?/sec 1.00 1150.7±9.27µs ? ?/sec smol-wiki-articles.csv: basic without quote/machine 1.00 524.6±3.45µs ? ?/sec 1.01 528.1±4.56µs ? ?/sec smol-wiki-articles.csv: basic without quote/miles davis 1.00 13.5±0.05ms ? ?/sec 1.02 13.8±0.05ms ? ?/sec smol-wiki-articles.csv: basic without quote/mingus 1.00 4.1±0.02ms ? ?/sec 1.03 4.2±0.01ms ? ?/sec smol-wiki-articles.csv: basic without quote/rock and roll 1.00 49.0±0.19ms ? ?/sec 1.03 50.4±0.22ms ? ?/sec smol-wiki-articles.csv: basic without quote/spain 1.00 412.2±3.35µs ? ?/sec 1.00 412.9±2.81µs ? ?/sec smol-wiki-articles.csv: prefix search/c 1.00 383.9±2.53µs ? ?/sec 1.00 383.4±2.44µs ? ?/sec smol-wiki-articles.csv: prefix search/g 1.00 433.4±2.53µs ? ?/sec 1.00 432.8±2.52µs ? ?/sec smol-wiki-articles.csv: prefix search/j 1.00 424.3±2.05µs ? ?/sec 1.00 424.0±2.15µs ? ?/sec smol-wiki-articles.csv: prefix search/q 1.00 154.0±1.93µs ? ?/sec 1.00 153.5±1.04µs ? ?/sec smol-wiki-articles.csv: prefix search/t 1.04 658.5±91.93µs ? ?/sec 1.00 631.4±3.89µs ? ?/sec smol-wiki-articles.csv: prefix search/x 1.00 446.2±2.09µs ? ?/sec 1.00 445.6±3.13µs ? ?/sec smol-wiki-articles.csv: proximity/april paris 1.02 3.4±0.39ms ? ?/sec 1.00 3.3±0.01ms ? ?/sec smol-wiki-articles.csv: proximity/diesel engine 1.00 1022.1±17.52µs ? ?/sec 1.00 1017.7±8.16µs ? ?/sec smol-wiki-articles.csv: proximity/herald sings 1.01 1872.5±97.70µs ? ?/sec 1.00 1862.2±8.57µs ? ?/sec smol-wiki-articles.csv: proximity/tea two 1.00 295.2±34.91µs ? ?/sec 1.00 296.6±4.08µs ? ?/sec smol-wiki-articles.csv: typo/Disnaylande 1.00 3.4±0.51ms ? ?/sec 1.04 3.5±0.01ms ? ?/sec smol-wiki-articles.csv: typo/aritmetric 1.00 3.6±0.01ms ? ?/sec 1.00 3.7±0.01ms ? ?/sec smol-wiki-articles.csv: typo/linax 1.00 167.5±1.28µs ? ?/sec 1.00 167.1±2.65µs ? ?/sec smol-wiki-articles.csv: typo/migrosoft 1.01 217.9±1.84µs ? ?/sec 1.00 216.2±1.61µs ? ?/sec smol-wiki-articles.csv: typo/nympalidea 1.00 2.9±0.01ms ? ?/sec 1.10 3.1±0.01ms ? ?/sec smol-wiki-articles.csv: typo/phytogropher 1.00 3.0±0.23ms ? ?/sec 1.08 3.3±0.01ms ? ?/sec smol-wiki-articles.csv: typo/sisan 1.00 234.6±1.38µs ? ?/sec 1.01 235.8±1.67µs ? ?/sec smol-wiki-articles.csv: typo/the fronce 1.00 104.4±0.84µs ? ?/sec 1.00 103.9±0.81µs ? ?/sec smol-wiki-articles.csv: words/Abraham machin 1.02 675.5±4.74µs ? ?/sec 1.00 662.1±5.13µs ? ?/sec smol-wiki-articles.csv: words/Idaho Bellevue pizza 1.02 1004.5±11.07µs ? ?/sec 1.00 989.5±13.08µs ? ?/sec smol-wiki-articles.csv: words/Kameya Tokujirō mingus monk 1.00 1650.8±10.92µs ? ?/sec 1.00 1643.2±10.77µs ? ?/sec smol-wiki-articles.csv: words/Ulrich Hensel meilisearch milli 1.00 5.4±0.03ms ? ?/sec 1.00 5.4±0.02ms ? ?/sec smol-wiki-articles.csv: words/the black saint and the sinner lady and the good doggo 1.00 32.9±0.10ms ? ?/sec 1.00 32.8±0.10ms ? ?/sec </pre> </details> Search songs: <details> <pre> group search_songs_main_d94339a8 search_songs_word-pair-proximity-docids-refactor_2983dd8e ----- -------------------------- --------------------------------------------------------- smol-songs.csv: asc + default/Notstandskomitee 1.00 3.0±0.01ms ? ?/sec 1.01 3.0±0.04ms ? ?/sec smol-songs.csv: asc + default/charles 1.00 2.2±0.01ms ? ?/sec 1.01 2.2±0.01ms ? ?/sec smol-songs.csv: asc + default/charles mingus 1.00 3.1±0.01ms ? ?/sec 1.01 3.1±0.01ms ? ?/sec smol-songs.csv: asc + default/david 1.00 2.9±0.01ms ? ?/sec 1.00 2.9±0.01ms ? ?/sec smol-songs.csv: asc + default/david bowie 1.00 4.5±0.02ms ? ?/sec 1.00 4.5±0.02ms ? ?/sec smol-songs.csv: asc + default/john 1.00 3.1±0.01ms ? ?/sec 1.01 3.2±0.01ms ? ?/sec smol-songs.csv: asc + default/marcus miller 1.00 5.0±0.02ms ? ?/sec 1.00 5.0±0.02ms ? ?/sec smol-songs.csv: asc + default/michael jackson 1.00 4.7±0.02ms ? ?/sec 1.00 4.7±0.02ms ? ?/sec smol-songs.csv: asc + default/tamo 1.00 1463.4±12.17µs ? ?/sec 1.01 1481.5±8.83µs ? ?/sec smol-songs.csv: asc + default/thelonious monk 1.00 4.4±0.01ms ? ?/sec 1.00 4.4±0.02ms ? ?/sec smol-songs.csv: asc/Notstandskomitee 1.01 2.6±0.01ms ? ?/sec 1.00 2.6±0.01ms ? ?/sec smol-songs.csv: asc/charles 1.00 473.6±3.70µs ? ?/sec 1.01 476.8±22.17µs ? ?/sec smol-songs.csv: asc/charles mingus 1.01 780.1±3.90µs ? ?/sec 1.00 773.6±4.60µs ? ?/sec smol-songs.csv: asc/david 1.00 757.6±4.50µs ? ?/sec 1.00 760.7±5.20µs ? ?/sec smol-songs.csv: asc/david bowie 1.00 1131.2±8.68µs ? ?/sec 1.00 1130.7±8.36µs ? ?/sec smol-songs.csv: asc/john 1.00 668.9±6.48µs ? ?/sec 1.00 669.9±2.78µs ? ?/sec smol-songs.csv: asc/marcus miller 1.00 959.8±7.10µs ? ?/sec 1.00 958.9±4.72µs ? ?/sec smol-songs.csv: asc/michael jackson 1.01 1076.7±16.73µs ? ?/sec 1.00 1070.8±7.34µs ? ?/sec smol-songs.csv: asc/tamo 1.00 70.4±0.55µs ? ?/sec 1.00 70.5±0.51µs ? ?/sec smol-songs.csv: asc/thelonious monk 1.01 2.9±0.01ms ? ?/sec 1.00 2.9±0.01ms ? ?/sec smol-songs.csv: basic filter: <=/Notstandskomitee 1.00 162.0±0.91µs ? ?/sec 1.01 163.6±1.72µs ? ?/sec smol-songs.csv: basic filter: <=/charles 1.00 38.3±0.24µs ? ?/sec 1.01 38.7±0.31µs ? ?/sec smol-songs.csv: basic filter: <=/charles mingus 1.01 85.3±0.44µs ? ?/sec 1.00 84.6±0.47µs ? ?/sec smol-songs.csv: basic filter: <=/david 1.01 32.4±0.25µs ? ?/sec 1.00 32.1±0.24µs ? ?/sec smol-songs.csv: basic filter: <=/david bowie 1.00 68.6±0.99µs ? ?/sec 1.01 68.9±0.88µs ? ?/sec smol-songs.csv: basic filter: <=/john 1.04 26.1±0.37µs ? ?/sec 1.00 25.1±0.22µs ? ?/sec smol-songs.csv: basic filter: <=/marcus miller 1.00 76.7±0.39µs ? ?/sec 1.01 77.3±0.61µs ? ?/sec smol-songs.csv: basic filter: <=/michael jackson 1.00 95.5±0.66µs ? ?/sec 1.01 96.3±0.79µs ? ?/sec smol-songs.csv: basic filter: <=/tamo 1.03 26.2±0.36µs ? ?/sec 1.00 25.3±0.23µs ? ?/sec smol-songs.csv: basic filter: <=/thelonious monk 1.00 140.7±1.36µs ? ?/sec 1.01 142.7±0.88µs ? ?/sec smol-songs.csv: basic filter: TO/Notstandskomitee 1.00 165.4±1.25µs ? ?/sec 1.00 165.7±1.72µs ? ?/sec smol-songs.csv: basic filter: TO/charles 1.01 40.6±0.57µs ? ?/sec 1.00 40.1±0.54µs ? ?/sec smol-songs.csv: basic filter: TO/charles mingus 1.01 87.1±0.80µs ? ?/sec 1.00 86.3±0.61µs ? ?/sec smol-songs.csv: basic filter: TO/david 1.02 34.5±0.26µs ? ?/sec 1.00 33.7±0.24µs ? ?/sec smol-songs.csv: basic filter: TO/david bowie 1.00 70.6±0.38µs ? ?/sec 1.00 70.6±0.68µs ? ?/sec smol-songs.csv: basic filter: TO/john 1.02 27.5±0.77µs ? ?/sec 1.00 26.9±0.21µs ? ?/sec smol-songs.csv: basic filter: TO/marcus miller 1.01 79.8±0.76µs ? ?/sec 1.00 79.3±1.27µs ? ?/sec smol-songs.csv: basic filter: TO/michael jackson 1.00 98.3±0.54µs ? ?/sec 1.00 98.0±0.88µs ? ?/sec smol-songs.csv: basic filter: TO/tamo 1.03 27.9±0.23µs ? ?/sec 1.00 27.1±0.32µs ? ?/sec smol-songs.csv: basic filter: TO/thelonious monk 1.00 142.5±1.36µs ? ?/sec 1.02 145.2±0.98µs ? ?/sec smol-songs.csv: basic placeholder/ 1.00 49.4±0.34µs ? ?/sec 1.00 49.3±0.45µs ? ?/sec smol-songs.csv: basic with quote/"Notstandskomitee" 1.00 190.5±1.60µs ? ?/sec 1.01 191.8±2.10µs ? ?/sec smol-songs.csv: basic with quote/"charles" 1.00 165.0±1.13µs ? ?/sec 1.01 166.0±1.39µs ? ?/sec smol-songs.csv: basic with quote/"charles" "mingus" 1.00 1149.4±15.78µs ? ?/sec 1.02 1171.1±9.95µs ? ?/sec smol-songs.csv: basic with quote/"david" 1.00 236.5±1.61µs ? ?/sec 1.00 236.9±1.73µs ? ?/sec smol-songs.csv: basic with quote/"david" "bowie" 1.00 1384.8±9.02µs ? ?/sec 1.01 1393.8±11.39µs ? ?/sec smol-songs.csv: basic with quote/"john" 1.00 358.3±4.85µs ? ?/sec 1.00 358.9±1.75µs ? ?/sec smol-songs.csv: basic with quote/"marcus" "miller" 1.00 281.4±1.79µs ? ?/sec 1.01 285.6±3.24µs ? ?/sec smol-songs.csv: basic with quote/"michael" "jackson" 1.00 1328.4±8.01µs ? ?/sec 1.00 1334.6±8.00µs ? ?/sec smol-songs.csv: basic with quote/"tamo" 1.00 528.7±3.72µs ? ?/sec 1.01 533.4±5.31µs ? ?/sec smol-songs.csv: basic with quote/"thelonious" "monk" 1.00 1223.0±7.24µs ? ?/sec 1.02 1245.7±12.04µs ? ?/sec smol-songs.csv: basic without quote/Notstandskomitee 1.00 2.8±0.01ms ? ?/sec 1.00 2.8±0.01ms ? ?/sec smol-songs.csv: basic without quote/charles 1.00 273.3±2.06µs ? ?/sec 1.01 275.9±1.76µs ? ?/sec smol-songs.csv: basic without quote/charles mingus 1.00 2.3±0.01ms ? ?/sec 1.02 2.4±0.01ms ? ?/sec smol-songs.csv: basic without quote/david 1.00 434.3±3.86µs ? ?/sec 1.01 436.7±2.47µs ? ?/sec smol-songs.csv: basic without quote/david bowie 1.00 5.6±0.02ms ? ?/sec 1.01 5.7±0.02ms ? ?/sec smol-songs.csv: basic without quote/john 1.00 1322.5±9.98µs ? ?/sec 1.00 1321.2±17.40µs ? ?/sec smol-songs.csv: basic without quote/marcus miller 1.02 2.4±0.02ms ? ?/sec 1.00 2.4±0.01ms ? ?/sec smol-songs.csv: basic without quote/michael jackson 1.00 3.8±0.02ms ? ?/sec 1.01 3.9±0.01ms ? ?/sec smol-songs.csv: basic without quote/tamo 1.00 809.0±4.01µs ? ?/sec 1.01 819.0±6.22µs ? ?/sec smol-songs.csv: basic without quote/thelonious monk 1.00 3.8±0.02ms ? ?/sec 1.02 3.9±0.02ms ? ?/sec smol-songs.csv: big filter/Notstandskomitee 1.00 2.7±0.01ms ? ?/sec 1.01 2.8±0.01ms ? ?/sec smol-songs.csv: big filter/charles 1.00 266.5±1.34µs ? ?/sec 1.01 270.1±8.17µs ? ?/sec smol-songs.csv: big filter/charles mingus 1.00 651.0±5.40µs ? ?/sec 1.00 651.0±2.73µs ? ?/sec smol-songs.csv: big filter/david 1.00 1018.1±11.16µs ? ?/sec 1.00 1022.3±8.94µs ? ?/sec smol-songs.csv: big filter/david bowie 1.00 1912.2±11.13µs ? ?/sec 1.00 1919.8±8.30µs ? ?/sec smol-songs.csv: big filter/john 1.00 867.2±6.66µs ? ?/sec 1.01 873.3±3.44µs ? ?/sec smol-songs.csv: big filter/marcus miller 1.00 717.7±2.86µs ? ?/sec 1.01 721.5±3.89µs ? ?/sec smol-songs.csv: big filter/michael jackson 1.00 1668.4±16.76µs ? ?/sec 1.00 1667.9±10.11µs ? ?/sec smol-songs.csv: big filter/tamo 1.01 136.7±0.88µs ? ?/sec 1.00 135.5±1.22µs ? ?/sec smol-songs.csv: big filter/thelonious monk 1.03 3.1±0.02ms ? ?/sec 1.00 3.0±0.01ms ? ?/sec smol-songs.csv: desc + default/Notstandskomitee 1.00 3.0±0.01ms ? ?/sec 1.00 3.0±0.01ms ? ?/sec smol-songs.csv: desc + default/charles 1.00 1599.5±13.07µs ? ?/sec 1.01 1622.9±22.43µs ? ?/sec smol-songs.csv: desc + default/charles mingus 1.00 2.3±0.01ms ? ?/sec 1.01 2.4±0.03ms ? ?/sec smol-songs.csv: desc + default/david 1.00 5.7±0.02ms ? ?/sec 1.00 5.7±0.02ms ? ?/sec smol-songs.csv: desc + default/david bowie 1.00 9.0±0.04ms ? ?/sec 1.00 9.0±0.03ms ? ?/sec smol-songs.csv: desc + default/john 1.00 4.5±0.01ms ? ?/sec 1.00 4.5±0.02ms ? ?/sec smol-songs.csv: desc + default/marcus miller 1.00 3.9±0.01ms ? ?/sec 1.00 3.9±0.02ms ? ?/sec smol-songs.csv: desc + default/michael jackson 1.00 6.6±0.03ms ? ?/sec 1.00 6.6±0.03ms ? ?/sec smol-songs.csv: desc + default/tamo 1.00 1472.4±10.38µs ? ?/sec 1.01 1484.2±8.07µs ? ?/sec smol-songs.csv: desc + default/thelonious monk 1.00 4.4±0.02ms ? ?/sec 1.00 4.4±0.05ms ? ?/sec smol-songs.csv: desc/Notstandskomitee 1.01 2.6±0.01ms ? ?/sec 1.00 2.6±0.01ms ? ?/sec smol-songs.csv: desc/charles 1.00 475.9±3.38µs ? ?/sec 1.00 475.9±2.64µs ? ?/sec smol-songs.csv: desc/charles mingus 1.00 775.3±4.30µs ? ?/sec 1.00 778.9±3.52µs ? ?/sec smol-songs.csv: desc/david 1.00 757.9±4.10µs ? ?/sec 1.01 763.4±3.27µs ? ?/sec smol-songs.csv: desc/david bowie 1.00 1129.0±11.87µs ? ?/sec 1.01 1135.1±8.86µs ? ?/sec smol-songs.csv: desc/john 1.00 670.2±4.38µs ? ?/sec 1.00 670.2±3.46µs ? ?/sec smol-songs.csv: desc/marcus miller 1.00 961.2±4.47µs ? ?/sec 1.00 961.9±4.03µs ? ?/sec smol-songs.csv: desc/michael jackson 1.00 1076.5±6.61µs ? ?/sec 1.00 1077.9±7.11µs ? ?/sec smol-songs.csv: desc/tamo 1.00 70.6±0.57µs ? ?/sec 1.01 71.3±0.48µs ? ?/sec smol-songs.csv: desc/thelonious monk 1.01 2.9±0.01ms ? ?/sec 1.00 2.9±0.01ms ? ?/sec smol-songs.csv: prefix search/a 1.00 1236.2±9.43µs ? ?/sec 1.00 1232.0±12.07µs ? ?/sec smol-songs.csv: prefix search/b 1.00 1090.8±9.89µs ? ?/sec 1.00 1090.8±9.43µs ? ?/sec smol-songs.csv: prefix search/i 1.00 1333.9±8.28µs ? ?/sec 1.00 1334.2±11.21µs ? ?/sec smol-songs.csv: prefix search/s 1.00 810.5±3.69µs ? ?/sec 1.00 806.6±3.50µs ? ?/sec smol-songs.csv: prefix search/x 1.00 290.5±1.88µs ? ?/sec 1.00 291.0±1.85µs ? ?/sec smol-songs.csv: proximity/7000 Danses Un Jour Dans Notre Vie 1.00 4.7±0.02ms ? ?/sec 1.00 4.7±0.02ms ? ?/sec smol-songs.csv: proximity/The Disneyland Sing-Along Chorus 1.01 5.6±0.02ms ? ?/sec 1.00 5.6±0.03ms ? ?/sec smol-songs.csv: proximity/Under Great Northern Lights 1.00 2.5±0.01ms ? ?/sec 1.00 2.5±0.01ms ? ?/sec smol-songs.csv: proximity/black saint sinner lady 1.00 4.8±0.02ms ? ?/sec 1.00 4.8±0.02ms ? ?/sec smol-songs.csv: proximity/les dangeureuses 1960 1.00 3.2±0.01ms ? ?/sec 1.01 3.2±0.01ms ? ?/sec smol-songs.csv: typo/Arethla Franklin 1.00 388.7±5.16µs ? ?/sec 1.00 390.0±2.11µs ? ?/sec smol-songs.csv: typo/Disnaylande 1.01 2.6±0.01ms ? ?/sec 1.00 2.6±0.01ms ? ?/sec smol-songs.csv: typo/dire straights 1.00 125.9±1.22µs ? ?/sec 1.00 126.0±0.71µs ? ?/sec smol-songs.csv: typo/fear of the duck 1.00 373.7±4.25µs ? ?/sec 1.01 375.7±14.17µs ? ?/sec smol-songs.csv: typo/indochie 1.00 103.6±0.94µs ? ?/sec 1.00 103.4±0.74µs ? ?/sec smol-songs.csv: typo/indochien 1.00 155.6±1.14µs ? ?/sec 1.01 157.5±1.75µs ? ?/sec smol-songs.csv: typo/klub des loopers 1.00 160.6±2.98µs ? ?/sec 1.01 161.7±1.96µs ? ?/sec smol-songs.csv: typo/michel depech 1.00 79.4±0.54µs ? ?/sec 1.01 79.9±0.60µs ? ?/sec smol-songs.csv: typo/mongus 1.00 126.7±1.85µs ? ?/sec 1.00 126.1±0.74µs ? ?/sec smol-songs.csv: typo/stromal 1.01 132.9±0.99µs ? ?/sec 1.00 131.9±1.09µs ? ?/sec smol-songs.csv: typo/the white striper 1.00 287.8±2.88µs ? ?/sec 1.00 286.5±1.91µs ? ?/sec smol-songs.csv: typo/thelonius monk 1.00 304.2±1.49µs ? ?/sec 1.01 306.5±1.50µs ? ?/sec smol-songs.csv: words/7000 Danses / Le Baiser / je me trompe de mots 1.01 20.9±0.08ms ? ?/sec 1.00 20.7±0.07ms ? ?/sec smol-songs.csv: words/Bring Your Daughter To The Slaughter but now this is not part of the title 1.00 48.9±0.13ms ? ?/sec 1.00 48.9±0.11ms ? ?/sec smol-songs.csv: words/The Disneyland Children's Sing-Alone song 1.01 13.9±0.06ms ? ?/sec 1.00 13.8±0.07ms ? ?/sec smol-songs.csv: words/les liaisons dangeureuses 1793 1.01 3.7±0.01ms ? ?/sec 1.00 3.6±0.02ms ? ?/sec smol-songs.csv: words/seven nation mummy 1.00 1054.2±14.49µs ? ?/sec 1.00 1056.6±10.53µs ? ?/sec smol-songs.csv: words/the black saint and the sinner lady and the good doggo 1.00 58.2±0.29ms ? ?/sec 1.00 57.9±0.21ms ? ?/sec smol-songs.csv: words/whathavenotnsuchforth and a good amount of words to pop to match the first one 1.00 66.1±0.21ms ? ?/sec 1.00 66.0±0.24ms ? ?/sec </code> </details> Co-authored-by: Loïc Lecrenier <loic@meilisearch.com> Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
commit
d11a6e187f
@ -15,4 +15,4 @@ pub use self::roaring_bitmap_length::{
|
||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
||||
};
|
||||
pub use self::str_beu32_codec::StrBEU32Codec;
|
||||
pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec};
|
||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
@ -1,61 +1,57 @@
|
||||
use std::borrow::Cow;
|
||||
use std::str;
|
||||
|
||||
pub struct StrStrU8Codec;
|
||||
pub struct U8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrStrU8Codec {
|
||||
type DItem = (&'a str, &'a str, u8);
|
||||
impl<'a> heed::BytesDecode<'a> for U8StrStrCodec {
|
||||
type DItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (n, bytes) = bytes.split_last()?;
|
||||
let (n, bytes) = bytes.split_first()?;
|
||||
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
||||
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
||||
let rest = &rest[1..];
|
||||
let s2_bytes = &rest[1..];
|
||||
let s1 = str::from_utf8(s1_bytes).ok()?;
|
||||
let (_, s2_bytes) = rest.split_last()?;
|
||||
let s2 = str::from_utf8(s2_bytes).ok()?;
|
||||
Some((s1, s2, *n))
|
||||
Some((*n, s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrStrU8Codec {
|
||||
type EItem = (&'a str, &'a str, u8);
|
||||
impl<'a> heed::BytesEncode<'a> for U8StrStrCodec {
|
||||
type EItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.push(*n);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
pub struct UncheckedStrStrU8Codec;
|
||||
pub struct UncheckedU8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec {
|
||||
type DItem = (&'a [u8], &'a [u8], u8);
|
||||
impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec {
|
||||
type DItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (n, bytes) = bytes.split_last()?;
|
||||
let (n, bytes) = bytes.split_first()?;
|
||||
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
||||
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
||||
let rest = &rest[1..];
|
||||
let (_, s2_bytes) = rest.split_last()?;
|
||||
Some((s1_bytes, s2_bytes, *n))
|
||||
let s2_bytes = &rest[1..];
|
||||
Some((*n, s1_bytes, s2_bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec {
|
||||
type EItem = (&'a [u8], &'a [u8], u8);
|
||||
impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec {
|
||||
type EItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1);
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2);
|
||||
bytes.push(0);
|
||||
bytes.push(*n);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ use crate::{
|
||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
|
||||
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
||||
Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32,
|
||||
Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32,
|
||||
};
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
@ -71,6 +71,7 @@ pub mod db_name {
|
||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
|
||||
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||
@ -106,9 +107,11 @@ pub struct Index {
|
||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
||||
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||
pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
|
||||
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the word and the position with the docids that corresponds to it.
|
||||
pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
||||
@ -138,7 +141,7 @@ impl Index {
|
||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(17);
|
||||
options.max_dbs(18);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
@ -151,6 +154,8 @@ impl Index {
|
||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_prefix_pair_proximity_docids =
|
||||
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let prefix_word_pair_proximity_docids =
|
||||
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
|
||||
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
||||
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
||||
@ -175,6 +180,7 @@ impl Index {
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_prefix_position_docids,
|
||||
field_id_word_count_docids,
|
||||
|
@ -37,7 +37,7 @@ pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{
|
||||
BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
|
||||
CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
|
||||
RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, UncheckedStrStrU8Codec,
|
||||
RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec,
|
||||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{
|
||||
|
@ -7,7 +7,7 @@ use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::criteria::{
|
||||
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
|
||||
resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
|
||||
};
|
||||
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
|
||||
use crate::{absolute_from_relative_position, FieldId, Result};
|
||||
@ -226,19 +226,7 @@ fn resolve_state(
|
||||
}
|
||||
// compute intersection on pair of words with a proximity of 0.
|
||||
Phrase(phrase) => {
|
||||
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
|
||||
for words in phrase.windows(2) {
|
||||
if let [left, right] = words {
|
||||
match ctx.word_pair_proximity_docids(left, right, 0)? {
|
||||
Some(docids) => bitmaps.push(docids),
|
||||
None => {
|
||||
bitmaps.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
candidates |= intersection_of(bitmaps.iter().collect());
|
||||
candidates |= resolve_phrase(ctx, phrase)?;
|
||||
}
|
||||
}
|
||||
parts_candidates_array.push(candidates);
|
||||
|
@ -71,6 +71,7 @@ pub trait Context<'c> {
|
||||
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
|
||||
fn word_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
@ -83,6 +84,12 @@ pub trait Context<'c> {
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn prefix_word_pair_proximity_docids(
|
||||
&self,
|
||||
prefix: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||
fn docid_words_positions(
|
||||
@ -111,6 +118,68 @@ pub struct CriteriaBuilder<'t> {
|
||||
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
|
||||
}
|
||||
|
||||
/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`].
|
||||
/// * `left, right, prox` (leftward proximity)
|
||||
/// * `right, left, prox-1` (rightward proximity)
|
||||
///
|
||||
/// ## Example
|
||||
/// For a document with the text `the good fox eats the apple`, we have:
|
||||
/// * `rightward_proximity(the, eats) = 3`
|
||||
/// * `leftward_proximity(eats, the) = 1`
|
||||
///
|
||||
/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)`
|
||||
/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing
|
||||
/// the id of this document.
|
||||
fn word_pair_overall_proximity_docids(
|
||||
ctx: &dyn Context,
|
||||
left: &str,
|
||||
right: &str,
|
||||
prox: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let rightward = ctx.word_pair_proximity_docids(left, right, prox)?;
|
||||
let leftward =
|
||||
if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None };
|
||||
if let Some(mut all) = rightward {
|
||||
if let Some(leftward) = leftward {
|
||||
all |= leftward;
|
||||
}
|
||||
Ok(Some(all))
|
||||
} else {
|
||||
Ok(leftward)
|
||||
}
|
||||
}
|
||||
|
||||
/// This function works identically to [`word_pair_overall_proximity_docids`] except that the
|
||||
/// right word is replaced by a prefix string.
|
||||
///
|
||||
/// It will return None if no documents were found or if the prefix does not exist in the
|
||||
/// `word_prefix_pair_proximity_docids` database.
|
||||
fn word_prefix_pair_overall_proximity_docids(
|
||||
ctx: &dyn Context,
|
||||
left: &str,
|
||||
prefix: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
// We retrieve the docids for the original and swapped word pairs:
|
||||
// A: word1 prefix2 proximity
|
||||
// B: prefix2 word1 proximity-1
|
||||
let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?;
|
||||
|
||||
let leftward = if proximity > 1 {
|
||||
ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if let Some(mut all) = rightward {
|
||||
if let Some(leftward) = leftward {
|
||||
all |= leftward;
|
||||
}
|
||||
Ok(Some(all))
|
||||
} else {
|
||||
Ok(leftward)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
|
||||
self.index.documents_ids(self.rtxn)
|
||||
@ -138,18 +207,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left, right, proximity);
|
||||
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
|
||||
self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right))
|
||||
}
|
||||
|
||||
fn word_prefix_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
prefix: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix))
|
||||
}
|
||||
fn prefix_word_pair_proximity_docids(
|
||||
&self,
|
||||
prefix: &str,
|
||||
right: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left, right, proximity);
|
||||
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
|
||||
self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right))
|
||||
}
|
||||
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
||||
@ -352,18 +427,31 @@ pub fn resolve_query_tree(
|
||||
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_iter = true;
|
||||
let winsize = phrase.len().min(7);
|
||||
|
||||
let winsize = phrase.len().min(3);
|
||||
for win in phrase.windows(winsize) {
|
||||
// Get all the documents with the matching distance for each word pairs.
|
||||
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
||||
for (offset, s1) in win.iter().enumerate() {
|
||||
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
|
||||
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
|
||||
Some(m) => bitmaps.push(m),
|
||||
// If there are no document for this distance, there will be no
|
||||
// results for the phrase query.
|
||||
None => return Ok(RoaringBitmap::new()),
|
||||
if dist == 0 {
|
||||
match ctx.word_pair_proximity_docids(s1, s2, 1)? {
|
||||
Some(m) => bitmaps.push(m),
|
||||
// If there are no document for this pair, there will be no
|
||||
// results for the phrase query.
|
||||
None => return Ok(RoaringBitmap::new()),
|
||||
}
|
||||
} else {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
for dist in 0..=dist {
|
||||
if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
|
||||
bitmap |= m
|
||||
}
|
||||
}
|
||||
if bitmap.is_empty() {
|
||||
return Ok(bitmap);
|
||||
} else {
|
||||
bitmaps.push(bitmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -387,7 +475,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBit
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
||||
fn all_word_pair_overall_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
||||
ctx: &dyn Context,
|
||||
left_words: &[(T, u8)],
|
||||
right_words: &[(U, u8)],
|
||||
@ -396,9 +484,9 @@ fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _l_typo) in left_words {
|
||||
for (right, _r_typo) in right_words {
|
||||
let current_docids = ctx
|
||||
.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?
|
||||
.unwrap_or_default();
|
||||
let current_docids =
|
||||
word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)?
|
||||
.unwrap_or_default();
|
||||
docids |= current_docids;
|
||||
}
|
||||
}
|
||||
@ -472,7 +560,8 @@ fn query_pair_proximity_docids(
|
||||
match (&left.kind, &right.kind) {
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
||||
if prefix {
|
||||
match ctx.word_prefix_pair_proximity_docids(
|
||||
match word_prefix_pair_overall_proximity_docids(
|
||||
ctx,
|
||||
left.as_str(),
|
||||
right.as_str(),
|
||||
proximity,
|
||||
@ -480,7 +569,12 @@ fn query_pair_proximity_docids(
|
||||
Some(docids) => Ok(docids),
|
||||
None => {
|
||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
all_word_pair_overall_proximity_docids(
|
||||
ctx,
|
||||
&[(left, 0)],
|
||||
&r_words,
|
||||
proximity,
|
||||
)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -495,7 +589,8 @@ fn query_pair_proximity_docids(
|
||||
if prefix {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _) in l_words {
|
||||
let current_docids = match ctx.word_prefix_pair_proximity_docids(
|
||||
let current_docids = match word_prefix_pair_overall_proximity_docids(
|
||||
ctx,
|
||||
left.as_str(),
|
||||
right.as_str(),
|
||||
proximity,
|
||||
@ -504,19 +599,24 @@ fn query_pair_proximity_docids(
|
||||
None => {
|
||||
let r_words =
|
||||
word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
all_word_pair_overall_proximity_docids(
|
||||
ctx,
|
||||
&[(left, 0)],
|
||||
&r_words,
|
||||
proximity,
|
||||
)
|
||||
}
|
||||
}?;
|
||||
docids |= current_docids;
|
||||
}
|
||||
Ok(docids)
|
||||
} else {
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||
all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||
}
|
||||
}
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
|
||||
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
}
|
||||
(
|
||||
QueryKind::Tolerant { typo: l_typo, word: left },
|
||||
@ -525,7 +625,7 @@ fn query_pair_proximity_docids(
|
||||
let l_words =
|
||||
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||
all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -552,6 +652,7 @@ pub mod test {
|
||||
exact_word_prefix_docids: HashMap<String, RoaringBitmap>,
|
||||
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
|
||||
docid_words: HashMap<u32, Vec<String>>,
|
||||
}
|
||||
|
||||
@ -588,13 +689,22 @@ pub mod test {
|
||||
|
||||
fn word_prefix_pair_proximity_docids(
|
||||
&self,
|
||||
left: &str,
|
||||
right: &str,
|
||||
word: &str,
|
||||
prefix: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (left.to_string(), right.to_string(), proximity.into());
|
||||
let key = (word.to_string(), prefix.to_string(), proximity.into());
|
||||
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
|
||||
}
|
||||
fn prefix_word_pair_proximity_docids(
|
||||
&self,
|
||||
prefix: &str,
|
||||
word: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (prefix.to_string(), word.to_string(), proximity.into());
|
||||
Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned())
|
||||
}
|
||||
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
|
||||
&self.words_fst
|
||||
@ -708,6 +818,8 @@ pub mod test {
|
||||
|
||||
let mut word_pair_proximity_docids = HashMap::new();
|
||||
let mut word_prefix_pair_proximity_docids = HashMap::new();
|
||||
let mut prefix_word_pair_proximity_docids = HashMap::new();
|
||||
|
||||
for (lword, lcandidates) in &word_docids {
|
||||
for (rword, rcandidates) in &word_docids {
|
||||
if lword == rword {
|
||||
@ -740,15 +852,19 @@ pub mod test {
|
||||
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
|
||||
let rposition =
|
||||
docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
|
||||
let key = if lposition < rposition {
|
||||
(s(lword), s(pword), (rposition - lposition) as i32)
|
||||
if lposition < rposition {
|
||||
let key = (s(lword), s(pword), (rposition - lposition) as i32);
|
||||
let docids = word_prefix_pair_proximity_docids
|
||||
.entry(key)
|
||||
.or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
} else {
|
||||
(s(lword), s(pword), (lposition - rposition + 1) as i32)
|
||||
let key = (s(lword), s(pword), (lposition - rposition) as i32);
|
||||
let docids = prefix_word_pair_proximity_docids
|
||||
.entry(key)
|
||||
.or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
};
|
||||
let docids = word_prefix_pair_proximity_docids
|
||||
.entry(key)
|
||||
.or_insert(RoaringBitmap::new());
|
||||
docids.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -766,6 +882,7 @@ pub mod test {
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
docid_words,
|
||||
}
|
||||
}
|
||||
|
@ -203,7 +203,7 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
||||
right_word: &str,
|
||||
proximity: u8,
|
||||
) -> heed::Result<Option<u64>> {
|
||||
let key = (left_word, right_word, proximity);
|
||||
let key = (proximity, left_word, right_word);
|
||||
self.index
|
||||
.word_pair_proximity_docids
|
||||
.remap_data_type::<CboRoaringBitmapLenCodec>()
|
||||
|
@ -182,19 +182,28 @@ pub fn snap_docid_word_positions(index: &Index) -> String {
|
||||
}
|
||||
pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
|
||||
let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |(
|
||||
(word1, word2, proximity),
|
||||
(proximity, word1, word2),
|
||||
b,
|
||||
)| {
|
||||
&format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b))
|
||||
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
||||
});
|
||||
snap
|
||||
}
|
||||
pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
|
||||
let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
|
||||
(word1, prefix, proximity),
|
||||
(proximity, word1, prefix),
|
||||
b,
|
||||
)| {
|
||||
&format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b))
|
||||
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
|
||||
});
|
||||
snap
|
||||
}
|
||||
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
|
||||
let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
|
||||
(proximity, prefix, word2),
|
||||
b,
|
||||
)| {
|
||||
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
|
||||
});
|
||||
snap
|
||||
}
|
||||
@ -427,6 +436,9 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, word_prefix_pair_proximity_docids) => {{
|
||||
$crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index)
|
||||
}};
|
||||
($index:ident, prefix_word_pair_proximity_docids) => {{
|
||||
$crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index)
|
||||
}};
|
||||
($index:ident, word_position_docids) => {{
|
||||
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
||||
}};
|
||||
|
@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
field_id_word_count_docids,
|
||||
word_prefix_position_docids,
|
||||
@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
docid_word_positions.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_position_docids.clear(self.wtxn)?;
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
word_prefix_position_docids.clear(self.wtxn)?;
|
||||
|
@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
word_pair_proximity_docids,
|
||||
field_id_word_count_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_prefix_position_docids,
|
||||
facet_id_f64_docids,
|
||||
@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
|
||||
}
|
||||
|
||||
// We delete the documents ids from the word prefix pair proximity database docids
|
||||
// and remove the empty pairs too.
|
||||
let db = word_prefix_pair_proximity_docids.remap_key_type::<ByteSlice>();
|
||||
let mut iter = db.iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (key, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let key = key.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&key, &docids)? };
|
||||
for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
|
||||
// We delete the documents ids from the word prefix pair proximity database docids
|
||||
// and remove the empty pairs too.
|
||||
let db = db.remap_key_type::<ByteSlice>();
|
||||
let mut iter = db.iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (key, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let key = key.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&key, &docids)? };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the pairs of words,
|
||||
// it is faster and use no memory to iterate over all the words pairs than
|
||||
// to compute the cartesian product of every words of the deleted documents.
|
||||
|
@ -106,17 +106,6 @@ fn document_word_positions_into_sorter(
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
|
||||
// We also compute the inverse proximity.
|
||||
let prox = prox + 1;
|
||||
if prox < MAX_DISTANCE {
|
||||
word_pair_proximity
|
||||
.entry((word.clone(), head.word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -151,11 +140,10 @@ fn document_word_positions_into_sorter(
|
||||
let mut key_buffer = Vec::new();
|
||||
for ((w1, w2), prox) in word_pair_proximity {
|
||||
key_buffer.clear();
|
||||
key_buffer.push(prox as u8);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.push(prox as u8);
|
||||
|
||||
word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
|
||||
}
|
||||
|
@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::UserError;
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
||||
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
||||
self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||
WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{Index, Result, RoaringBitmapCodec};
|
||||
|
||||
@ -522,12 +522,13 @@ where
|
||||
|
||||
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
|
||||
// Run the word prefix pair proximity docids update operation.
|
||||
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
|
||||
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
|
||||
builder.max_memory = self.indexer_config.max_memory;
|
||||
builder.execute(
|
||||
PrefixWordPairsProximityDocids::new(
|
||||
self.wtxn,
|
||||
self.index,
|
||||
self.indexer_config.chunk_compression_type,
|
||||
self.indexer_config.chunk_compression_level,
|
||||
)
|
||||
.execute(
|
||||
word_pair_proximity_docids,
|
||||
&new_prefix_fst_words,
|
||||
&common_prefix_fst_words,
|
||||
|
@ -6,10 +6,10 @@ pub use self::index_documents::{
|
||||
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids;
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
|
||||
pub use self::words_prefix_position_docids::WordPrefixPositionDocids;
|
||||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
@ -19,9 +19,9 @@ mod delete_documents;
|
||||
mod facets;
|
||||
mod index_documents;
|
||||
mod indexer_config;
|
||||
mod prefix_word_pairs;
|
||||
mod settings;
|
||||
mod update_step;
|
||||
mod word_prefix_docids;
|
||||
mod word_prefix_pair_proximity_docids;
|
||||
mod words_prefix_position_docids;
|
||||
mod words_prefixes_fst;
|
||||
|
241
milli/src/update/prefix_word_pairs/mod.rs
Normal file
241
milli/src/update/prefix_word_pairs/mod.rs
Normal file
@ -0,0 +1,241 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
|
||||
use crate::{Index, Result};
|
||||
|
||||
mod prefix_word;
|
||||
mod word_prefix;
|
||||
|
||||
pub use prefix_word::index_prefix_word_database;
|
||||
pub use word_prefix::index_word_prefix_database;
|
||||
|
||||
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
}
|
||||
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Self {
|
||||
Self {
|
||||
wtxn,
|
||||
index,
|
||||
max_proximity: 4,
|
||||
max_prefix_length: 2,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
}
|
||||
}
|
||||
/// Set the maximum proximity required to make a prefix be part of the words prefixes
|
||||
/// database. If two words are too far from the threshold the associated documents will
|
||||
/// not be part of the prefix database.
|
||||
///
|
||||
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
|
||||
/// to this bound otherwise.
|
||||
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
|
||||
self.max_proximity = value.max(7);
|
||||
self
|
||||
}
|
||||
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
|
||||
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
|
||||
/// will not be part of the prefix database.
|
||||
///
|
||||
/// Default value is 2.
|
||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||
self.max_prefix_length = value;
|
||||
self
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||
pub fn execute<'a>(
|
||||
self,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &'a [String],
|
||||
common_prefix_fst_words: &[&'a [String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
index_word_prefix_database(
|
||||
self.wtxn,
|
||||
self.index.word_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids,
|
||||
self.max_proximity,
|
||||
self.max_prefix_length,
|
||||
new_word_pair_proximity_docids.clone(),
|
||||
new_prefix_fst_words,
|
||||
common_prefix_fst_words,
|
||||
del_prefix_fst_words,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
)?;
|
||||
|
||||
index_prefix_word_database(
|
||||
self.wtxn,
|
||||
self.index.word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids,
|
||||
self.max_proximity,
|
||||
self.max_prefix_length,
|
||||
new_word_pair_proximity_docids,
|
||||
new_prefix_fst_words,
|
||||
common_prefix_fst_words,
|
||||
del_prefix_fst_words,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database`
|
||||
pub fn insert_into_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
new_key: &[u8],
|
||||
new_value: &[u8],
|
||||
) -> Result<()> {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if new_key == key => {
|
||||
let val =
|
||||
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
||||
.map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
crate::error::InternalError::IndexingMergingKeys {
|
||||
process: "get-put-merge",
|
||||
}
|
||||
})?;
|
||||
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
||||
unsafe { iter.put_current(new_key, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
||||
// but it uses `append` if the database is empty, and it assumes that the values in the
|
||||
// writer don't conflict with values in the database.
|
||||
pub fn write_into_lmdb_database_without_merging(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
writer: grenad::Writer<std::fs::File>,
|
||||
) -> Result<()> {
|
||||
let file = writer.into_inner()?;
|
||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||
if database.is_empty(wtxn)? {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
// safety: the key comes from the grenad reader, not the database
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
} else {
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Cursor;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
|
||||
let mut documents = Vec::new();
|
||||
for prefix in prefixes {
|
||||
for i in 0..50 {
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": format!("{prefix}{i:x}"),
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
documents
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "At an extraordinary house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_pair_proximity_docids, "update");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "update");
|
||||
}
|
||||
}
|
182
milli/src/update/prefix_word_pairs/prefix_word.rs
Normal file
182
milli/src/update/prefix_word_pairs/prefix_word.rs
Normal file
@ -0,0 +1,182 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesDecode;
|
||||
use log::debug;
|
||||
|
||||
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||
use crate::update::prefix_word_pairs::{
|
||||
insert_into_database, write_into_lmdb_database_without_merging,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
#[logging_timer::time]
|
||||
pub fn index_prefix_word_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Result<()> {
|
||||
let max_proximity = max_proximity - 1;
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
let common_prefixes: Vec<_> = common_prefix_fst_words
|
||||
.into_iter()
|
||||
.map(|s| s.into_iter())
|
||||
.flatten()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length)
|
||||
.collect();
|
||||
|
||||
for proximity in 1..max_proximity {
|
||||
for prefix in common_prefixes.iter() {
|
||||
let mut prefix_key = vec![];
|
||||
prefix_key.push(proximity);
|
||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
|
||||
// This is the core of the algorithm
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
proximity,
|
||||
prefix.as_bytes(),
|
||||
// the next two arguments tell how to iterate over the new word pairs
|
||||
&mut cursor,
|
||||
|cursor| {
|
||||
if let Some((key, value)) = cursor.next()? {
|
||||
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
|
||||
.ok_or(heed::Error::Decoding)?;
|
||||
Ok(Some((word2, value)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
},
|
||||
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|
||||
|key, value| {
|
||||
insert_into_database(
|
||||
wtxn,
|
||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||
key,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||
let new_prefixes: Vec<_> = new_prefix_fst_words
|
||||
.into_iter()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length)
|
||||
.collect();
|
||||
|
||||
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
||||
// element in an intermediary grenad
|
||||
let mut writer =
|
||||
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
|
||||
|
||||
for proximity in 1..max_proximity {
|
||||
for prefix in new_prefixes.iter() {
|
||||
let mut prefix_key = vec![];
|
||||
prefix_key.push(proximity);
|
||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||
let mut db_iter = word_pair_proximity_docids
|
||||
.as_polymorph()
|
||||
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
|
||||
.remap_key_type::<UncheckedU8StrStrCodec>();
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
proximity,
|
||||
prefix.as_bytes(),
|
||||
&mut db_iter,
|
||||
|db_iter| {
|
||||
db_iter
|
||||
.next()
|
||||
.transpose()
|
||||
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
|
||||
.map_err(|e| e.into())
|
||||
},
|
||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||
)?;
|
||||
drop(db_iter);
|
||||
}
|
||||
}
|
||||
|
||||
// and then we write the grenad into the DB
|
||||
// Since the grenad contains only new prefixes, we know in advance that none
|
||||
// of its elements already exist in the DB, thus there is no need to specify
|
||||
// how to merge conflicting elements
|
||||
write_into_lmdb_database_without_merging(
|
||||
wtxn,
|
||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||
writer,
|
||||
)?;
|
||||
|
||||
// All of the word prefix pairs in the database that have a w2
|
||||
// that is contained in the `suppr_pw` set must be removed as well.
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter =
|
||||
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
|
||||
///
|
||||
/// Its arguments are:
|
||||
/// - an iterator over the words following the given `prefix` with the given `proximity`
|
||||
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
|
||||
fn execute_on_word_pairs_and_prefixes<I>(
|
||||
proximity: u8,
|
||||
prefix: &[u8],
|
||||
iter: &mut I,
|
||||
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
|
||||
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
|
||||
|
||||
// Memory usage check:
|
||||
// The content of the loop will be called for each `word2` that follows a word beginning
|
||||
// with `prefix` with the given proximity.
|
||||
// In practice, I don't think the batch can ever get too big.
|
||||
while let Some((word2, docids)) = next_word2_and_docids(iter)? {
|
||||
let entry = batch.entry(word2.to_owned()).or_default();
|
||||
entry.push(Cow::Owned(docids.to_owned()));
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::with_capacity(512);
|
||||
key_buffer.push(proximity);
|
||||
key_buffer.extend_from_slice(prefix);
|
||||
key_buffer.push(0);
|
||||
|
||||
let mut value_buffer = Vec::with_capacity(65_536);
|
||||
|
||||
for (word2, docids) in batch {
|
||||
key_buffer.truncate(prefix.len() + 2);
|
||||
value_buffer.clear();
|
||||
|
||||
key_buffer.extend_from_slice(&word2);
|
||||
let data = if docids.len() > 1 {
|
||||
CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
|
||||
value_buffer.as_slice()
|
||||
} else {
|
||||
&docids[0]
|
||||
};
|
||||
insert(key_buffer.as_slice(), data)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 a [101, ]
|
||||
1 amazing a [100, ]
|
||||
1 an a [100, ]
|
||||
1 and b [100, ]
|
||||
1 and be [100, ]
|
||||
1 at a [100, ]
|
||||
1 rings a [101, ]
|
||||
1 the b [101, ]
|
||||
1 the be [101, ]
|
||||
2 amazing b [100, ]
|
||||
2 amazing be [100, ]
|
||||
2 an a [100, ]
|
||||
2 at a [100, 101, ]
|
||||
2 bell a [101, ]
|
||||
3 an b [100, ]
|
||||
3 an be [100, ]
|
||||
3 at a [100, ]
|
||||
3 rings a [101, ]
|
||||
3 the a [101, ]
|
||||
4 at b [100, ]
|
||||
4 at be [100, ]
|
||||
4 bell a [101, ]
|
||||
|
@ -0,0 +1,29 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 a 5 [101, ]
|
||||
1 a amazing [100, ]
|
||||
1 a an [100, 202, ]
|
||||
1 a and [100, ]
|
||||
1 a beautiful [100, ]
|
||||
1 a extraordinary [202, ]
|
||||
1 am and [100, ]
|
||||
1 an amazing [100, ]
|
||||
1 an beautiful [100, ]
|
||||
1 an extraordinary [202, ]
|
||||
1 b house [100, ]
|
||||
1 b rings [101, ]
|
||||
1 be house [100, ]
|
||||
1 be rings [101, ]
|
||||
2 a am [101, ]
|
||||
2 a amazing [100, ]
|
||||
2 a and [100, ]
|
||||
2 a beautiful [100, ]
|
||||
2 a extraordinary [202, ]
|
||||
2 a house [100, 202, ]
|
||||
2 am beautiful [100, ]
|
||||
2 an and [100, ]
|
||||
2 an house [100, 202, ]
|
||||
2 b at [101, ]
|
||||
2 be at [101, ]
|
||||
|
@ -0,0 +1,39 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 am [101, ]
|
||||
1 amazing and [100, ]
|
||||
1 an amazing [100, ]
|
||||
1 an extraordinary [202, ]
|
||||
1 and beautiful [100, ]
|
||||
1 at 5 [101, ]
|
||||
1 at an [100, 202, ]
|
||||
1 beautiful house [100, ]
|
||||
1 bell rings [101, ]
|
||||
1 extraordinary house [202, ]
|
||||
1 rings at [101, ]
|
||||
1 the bell [101, ]
|
||||
2 amazing beautiful [100, ]
|
||||
2 an and [100, ]
|
||||
2 an house [202, ]
|
||||
2 and house [100, ]
|
||||
2 at am [101, ]
|
||||
2 at amazing [100, ]
|
||||
2 at extraordinary [202, ]
|
||||
2 bell at [101, ]
|
||||
2 rings 5 [101, ]
|
||||
2 the rings [101, ]
|
||||
3 amazing house [100, ]
|
||||
3 an beautiful [100, ]
|
||||
3 at and [100, ]
|
||||
3 at house [202, ]
|
||||
3 bell 5 [101, ]
|
||||
3 rings am [101, ]
|
||||
3 the at [101, ]
|
||||
4 an house [100, ]
|
||||
4 at beautiful [100, ]
|
||||
4 bell am [101, ]
|
||||
4 the 5 [101, ]
|
||||
5 at house [100, ]
|
||||
5 the am [101, ]
|
||||
|
@ -0,0 +1,35 @@
|
||||
---
|
||||
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||
---
|
||||
1 5 a [101, ]
|
||||
1 5 am [101, ]
|
||||
1 amazing a [100, ]
|
||||
1 amazing an [100, ]
|
||||
1 an a [100, ]
|
||||
1 an am [100, ]
|
||||
1 and b [100, ]
|
||||
1 and be [100, ]
|
||||
1 at a [100, 202, ]
|
||||
1 at an [100, 202, ]
|
||||
1 rings a [101, ]
|
||||
1 the b [101, ]
|
||||
1 the be [101, ]
|
||||
2 amazing b [100, ]
|
||||
2 amazing be [100, ]
|
||||
2 an a [100, ]
|
||||
2 an an [100, ]
|
||||
2 at a [100, 101, ]
|
||||
2 at am [100, 101, ]
|
||||
2 bell a [101, ]
|
||||
3 an b [100, ]
|
||||
3 an be [100, ]
|
||||
3 at a [100, ]
|
||||
3 at an [100, ]
|
||||
3 rings a [101, ]
|
||||
3 rings am [101, ]
|
||||
3 the a [101, ]
|
||||
4 at b [100, ]
|
||||
4 at be [100, ]
|
||||
4 bell a [101, ]
|
||||
4 bell am [101, ]
|
||||
|
@ -1,7 +1,6 @@
|
||||
/*!
|
||||
## What is WordPrefixPairProximityDocids?
|
||||
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
||||
the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of
|
||||
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
||||
the documents which contain `word` followed by another word starting with
|
||||
`prefix` at a distance of `proximity`.
|
||||
|
||||
@ -23,127 +22,100 @@ dog
|
||||
Note that only prefixes which correspond to more than a certain number of
|
||||
different words from the database are included in this list.
|
||||
|
||||
* a sorted list of word pairs and the distance between them (i.e. proximity),
|
||||
* associated with a roaring bitmap, such as:
|
||||
* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
|
||||
associated with a roaring bitmap, such as:
|
||||
```text
|
||||
good dog 3 -> docids1: [2, 5, 6]
|
||||
good doggo 1 -> docids2: [8]
|
||||
good dogma 1 -> docids3: [7, 19, 20]
|
||||
good ghost 2 -> docids4: [1]
|
||||
horror cathedral 4 -> docids5: [1, 2]
|
||||
1 good doggo -> docids1: [8]
|
||||
1 good door -> docids2: [7, 19, 20]
|
||||
1 good ghost -> docids3: [1]
|
||||
2 good dog -> docids4: [2, 5, 6]
|
||||
2 horror cathedral -> docids5: [1, 2]
|
||||
```
|
||||
|
||||
I illustrate a simplified version of the algorithm to create the word-prefix
|
||||
pair-proximity database below:
|
||||
|
||||
1. **Outer loop:** First, we iterate over each word pair and its proximity:
|
||||
1. **Outer loop:** First, we iterate over each proximity and word pair:
|
||||
```text
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : dog
|
||||
proximity: 3
|
||||
word2 : doggo
|
||||
```
|
||||
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
|
||||
in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`)
|
||||
in the list of sorted prefixes. And we insert the key `prefix`
|
||||
and the value (`docids`) to a sorted map which we call the “batch”. For example,
|
||||
at the end of the first inner loop, we may have:
|
||||
```text
|
||||
Outer loop 1:
|
||||
------------------------------
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : dog
|
||||
proximity: 3
|
||||
word2 : doggo
|
||||
docids : docids1
|
||||
|
||||
prefixes: [d, do, dog]
|
||||
|
||||
batch: [
|
||||
(d, 3) -> [docids1]
|
||||
(do, 3) -> [docids1]
|
||||
(dog, 3) -> [docids1]
|
||||
d, -> [docids1]
|
||||
do -> [docids1]
|
||||
dog -> [docids1]
|
||||
]
|
||||
```
|
||||
3. For illustration purpose, let's run through a second iteration of the outer loop:
|
||||
```text
|
||||
Outer loop 2:
|
||||
------------------------------
|
||||
word1 : good
|
||||
word2 : doggo
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : door
|
||||
docids : docids2
|
||||
|
||||
prefixes: [d, do, dog]
|
||||
prefixes: [d, do, doo]
|
||||
|
||||
batch: [
|
||||
(d, 1) -> [docids2]
|
||||
(d, 3) -> [docids1]
|
||||
(do, 1) -> [docids2]
|
||||
(do, 3) -> [docids1]
|
||||
(dog, 1) -> [docids2]
|
||||
(dog, 3) -> [docids1]
|
||||
]
|
||||
```
|
||||
Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some
|
||||
of the elements inserted in the second iteration of the outer loop appear
|
||||
*before* elements from the first iteration.
|
||||
|
||||
4. And a third:
|
||||
```text
|
||||
Outer loop 3:
|
||||
------------------------------
|
||||
word1 : good
|
||||
word2 : dogma
|
||||
proximity: 1
|
||||
docids : docids3
|
||||
|
||||
prefixes: [d, do, dog]
|
||||
|
||||
batch: [
|
||||
(d, 1) -> [docids2, docids3]
|
||||
(d, 3) -> [docids1]
|
||||
(do, 1) -> [docids2, docids3]
|
||||
(do, 3) -> [docids1]
|
||||
(dog, 1) -> [docids2, docids3]
|
||||
(dog, 3) -> [docids1]
|
||||
d -> [docids1, docids2]
|
||||
do -> [docids1, docids2]
|
||||
dog -> [docids1]
|
||||
doo -> [docids2]
|
||||
]
|
||||
```
|
||||
Notice that there were some conflicts which were resolved by merging the
|
||||
conflicting values together.
|
||||
conflicting values together. Also, an additional prefix was added at the
|
||||
end of the batch.
|
||||
|
||||
5. On the fourth iteration of the outer loop, we have:
|
||||
4. On the third iteration of the outer loop, we have:
|
||||
```text
|
||||
Outer loop 4:
|
||||
------------------------------
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : ghost
|
||||
proximity: 2
|
||||
```
|
||||
Because `word2` begins with a different letter than the previous `word2`,
|
||||
we know that:
|
||||
|
||||
1. All the prefixes of `word2` are greater than the prefixes of the previous word2
|
||||
2. And therefore, every instance of (`word2`, `prefix`) will be greater than
|
||||
any element in the batch.
|
||||
we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
|
||||
|
||||
Therefore, we know that we can insert every element from the batch into the
|
||||
database before proceeding any further. This operation is called
|
||||
“flushing the batch”. Flushing the batch should also be done whenever `word1`
|
||||
is different than the previous `word1`.
|
||||
“flushing the batch”. Flushing the batch should also be done whenever:
|
||||
* `proximity` is different than the previous `proximity`.
|
||||
* `word1` is different than the previous `word1`.
|
||||
* `word2` starts with a different letter than the previous word2
|
||||
|
||||
6. **Flushing the batch:** to flush the batch, we look at the `word1` and
|
||||
iterate over the elements of the batch in sorted order:
|
||||
6. **Flushing the batch:** to flush the batch, we iterate over its elements:
|
||||
```text
|
||||
Flushing Batch loop 1:
|
||||
------------------------------
|
||||
word1 : good
|
||||
word2 : d
|
||||
proximity: 1
|
||||
proximity : 1
|
||||
word1 : good
|
||||
prefix : d
|
||||
|
||||
docids : [docids2, docids3]
|
||||
```
|
||||
We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
|
||||
`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
|
||||
roaring bitmap of all the document ids where `word1` is followed by `prefix`
|
||||
at a distance of `proximity`.
|
||||
Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids`
|
||||
Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
|
||||
into the database.
|
||||
|
||||
7. That's it! ... except...
|
||||
@ -166,7 +138,7 @@ inputs described above, which come from different places:
|
||||
|
||||
2. `word_pairs_db`, which is the list of word pairs from the database.
|
||||
This list includes all elements in `new_word_pairs` since `new_word_pairs`
|
||||
was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute`
|
||||
was added to the database prior to calling the `WordPrefix::execute`
|
||||
function.
|
||||
|
||||
To update the prefix database correctly, we call the algorithm described earlier first
|
||||
@ -184,199 +156,146 @@ Note, also, that since we read data from the database when iterating over
|
||||
`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
|
||||
docids from the batch directly into the database (we would have a concurrent
|
||||
reader and writer). Therefore, when calling the algorithm on
|
||||
(`new_prefixes`, `word_pairs_db`), we insert the computed
|
||||
((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad
|
||||
`(new_prefixes, word_pairs_db)`, we insert the computed
|
||||
`((proximity, word, prefix), docids)` elements in an intermediary grenad
|
||||
Writer instead of the DB. At the end of the outer loop, we finally read from
|
||||
the grenad and insert its elements in the database.
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesDecode;
|
||||
use log::debug;
|
||||
|
||||
use crate::update::index_documents::{
|
||||
create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap,
|
||||
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||
use crate::update::prefix_word_pairs::{
|
||||
insert_into_database, write_into_lmdb_database_without_merging,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec};
|
||||
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
#[logging_timer::time]
|
||||
pub fn index_word_prefix_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
}
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Result<()> {
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
WordPrefixPairProximityDocids {
|
||||
wtxn,
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
max_proximity: 4,
|
||||
max_prefix_length: 2,
|
||||
}
|
||||
}
|
||||
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||
common_prefix_fst_words
|
||||
.into_iter()
|
||||
.map(|s| s.into_iter())
|
||||
.flatten()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length),
|
||||
);
|
||||
|
||||
/// Set the maximum proximity required to make a prefix be part of the words prefixes
|
||||
/// database. If two words are too far from the threshold the associated documents will
|
||||
/// not be part of the prefix database.
|
||||
///
|
||||
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
|
||||
/// to this bound otherwise.
|
||||
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
|
||||
self.max_proximity = value.max(7);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
|
||||
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
|
||||
/// will not be part of the prefix database.
|
||||
///
|
||||
/// Default value is 2.
|
||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||
self.max_prefix_length = value;
|
||||
self
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||
pub fn execute<'a>(
|
||||
self,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &'a [String],
|
||||
common_prefix_fst_words: &[&'a [String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||
common_prefix_fst_words
|
||||
.iter()
|
||||
.flat_map(|s| s.iter())
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= self.max_prefix_length),
|
||||
);
|
||||
|
||||
// If the prefix trie is not empty, then we can iterate over all new
|
||||
// word pairs to look for new (word1, common_prefix, proximity) elements
|
||||
// to insert in the DB
|
||||
if !prefixes.is_empty() {
|
||||
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
||||
// This is the core of the algorithm
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
// the first two arguments tell how to iterate over the new word pairs
|
||||
&mut cursor,
|
||||
|cursor| {
|
||||
if let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key)
|
||||
.ok_or(heed::Error::Decoding)?;
|
||||
Ok(Some(((word1, word2, proximity), value)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
},
|
||||
&prefixes,
|
||||
self.max_proximity,
|
||||
// and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
|
||||
|key, value| {
|
||||
insert_into_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
key,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
}
|
||||
|
||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||
new_prefix_fst_words
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= self.max_prefix_length),
|
||||
);
|
||||
|
||||
if !prefixes.is_empty() {
|
||||
let mut db_iter = self
|
||||
.index
|
||||
.word_pair_proximity_docids
|
||||
.remap_key_type::<UncheckedStrStrU8Codec>()
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.iter(self.wtxn)?;
|
||||
|
||||
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
||||
// element in an intermediary grenad
|
||||
let mut writer = create_writer(
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
&mut db_iter,
|
||||
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
||||
&prefixes,
|
||||
self.max_proximity,
|
||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||
)?;
|
||||
drop(db_iter);
|
||||
|
||||
// and then we write the grenad into the DB
|
||||
// Since the grenad contains only new prefixes, we know in advance that none
|
||||
// of its elements already exist in the DB, thus there is no need to specify
|
||||
// how to merge conflicting elements
|
||||
write_into_lmdb_database_without_merging(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
writer,
|
||||
)?;
|
||||
}
|
||||
|
||||
// All of the word prefix pairs in the database that have a w2
|
||||
// that is contained in the `suppr_pw` set must be removed as well.
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter = self
|
||||
.index
|
||||
.word_prefix_pair_proximity_docids
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.iter_mut(self.wtxn)?;
|
||||
while let Some(((_, w2, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(w2.as_bytes()) {
|
||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||
unsafe { iter.del_current()? };
|
||||
// If the prefix trie is not empty, then we can iterate over all new
|
||||
// word pairs to look for new (proximity, word1, common_prefix) elements
|
||||
// to insert in the DB
|
||||
if !prefixes.is_empty() {
|
||||
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
||||
// This is the core of the algorithm
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
// the first two arguments tell how to iterate over the new word pairs
|
||||
&mut cursor,
|
||||
|cursor| {
|
||||
if let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (proximity, word1, word2) =
|
||||
UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
|
||||
Ok(Some(((proximity, word1, word2), value)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
},
|
||||
&prefixes,
|
||||
max_proximity,
|
||||
// and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
|
||||
|key, value| {
|
||||
insert_into_database(
|
||||
wtxn,
|
||||
*word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
key,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
}
|
||||
|
||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||
new_prefix_fst_words
|
||||
.into_iter()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length),
|
||||
);
|
||||
|
||||
if !prefixes.is_empty() {
|
||||
let mut db_iter = word_pair_proximity_docids
|
||||
.remap_key_type::<UncheckedU8StrStrCodec>()
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.iter(wtxn)?;
|
||||
|
||||
// Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
|
||||
// element in an intermediary grenad
|
||||
let mut writer =
|
||||
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
|
||||
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
&mut db_iter,
|
||||
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
||||
&prefixes,
|
||||
max_proximity,
|
||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||
)?;
|
||||
drop(db_iter);
|
||||
|
||||
// and then we write the grenad into the DB
|
||||
// Since the grenad contains only new prefixes, we know in advance that none
|
||||
// of its elements already exist in the DB, thus there is no need to specify
|
||||
// how to merge conflicting elements
|
||||
write_into_lmdb_database_without_merging(
|
||||
wtxn,
|
||||
*word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
writer,
|
||||
)?;
|
||||
}
|
||||
|
||||
// All of the word prefix pairs in the database that have a w2
|
||||
// that is contained in the `suppr_pw` set must be removed as well.
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter =
|
||||
word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||
while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
||||
///
|
||||
/// Its main arguments are:
|
||||
/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements
|
||||
/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
|
||||
/// 2. a prefix trie
|
||||
/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
|
||||
/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
|
||||
///
|
||||
/// For more information about what this function does, read the module documentation.
|
||||
fn execute_on_word_pairs_and_prefixes<I>(
|
||||
@ -384,7 +303,7 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
||||
mut next_word_pair_proximity: impl for<'a> FnMut(
|
||||
&'a mut I,
|
||||
) -> Result<
|
||||
Option<((&'a [u8], &'a [u8], u8), &'a [u8])>,
|
||||
Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>,
|
||||
>,
|
||||
prefixes: &PrefixTrieNode,
|
||||
max_proximity: u8,
|
||||
@ -402,10 +321,10 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
||||
let mut prefix_buffer = Vec::with_capacity(8);
|
||||
let mut merge_buffer = Vec::with_capacity(65_536);
|
||||
|
||||
while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
|
||||
// skip this iteration if the proximity is over the threshold
|
||||
while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
|
||||
// stop indexing if the proximity is over the threshold
|
||||
if proximity > max_proximity {
|
||||
continue;
|
||||
break;
|
||||
};
|
||||
let word2_start_different_than_prev = word2[0] != prev_word2_start;
|
||||
// if there were no potential prefixes for the previous word2 based on its first letter,
|
||||
@ -415,16 +334,21 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
||||
continue;
|
||||
}
|
||||
|
||||
// if word1 is different than the previous word1 OR if the start of word2 is different
|
||||
// than the previous start of word2, then we'll need to flush the batch
|
||||
// if the proximity is different to the previous one, OR
|
||||
// if word1 is different than the previous word1, OR
|
||||
// if the start of word2 is different than the previous start of word2,
|
||||
// THEN we'll need to flush the batch
|
||||
let prox_different_than_prev = proximity != batch.proximity;
|
||||
let word1_different_than_prev = word1 != batch.word1;
|
||||
if word1_different_than_prev || word2_start_different_than_prev {
|
||||
if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
|
||||
{
|
||||
batch.flush(&mut merge_buffer, &mut insert)?;
|
||||
// don't forget to reset the value of batch.word1 and prev_word2_start
|
||||
if word1_different_than_prev {
|
||||
prefix_search_start.0 = 0;
|
||||
batch.word1.clear();
|
||||
batch.word1.extend_from_slice(word1);
|
||||
batch.proximity = proximity;
|
||||
}
|
||||
if word2_start_different_than_prev {
|
||||
// word2_start_different_than_prev == true
|
||||
@ -436,74 +360,70 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
||||
|
||||
if !empty_prefixes {
|
||||
// All conditions are satisfied, we can now insert each new prefix of word2 into the batch
|
||||
prefix_buffer.clear();
|
||||
prefixes.for_each_prefix_of(
|
||||
word2,
|
||||
&mut prefix_buffer,
|
||||
&prefix_search_start,
|
||||
|prefix_buffer| {
|
||||
let prefix_len = prefix_buffer.len();
|
||||
prefix_buffer.push(0);
|
||||
prefix_buffer.push(proximity);
|
||||
batch.insert(prefix_buffer, data.to_vec());
|
||||
prefix_buffer.truncate(prefix_len);
|
||||
batch.insert(&prefix_buffer, data.to_vec());
|
||||
},
|
||||
);
|
||||
prefix_buffer.clear();
|
||||
}
|
||||
}
|
||||
batch.flush(&mut merge_buffer, &mut insert)?;
|
||||
Ok(())
|
||||
}
|
||||
/**
|
||||
A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
||||
A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
||||
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
|
||||
|
||||
It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently.
|
||||
It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
|
||||
|
||||
The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content
|
||||
The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
|
||||
can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
|
||||
- key : (word1, prefix, proximity) as bytes
|
||||
- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes
|
||||
- key : (proximity, word1, prefix) as bytes
|
||||
- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
|
||||
*/
|
||||
#[derive(Default)]
|
||||
struct PrefixAndProximityBatch {
|
||||
proximity: u8,
|
||||
word1: Vec<u8>,
|
||||
batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
|
||||
}
|
||||
|
||||
impl PrefixAndProximityBatch {
|
||||
/// Insert the new key and value into the batch
|
||||
///
|
||||
/// The key must either exist in the batch or be greater than all existing keys
|
||||
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
|
||||
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
|
||||
Ok(position) => {
|
||||
self.batch[position].1.push(Cow::Owned(new_value));
|
||||
}
|
||||
Err(position) => {
|
||||
self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)]));
|
||||
}
|
||||
match self.batch.iter_mut().find(|el| el.0 == new_key) {
|
||||
Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
|
||||
None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
|
||||
}
|
||||
}
|
||||
|
||||
/// Empties the batch, calling `insert` on each element.
|
||||
///
|
||||
/// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap.
|
||||
/// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
|
||||
fn flush(
|
||||
&mut self,
|
||||
merge_buffer: &mut Vec<u8>,
|
||||
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let PrefixAndProximityBatch { word1, batch } = self;
|
||||
let PrefixAndProximityBatch { proximity, word1, batch } = self;
|
||||
if batch.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
merge_buffer.clear();
|
||||
|
||||
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1);
|
||||
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
|
||||
buffer.push(*proximity);
|
||||
buffer.extend_from_slice(word1);
|
||||
buffer.push(0);
|
||||
|
||||
for (key, mergeable_data) in batch.drain(..) {
|
||||
buffer.truncate(word1.len() + 1);
|
||||
buffer.truncate(1 + word1.len() + 1);
|
||||
buffer.extend_from_slice(key.as_slice());
|
||||
|
||||
let data = if mergeable_data.len() > 1 {
|
||||
@ -520,61 +440,6 @@ impl PrefixAndProximityBatch {
|
||||
}
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database`
|
||||
fn insert_into_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
new_key: &[u8],
|
||||
new_value: &[u8],
|
||||
) -> Result<()> {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if new_key == key => {
|
||||
let val =
|
||||
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
||||
.map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
crate::error::InternalError::IndexingMergingKeys {
|
||||
process: "get-put-merge",
|
||||
}
|
||||
})?;
|
||||
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
||||
unsafe { iter.put_current(new_key, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
||||
// but it uses `append` if the database is empty, and it assumes that the values in the
|
||||
// writer don't conflict with values in the database.
|
||||
pub fn write_into_lmdb_database_without_merging(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
writer: grenad::Writer<std::fs::File>,
|
||||
) -> Result<()> {
|
||||
let file = writer.into_inner()?;
|
||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||
if database.is_empty(wtxn)? {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
// safety: the key comes from the grenad reader, not the database
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
} else {
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
|
||||
within a set.
|
||||
|
||||
@ -619,7 +484,7 @@ impl PrefixTrieNode {
|
||||
fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool {
|
||||
let byte = word[0];
|
||||
if self.children[search_start.0].1 == byte {
|
||||
true
|
||||
return true;
|
||||
} else {
|
||||
match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) {
|
||||
Ok(position) => {
|
||||
@ -637,7 +502,7 @@ impl PrefixTrieNode {
|
||||
fn from_sorted_prefixes<'a>(prefixes: impl Iterator<Item = &'a str>) -> Self {
|
||||
let mut node = PrefixTrieNode::default();
|
||||
for prefix in prefixes {
|
||||
node.insert_sorted_prefix(prefix.as_bytes().iter());
|
||||
node.insert_sorted_prefix(prefix.as_bytes().into_iter());
|
||||
}
|
||||
node
|
||||
}
|
||||
@ -701,90 +566,10 @@ impl PrefixTrieNode {
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Cursor;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{db_snap, CboRoaringBitmapCodec, StrStrU8Codec};
|
||||
|
||||
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
|
||||
let mut documents = Vec::new();
|
||||
for prefix in prefixes {
|
||||
for i in 0..50 {
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": format!("{prefix}{i:x}"),
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
documents
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "At an extraordinary house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
||||
}
|
||||
use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
|
||||
|
||||
fn check_prefixes(
|
||||
trie: &PrefixTrieNode,
|
||||
@ -883,58 +668,40 @@ mod tests {
|
||||
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
|
||||
|
||||
let word_pairs = [
|
||||
// 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456)
|
||||
(("healthy", "arbre", 2), &serialised_bitmap123),
|
||||
// not inserted because 3 > max_proximity
|
||||
(("healthy", "arbre", 3), &serialised_bitmap456),
|
||||
// 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123)
|
||||
(("healthy", "arbres", 1), &serialised_bitmap123),
|
||||
// 1, 3:
|
||||
(("healthy", "arbres", 2), &serialised_bitmap456),
|
||||
// not be inserted because 3 > max_proximity
|
||||
(("healthy", "arbres", 3), &serialised_bitmap789),
|
||||
// not inserted because no prefixes for boat
|
||||
(("healthy", "boat", 1), &serialised_bitmap123),
|
||||
// not inserted because no prefixes for ca
|
||||
(("healthy", "ca", 1), &serialised_bitmap123),
|
||||
// 4: (healthy cat 1) with (bitmap456 + bitmap123)
|
||||
(("healthy", "cats", 1), &serialised_bitmap456),
|
||||
// 5: (healthy cat 2) with (bitmap789 + bitmap_ranges)
|
||||
(("healthy", "cats", 2), &serialised_bitmap789),
|
||||
// 4 + 6: (healthy catto 1) with (bitmap123)
|
||||
(("healthy", "cattos", 1), &serialised_bitmap123),
|
||||
// 5 + 7: (healthy catto 2) with (bitmap_ranges)
|
||||
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
|
||||
// 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges)
|
||||
(("jittery", "cat", 1), &serialised_bitmap123),
|
||||
// 8:
|
||||
(("jittery", "cata", 1), &serialised_bitmap456),
|
||||
// 8:
|
||||
(("jittery", "catb", 1), &serialised_bitmap789),
|
||||
// 8:
|
||||
(("jittery", "catc", 1), &serialised_bitmap_ranges),
|
||||
((1, "healthy", "arbres"), &serialised_bitmap123),
|
||||
((1, "healthy", "boat"), &serialised_bitmap123),
|
||||
((1, "healthy", "ca"), &serialised_bitmap123),
|
||||
((1, "healthy", "cats"), &serialised_bitmap456),
|
||||
((1, "healthy", "cattos"), &serialised_bitmap123),
|
||||
((1, "jittery", "cat"), &serialised_bitmap123),
|
||||
((1, "jittery", "cata"), &serialised_bitmap456),
|
||||
((1, "jittery", "catb"), &serialised_bitmap789),
|
||||
((1, "jittery", "catc"), &serialised_bitmap_ranges),
|
||||
((2, "healthy", "arbre"), &serialised_bitmap123),
|
||||
((2, "healthy", "arbres"), &serialised_bitmap456),
|
||||
((2, "healthy", "cats"), &serialised_bitmap789),
|
||||
((2, "healthy", "cattos"), &serialised_bitmap_ranges),
|
||||
((3, "healthy", "arbre"), &serialised_bitmap456),
|
||||
((3, "healthy", "arbres"), &serialised_bitmap789),
|
||||
];
|
||||
|
||||
let expected_result = [
|
||||
// first batch:
|
||||
(("healthy", "arb", 1), bitmap123.clone()),
|
||||
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
|
||||
(("healthy", "arbre", 1), bitmap123.clone()),
|
||||
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
|
||||
// second batch:
|
||||
(("healthy", "cat", 1), &bitmap456 | &bitmap123),
|
||||
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
|
||||
(("healthy", "catto", 1), bitmap123.clone()),
|
||||
(("healthy", "catto", 2), bitmap_ranges.clone()),
|
||||
// third batch
|
||||
(("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
|
||||
((1, "healthy", "arb"), bitmap123.clone()),
|
||||
((1, "healthy", "arbre"), bitmap123.clone()),
|
||||
((1, "healthy", "cat"), &bitmap456 | &bitmap123),
|
||||
((1, "healthy", "catto"), bitmap123.clone()),
|
||||
((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
|
||||
((2, "healthy", "arb"), &bitmap123 | &bitmap456),
|
||||
((2, "healthy", "arbre"), &bitmap123 | &bitmap456),
|
||||
((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges),
|
||||
((2, "healthy", "catto"), bitmap_ranges.clone()),
|
||||
];
|
||||
|
||||
let mut result = vec![];
|
||||
|
||||
let mut iter =
|
||||
IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| {
|
||||
((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice())
|
||||
IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| {
|
||||
((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice())
|
||||
});
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
&mut iter,
|
||||
@ -942,17 +709,17 @@ mod tests {
|
||||
&prefixes,
|
||||
2,
|
||||
|k, v| {
|
||||
let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap();
|
||||
let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
|
||||
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
|
||||
result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap));
|
||||
result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
|
||||
let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x;
|
||||
let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y;
|
||||
let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x;
|
||||
let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y;
|
||||
|
||||
assert_eq!(actual_word1, expected_word1);
|
||||
assert_eq!(actual_prefix, expected_prefix);
|
@ -1,46 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/word_prefix_pair_proximity_docids.rs
|
||||
---
|
||||
5 a 1 [101, ]
|
||||
5 a 2 [101, ]
|
||||
5 b 4 [101, ]
|
||||
5 be 4 [101, ]
|
||||
am a 3 [101, ]
|
||||
amazing a 1 [100, ]
|
||||
amazing a 2 [100, ]
|
||||
amazing a 3 [100, ]
|
||||
amazing b 2 [100, ]
|
||||
amazing be 2 [100, ]
|
||||
an a 1 [100, ]
|
||||
an a 2 [100, ]
|
||||
an b 3 [100, ]
|
||||
an be 3 [100, ]
|
||||
and a 2 [100, ]
|
||||
and a 3 [100, ]
|
||||
and a 4 [100, ]
|
||||
and b 1 [100, ]
|
||||
and be 1 [100, ]
|
||||
at a 1 [100, ]
|
||||
at a 2 [100, 101, ]
|
||||
at a 3 [100, ]
|
||||
at b 3 [101, ]
|
||||
at b 4 [100, ]
|
||||
at be 3 [101, ]
|
||||
at be 4 [100, ]
|
||||
beautiful a 2 [100, ]
|
||||
beautiful a 3 [100, ]
|
||||
beautiful a 4 [100, ]
|
||||
bell a 2 [101, ]
|
||||
bell a 4 [101, ]
|
||||
house a 3 [100, ]
|
||||
house a 4 [100, ]
|
||||
house b 2 [100, ]
|
||||
house be 2 [100, ]
|
||||
rings a 1 [101, ]
|
||||
rings a 3 [101, ]
|
||||
rings b 2 [101, ]
|
||||
rings be 2 [101, ]
|
||||
the a 3 [101, ]
|
||||
the b 1 [101, ]
|
||||
the be 1 [101, ]
|
||||
|
@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/word_prefix_pair_proximity_docids.rs
|
||||
---
|
||||
5ed4bf83317b10962a55ade353427bdd
|
Loading…
x
Reference in New Issue
Block a user