2020-05-22 15:00:50 +02:00
use std ::borrow ::Cow ;
2019-10-02 17:34:32 +02:00
use std ::collections ::{ BTreeMap , HashMap } ;
use std ::convert ::TryFrom ;
2020-01-10 18:20:30 +01:00
use meilisearch_schema ::IndexedPos ;
2020-11-24 21:43:21 +01:00
use meilisearch_tokenizer ::analyzer ::{ Analyzer , AnalyzerConfig } ;
2020-11-26 13:16:12 +01:00
use meilisearch_tokenizer ::{ Token , token ::SeparatorKind , TokenKind } ;
2019-10-02 17:34:32 +02:00
use sdset ::SetBuf ;
2020-05-22 15:00:50 +02:00
use crate ::{ DocIndex , DocumentId } ;
use crate ::FstSetCow ;
2019-11-10 17:41:32 +01:00
const WORD_LENGTH_LIMIT : usize = 80 ;
2019-10-02 17:34:32 +02:00
type Word = Vec < u8 > ; // TODO make it be a SmallVec
2020-12-03 12:34:22 +01:00
pub struct RawIndexer < ' a , A > {
2019-10-02 17:34:32 +02:00
word_limit : usize , // the maximum number of indexed words
words_doc_indexes : BTreeMap < Word , Vec < DocIndex > > ,
docs_words : HashMap < DocumentId , Vec < Word > > ,
2020-11-26 15:17:49 +01:00
analyzer : Analyzer < ' a , A > ,
2019-10-02 17:34:32 +02:00
}
2020-05-22 15:00:50 +02:00
pub struct Indexed < ' a > {
2019-10-02 17:34:32 +02:00
pub words_doc_indexes : BTreeMap < Word , SetBuf < DocIndex > > ,
2020-05-22 15:00:50 +02:00
pub docs_words : HashMap < DocumentId , FstSetCow < ' a > > ,
2019-10-02 17:34:32 +02:00
}
2020-11-26 15:17:49 +01:00
impl < ' a , A > RawIndexer < ' a , A >
where
A : AsRef < [ u8 ] >
{
pub fn new ( stop_words : & ' a fst ::Set < A > ) -> RawIndexer < ' a , A > {
2019-10-29 15:53:45 +01:00
RawIndexer ::with_word_limit ( stop_words , 1000 )
2019-10-02 17:34:32 +02:00
}
2020-11-26 15:17:49 +01:00
pub fn with_word_limit ( stop_words : & ' a fst ::Set < A > , limit : usize ) -> RawIndexer < A > {
2019-10-02 17:34:32 +02:00
RawIndexer {
word_limit : limit ,
words_doc_indexes : BTreeMap ::new ( ) ,
docs_words : HashMap ::new ( ) ,
2020-11-26 15:17:49 +01:00
analyzer : Analyzer ::new ( AnalyzerConfig ::default_with_stopwords ( stop_words ) ) ,
2019-10-02 17:34:32 +02:00
}
}
2020-01-10 18:20:30 +01:00
pub fn index_text ( & mut self , id : DocumentId , indexed_pos : IndexedPos , text : & str ) -> usize {
2019-10-14 13:56:52 +02:00
let mut number_of_words = 0 ;
2019-10-02 17:34:32 +02:00
2020-11-19 18:23:08 +01:00
let analyzed_text = self . analyzer . analyze ( text ) ;
2020-12-02 15:21:24 +01:00
for ( token_pos , ( word_pos , token ) ) in process_tokens ( analyzed_text . tokens ( ) ) . enumerate ( ) {
2019-10-02 17:34:32 +02:00
let must_continue = index_token (
token ,
2020-11-19 18:23:08 +01:00
word_pos ,
2020-11-26 20:01:53 +01:00
token_pos ,
2019-10-02 17:34:32 +02:00
id ,
2020-01-10 18:20:30 +01:00
indexed_pos ,
2019-10-02 17:34:32 +02:00
self . word_limit ,
& mut self . words_doc_indexes ,
& mut self . docs_words ,
) ;
2019-11-04 16:09:32 +01:00
number_of_words + = 1 ;
2019-10-18 13:05:28 +02:00
if ! must_continue {
break ;
}
2019-10-02 17:34:32 +02:00
}
2019-11-04 16:09:32 +01:00
number_of_words
}
2019-10-02 17:34:32 +02:00
2020-12-03 12:34:22 +01:00
pub fn index_text_seq < ' s , I > ( & mut self , id : DocumentId , indexed_pos : IndexedPos , text_iter : I )
2019-11-04 16:09:32 +01:00
where
2020-05-22 15:00:50 +02:00
I : IntoIterator < Item = & ' s str > ,
2019-11-04 16:09:32 +01:00
{
2020-11-19 18:23:08 +01:00
let mut byte_offset = 0 ;
let mut word_offset = 0 ;
2020-12-03 12:34:22 +01:00
for text in text_iter . into_iter ( ) {
2020-11-19 18:23:08 +01:00
let current_byte_offset = byte_offset ;
let current_word_offset = word_offset ;
2020-12-03 12:34:22 +01:00
let analyzed_text = self . analyzer . analyze ( text ) ;
2020-12-02 15:21:24 +01:00
let tokens = process_tokens ( analyzed_text . tokens ( ) )
2020-11-26 20:01:53 +01:00
. map ( | ( i , mut t ) | {
2020-11-19 18:23:08 +01:00
t . byte_start = t . byte_start + current_byte_offset ;
t . byte_end = t . byte_end + current_byte_offset ;
2020-12-03 12:34:22 +01:00
( i + current_word_offset , t )
2020-11-19 18:23:08 +01:00
} )
2020-11-26 20:01:53 +01:00
. enumerate ( ) ;
2020-11-19 18:23:08 +01:00
2020-11-26 20:01:53 +01:00
for ( token_pos , ( word_pos , token ) ) in tokens {
2020-11-19 18:23:08 +01:00
word_offset = word_pos + 1 ;
byte_offset = token . byte_end + 1 ;
let must_continue = index_token (
token ,
word_pos ,
2020-11-26 20:01:53 +01:00
token_pos ,
2020-11-19 18:23:08 +01:00
id ,
indexed_pos ,
self . word_limit ,
& mut self . words_doc_indexes ,
& mut self . docs_words ,
) ;
if ! must_continue {
break ;
}
2019-10-18 13:05:28 +02:00
}
2019-10-02 17:34:32 +02:00
}
}
2020-05-22 15:00:50 +02:00
pub fn build ( self ) -> Indexed < 'static > {
2019-10-18 13:05:28 +02:00
let words_doc_indexes = self
. words_doc_indexes
2019-10-02 17:34:32 +02:00
. into_iter ( )
. map ( | ( word , indexes ) | ( word , SetBuf ::from_dirty ( indexes ) ) )
. collect ( ) ;
2019-10-18 13:05:28 +02:00
let docs_words = self
. docs_words
2019-10-02 17:34:32 +02:00
. into_iter ( )
. map ( | ( id , mut words ) | {
words . sort_unstable ( ) ;
words . dedup ( ) ;
2020-05-22 15:00:50 +02:00
let fst = fst ::Set ::from_iter ( words ) . unwrap ( ) . map_data ( Cow ::Owned ) . unwrap ( ) ;
( id , fst )
2019-10-02 17:34:32 +02:00
} )
. collect ( ) ;
2019-10-18 13:05:28 +02:00
Indexed {
words_doc_indexes ,
docs_words ,
}
2019-10-02 17:34:32 +02:00
}
}
2020-12-02 15:21:24 +01:00
fn process_tokens < ' a > ( tokens : impl Iterator < Item = Token < ' a > > ) -> impl Iterator < Item = ( usize , Token < ' a > ) > {
tokens
2020-12-03 12:34:22 +01:00
. scan ( ( 0 , None ) , | ( offset , prev_kind ) , token | {
2020-12-02 15:21:24 +01:00
match token . kind {
TokenKind ::Word | TokenKind ::StopWord | TokenKind ::Any = > {
2020-12-03 12:34:22 +01:00
* offset + = match * prev_kind {
2020-12-02 18:55:39 +01:00
Some ( TokenKind ::Separator ( SeparatorKind ::Hard ) ) = > 8 ,
Some ( _ ) = > 1 ,
2020-12-02 15:21:24 +01:00
None = > 0 ,
} ;
2020-12-03 12:34:22 +01:00
* prev_kind = Some ( token . kind )
2020-12-02 15:21:24 +01:00
}
TokenKind ::Separator ( SeparatorKind ::Hard ) = > {
2020-12-03 12:34:22 +01:00
* prev_kind = Some ( token . kind ) ;
2020-12-02 15:21:24 +01:00
}
2020-12-03 12:34:22 +01:00
TokenKind ::Separator ( SeparatorKind ::Soft )
if * prev_kind ! = Some ( TokenKind ::Separator ( SeparatorKind ::Hard ) ) = > {
* prev_kind = Some ( token . kind ) ;
2020-12-02 15:21:24 +01:00
}
_ = > ( ) ,
}
Some ( ( * offset , token ) )
} )
. filter ( | ( _ , t ) | t . is_word ( ) )
}
2020-11-24 21:43:21 +01:00
fn index_token (
2019-10-02 17:34:32 +02:00
token : Token ,
2020-11-19 18:23:08 +01:00
word_pos : usize ,
2020-11-26 20:01:53 +01:00
token_pos : usize ,
2019-10-02 17:34:32 +02:00
id : DocumentId ,
2020-01-10 18:20:30 +01:00
indexed_pos : IndexedPos ,
2019-10-02 17:34:32 +02:00
word_limit : usize ,
words_doc_indexes : & mut BTreeMap < Word , Vec < DocIndex > > ,
docs_words : & mut HashMap < DocumentId , Vec < Word > > ,
2020-05-22 15:00:50 +02:00
) -> bool
{
2020-11-26 20:01:53 +01:00
if token_pos > = word_limit {
2019-10-18 13:05:28 +02:00
return false ;
}
2019-10-02 17:34:32 +02:00
2020-11-24 21:43:21 +01:00
if ! token . is_stopword ( ) {
2020-11-19 18:23:08 +01:00
match token_to_docindex ( id , indexed_pos , & token , word_pos ) {
2019-10-29 16:04:48 +01:00
Some ( docindex ) = > {
2020-11-19 18:23:08 +01:00
let word = Vec ::from ( token . word . as_ref ( ) ) ;
2019-11-10 17:41:32 +01:00
if word . len ( ) < = WORD_LENGTH_LIMIT {
words_doc_indexes
. entry ( word . clone ( ) )
. or_insert_with ( Vec ::new )
. push ( docindex ) ;
docs_words . entry ( id ) . or_insert_with ( Vec ::new ) . push ( word ) ;
2019-11-04 16:09:32 +01:00
}
}
2019-11-05 16:40:34 +01:00
None = > return false ,
2019-11-04 16:09:32 +01:00
}
2019-10-02 17:34:32 +02:00
}
true
}
2020-11-19 18:23:08 +01:00
fn token_to_docindex ( id : DocumentId , indexed_pos : IndexedPos , token : & Token , word_index : usize ) -> Option < DocIndex > {
let word_index = u16 ::try_from ( word_index ) . ok ( ) ? ;
2020-12-14 18:23:13 +01:00
let char_index = u16 ::try_from ( token . byte_start ) . ok ( ) ? ;
let char_length = u16 ::try_from ( token . word . len ( ) ) . ok ( ) ? ;
2019-10-02 17:34:32 +02:00
let docindex = DocIndex {
document_id : id ,
2020-01-10 18:20:30 +01:00
attribute : indexed_pos . 0 ,
2019-10-02 17:34:32 +02:00
word_index ,
char_index ,
char_length ,
} ;
Some ( docindex )
}
#[ cfg(test) ]
mod tests {
use super ::* ;
2020-01-13 19:10:58 +01:00
use meilisearch_schema ::IndexedPos ;
2020-12-02 15:21:24 +01:00
use meilisearch_tokenizer ::{ Analyzer , AnalyzerConfig } ;
use fst ::Set ;
#[ test ]
fn test_process_token ( ) {
2020-12-03 12:34:22 +01:00
let text = " 為一包含一千多萬目詞的帶標記平衡語料庫 " ;
2020-12-02 15:21:24 +01:00
let stopwords = Set ::default ( ) ;
let analyzer = Analyzer ::new ( AnalyzerConfig ::default_with_stopwords ( & stopwords ) ) ;
let analyzer = analyzer . analyze ( text ) ;
2020-12-03 12:34:22 +01:00
let tokens : Vec < _ > = process_tokens ( analyzer . tokens ( ) ) . map ( | ( _ , t ) | t . text ( ) . to_string ( ) ) . collect ( ) ;
assert_eq! ( tokens , [ " 为 " , " 一 " , " 包含 " , " 一千多万 " , " 目 " , " 词 " , " 的 " , " 带 " , " 标记 " , " 平衡 " , " 语料库 " ] ) ;
2020-12-02 15:21:24 +01:00
}
2019-10-02 17:34:32 +02:00
#[ test ]
fn strange_apostrophe ( ) {
2020-11-26 15:17:49 +01:00
let stop_words = fst ::Set ::default ( ) ;
let mut indexer = RawIndexer ::new ( & stop_words ) ;
2019-10-02 17:34:32 +02:00
let docid = DocumentId ( 0 ) ;
2020-01-13 19:10:58 +01:00
let indexed_pos = IndexedPos ( 0 ) ;
2019-10-02 17:34:32 +02:00
let text = " Zut, l’ aspirateur, j’ ai oublié de l’ éteindre ! " ;
2020-01-13 19:10:58 +01:00
indexer . index_text ( docid , indexed_pos , text ) ;
2019-10-02 17:34:32 +02:00
2019-10-18 13:05:28 +02:00
let Indexed {
words_doc_indexes , ..
} = indexer . build ( ) ;
2019-10-02 17:34:32 +02:00
assert! ( words_doc_indexes . get ( & b " l " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " aspirateur " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " ai " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " eteindre " [ .. ] ) . is_some ( ) ) ;
}
#[ test ]
fn strange_apostrophe_in_sequence ( ) {
2020-11-26 15:17:49 +01:00
let stop_words = fst ::Set ::default ( ) ;
let mut indexer = RawIndexer ::new ( & stop_words ) ;
2019-10-02 17:34:32 +02:00
let docid = DocumentId ( 0 ) ;
2020-01-13 19:10:58 +01:00
let indexed_pos = IndexedPos ( 0 ) ;
2019-10-02 17:34:32 +02:00
let text = vec! [ " Zut, l’ aspirateur, j’ ai oublié de l’ éteindre ! " ] ;
2020-01-13 19:10:58 +01:00
indexer . index_text_seq ( docid , indexed_pos , text ) ;
2019-10-02 17:34:32 +02:00
2019-10-18 13:05:28 +02:00
let Indexed {
words_doc_indexes , ..
} = indexer . build ( ) ;
2019-10-02 17:34:32 +02:00
assert! ( words_doc_indexes . get ( & b " l " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " aspirateur " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " ai " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " eteindre " [ .. ] ) . is_some ( ) ) ;
}
2019-10-29 16:04:48 +01:00
#[ test ]
fn basic_stop_words ( ) {
let stop_words = sdset ::SetBuf ::from_dirty ( vec! [ " l " , " j " , " ai " , " de " ] ) ;
let stop_words = fst ::Set ::from_iter ( stop_words ) . unwrap ( ) ;
2020-11-26 15:17:49 +01:00
let mut indexer = RawIndexer ::new ( & stop_words ) ;
2019-10-29 16:04:48 +01:00
let docid = DocumentId ( 0 ) ;
2020-01-13 19:10:58 +01:00
let indexed_pos = IndexedPos ( 0 ) ;
2019-10-29 16:04:48 +01:00
let text = " Zut, l’ aspirateur, j’ ai oublié de l’ éteindre ! " ;
2020-01-13 19:10:58 +01:00
indexer . index_text ( docid , indexed_pos , text ) ;
2019-10-29 16:04:48 +01:00
let Indexed {
words_doc_indexes , ..
} = indexer . build ( ) ;
assert! ( words_doc_indexes . get ( & b " l " [ .. ] ) . is_none ( ) ) ;
assert! ( words_doc_indexes . get ( & b " aspirateur " [ .. ] ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & b " j " [ .. ] ) . is_none ( ) ) ;
assert! ( words_doc_indexes . get ( & b " ai " [ .. ] ) . is_none ( ) ) ;
assert! ( words_doc_indexes . get ( & b " de " [ .. ] ) . is_none ( ) ) ;
assert! ( words_doc_indexes . get ( & b " eteindre " [ .. ] ) . is_some ( ) ) ;
}
2019-11-04 16:58:02 +01:00
#[ test ]
fn no_empty_unidecode ( ) {
2020-11-26 15:17:49 +01:00
let stop_words = fst ::Set ::default ( ) ;
let mut indexer = RawIndexer ::new ( & stop_words ) ;
2019-11-04 16:58:02 +01:00
let docid = DocumentId ( 0 ) ;
2020-01-13 19:10:58 +01:00
let indexed_pos = IndexedPos ( 0 ) ;
2019-11-04 16:58:02 +01:00
let text = " 🇯🇵 " ;
2020-01-13 19:10:58 +01:00
indexer . index_text ( docid , indexed_pos , text ) ;
2019-11-04 16:58:02 +01:00
let Indexed {
words_doc_indexes , ..
} = indexer . build ( ) ;
assert! ( words_doc_indexes
. get ( & " 🇯🇵 " . to_owned ( ) . into_bytes ( ) )
. is_some ( ) ) ;
}
2020-06-30 17:37:13 +02:00
#[ test ]
// test sample from 807
fn very_long_text ( ) {
2020-11-26 15:17:49 +01:00
let stop_words = fst ::Set ::default ( ) ;
let mut indexer = RawIndexer ::new ( & stop_words ) ;
2020-06-30 17:37:13 +02:00
let indexed_pos = IndexedPos ( 0 ) ;
let docid = DocumentId ( 0 ) ;
let text = " The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests. \n Each entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means “requests for example.com/”, while '/admin' means “requests for example.com/admin”. If multiple blocks could match an incoming request then the most-specific will apply. \n web:locations:'/':# Rules for all requests that don't otherwise match....'/sites/default/files':# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally: \n web:locations:'/':passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example: \n web:locations:'/':root:'web'passthru:'/app.php'This block will serve requests to / from the web directory in the application, and if a file doesn’t exist on disk then the request will be forwarded to the /app.php script. \n A full list of the possible subkeys for locations is below. \n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported. \n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below. \n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location. \n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with “ms” (milliseconds), “s” (seconds), “m” (minutes), “h” (hours), “d” (days), “w” (weeks), “M” (months, 30d) or “y” (years, 365d). \n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP. \n allow: Whether to allow serving files which don’t match a rule (true or false, default: true). \n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren’t affected, to avoid overlap with the application’s own ability to include custom headers in the response. \n rules: Specific overrides
indexer . index_text ( docid , indexed_pos , text ) ;
let Indexed {
words_doc_indexes , ..
} = indexer . build ( ) ;
2020-11-26 20:01:53 +01:00
assert! ( words_doc_indexes . get ( & " request " . to_owned ( ) . into_bytes ( ) ) . is_some ( ) ) ;
2020-06-30 17:37:13 +02:00
}
2020-06-30 18:44:17 +02:00
#[ test ]
fn words_over_index_1000_not_indexed ( ) {
2020-11-26 15:17:49 +01:00
let stop_words = fst ::Set ::default ( ) ;
let mut indexer = RawIndexer ::new ( & stop_words ) ;
2020-06-30 18:44:17 +02:00
let indexed_pos = IndexedPos ( 0 ) ;
let docid = DocumentId ( 0 ) ;
let mut text = String ::with_capacity ( 5000 ) ;
for _ in 0 .. 1000 {
text . push_str ( " less " ) ;
}
text . push_str ( " more " ) ;
indexer . index_text ( docid , indexed_pos , & text ) ;
let Indexed {
words_doc_indexes , ..
} = indexer . build ( ) ;
assert! ( words_doc_indexes . get ( & " less " . to_owned ( ) . into_bytes ( ) ) . is_some ( ) ) ;
assert! ( words_doc_indexes . get ( & " more " . to_owned ( ) . into_bytes ( ) ) . is_none ( ) ) ;
}
2019-10-02 17:34:32 +02:00
}