replace hashset with fst::Set

This commit is contained in:
mpostma 2020-11-26 15:17:49 +01:00 committed by many
parent 6527d3e492
commit 206308c1aa
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
4 changed files with 34 additions and 28 deletions

View File

@ -1,5 +1,5 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::ops::Range;
use std::time::Instant;
@ -175,7 +175,7 @@ where I: IntoIterator<Item=Operation>,
const MAX_NGRAM: usize = 3;
fn split_query_string(s: &str, stop_words: HashSet<String>) -> Vec<(usize, String)> {
fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set<A>) -> Vec<(usize, String)> {
// TODO: Use global instance instead
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
analyzer
@ -213,12 +213,7 @@ pub fn create_query_tree(
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
{
// TODO: use a shared analyzer instance
let words = split_query_string(query, ctx.stop_words
.stream()
.into_strs()
.unwrap_or_default()
.into_iter().
collect());
let words = split_query_string(query, &ctx.stop_words);
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));

View File

@ -14,11 +14,14 @@ const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
pub struct RawIndexer<'a, A>
where
A: AsRef<[u8]>
{
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
analyzer: Analyzer,
analyzer: Analyzer<'a, A>,
}
pub struct Indexed<'a> {
@ -26,17 +29,20 @@ pub struct Indexed<'a> {
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
}
impl RawIndexer {
pub fn new<A: AsRef<[u8]>>(stop_words: fst::Set<A>) -> RawIndexer {
impl<'a, A> RawIndexer<'a, A>
where
A: AsRef<[u8]>
{
pub fn new(stop_words: &'a fst::Set<A>) -> RawIndexer<'a, A> {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit<A: AsRef<[u8]>>(stop_words: fst::Set<A>, limit: usize) -> RawIndexer {
pub fn with_word_limit(stop_words: &'a fst::Set<A>, limit: usize) -> RawIndexer<A> {
RawIndexer {
word_limit: limit,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words.stream().into_strs().unwrap().into_iter().collect()))
analyzer: Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)),
}
}
@ -231,7 +237,8 @@ mod tests {
#[test]
fn strange_apostrophe() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -250,7 +257,8 @@ mod tests {
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -272,7 +280,7 @@ mod tests {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -293,7 +301,8 @@ mod tests {
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
@ -312,7 +321,8 @@ mod tests {
#[test]
// test sample from 807
fn very_long_text() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let text = " The locations block is the most powerful, and potentially most involved, section of the .platform.app.yaml file. It allows you to control how the application container responds to incoming requests at a very fine-grained level. Common patterns also vary between language containers due to the way PHP-FPM handles incoming requests.\nEach entry of the locations block is an absolute URI path (with leading /) and its value includes the configuration directives for how the web server should handle matching requests. That is, if your domain is example.com then '/' means &ldquo;requests for example.com/&rdquo;, while '/admin' means &ldquo;requests for example.com/admin&rdquo;. If multiple blocks could match an incoming request then the most-specific will apply.\nweb:locations:&#39;/&#39;:# Rules for all requests that don&#39;t otherwise match....&#39;/sites/default/files&#39;:# Rules for any requests that begin with /sites/default/files....The simplest possible locations configuration is one that simply passes all requests on to your application unconditionally:\nweb:locations:&#39;/&#39;:passthru:trueThat is, all requests to /* should be forwarded to the process started by web.commands.start above. Note that for PHP containers the passthru key must specify what PHP file the request should be forwarded to, and must also specify a docroot under which the file lives. For example:\nweb:locations:&#39;/&#39;:root:&#39;web&#39;passthru:&#39;/app.php&#39;This block will serve requests to / from the web directory in the application, and if a file doesn&rsquo;t exist on disk then the request will be forwarded to the /app.php script.\nA full list of the possible subkeys for locations is below.\n root: The folder from which to serve static assets for this location relative to the application root. The application root is the directory in which the .platform.app.yaml file is located. Typical values for this property include public or web. Setting it to '' is not recommended, and its behavior may vary depending on the type of application. Absolute paths are not supported.\n passthru: Whether to forward disallowed and missing resources from this location to the application and can be true, false or an absolute URI path (with leading /). The default value is false. For non-PHP applications it will generally be just true or false. In a PHP application this will typically be the front controller such as /index.php or /app.php. This entry works similar to mod_rewrite under Apache. Note: If the value of passthru does not begin with the same value as the location key it is under, the passthru may evaluate to another entry. That may be useful when you want different cache settings for different paths, for instance, but want missing files in all of them to map back to the same front controller. See the example block below.\n index: The files to consider when serving a request for a directory: an array of file names or null. (typically ['index.html']). Note that in order for this to work, access to the static files named must be allowed by the allow or rules keys for this location.\n expires: How long to allow static assets from this location to be cached (this enables the Cache-Control and Expires headers) and can be a time or -1 for no caching (default). Times can be suffixed with &ldquo;ms&rdquo; (milliseconds), &ldquo;s&rdquo; (seconds), &ldquo;m&rdquo; (minutes), &ldquo;h&rdquo; (hours), &ldquo;d&rdquo; (days), &ldquo;w&rdquo; (weeks), &ldquo;M&rdquo; (months, 30d) or &ldquo;y&rdquo; (years, 365d).\n scripts: Whether to allow loading scripts in that location (true or false). This directive is only meaningful on PHP.\n allow: Whether to allow serving files which don&rsquo;t match a rule (true or false, default: true).\n headers: Any additional headers to apply to static assets. This section is a mapping of header names to header values. Responses from the application aren&rsquo;t affected, to avoid overlap with the application&rsquo;s own ability to include custom headers in the response.\n rules: Specific overrides for a specific location. The key is a PCRE (regular expression) that is matched against the full request path.\n request_buffering: Most application servers do not support chunked requests (e.g. fpm, uwsgi), so Platform.sh enables request_buffering by default to handle them. That default configuration would look like this if it was present in .platform.app.yaml:\nweb:locations:&#39;/&#39;:passthru:truerequest_buffering:enabled:truemax_request_size:250mIf the application server can already efficiently handle chunked requests, the request_buffering subkey can be modified to disable it entirely (enabled: false). Additionally, applications that frequently deal with uploads greater than 250MB in size can update the max_request_size key to the application&rsquo;s needs. Note that modifications to request_buffering will need to be specified at each location where it is desired.\n ";
@ -325,7 +335,8 @@ mod tests {
#[test]
fn words_over_index_1000_not_indexed() {
let mut indexer = RawIndexer::new(fst::Set::default());
let stop_words = fst::Set::default();
let mut indexer = RawIndexer::new(&stop_words);
let indexed_pos = IndexedPos(0);
let docid = DocumentId(0);
let mut text = String::with_capacity(5000);

View File

@ -110,12 +110,12 @@ pub fn push_documents_addition<D: serde::Serialize>(
}
#[allow(clippy::too_many_arguments)]
fn index_document(
fn index_document<A: AsRef<[u8]>>(
writer: &mut heed::RwTxn<MainT>,
documents_fields: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
ranked_map: &mut RankedMap,
indexer: &mut RawIndexer,
indexer: &mut RawIndexer<A>,
schema: &Schema,
field_id: FieldId,
document_id: DocumentId,
@ -221,7 +221,7 @@ pub fn apply_addition(
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
let mut indexer = RawIndexer::new(stop_words);
let mut indexer = RawIndexer::new(&stop_words);
// For each document in this update
for (document_id, document) in &documents_additions {
@ -316,7 +316,7 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
.unwrap();
let number_of_inserted_documents = documents_ids_to_reindex.len();
let mut indexer = RawIndexer::new(stop_words);
let mut indexer = RawIndexer::new(&stop_words);
let mut ram_store = HashMap::new();
if let Some(ref attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
@ -372,12 +372,12 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
Ok(())
}
pub fn write_documents_addition_index(
pub fn write_documents_addition_index<A: AsRef<[u8]>>(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer,
indexer: RawIndexer<A>,
) -> MResult<()>
{
let indexed = indexer.build();

View File

@ -12,8 +12,8 @@ use crate::serde::SerializerError;
use crate::store::DiscoverIds;
/// Returns the number of words indexed or `None` if the type is unindexable.
pub fn index_value(
indexer: &mut RawIndexer,
pub fn index_value<A: AsRef<[u8]>>(
indexer: &mut RawIndexer<A>,
document_id: DocumentId,
indexed_pos: IndexedPos,
value: &Value,