Switch to a JSON protocol for the front page

This commit is contained in:
Clément Renault 2020-10-21 18:26:29 +02:00
parent 5caf523fd9
commit 802e925fd7
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
10 changed files with 53 additions and 80 deletions

View file

@ -26,7 +26,7 @@ pub struct Index {
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the document id to the document as a CSV line.
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
impl Index {
@ -74,23 +74,15 @@ impl Index {
pub fn documents<'t>(
&self,
rtxn: &'t heed::RoTxn,
iter: impl IntoIterator<Item=DocumentId>,
) -> anyhow::Result<Vec<(DocumentId, StringRecord)>>
ids: impl IntoIterator<Item=DocumentId>,
) -> anyhow::Result<Vec<(DocumentId, obkv::KvReader<'t>)>>
{
let ids: Vec<_> = iter.into_iter().collect();
let mut content = Vec::new();
let mut documents = Vec::new();
for id in ids.iter().cloned() {
let document_content = self.documents.get(rtxn, &BEU32::new(id))?
for id in ids {
let kv = self.documents.get(rtxn, &BEU32::new(id))?
.with_context(|| format!("Could not find document {}", id))?;
content.extend_from_slice(document_content);
}
let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]);
let mut documents = Vec::with_capacity(ids.len());
for (id, result) in ids.into_iter().zip(rdr.records()) {
documents.push((id, result?));
documents.push((id, kv));
}
Ok(documents)

View file

@ -1,5 +1,5 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use std::convert::{TryFrom, TryInto};
use std::fs::File;
use std::io::Read;
use std::iter::FromIterator;
@ -204,11 +204,15 @@ impl Store {
self.insert_word_docid(word, document_id)?;
}
let record = CsvStringRecordCodec::bytes_encode(record)
.with_context(|| format!("could not encode CSV record"))?;
let mut writer = obkv::KvWriter::memory();
record.iter().enumerate().for_each(|(i, v)| {
let key = i.try_into().unwrap();
writer.insert(key, v.as_bytes()).unwrap();
});
let bytes = writer.into_inner().unwrap();
self.documents_ids.insert(document_id);
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
self.documents_writer.insert(document_id.to_be_bytes(), bytes)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
Ok(())

View file

@ -68,7 +68,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let mut wtr = csv::Writer::from_writer(io::stdout());
wtr.write_record(&headers)?;
for (_id, record) in documents {
wtr.write_record(&record)?;
wtr.write_record(record.iter().map(|(_, v)| v))?;
}
wtr.flush()?;

View file

@ -1,5 +1,6 @@
use std::collections::HashSet;
use std::fs::{File, create_dir_all};
use std::mem;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
@ -7,9 +8,10 @@ use std::sync::Arc;
use std::time::Instant;
use askama_warp::Template;
use futures::{FutureExt, StreamExt};
use futures::stream;
use futures::{FutureExt, StreamExt};
use heed::EnvOpenOptions;
use indexmap::IndexMap;
use serde::{Serialize, Deserialize};
use structopt::StructOpt;
use tokio::fs::File as TFile;
@ -56,25 +58,21 @@ pub struct Opt {
indexer: IndexerOpt,
}
fn highlight_record(record: &csv::StringRecord, words: &HashSet<String>) -> csv::StringRecord {
let mut output_record = csv::StringRecord::new();
let mut buffer = String::new();
for field in record {
buffer.clear();
for (token_type, token) in simple_tokenizer(field) {
fn highlight_record(record: &mut IndexMap<String, String>, words: &HashSet<String>) {
for (_key, value) in record.iter_mut() {
let old_value = mem::take(value);
for (token_type, token) in simple_tokenizer(&old_value) {
if token_type == TokenType::Word {
let lowercase_token = token.to_lowercase();
let to_highlight = words.contains(&lowercase_token);
if to_highlight { buffer.push_str("<mark>") }
buffer.push_str(token);
if to_highlight { buffer.push_str("</mark>") }
if to_highlight { value.push_str("<mark>") }
value.push_str(token);
if to_highlight { value.push_str("</mark>") }
} else {
buffer.push_str(token);
value.push_str(token);
}
}
output_record.push_field(&buffer);
}
output_record
}
#[derive(Template)]
@ -327,13 +325,6 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
.body(include_str!("../../public/jquery-3.4.1.min.js"))
);
let dash_papaparse_route = warp::filters::method::get()
.and(warp::path!("papaparse.min.js"))
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../../public/papaparse.min.js"))
);
let dash_filesize_route = warp::filters::method::get()
.and(warp::path!("filesize.min.js"))
.map(|| Response::builder()
@ -390,32 +381,29 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
let body = match index.headers(&rtxn).unwrap() {
Some(headers) => {
let mut wtr = csv::Writer::from_writer(Vec::new());
let mut documents = Vec::new();
if let Some(headers) = index.headers(&rtxn).unwrap() {
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
let mut record = record.iter()
.map(|(key_id, value)| {
let key = headers[key_id as usize].to_owned();
let value = std::str::from_utf8(value).unwrap().to_owned();
(key, value)
})
.collect();
// We write the headers
wtr.write_record(&headers).unwrap();
let documents = index.documents(&rtxn, documents_ids).unwrap();
for (_id, record) in documents {
let record = if disable_highlighting {
record
} else {
highlight_record(&record, &found_words)
};
wtr.write_record(&record).unwrap();
if !disable_highlighting {
highlight_record(&mut record, &found_words);
}
wtr.into_inner().unwrap()
},
None => Vec::new(),
};
documents.push(record);
}
}
Response::builder()
.header("Content-Type", "text/csv")
.header("Content-Type", "application/json")
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
.body(String::from_utf8(body).unwrap())
.body(serde_json::to_string(&documents).unwrap())
});
async fn buf_stream(
@ -504,7 +492,6 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
.or(dash_bulma_dark_route)
.or(dash_style_route)
.or(dash_jquery_route)
.or(dash_papaparse_route)
.or(dash_filesize_route)
.or(dash_script_route)
.or(updates_script_route)