From 2d58b28f43ab45d6d0cd1e9c04f9450ec256f1d0 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 19 Oct 2022 07:03:46 -0500 Subject: [PATCH] Improve script language codec --- milli/src/heed_codec/script_language_codec.rs | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs index af15990ea..7e150723a 100644 --- a/milli/src/heed_codec/script_language_codec.rs +++ b/milli/src/heed_codec/script_language_codec.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; -use std::mem::size_of; use std::str; use charabia::{Language, Script}; @@ -11,16 +10,11 @@ impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { type DItem = (Script, Language); fn bytes_decode(bytes: &'a [u8]) -> Option { - let footer_len = size_of::(); - - if bytes.len() < footer_len { - return None; - } - - let (script, bytes) = bytes.split_at(bytes.len() - footer_len); - let script = str::from_utf8(script).ok()?; + let sep = bytes.iter().position(|b| *b == 0)?; + let (s_bytes, l_bytes) = bytes.split_at(sep); + let script = str::from_utf8(s_bytes).ok()?; let script_name = Script::from_name(script); - let lan = str::from_utf8(bytes).ok()?; + let lan = str::from_utf8(l_bytes).ok()?; let lan_name = Language::from_name(lan); Some((script_name, lan_name)) @@ -31,12 +25,13 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { type EItem = (Script, Language); fn bytes_encode((script, lan): &Self::EItem) -> Option> { - let script_name = script.name(); - let lan_name = lan.name(); + let script_name = script.name().as_bytes(); + let lan_name = lan.name().as_bytes(); - let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len()); - bytes.extend_from_slice(script_name.as_bytes()); - bytes.extend_from_slice(lan_name.as_bytes()); + let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1); + bytes.extend_from_slice(script_name); + bytes.push(0); + bytes.extend_from_slice(lan_name); Some(Cow::Owned(bytes)) }