Browse Source

[enh] add checker

pull/2419/head
Alexandre Flament 2 months ago
parent
commit
8cbc9f2d58
11 changed files with 539 additions and 2 deletions
  1. +3
    -0
      Dockerfile
  2. +1
    -0
      requirements.txt
  3. +7
    -0
      searx/search/__init__.py
  4. +1
    -0
      searx/search/checker/__init__.py
  5. +51
    -0
      searx/search/checker/__main__.py
  6. +388
    -0
      searx/search/checker/impl.py
  7. +12
    -0
      searx/search/processors/abstract.py
  8. +44
    -0
      searx/search/processors/online.py
  9. +10
    -0
      searx/search/processors/online_currency.py
  10. +18
    -0
      searx/search/processors/online_dictionary.py
  11. +4
    -2
      utils/searx.sh

+ 3
- 0
Dockerfile View File

@ -41,6 +41,8 @@ RUN apk upgrade --no-cache \
openssl-dev \
tar \
git \
protoc \
protobuf-dev \
&& apk add --no-cache \
ca-certificates \
su-exec \
@ -53,6 +55,7 @@ RUN apk upgrade --no-cache \
uwsgi \
uwsgi-python3 \
brotli \
protobuf \
&& pip3 install --upgrade pip \
&& pip3 install --no-cache -r requirements.txt \
&& apk del build-dependencies \


+ 1
- 0
requirements.txt View File

@ -9,3 +9,4 @@ pygments==2.1.3
python-dateutil==2.8.1
pyyaml==5.3.1
requests[socks]==2.25.1
pycld3==0.20

+ 7
- 0
searx/search/__init__.py View File

@ -64,6 +64,9 @@ class EngineRef:
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
@ -108,6 +111,10 @@ class SearchQuery:
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
def __hash__(self):
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
self.timeout_limit, self.external_bang))
class Search:
"""Search information container"""


+ 1
- 0
searx/search/checker/__init__.py View File

@ -0,0 +1 @@
from .impl import Checker

+ 51
- 0
searx/search/checker/__main__.py View File

@ -0,0 +1,51 @@
import sys
import searx.search
import searx.search.processors
import searx.search.checker
if sys.stdout.isatty():
RESET_SEQ = "\033[0m"
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8))
else:
RESET_SEQ = ""
COLOR_SEQ = ""
BOLD_SEQ = ""
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
def iter_processor():
if len(sys.argv) > 1:
for name, processor in searx.search.processors.items():
if name in sys.argv:
yield name, processor
else:
for name, processor in searx.search.processors.items():
yield name, processor
def main():
searx.search.initialize()
broken_urls = []
for name, processor in iter_processor():
if sys.stdout.isatty():
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ)
checker = searx.search.checker.Checker(processor)
checker.run()
if checker.test_results.succesfull:
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ)
else:
errors = [test_name + ': ' + error for test_name, error in checker.test_results]
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ)
broken_urls += checker.test_results.broken_urls
for url in broken_urls:
print('Error fetching', url)
if __name__ == '__main__':
main()

+ 388
- 0
searx/search/checker/impl.py View File

@ -0,0 +1,388 @@
import typing
import types
import functools
import itertools
from time import time
from urllib.parse import urlparse
import re
import cld3
import requests.exceptions
from searx import poolrequests, logger
from searx.results import ResultContainer
from searx.search import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
HTML_TAGS = [
'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script',
'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite',
'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small',
'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt',
'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input',
'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet',
'frame', 'frameset'
]
def get_check_no_html():
rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS]
rep += ['</' + tag + '>' for tag in HTML_TAGS]
pattern = re.compile('|'.join(rep))
def f(text):
return pattern.search(text.lower()) is None
return f
_check_no_html = get_check_no_html()
def _is_url(url):
try:
result = urlparse(url)
except ValueError:
return False
if result.scheme not in ('http', 'https'):
return False
return True
@functools.lru_cache(maxsize=8192)
def _is_url_image(image_url):
if not isinstance(image_url, str):
return False
if image_url.startswith('//'):
image_url = 'https:' + image_url
if image_url.startswith('data:'):
return image_url.startswith('data:image/')
if not _is_url(image_url):
return False
retry = 2
while retry > 0:
a = time()
try:
poolrequests.set_timeout_for_thread(10.0, time())
r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-GPC': '1',
'Cache-Control': 'max-age=0'
})
if r.headers["content-type"].startswith('image/'):
return True
return False
except requests.exceptions.Timeout:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
except requests.exceptions.RequestException:
logger.exception('Exception for %s', image_url)
return False
def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
return {
'query': search_query.query,
'lang': search_query.lang,
'pageno': search_query.pageno,
'safesearch': search_query.safesearch,
'time_range': search_query.time_range,
}
def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\
-> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]:
param1 = _search_query_to_dict(sq1)
param2 = _search_query_to_dict(sq2)
common = {}
diff = {}
for k, value1 in param1.items():
value2 = param2[k]
if value1 == value2:
common[k] = value1
else:
diff[k] = (value1, value2)
return (common, diff)
class TestResults:
__slots__ = 'errors', 'broken_urls'
def __init__(self):
self.errors: typing.Dict[str, typing.List[str]] = {}
self.broken_urls = []
def add_error(self, test, message):
errors_for_test = self.errors.setdefault(test, [])
if message not in errors_for_test:
errors_for_test.append(message)
def add_broken_url(self, url):
if url not in self.broken_urls:
self.broken_urls.append(url)
@property
def succesfull(self):
return len(self.errors) == 0
def __iter__(self):
for test_name, errors in self.errors.items():
for error in sorted(errors):
yield (test_name, error)
class ResultContainerTests:
__slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results'
def __init__(self,
test_results: TestResults,
test_name: str,
search_query: SearchQuery,
result_container: ResultContainer):
self.test_name = test_name
self.search_query = search_query
self.result_container = result_container
self.languages: typing.Set[str] = set()
self.test_results = test_results
self.stop_test = False
@property
def result_urls(self):
results = self.result_container.get_ordered_results()
return [result['url'] for result in results]
def _record_error(self, message: str) -> None:
self.test_results.add_error(self.test_name, message)
def _add_language(self, text: str) -> typing.Optional[str]:
r = cld3.get_language(str(text)) # pylint: disable=E1101
if r is not None and r.probability >= 0.9 and r.is_reliable:
self.languages.add(r.language)
return None
def _check_result(self, result):
if not _check_no_html(result.get('title', '')):
self._record_error('HTML in title')
if not _check_no_html(result.get('content', '')):
self._record_error('HTML in content')
self._add_language(result.get('title', ''))
self._add_language(result.get('content', ''))
template = result.get('template', 'default.html')
if template == 'default.html':
return
if template == 'code.html':
return
if template == 'torrent.html':
return
if template == 'map.html':
return
if template == 'images.html':
thumbnail_src = result.get('thumbnail_src')
if thumbnail_src is not None:
if not _is_url_image(thumbnail_src):
self.test_results.add_broken_url(thumbnail_src)
self._record_error('thumbnail_src URL is invalid')
elif not _is_url_image(result.get('img_src')):
self.test_results.add_broken_url(result.get('img_src'))
self._record_error('img_src URL is invalid')
if template == 'videos.html' and not _is_url_image(result.get('thumbnail')):
self._record_error('thumbnail URL is invalid')
def _check_results(self, results: list):
for result in results:
self._check_result(result)
def _check_answers(self, answers):
for answer in answers:
if not _check_no_html(answer):
self._record_error('HTML in answer')
def _check_infoboxes(self, infoboxes):
for infobox in infoboxes:
if not _check_no_html(infobox.get('content', '')):
self._record_error('HTML in infobox content')
self._add_language(infobox.get('content', ''))
for attribute in infobox.get('attributes', {}):
if not _check_no_html(attribute.get('value', '')):
self._record_error('HTML in infobox attribute value')
def check_basic(self):
if len(self.result_container.unresponsive_engines) > 0:
for message in self.result_container.unresponsive_engines:
self._record_error(message[1] + ' ' + (message[2] or ''))
self.stop_test = True
return
results = self.result_container.get_ordered_results()
if len(results) > 0:
self._check_results(results)
if len(self.result_container.answers) > 0:
self._check_answers(self.result_container.answers)
if len(self.result_container.infoboxes) > 0:
self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self):
if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox')
def has_answer(self):
if len(self.result_container.answers) == 0:
self._record_error('No answer')
def has_language(self, lang):
if lang not in self.languages:
self._record_error(lang + ' not found')
def not_empty(self):
result_types = set()
results = self.result_container.get_ordered_results()
if len(results) > 0:
result_types.add('results')
if len(self.result_container.answers) > 0:
result_types.add('answers')
if len(self.result_container.infoboxes) > 0:
result_types.add('infoboxes')
if len(result_types) == 0:
self._record_error('No result')
def one_title_contains(self, title: str):
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():
return
self._record_error(('{!r} not found in the title'.format(title)))
class CheckerTests:
__slots__ = 'test_results', 'test_name', 'result_container_tests_list'
def __init__(self,
test_results: TestResults,
test_name: str,
result_container_tests_list: typing.List[ResultContainerTests]):
self.test_results = test_results
self.test_name = test_name
self.result_container_tests_list = result_container_tests_list
def unique_results(self):
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0:
# results on the first page
for i, urls_i in enumerate(urls_list):
for j, urls_j in enumerate(urls_list):
if i < j and urls_i == urls_j:
common, diff = _search_query_diff(self.result_container_tests_list[i].search_query,
self.result_container_tests_list[j].search_query)
common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()])
diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()])
diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()])
self.test_results.add_error(self.test_name,
'results are identitical for {} and {} ({})'
.format(diff1_str, diff2_str, common_str))
class Checker:
__slots__ = 'processor', 'tests', 'test_results'
def __init__(self, processor: EngineProcessor):
self.processor = processor
self.tests = self.processor.get_tests()
self.test_results = TestResults()
@property
def engineref_list(self):
engine_name = self.processor.engine_name
engine_category = self.processor.engine.categories[0]
return [EngineRef(engine_name, engine_category)]
@staticmethod
def search_query_matrix_iterator(engineref_list, matrix):
p = []
for name, values in matrix.items():
if isinstance(values, (tuple, list)):
l = [(name, value) for value in values]
else:
l = [(name, values)]
p.append(l)
for kwargs in itertools.product(*p):
kwargs = {k: v for k, v in kwargs}
query = kwargs['query']
params = dict(kwargs)
del params['query']
yield SearchQuery(query, engineref_list, **params)
def call_test(self, obj, test_description):
if isinstance(test_description, (tuple, list)):
method, args = test_description[0], test_description[1:]
else:
method = test_description
args = ()
if isinstance(method, str) and hasattr(obj, method):
getattr(obj, method)(*args)
elif isinstance(method, types.FunctionType):
method(*args)
else:
self.test_results.add_error(obj.test_name,
'method {!r} ({}) not found for {}'
.format(method, method.__class__.__name__, obj.__class__.__name__))
def call_tests(self, obj, test_descriptions):
for test_description in test_descriptions:
self.call_test(obj, test_description)
def search(self, search_query: SearchQuery) -> ResultContainer:
result_container = ResultContainer()
engineref_category = search_query.engineref_list[0].category
params = self.processor.get_params(search_query, engineref_category)
if params is not None:
self.processor.search(search_query.query, params, result_container, time(), 5)
return result_container
def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests:
result_container = self.search(search_query)
result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container)
result_container_check.check_basic()
return result_container_check
def run_test(self, test_name):
test_parameters = self.tests[test_name]
search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
stop_test = False
if 'result_container' in test_parameters:
for rct in rct_list:
stop_test = stop_test or rct.stop_test
if not rct.stop_test:
self.call_tests(rct, test_parameters['result_container'])
if not stop_test:
if 'test' in test_parameters:
checker_tests = CheckerTests(self.test_results, test_name, rct_list)
self.call_tests(checker_tests, test_parameters['test'])
def run(self):
for test_name in self.tests:
self.run_test(test_name)

+ 12
- 0
searx/search/processors/abstract.py View File

@ -37,3 +37,15 @@ class EngineProcessor:
@abstractmethod
def search(self, query, params, result_container, start_time, timeout_limit):
pass
def get_tests(self):
tests = getattr(self.engine, 'tests', None)
if tests is None:
tests = getattr(self.engine, 'additional_tests', {})
tests.update(self.get_default_tests())
return tests
else:
return tests
def get_default_tests(self):
return {}

+ 44
- 0
searx/search/processors/online.py View File

@ -211,3 +211,47 @@ class OnlineProcessor(EngineProcessor):
# reset the suspend variables
self.engine.continuous_errors = 0
self.engine.suspend_end_time = 0
def get_default_tests(self):
tests = {}
tests['simple'] = {
'matrix': {'query': ('time', 'time')},
'result_container': ['not_empty'],
}
if getattr(self.engine, 'paging', False):
# [1, 2, 3] --> isinstance(l, (list, tuple)) ??
tests['paging'] = {
'matrix': {'query': 'time',
'pageno': (1, 2, 3)},
'result_container': ['not_empty'],
'test': ['unique_results']
}
if getattr(self.engine, 'time_range', False):
tests['time_range'] = {
'matrix': {'query': 'time',
'time_range': (None, 'day')},
'result_container': ['not_empty'],
'test': ['unique_results']
}
if getattr(self.engine, 'lang', False):
tests['lang_fr'] = {
'matrix': {'query': 'paris', 'lang': 'fr'},
'result_container': ['not_empty', ('has_lang', 'fr')],
}
tests['lang_en'] = {
'matrix': {'query': 'paris', 'lang': 'en'},
'result_container': ['not_empty', ('has_lang', 'en')],
}
if getattr(self.engine, 'safesearch', False):
tests['safesearch'] = {
'matrix': {'query': 'porn',
'safesearch': (0, 2)},
'test': ['unique_results']
}
return tests

+ 10
- 0
searx/search/processors/online_currency.py View File

@ -55,3 +55,13 @@ class OnlineCurrencyProcessor(OnlineProcessor):
params['from_name'] = iso4217_to_name(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en')
return params
def get_default_tests(self):
tests = {}
tests['currency'] = {
'matrix': {'query': '1337 usd in rmb'},
'result_container': ['has_answer'],
}
return tests

+ 18
- 0
searx/search/processors/online_dictionary.py View File

@ -35,3 +35,21 @@ class OnlineDictionaryProcessor(OnlineProcessor):
params['query'] = query
return params
def get_default_tests(self):
tests = {}
if getattr(self.engine, 'paging', False):
tests['translation_paging'] = {
'matrix': {'query': 'en-es house',
'pageno': (1, 2, 3)},
'result_container': ['not_empty', ('one_title_contains', 'house')],
'test': ['unique_results']
}
else:
tests['translation'] = {
'matrix': {'query': 'en-es house'},
'result_container': ['not_empty', ('one_title_contains', 'house')],
}
return tests

+ 4
- 2
utils/searx.sh View File

@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\
python3-dev python3-babel python3-venv
uwsgi uwsgi-plugin-python3
git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev
libprotobuf-dev protobuf-compiler
shellcheck"
BUILD_PACKAGES_debian="\
@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\
python python-pip python-lxml python-babel
uwsgi uwsgi-plugin-python
git base-devel libxml2
protobuf
shellcheck"
BUILD_PACKAGES_arch="\
@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\
python python-pip python-lxml python-babel
uwsgi uwsgi-plugin-python3
git @development-tools libxml2
ShellCheck"
ShellCheck protobuf-compiler protobuf-devel"
BUILD_PACKAGES_fedora="\
firefox graphviz graphviz-gd ImageMagick librsvg2-tools
@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\
python36 python36-pip python36-lxml python-babel
uwsgi uwsgi-plugin-python3
git @development-tools libxml2
ShellCheck"
ShellCheck protobuf-compiler protobuf-devel"
BUILD_PACKAGES_centos="\
firefox graphviz graphviz-gd ImageMagick librsvg2-tools


Loading…
Cancel
Save