From e5aa7edb7fadf99cf2529f7c55d3e359e9b76cb3 Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Wed, 28 Jul 2021 10:04:38 -0700 Subject: [PATCH 1/7] Made 3 improvements. First, set timeout to 15 seconds so it does not get stuck. Limit text returned to 2K per result. Set time filter to last 30 days for web pages indexed within last 30 days. These changes improve the information returned and answers given by ParlAI. --- search_server.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/search_server.py b/search_server.py index 34b74d6..61d8ed3 100644 --- a/search_server.py +++ b/search_server.py @@ -29,7 +29,8 @@ _STYLE_SKIP = "" _CLOSE_STYLE_GOOD = "[/]" if _STYLE_GOOD else "" _CLOSE_STYLE_SKIP = "[/]" if _STYLE_SKIP else "" - +_DEFAULT_URL_REQUEST_TIMEOUT = 15 # seconds +_TRUNCATE_TEXT_BYTES = 2048 def _parse_host(host: str) -> Tuple[str, int]: """ Parse the host string. @@ -46,7 +47,7 @@ def _get_and_parse(url: str) -> Dict[str, str]: """ Download a webpage and parse it. """ try: - resp = requests.get(url) + resp = requests.get(url, timeout=_DEFAULT_URL_REQUEST_TIMEOUT) except requests.exceptions.RequestException as e: print(f"[!] {e} for url {url}") return None @@ -161,6 +162,11 @@ def do_POST(self): f" {rich.markup.escape(maybe_content['url'])}" # f"Content: {len(maybe_content['content'])}", ) + + # Truncate text + if len(maybe_content) > _TRUNCATE_TEXT_BYTES: + maybe_content = maybe_content[:_TRUNCATE_TEXT_BYTES] + dupe_detection_set.add(maybe_content["content"]) content.append(maybe_content) if len(content) >= n: @@ -201,7 +207,7 @@ def search(self, q: str, n: int) -> Generator[str, None, None]: class GoogleSearchServer(SearchABC): def search(self, q: str, n: int) -> Generator[str, None, None]: - return googlesearch.search(q, num=n, stop=None, pause=_DELAY_SEARCH) + return googlesearch.search(q, num=n, stop=None, pause=_DELAY_SEARCH, tbs="qdr:m") class Application: From 14f0fba2e255c8d5ef077e8ec43b7b7e50fe5194 Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Wed, 28 Jul 2021 20:58:05 -0700 Subject: [PATCH 2/7] Added support for bing, provide a 10X faster implementation, provide much cleaner data returned, added command line args --- search_server.py | 260 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 234 insertions(+), 26 deletions(-) diff --git a/search_server.py b/search_server.py index 61d8ed3..c45b88e 100644 --- a/search_server.py +++ b/search_server.py @@ -19,7 +19,6 @@ import rich.markup import requests - print = rich.print _DEFAULT_HOST = "0.0.0.0" @@ -29,12 +28,22 @@ _STYLE_SKIP = "" _CLOSE_STYLE_GOOD = "[/]" if _STYLE_GOOD else "" _CLOSE_STYLE_SKIP = "[/]" if _STYLE_SKIP else "" -_DEFAULT_URL_REQUEST_TIMEOUT = 15 # seconds -_TRUNCATE_TEXT_BYTES = 2048 +_requests_get_timeout = 5 # seconds +_strip_html_menus = False +_max_text_bytes = None + +# To get a free Bing Subscription Key go here: +# https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api +_use_bing = False # Use Bing instead of Google Search Engine + +_use_bing_description_only = False # short but 10X faster + +# Bing Search API documentation: +# https://docs.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/query-parameters def _parse_host(host: str) -> Tuple[str, int]: - """ Parse the host string. - Should be in the format HOSTNAME:PORT. + """ Parse the host string. + Should be in the format HOSTNAME:PORT. Example: 0.0.0.0:8080 """ splitted = host.split(":") @@ -46,15 +55,17 @@ def _parse_host(host: str) -> Tuple[str, int]: def _get_and_parse(url: str) -> Dict[str, str]: """ Download a webpage and parse it. """ + global _requests_get_timeout + try: - resp = requests.get(url, timeout=_DEFAULT_URL_REQUEST_TIMEOUT) + resp = requests.get(url, timeout=_requests_get_timeout) except requests.exceptions.RequestException as e: print(f"[!] {e} for url {url}") return None else: resp.encoding = resp.apparent_encoding page = resp.text - + ########################################################################### # Prepare the title ########################################################################### @@ -64,7 +75,7 @@ def _get_and_parse(url: str) -> Dict[str, str]: output_dict["title"] = ( html.unescape(pre_rendered.renderContents().decode()) if pre_rendered else "" ) - + output_dict["title"] = ( output_dict["title"].replace("\n", "").replace("\r", "") ) @@ -85,6 +96,8 @@ def _get_and_parse(url: str) -> Dict[str, str]: class SearchABC(http.server.BaseHTTPRequestHandler): def do_POST(self): + global _strip_html_menus, _max_text_bytes, _use_bing, _use_bing_description_only + """ Handle POST requests from the client. (All requests are POST) """ ####################################################################### @@ -112,16 +125,34 @@ def do_POST(self): ####################################################################### # Search, get the pages and parse the content of the pages ####################################################################### - print(f"\n[bold]Received query:[/] {parsed}") + if _use_bing: + search_engine = "Bing" + else: + search_engine = "Google" + + print(f"\n[bold]Received query:[/] {parsed}, using {search_engine} search engine and using bing link descriptions only {_use_bing_description_only}") + n = int(parsed["n"]) q = parsed["q"] # Over query a little bit in case we find useless URLs content = [] dupe_detection_set = set() - - # Search until we have n valid entries - for url in self.search(q=q, n=n): + + urls = [] + if _use_bing: + results = self.search_bing(q, n, ["News", "Entities", "Places", "Webpages"], + _use_bing_description_only) + + if _use_bing_description_only: + content = results + else: + urls = results + else: + urls = self.search(q=q, n=n) + + # Only execute loop to fetch each URL if urls returned + for url in urls: if len(content) >= n: break @@ -141,7 +172,7 @@ def do_POST(self): else: reason_content_empty = False reason_already_seen_content = False - + reasons = dict( reason_empty_response=reason_empty_response, reason_content_empty=reason_content_empty, @@ -163,9 +194,21 @@ def do_POST(self): # f"Content: {len(maybe_content['content'])}", ) - # Truncate text - if len(maybe_content) > _TRUNCATE_TEXT_BYTES: - maybe_content = maybe_content[:_TRUNCATE_TEXT_BYTES] + # Strip out all lines starting with "* " usually menu items + if _strip_html_menus: + print("Stripping HTML menus") + new_content = "" + for line in maybe_content['content'].splitlines(): + x = re.findall("^[\s]*\\* ", line) + if not x or len(line) > 50: + new_content += line + "\n" + + maybe_content['content'] = new_content + else: + print("Not stripping HTML menus") + + # Truncate text + maybe_content['content'] = maybe_content['content'][:_max_text_bytes] dupe_detection_set.add(maybe_content["content"]) content.append(maybe_content) @@ -184,12 +227,12 @@ def do_POST(self): } ) print(f" {_STYLE_SKIP}x{_CLOSE_STYLE_SKIP} Excluding an URL because `{_STYLE_SKIP}{reason_string}{_CLOSE_STYLE_SKIP}`:\n" - f" {url}") + f" {url}") ############################################################### # Prepare the answer and send it ############################################################### - content = content[:n] + content = content[:n] output = json.dumps(dict(response=content)).encode("utf-8") self.send_response(200) self.send_header("Content-type", "text/html") @@ -204,22 +247,152 @@ def search(self, q: str, n: int) -> Generator[str, None, None]: "GoogleSearch." ) + def search_bing( + self, query: str, n: int, types = ["News"], + return_content = True, promote=["News"] + ): + + global _bing_subscription_key + + assert _bing_subscription_key + + search_url = "https://api.bing.microsoft.com/v7.0/search" + print(f"n={n} responseFilter={types}") + headers = {"Ocp-Apim-Subscription-Key": _bing_subscription_key} + params = {"q": query, "textDecorations":True, + "textFormat": "HTML", "responseFilter":types, + "promote":promote, "answerCount":5} + response = requests.get(search_url, headers=headers, params=params) + response.raise_for_status() + search_results = response.json() + + items = [] + if "news" in search_results and "value" in search_results["news"]: + print(f'bing adding {len(search_results["news"]["value"])} news') + items = items + search_results["news"]["value"] + + if "webPages" in search_results and "value" in search_results["webPages"]: + print(f'bing adding {len(search_results["webPages"]["value"])} webPages') + items = items + search_results["webPages"]["value"] + + if "entities" in search_results and "value" in search_results["entities"]: + print(f'bing adding {len(search_results["entities"]["value"])} entities') + items = items + search_results["entities"]["value"] + + if "places" in search_results and "value" in search_results["places"]: + print(f'bing adding {len(search_results["places"]["value"])} places') + items = items + search_results["places"]["value"] + + urls = [] + contents = [] + news_count = 0 + + for item in items: + if "url" not in item: + continue + else: + url = item["url"] + + title = item["name"] + + # Remove Bing formatting characters from title + title = filter_html(title) + + if title is None or title == "": + print("No title to skipping") + continue + + if return_content: + content = title + ". " + if "snippet" in item : + snippet = filter_html(item["snippet"]) + content += snippet + print(f"Adding webpage summary with title {title} for url {url}") + contents.append({'title': title, 'url': url, 'content': content}) + + elif "description" in item: + if news_count < 3: + text = filter_html(item["description"]) + content += text + news_count += 1 + contents.append({'title': title, 'url': url, 'content': content}) + else: + print(f"Could not find descripton for item {item}") + else: + urls.append(url) + + if len(urls) == 0 and not return_content: + print(f"Warning: No Bing URLs found for query {query}") + + if return_content: + return contents + else: + return urls + +def filter_html(title): + title.replace("", "") + title = title.replace("", "") + title = title.replace("", "") + title = title.replace("
", "") + title = title.replace("\u2018", "") + title = title.replace("\u2018", "") + title = title.replace("\u00b7", "") + title = title.replace("&", "") + title = title.replace("
", "") + title = title.replace("'", "") + return title class GoogleSearchServer(SearchABC): def search(self, q: str, n: int) -> Generator[str, None, None]: - return googlesearch.search(q, num=n, stop=None, pause=_DELAY_SEARCH, tbs="qdr:m") - + return googlesearch.search(q, num=n, stop=None, pause=_DELAY_SEARCH) class Application: - def serve(self, host: str = _DEFAULT_HOST) -> NoReturn: + def serve( + self, host: str = _DEFAULT_HOST, + requests_get_timeout = _requests_get_timeout, + strip_html_menus = _strip_html_menus, + max_text_bytes = _max_text_bytes, + use_bing = _use_bing, + use_bing_description_only = _use_bing_description_only, + bing_subscription_key = None) -> NoReturn: + + global _requests_get_timeout, _strip_html_menus, _max_text_bytes + global _use_bing, _use_bing_description_only, _bing_subscription_key + """ Main entry point: Start the server. - Host is expected to be in the HOSTNAME:PORT format. - HOSTNAME can be an IP. Most of the time should be 0.0.0.0. - Port 8080 doesn't work on colab. + Arguments: + host (str): + requests_get_timeout (int): + strip_html_menus (bool): + max_text_bytes (int): + use_bing (bool): + use_bing_description_only (bool): + bing_subscription_key (str): + HOSTNAME:PORT of the server. HOSTNAME can be an IP. + Most of the time should be 0.0.0.0. Port 8080 doesn't work on colab. + Other ports also probably don't work on colab, test it out. + requests_get_timeout is seconds before each url fetch times out + strip_html_menus removes likely menus to clean up text + max_text_bytes limits the bytes returned per web page. Note, + ParlAI current defaults to 512 bytes + use_bing set to True will use Bing instead of Google + use_bing_description_only are short but 10X faster since no url gets + bing_subscription_key required to use bing. Can get one at: + https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api """ + hostname, port = _parse_host(host) host = f"{hostname}:{port}" + _requests_get_timeout = requests_get_timeout + _strip_html_menus = strip_html_menus + _max_text_bytes = max_text_bytes + _use_bing = use_bing + _use_bing_description_only = use_bing_description_only + _bing_subscription_key = bing_subscription_key + + self.check_and_print_cmdline_args() + with http.server.ThreadingHTTPServer( (hostname, int(port)), GoogleSearchServer ) as server: @@ -227,13 +400,40 @@ def serve(self, host: str = _DEFAULT_HOST) -> NoReturn: print(f"Host: {host}") server.serve_forever() + def check_and_print_cmdline_args( + self) -> None: + if _use_bing and _bing_subscription_key is None: + print("--bing_subscription_key required to use bing search") + print("To get one go to url:") + print("https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api") + exit() + + print("Command line args used:") + print(f" requests_get_timeout={_requests_get_timeout}") + print(f" strip_html_menus={_strip_html_menus}") + print(f" max_text_bytes={_max_text_bytes}") + print(f" use_bing={_use_bing}") + print(f" use_bing_description_only={_use_bing_description_only}") + def test_parser(self, url: str) -> None: - """ Test the webpage getter and parser. + """ Test the webpage getter and parser. Will try to download the page, then parse it, then will display the result. """ print(_get_and_parse(url)) - def test_server(self, query: str, n: int, host : str = _DEFAULT_HOST) -> None: + def test_server( + self, query: str, n: int, host : str = _DEFAULT_HOST, + requests_get_timeout = _requests_get_timeout, + strip_html_menus = _strip_html_menus, + max_text_bytes = _max_text_bytes, + use_bing = _use_bing, + use_bing_description_only = _use_bing_description_only, + bing_subscription_key = None + ) -> None: + + global _requests_get_timeout, _strip_html_menus, _max_text_bytes + global _use_bing, _use_bing_description_only, _bing_subscription_key + """ Creates a thin fake client to test a server that is already up. Expects a server to have already been started with `python search_server.py serve [options]`. Creates a retriever client the same way ParlAi client does it for its chat bot, then @@ -241,9 +441,17 @@ def test_server(self, query: str, n: int, host : str = _DEFAULT_HOST) -> None: """ host, port = _parse_host(host) + _requests_get_timeout = requests_get_timeout + _strip_html_menus = strip_html_menus + _max_text_bytes = max_text_bytes + _use_bing = use_bing + _use_bing_description_only = use_bing_description_only + print(f"Query: `{query}`") print(f"n: {n}") + self.check_and_print_cmdline_args() + retriever = parlai.agents.rag.retrieve_api.SearchEngineRetriever( dict( search_server=f"{host}:{port}", From be2c0b2638ae685c15e7f606d1c4de8d275e8d1e Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Thu, 29 Jul 2021 14:19:11 -0700 Subject: [PATCH 3/7] Made architecture and stylistic changes requested by JulesGM including new class for Bing Search and removing most global variables --- search_server.py | 274 +++++++++++++++++++++++++---------------------- 1 file changed, 148 insertions(+), 126 deletions(-) diff --git a/search_server.py b/search_server.py index c45b88e..f72e856 100644 --- a/search_server.py +++ b/search_server.py @@ -28,15 +28,7 @@ _STYLE_SKIP = "" _CLOSE_STYLE_GOOD = "[/]" if _STYLE_GOOD else "" _CLOSE_STYLE_SKIP = "[/]" if _STYLE_SKIP else "" -_requests_get_timeout = 5 # seconds -_strip_html_menus = False -_max_text_bytes = None - -# To get a free Bing Subscription Key go here: -# https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api -_use_bing = False # Use Bing instead of Google Search Engine - -_use_bing_description_only = False # short but 10X faster +_REQUESTS_GET_TIMEOUT = 5 # seconds # Bing Search API documentation: # https://docs.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/query-parameters @@ -51,14 +43,11 @@ def _parse_host(host: str) -> Tuple[str, int]: port = splitted[1] if len(splitted) > 1 else _DEFAULT_PORT return hostname, int(port) - def _get_and_parse(url: str) -> Dict[str, str]: """ Download a webpage and parse it. """ - global _requests_get_timeout - try: - resp = requests.get(url, timeout=_requests_get_timeout) + resp = requests.get(url, timeout=_REQUESTS_GET_TIMEOUT) except requests.exceptions.RequestException as e: print(f"[!] {e} for url {url}") return None @@ -93,10 +82,8 @@ def _get_and_parse(url: str) -> Dict[str, str]: return output_dict - class SearchABC(http.server.BaseHTTPRequestHandler): def do_POST(self): - global _strip_html_menus, _max_text_bytes, _use_bing, _use_bing_description_only """ Handle POST requests from the client. (All requests are POST) """ @@ -125,12 +112,7 @@ def do_POST(self): ####################################################################### # Search, get the pages and parse the content of the pages ####################################################################### - if _use_bing: - search_engine = "Bing" - else: - search_engine = "Google" - - print(f"\n[bold]Received query:[/] {parsed}, using {search_engine} search engine and using bing link descriptions only {_use_bing_description_only}") + print(f"\n[bold]Received query:[/] {parsed}") n = int(parsed["n"]) q = parsed["q"] @@ -140,16 +122,14 @@ def do_POST(self): dupe_detection_set = set() urls = [] - if _use_bing: - results = self.search_bing(q, n, ["News", "Entities", "Places", "Webpages"], - _use_bing_description_only) + results = self.search(q=q, n=n, + subscription_key = self.server.subscription_key, + use_description_only=self.server.use_description_only) - if _use_bing_description_only: - content = results - else: - urls = results + if self.server.use_description_only: + content = results else: - urls = self.search(q=q, n=n) + urls = results # Only execute loop to fetch each URL if urls returned for url in urls: @@ -195,20 +175,18 @@ def do_POST(self): ) # Strip out all lines starting with "* " usually menu items - if _strip_html_menus: - print("Stripping HTML menus") + if self.server.strip_html_menus: new_content = "" for line in maybe_content['content'].splitlines(): - x = re.findall("^[\s]*\\* ", line) - if not x or len(line) > 50: - new_content += line + "\n" + if line.find("*"): # Performance optimazation since regex is slow + x = re.findall("^[\s]*\\* ", line) + if not x or len(line) > 50: + new_content += line + "\n" maybe_content['content'] = new_content - else: - print("Not stripping HTML menus") # Truncate text - maybe_content['content'] = maybe_content['content'][:_max_text_bytes] + maybe_content['content'] = maybe_content['content'][:self.server.max_text_bytes] dupe_detection_set.add(maybe_content["content"]) content.append(maybe_content) @@ -240,26 +218,53 @@ def do_POST(self): self.end_headers() self.wfile.write(output) - def search(self, q: str, n: int) -> Generator[str, None, None]: + def search(self, + q: str, n: int, + subscription_key: str = "", + use_description_only: bool = False + ) -> Generator[str, None, None]: + return NotImplemented( "Search is an abstract base class, not meant to be directly " "instantiated. You should instantiate a derived class like " "GoogleSearch." ) - def search_bing( - self, query: str, n: int, types = ["News"], - return_content = True, promote=["News"] - ): +def filter_html(title): + title.replace("", "") + title = title.replace("", "") + title = title.replace("", "") + title = title.replace("
", "") + title = title.replace("
", "") + title = title.replace(""", "") + title = title.replace("&", "") + title = title.replace(">", "") + title = title.replace("<", "") + title = title.replace("'", "") + title = title.replace("\u2018", "") # unicode single quote + title = title.replace("\u2019", "") # unicode single quote + title = title.replace("\u8220", "") # unicode left double quote + title = title.replace("\u8221", "") # unicode right double quote + title = title.replace("\u8222", "") # unicode double low-9 quotation mark + title = title.replace("\u2013", "") # unicode dash + title = title.replace("\u00b7", "") # unicode middle dot + return title - global _bing_subscription_key +class BingSearchServer(SearchABC): + def search(self, + q: str, n: int, + subscription_key: str = None, + use_description_only: bool = False + ) -> Generator[str, None, None]: - assert _bing_subscription_key + assert subscription_key + types = ["News", "Entities", "Places", "Webpages"] + promote = ["News"] search_url = "https://api.bing.microsoft.com/v7.0/search" print(f"n={n} responseFilter={types}") - headers = {"Ocp-Apim-Subscription-Key": _bing_subscription_key} - params = {"q": query, "textDecorations":True, + headers = {"Ocp-Apim-Subscription-Key": subscription_key} + params = {"q": q, "textDecorations":True, "textFormat": "HTML", "responseFilter":types, "promote":promote, "answerCount":5} response = requests.get(search_url, headers=headers, params=params) @@ -302,7 +307,7 @@ def search_bing( print("No title to skipping") continue - if return_content: + if self.server.use_description_only: content = title + ". " if "snippet" in item : snippet = filter_html(item["snippet"]) @@ -319,101 +324,122 @@ def search_bing( else: print(f"Could not find descripton for item {item}") else: - urls.append(url) + if url not in urls: + urls.append(url) - if len(urls) == 0 and not return_content: - print(f"Warning: No Bing URLs found for query {query}") + if len(urls) == 0 and not use_description_only: + print(f"Warning: No Bing URLs found for query {q}") - if return_content: + if use_description_only: return contents else: return urls -def filter_html(title): - title.replace("", "") - title = title.replace("", "") - title = title.replace("", "") - title = title.replace("
", "") - title = title.replace("\u2018", "") - title = title.replace("\u2018", "") - title = title.replace("\u00b7", "") - title = title.replace("&", "") - title = title.replace("
", "") - title = title.replace("'", "") - return title - class GoogleSearchServer(SearchABC): - def search(self, q: str, n: int) -> Generator[str, None, None]: + def search(self, q: str, n: int, + subscription_key: str = None, + use_description_only: bool = False + ) -> Generator[str, None, None]: + return googlesearch.search(q, num=n, stop=None, pause=_DELAY_SEARCH) -class Application: - def serve( - self, host: str = _DEFAULT_HOST, - requests_get_timeout = _requests_get_timeout, - strip_html_menus = _strip_html_menus, - max_text_bytes = _max_text_bytes, - use_bing = _use_bing, - use_bing_description_only = _use_bing_description_only, - bing_subscription_key = None) -> NoReturn: +class SearchABCServer(http.server.ThreadingHTTPServer): + def __init__(self, + server_address, RequestHandlerClass, + max_text_bytes, strip_html_menus, + use_description_only = False, subscription_key = None + ): - global _requests_get_timeout, _strip_html_menus, _max_text_bytes - global _use_bing, _use_bing_description_only, _bing_subscription_key + self.max_text_bytes = max_text_bytes + self.strip_html_menus = strip_html_menus + self.use_description_only = use_description_only + self.subscription_key = subscription_key + super().__init__(server_address, RequestHandlerClass) + +class Application: + def serve( + self, host: str = _DEFAULT_HOST, + requests_get_timeout = _REQUESTS_GET_TIMEOUT, + strip_html_menus = False, + max_text_bytes = None, + search_engine = "Google", + use_description_only = False, + subscription_key = None + ) -> NoReturn: """ Main entry point: Start the server. Arguments: host (str): requests_get_timeout (int): strip_html_menus (bool): max_text_bytes (int): - use_bing (bool): - use_bing_description_only (bool): - bing_subscription_key (str): + search_engine (str): + use_description_only (bool): + subscription_key (str): HOSTNAME:PORT of the server. HOSTNAME can be an IP. Most of the time should be 0.0.0.0. Port 8080 doesn't work on colab. Other ports also probably don't work on colab, test it out. - requests_get_timeout is seconds before each url fetch times out - strip_html_menus removes likely menus to clean up text - max_text_bytes limits the bytes returned per web page. Note, - ParlAI current defaults to 512 bytes - use_bing set to True will use Bing instead of Google - use_bing_description_only are short but 10X faster since no url gets - bing_subscription_key required to use bing. Can get one at: + requests_get_timeout defaults to 5 seconds before each url fetch times out. + strip_html_menus removes likely HTML menus to clean up text. + max_text_bytes limits the bytes returned per web page. Set to no max. + Note, ParlAI current defaults to 512 byte. + search_engine set to "Google" default or "Bing" + use_description_only are short but 10X faster since no url gets + for Bing only + use_subscription_key required to use Bing only. Can get a free one at: https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api + """ + global _REQUESTS_GET_TIMEOUT + hostname, port = _parse_host(host) host = f"{hostname}:{port}" - _requests_get_timeout = requests_get_timeout - _strip_html_menus = strip_html_menus - _max_text_bytes = max_text_bytes - _use_bing = use_bing - _use_bing_description_only = use_bing_description_only - _bing_subscription_key = bing_subscription_key + _REQUESTS_GET_TIMEOUT = requests_get_timeout - self.check_and_print_cmdline_args() + self.check_and_print_cmdline_args(max_text_bytes, strip_html_menus, + search_engine, use_description_only, subscription_key) - with http.server.ThreadingHTTPServer( - (hostname, int(port)), GoogleSearchServer - ) as server: - print("Serving forever.") - print(f"Host: {host}") - server.serve_forever() + if search_engine == "Bing": + request_handler = BingSearchServer + else: + request_handler = GoogleSearchServer + + with SearchABCServer( + (hostname, int(port)), request_handler, + max_text_bytes, strip_html_menus, + use_description_only, subscription_key + ) as server: + print("Serving forever.") + print(f"Host: {host}") + server.serve_forever() def check_and_print_cmdline_args( - self) -> None: - if _use_bing and _bing_subscription_key is None: - print("--bing_subscription_key required to use bing search") - print("To get one go to url:") - print("https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api") - exit() + self, max_text_bytes, strip_html_menus, + search_engine, use_description_only, subscription_key + ) -> None: + + if search_engine == "Bing": + if subscription_key is None: + print("Warning: subscription_key is required for Bing Search Engine") + print("To get one go to url:") + print("https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api") + exit() + elif search_engine == "Google": + if use_description_only: + print("Warning: use_description_only is not supported for Google Search Engine") + exit() + if subscription_key is not None: + print("Warning: subscription_key is not supported for Google Search Engine") + exit() print("Command line args used:") - print(f" requests_get_timeout={_requests_get_timeout}") - print(f" strip_html_menus={_strip_html_menus}") - print(f" max_text_bytes={_max_text_bytes}") - print(f" use_bing={_use_bing}") - print(f" use_bing_description_only={_use_bing_description_only}") + print(f" requests_get_timeout={_REQUESTS_GET_TIMEOUT}") + print(f" strip_html_menus={strip_html_menus}") + print(f" max_text_bytes={max_text_bytes}") + print(f" search_engine={search_engine}") + print(f" use_description_only={use_description_only}") def test_parser(self, url: str) -> None: """ Test the webpage getter and parser. @@ -422,17 +448,16 @@ def test_parser(self, url: str) -> None: print(_get_and_parse(url)) def test_server( - self, query: str, n: int, host : str = _DEFAULT_HOST, - requests_get_timeout = _requests_get_timeout, - strip_html_menus = _strip_html_menus, - max_text_bytes = _max_text_bytes, - use_bing = _use_bing, - use_bing_description_only = _use_bing_description_only, - bing_subscription_key = None - ) -> None: + self, host: str = _DEFAULT_HOST, + requests_get_timeout = _REQUESTS_GET_TIMEOUT, + strip_html_menus = False, + max_text_bytes = None, + search_engine = "Google", + use_description_only = False, + subscription_key = None + ) -> NoReturn: - global _requests_get_timeout, _strip_html_menus, _max_text_bytes - global _use_bing, _use_bing_description_only, _bing_subscription_key + global _REQUESTS_GET_TIMEOUT """ Creates a thin fake client to test a server that is already up. Expects a server to have already been started with `python search_server.py serve [options]`. @@ -441,16 +466,13 @@ def test_server( """ host, port = _parse_host(host) - _requests_get_timeout = requests_get_timeout - _strip_html_menus = strip_html_menus - _max_text_bytes = max_text_bytes - _use_bing = use_bing - _use_bing_description_only = use_bing_description_only + _REQUESTS_GET_TIMEOUT = requests_get_timeout print(f"Query: `{query}`") print(f"n: {n}") - self.check_and_print_cmdline_args() + self.check_and_print_cmdline_args(max_text_bytes, strip_html_menus, + search_server, use_description_only, subscription_key) retriever = parlai.agents.rag.retrieve_api.SearchEngineRetriever( dict( From 9a7322e0492793e8d64958f3506571bd007d3873 Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Thu, 29 Jul 2021 18:24:11 -0700 Subject: [PATCH 4/7] Minor additional cleanup --- search_server.py | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/search_server.py b/search_server.py index f72e856..abe339c 100644 --- a/search_server.py +++ b/search_server.py @@ -82,7 +82,7 @@ def _get_and_parse(url: str) -> Dict[str, str]: return output_dict -class SearchABC(http.server.BaseHTTPRequestHandler): +class SearchABCRequestHandler(http.server.BaseHTTPRequestHandler): def do_POST(self): """ Handle POST requests from the client. (All requests are POST) """ @@ -183,7 +183,7 @@ def do_POST(self): if not x or len(line) > 50: new_content += line + "\n" - maybe_content['content'] = new_content + maybe_content['content'] = filter_special_chars(new_content) # Truncate text maybe_content['content'] = maybe_content['content'][:self.server.max_text_bytes] @@ -230,12 +230,7 @@ def search(self, "GoogleSearch." ) -def filter_html(title): - title.replace("", "") - title = title.replace("", "") - title = title.replace("", "") - title = title.replace("
", "") - title = title.replace("
", "") +def filter_special_chars(title): title = title.replace(""", "") title = title.replace("&", "") title = title.replace(">", "") @@ -243,14 +238,20 @@ def filter_html(title): title = title.replace("'", "") title = title.replace("\u2018", "") # unicode single quote title = title.replace("\u2019", "") # unicode single quote + title = title.replace("\u201c", "") # unicode left double quote + title = title.replace("\u201d", "") # unicode right double quote title = title.replace("\u8220", "") # unicode left double quote title = title.replace("\u8221", "") # unicode right double quote title = title.replace("\u8222", "") # unicode double low-9 quotation mark + title = title.replace("\u2022", "") # unicode bullet title = title.replace("\u2013", "") # unicode dash title = title.replace("\u00b7", "") # unicode middle dot + title = title.replace("\u00d7", "") # multiplication sign return title -class BingSearchServer(SearchABC): +class BingSearchRequestHandler(SearchABCRequestHandler): + bing_search_url = "https://api.bing.microsoft.com/v7.0/search" + def search(self, q: str, n: int, subscription_key: str = None, @@ -261,13 +262,13 @@ def search(self, types = ["News", "Entities", "Places", "Webpages"] promote = ["News"] - search_url = "https://api.bing.microsoft.com/v7.0/search" print(f"n={n} responseFilter={types}") headers = {"Ocp-Apim-Subscription-Key": subscription_key} - params = {"q": q, "textDecorations":True, + params = {"q": q, "textDecorations":False, "textFormat": "HTML", "responseFilter":types, "promote":promote, "answerCount":5} - response = requests.get(search_url, headers=headers, params=params) + response = requests.get(BingSearchRequestHandler.bing_search_url, + headers=headers, params=params) response.raise_for_status() search_results = response.json() @@ -301,7 +302,7 @@ def search(self, title = item["name"] # Remove Bing formatting characters from title - title = filter_html(title) + title = filter_special_chars(title) if title is None or title == "": print("No title to skipping") @@ -310,14 +311,14 @@ def search(self, if self.server.use_description_only: content = title + ". " if "snippet" in item : - snippet = filter_html(item["snippet"]) + snippet = filter_special_chars(item["snippet"]) content += snippet print(f"Adding webpage summary with title {title} for url {url}") contents.append({'title': title, 'url': url, 'content': content}) elif "description" in item: if news_count < 3: - text = filter_html(item["description"]) + text = filter_special_chars(item["description"]) content += text news_count += 1 contents.append({'title': title, 'url': url, 'content': content}) @@ -335,7 +336,7 @@ def search(self, else: return urls -class GoogleSearchServer(SearchABC): +class GoogleSearchRequestHandler(SearchABCRequestHandler): def search(self, q: str, n: int, subscription_key: str = None, use_description_only: bool = False @@ -402,9 +403,9 @@ def serve( search_engine, use_description_only, subscription_key) if search_engine == "Bing": - request_handler = BingSearchServer + request_handler = BingSearchRequestHandler else: - request_handler = GoogleSearchServer + request_handler = GoogleSearchRequestHandler with SearchABCServer( (hostname, int(port)), request_handler, @@ -457,8 +458,6 @@ def test_server( subscription_key = None ) -> NoReturn: - global _REQUESTS_GET_TIMEOUT - """ Creates a thin fake client to test a server that is already up. Expects a server to have already been started with `python search_server.py serve [options]`. Creates a retriever client the same way ParlAi client does it for its chat bot, then @@ -466,8 +465,6 @@ def test_server( """ host, port = _parse_host(host) - _REQUESTS_GET_TIMEOUT = requests_get_timeout - print(f"Query: `{query}`") print(f"n: {n}") From 9e8a146822e9eff7392e77c7809dcc226ea22aaa Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Thu, 29 Jul 2021 20:38:30 -0700 Subject: [PATCH 5/7] Removed test_server changes I had made --- search_server.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/search_server.py b/search_server.py index abe339c..5d22887 100644 --- a/search_server.py +++ b/search_server.py @@ -448,15 +448,7 @@ def test_parser(self, url: str) -> None: """ print(_get_and_parse(url)) - def test_server( - self, host: str = _DEFAULT_HOST, - requests_get_timeout = _REQUESTS_GET_TIMEOUT, - strip_html_menus = False, - max_text_bytes = None, - search_engine = "Google", - use_description_only = False, - subscription_key = None - ) -> NoReturn: + def test_server(self, query: str, n: int, host : str = _DEFAULT_HOST) -> None: """ Creates a thin fake client to test a server that is already up. Expects a server to have already been started with `python search_server.py serve [options]`. @@ -468,9 +460,6 @@ def test_server( print(f"Query: `{query}`") print(f"n: {n}") - self.check_and_print_cmdline_args(max_text_bytes, strip_html_menus, - search_server, use_description_only, subscription_key) - retriever = parlai.agents.rag.retrieve_api.SearchEngineRetriever( dict( search_server=f"{host}:{port}", From 9a6dd37caf32673ef1493c20ff6bfc1a92a20b07 Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Fri, 30 Jul 2021 09:38:49 -0700 Subject: [PATCH 6/7] Added README and fixed a couple of bugs --- README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++-- search_server.py | 14 +++++++++----- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index eae41dd..fc20e03 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ alt="Shows lines with search results, the titles and the urls."> - Uses `html2text` to strip the markup out of the page. - Uses `beautifulsoup4` to parse the title. -- Currently only uses the `googlesearch` module to query Google for urls, but is coded -in a modular / search engine agnostic way to allow very easily add new search engine support. +- Supports both Google (default) and Bing search, but is coded in a modular / search engine agnostic +way to allow very easily add new search engine support. Bing search requires a API subscription key, +which can be obtained for free at: https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api Using the `googlesearch` module is very slow because it parses Google search webpages instead of querying cloud webservices. This is fine for playing with the model, but makes that searcher unusable for training or large scale inference purposes. In the paper, Bing cloud services are used, matching the results over Common Crawl instead of just downloading the page. @@ -62,3 +63,44 @@ python search_server.py test_server --host 0.0.0.0:8080 ```bash python search_server.py test_parser www.some_url_of_your_choice.com/ ``` + +# Additional Command Line Parameters + +- requests_get_timeout - sets the timeout for URL requests to fetch content of URLs found during search. Defaults to 5 seconds. +- strip_html_menus - removes likely HTML menus to clean up text. This returns significantly higher quality and informationally dense text. +- max_text_bytes limits the bytes returned per web page. Defaults to no max. Note, ParlAI current defaults to only use the first 512 byte. +- search_engine set to "Google" default or "Bing". Note, the Bing Search engine was used in the Blenderbot2 paper to achieve their results. This implementation not only uses web pages but also news, entities and places. +- use_description_only are short but 10X faster since no url gets for Bing only. It also has the advantage of being very concise without an HTML irrelevant text normally returned. +- use_subscription_key required to use Bing only. Can get a free one at: https://www.microsoft.com/en-us/bing/apis/bing-entity-search-api + +# Advanced Examples + +Google Search Engine returning more relevant information than the defaults: +```bash +python search_server.py serve --host 0.0.0.0:8080 --max_text_bytes 512 --requests_get_timeout 10 --strip_html_menus +``` + +Bing Search Engine: +```bash +python search_server.py serve --host 0.0.0.0:8080 --search_engine="Bing" --subscription_key "put your bing api subscription key here" +``` + +Bing Search Engine returning more relevant information: +```bash +python search_server.py serve --host 0.0.0.0:8080 --search_engine="Bing" --max_text_bytes=512 --requests_get_timeout 10 --strip_html_menus --subscription_key "put your bing api subscription key here" +``` + +Bing Search Engine returning very relevant concise information 10X faster: +```bash +python search_server.py serve --host 0.0.0.0:8080 --search_engine="Bing" --use_description_only --subscription_key "put your bing api subscription key here" +``` + +# Additional Command Line Example Test Calls + +```bash +curl -X POST "http://0.0.0:8080" -d "q=Which%20team%20does%20Tom%20Brady%20play%20for%20now&n=6" +``` + +```bash +curl -X POST "http://0.0.0:8080" -d "q=Where%20Are%20The%20Olympics%20Being%20Held%20in%202021&n=6" +``` diff --git a/search_server.py b/search_server.py index 5d22887..91a1a6d 100644 --- a/search_server.py +++ b/search_server.py @@ -149,14 +149,19 @@ def do_POST(self): reason_already_seen_content = ( maybe_content["content"] in dupe_detection_set ) + reason_content_forbidden = ( + maybe_content["content"] == "Forbidden" + ) else: reason_content_empty = False reason_already_seen_content = False - + reason_content_forbidden = False + reasons = dict( reason_empty_response=reason_empty_response, reason_content_empty=reason_content_empty, reason_already_seen_content=reason_already_seen_content, + reason_content_forbidden=reason_content_forbidden, ) if not any(reasons.values()): @@ -178,10 +183,9 @@ def do_POST(self): if self.server.strip_html_menus: new_content = "" for line in maybe_content['content'].splitlines(): - if line.find("*"): # Performance optimazation since regex is slow - x = re.findall("^[\s]*\\* ", line) - if not x or len(line) > 50: - new_content += line + "\n" + x = re.findall("^[\s]*\\* ", line) + if line != "" and (not x or len(line) > 50): + new_content += line + "\n" maybe_content['content'] = filter_special_chars(new_content) From f6fb411bba2533f716c9de6fad9b0be22a9feb8d Mon Sep 17 00:00:00 2001 From: Hamilton Hitchings Date: Sat, 31 Jul 2021 17:34:08 -0700 Subject: [PATCH 7/7] Updated README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fc20e03..70c380c 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ Bing Search Engine returning more relevant information: python search_server.py serve --host 0.0.0.0:8080 --search_engine="Bing" --max_text_bytes=512 --requests_get_timeout 10 --strip_html_menus --subscription_key "put your bing api subscription key here" ``` -Bing Search Engine returning very relevant concise information 10X faster: +Bing Search Engine returning very relevant concise information 10X faster. Returns a 250 to 350 byte web page summary per URL including the web page title: ```bash python search_server.py serve --host 0.0.0.0:8080 --search_engine="Bing" --use_description_only --subscription_key "put your bing api subscription key here" ```