From b8de3486818d369c6dc84dfb13c6943b318fee65 Mon Sep 17 00:00:00 2001 From: Fabian Venturini Cabau Date: Fri, 29 Jul 2016 17:09:52 -0300 Subject: [PATCH] Fixed BeautifulSoup warning Fixed this warning from BeautifulSoup ``` UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently. The code that caused this warning is on line 774 of the file c:\python27\Lib\threading.py. To get rid of this warning, change code that looks like this: BeautifulSoup([your markup]) to this: BeautifulSoup([your markup], "lxml") markup_type=markup_type)) ``` --- extraction/techniques.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extraction/techniques.py b/extraction/techniques.py index f1dc18d..df51dc9 100644 --- a/extraction/techniques.py +++ b/extraction/techniques.py @@ -44,7 +44,7 @@ class HeadTags(Technique): def extract(self, html): "Extract data from meta, link and title tags within the head tag." extracted = {} - soup = BeautifulSoup(html) + soup = BeautifulSoup(html, "lxml") # extract data from title tag title_tag = soup.find('title') if title_tag: @@ -168,7 +168,7 @@ def extract(self, html): titles = [] descriptions = [] videos = [] - soup = BeautifulSoup(html) + soup = BeautifulSoup(html, "lxml") for article in soup.find_all('article') or []: title = article.find('h1') if title: @@ -207,7 +207,7 @@ class SemanticTags(Technique): def extract(self, html): "Extract data from usual semantic tags." extracted = {} - soup = BeautifulSoup(html) + soup = BeautifulSoup(html, "lxml") for tag, dest, max_to_store in self.extract_string: for found in soup.find_all(tag)[:max_to_store] or []: