diff --git a/extraction/techniques.py b/extraction/techniques.py index f1dc18d..df51dc9 100644 --- a/extraction/techniques.py +++ b/extraction/techniques.py @@ -44,7 +44,7 @@ class HeadTags(Technique): def extract(self, html): "Extract data from meta, link and title tags within the head tag." extracted = {} - soup = BeautifulSoup(html) + soup = BeautifulSoup(html, "lxml") # extract data from title tag title_tag = soup.find('title') if title_tag: @@ -168,7 +168,7 @@ def extract(self, html): titles = [] descriptions = [] videos = [] - soup = BeautifulSoup(html) + soup = BeautifulSoup(html, "lxml") for article in soup.find_all('article') or []: title = article.find('h1') if title: @@ -207,7 +207,7 @@ class SemanticTags(Technique): def extract(self, html): "Extract data from usual semantic tags." extracted = {} - soup = BeautifulSoup(html) + soup = BeautifulSoup(html, "lxml") for tag, dest, max_to_store in self.extract_string: for found in soup.find_all(tag)[:max_to_store] or []: