diff --git a/ExplainToMe/forms.py b/ExplainToMe/forms.py index 45da42b..9c7ed72 100644 --- a/ExplainToMe/forms.py +++ b/ExplainToMe/forms.py @@ -1,5 +1,6 @@ from flask_wtf import FlaskForm as Form from wtforms.fields.html5 import IntegerField, URLField +from wtforms.fields import SelectField from wtforms.validators import NumberRange, InputRequired from wtforms.validators import URL, Required @@ -16,5 +17,12 @@ class LinkForm(Form): 'max_sent', validators=[Required(), InputRequired(), NumberRange(min=0, max=100)], id='max_sent', - default=10, + default=10 + ) + language = SelectField( + 'language', + validators=[Required(), InputRequired()], + id='language', + choices=[('english','English'), ('chinese', 'Chinese'), ('czech', 'Czech'), ('french', 'French'), ('german', 'German'), ('japanese', 'Japanese'), ('portuguese', 'Portuguese'), ('slovak', 'Slovak'), ('spanish', 'Spanish')], + default='english' ) diff --git a/ExplainToMe/templates/base.html b/ExplainToMe/templates/base.html index c0be366..1e3ad48 100644 --- a/ExplainToMe/templates/base.html +++ b/ExplainToMe/templates/base.html @@ -1,16 +1,11 @@ -{% extends "bootstrap/base.html" %} - -{% import "bootstrap/fixes.html" as fixes %} -{% import "bootstrap/google.html" as google %} +{% extends "bootstrap/base.html" %} {% import "bootstrap/fixes.html" as fixes %} {% import "bootstrap/google.html" as google +%} - -{% block head %} - {% block meta %} - {% include "includes/meta.html" %} - {% endblock %} + + {% block head %} {% block meta %} {% include "includes/meta.html" %} {% endblock %} @@ -19,73 +14,57 @@ {% block favicon %} - <link type="image/x-icon" rel="icon" href="/favicon.ico" /> - {% endblock favicon %} + <link type="image/x-icon" rel="icon" href="/favicon.ico" /> {% endblock favicon %} {% block styles %} + <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" /> - {% block styles %} - <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" /> - <link href="https://bootswatch.com/readable/bootstrap.min.css" rel="stylesheet" /> - <link href="{{ url_for('static', filename='css/style.css') }}" rel=stylesheet /> - <link href="{{ url_for('static', filename='css/animation.css') }}" rel=stylesheet /> - {% endblock styles %} - - {{ fixes.ie8() }} - {{ google.uanalytics('UA-51412306-6') }} - -{% endblock %} + <link href="https://bootswatch.com/3/readable/bootstrap.min.css" rel="stylesheet" /> + <link href="{{ url_for('static', filename='css/style.css') }}" rel=stylesheet /> + <link href="{{ url_for('static', filename='css/animation.css') }}" rel=stylesheet /> {% endblock styles %} {{ fixes.ie8() }} {{ google.uanalytics('UA-51412306-6') }} {% endblock %} </head> <body> -{% block body %} - -{% block navbar %} -{% include 'includes/nav.html' %} -{% endblock %} - -<a id="run" name="run"></a> -<div class="intro-header"> - <div class="container"> - <div class="row"> - {% block run %}{% endblock run %} + {% block body %} {% block navbar %} {% include 'includes/nav.html' %} {% endblock %} + + <a id="run" name="run"></a> + <div class="intro-header"> + <div class="container"> + <div class="row"> + {% block run %}{% endblock run %} + </div> </div> </div> -</div> -<a id="summary" name="summary"></a> -<div class="content-section-a"> - <div class="container"> - <div class="row"> - {% block summary %}{% endblock summary %} + <a id="summary" name="summary"></a> + <div class="content-section-a"> + <div class="container"> + <div class="row"> + {% block summary %}{% endblock summary %} + </div> </div> </div> -</div> -<a id="about" name="about"></a> -<div class="banner"> - <div class="container"> - <div class="row"> - {% block about %}{% endblock about %} + <a id="about" name="about"></a> + <div class="banner"> + <div class="container"> + <div class="row"> + {% block about %}{% endblock about %} + </div> </div> </div> -</div> -<a id="contact" name="contact"> -<div class="content-section-b"> - <div class="container"> - <div class="row"> - {% block contact %}{% endblock contact %} + <a id="contact" name="contact"> + <div class="content-section-b"> + <div class="container"> + <div class="row"> + {% block contact %}{% endblock contact %} + </div> + </div> </div> - </div> -</div> - -{% include 'includes/footer.html' %} -{% block scripts %} - {{ super() }} - <script type="text/javascript" src="{{ url_for('static', filename='js/site.js') }}"></script> -{% endblock scripts %} - -{% endblock body %} + {% include 'includes/footer.html' %} {% block scripts %} {{ super() }} + <script type="text/javascript" src="{{ url_for('static', filename='js/site.js') }}"></script> + {% endblock scripts %} {% endblock body %} </body> -</html> + +</html> \ No newline at end of file diff --git a/ExplainToMe/templates/index.html b/ExplainToMe/templates/index.html index 0970baa..2321950 100644 --- a/ExplainToMe/templates/index.html +++ b/ExplainToMe/templates/index.html @@ -15,6 +15,7 @@ <h3>Automatic Web Article Summarizer</h3> {{ form.hidden_tag() }} {{ render_field(form.url, required=true) }} {{ render_field(form.max_sent, required=true) }} + {{ render_field(form.language, required=true) }} <div class="form-group"> <button id="send" type="submit" class="btn btn-success"> <span class="glyphicon glyphicon-send"></span> Send diff --git a/ExplainToMe/textrank.py b/ExplainToMe/textrank.py index 1038bb5..bd1abe5 100644 --- a/ExplainToMe/textrank.py +++ b/ExplainToMe/textrank.py @@ -12,6 +12,9 @@ from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.utils import cached_property, get_stop_words from itertools import chain +from sumy.nlp.tokenizers import Tokenizer + +from iso639 import languages if six.PY2: str = six.text_type @@ -114,7 +117,11 @@ def run_summarizer(parser, sentences, language='english'): for sentence in summarizer(parser.document, sentences)] -def get_parser(url, tokenizer): +def get_lang(name): + return languages.inverted.get(name.title()).part1 + +def get_parser(url, language): + tokenizer = Tokenizer(language) useragent = ' '.join([ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)", "AppleWebKit/537.36 (KHTML, like Gecko)", @@ -127,13 +134,17 @@ def get_parser(url, tokenizer): extra_headers['Referer'] = r'https://t.co/T1323aaaa' # Scrape Web Page With HTMLParser and Goose and select the best scrape html_parser = HtmlParser.from_url(url, tokenizer, **extra_headers) - article = Goose({'browser_user_agent': useragent}) + article = Goose({'browser_user_agent': useragent, 'target_language': get_lang(language)}) # Goose raises IndexError when requesting unfamiliar sites. try: extract = article.extract(url=url) except: - extract = article.extract(raw_html=requests.get(url).text) + req = requests.get(url) + print(req.apparent_encoding) + extract = article.extract( + raw_html=req.content.decode(encoding=req.apparent_encoding) + ) goose_parser = PlaintextParser(extract, tokenizer) diff --git a/ExplainToMe/views/root.py b/ExplainToMe/views/root.py index 699b8e1..3fc44fa 100644 --- a/ExplainToMe/views/root.py +++ b/ExplainToMe/views/root.py @@ -23,9 +23,9 @@ def index(): form = LinkForm() if form.validate_on_submit(): - url, max_sent = request.form.get('url'), request.form.get('max_sent') - language = 'english' - parser, meta = get_parser(url, Tokenizer(language)) + url, max_sent, language = request.form.get('url'), request.form.get('max_sent'), request.form.get('language') + #language = 'english' + parser, meta = get_parser(url, language) new_meta = meta.pop('meta', {}) meta.update(new_meta) session['summary'] = run_summarizer(parser, max_sent, language=language) diff --git a/requirements.txt b/requirements.txt index e7fccb6..562209a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,11 +11,12 @@ chainmap==1.0.2 ipython>=5.3.0,<6.0.0 nltk==3.2.2 python-dateutil==2.6.0 -sumy==0.6.0 +sumy==0.7.0 requests==2.17.3 Flask-AppConfig==0.11.1 Flask-Debug==0.4.3 Flask-SSLify==0.1.5 +iso-639==0.4.5 gunicorn ipdb gevent