From 27a65babf6506925b5dc93935cf9a2f425839c11 Mon Sep 17 00:00:00 2001 From: Joker-dock <60212974+Joker-dock@users.noreply.github.com> Date: Wed, 16 Feb 2022 03:04:27 +0100 Subject: [PATCH 1/2] TEI generator file for hang.hu. --- configs/hang/hang_BASE.xml | 125 +++++++++++++++++++++++ configs/hang/hang_specific.py | 133 +++++++++++++++++++++++++ configs/hang/hang_text_tags_normal.tsv | 29 ++++++ 3 files changed, 287 insertions(+) create mode 100644 configs/hang/hang_BASE.xml create mode 100644 configs/hang/hang_specific.py create mode 100644 configs/hang/hang_text_tags_normal.tsv diff --git a/configs/hang/hang_BASE.xml b/configs/hang/hang_BASE.xml new file mode 100644 index 0000000..552d4b1 --- /dev/null +++ b/configs/hang/hang_BASE.xml @@ -0,0 +1,125 @@ + + + + + + + + + + + ELTE-DH webcrawling + + creator + ELTE-DH + + + + project director + PalkóGábor + https://orcid.org/0000-0002-4394-8577 + + + chief programmer + IndigBalázs + https://orcid.org/0000-0001-8090-3661 + + + TEI expert + FellegiZsófia + https://orcid.org/0000-0001-9199-1759 + + + programmer + Sárközi-LindnerZsófia + https://orcid.org/0000-0002-2558-0633 + + + + + ELTE-DH + http://elte-dh.hu/ + + Budapest http://www.geonames.org/3054643 + + 2020 + +

Metadata: IN COPYRIGHT - NON-COMMERCIAL USE PERMITTEDhttp://rightsstatements.org/vocab/InC-NC/1.0/

+

Text: IN COPYRIGHT http://rightsstatements.org/vocab/InC/1.0/ +

+
+ +
+ + + + Alhambra Press Bt. + 1034 Budapest, Szomolnok u. 7. I. em. 3. + + + + + Budapest + http://www.geonames.org/3054643 + +

Minden jog fenntartva © 2018-2021 - Magyar Hang

+

+

+
+ +
+
+
+ + + + + Magyar Hanghttps://hang.hu/ + hun + + In Copyright + + + + + + + WARC/1.1 + + 2021-09-02 19:50:07/2021-09-06 18:31:37 + + + + urn:uuid:5ec17312-d7ee-4f3b-801e-4557ffa4a463 + + + + + + + TEI + + + Palkó Gábor + Indig Balázs + Fellegi Zsófia + Sárközi-Lindner Zsófia + + + + + + TEI file created + +
+ + + + +
diff --git a/configs/hang/hang_specific.py b/configs/hang/hang_specific.py new file mode 100644 index 0000000..2e4bc50 --- /dev/null +++ b/configs/hang/hang_specific.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8, vim: expandtab:ts=4 -* + +import re + +from html2tei import parse_date, BASIC_LINK_ATTRS, decompose_listed_subtrees_and_mark_media_descendants, tei_defaultdict + +PORTAL_URL_PREFIX = 'https://hang.hu' + +ARTICLE_ROOT_PARAMS_SPEC = [(('article',), {'class': 'post-wrap post-single mt-3 icms-content'}), + (('article',), {'class': 'post-wrap post-single wide mt-3 icms-content'})] + + +HTML_BASICS = {'p', 'h3', 'h2', 'h4', 'h5', 'em', 'i', 'b', 'strong', 'mark', 'u', 'sub', 'sup', 'del', 'strike', + 'ul', 'ol', 'li', 'table', 'tr', 'td', 'th', 'quote', 'figure', 'iframe'} + + +def get_meta_from_articles_spec(tei_logger, url, bs): + data = tei_defaultdict() + data['sch:url'] = url + article_root = bs.find('article') + if article_root is not None: + date_tag = article_root.find('ul', class_='d-flex flex-row') + if date_tag is not None: + date_tag = date_tag.find_all('li')[1] + if date_tag is not None: + article_date_text = date_tag.text + if article_date_text is not None: + parsed_date = parse_date(article_date_text, '%Y. %B %d., %H:%M') + if parsed_date is not None: + data['sch:datePublished'] = parsed_date + else: + tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!') + else: + tei_logger.log('WARNING', f'{url}: DATE TEXT NOT FOUND!') + else: + tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') + else: + tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') + # data['sch:dateModified'] = write_it + # else: tei_logger.log('WARNING', f'{url}: MODIFIED DATE TEXT FORMAT ERROR!') + author_tag = article_root.find('span', class_='author-name') + if author_tag is not None: + author_text = author_tag.text + if author_text is not None: + author_text = author_text.strip() + if author_text == 'Magyar Hang': + data['sch:source'] = [author_text] + else: + data['sch:author'] = author_text.split(',') + else: + tei_logger.log('WARNING', f'{url}: AUTHOR TEXT NOT FOUND!') + else: + tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') + name_tag = article_root.find('div', class_='entry-title') + if name_tag is not None: + name_text = name_tag.h1.text + if name_text is not None: + data['sch:name'] = name_text.strip() + else: + tei_logger.log('WARNING', f'{url}: TITLE TEXT NOT FOUND IN URL!') + else: + tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND IN URL!') + section_line = article_root.find('div', class_='entry-meta') + if section_line is not None: + sections = [a.text for a in section_line.find_all('a') if a is not None] + if sections is not None: + data['sch:articleSection'] = sections + else: + tei_logger.log('DEBUG', f'{url}: SECTION TEXT NOT FOUND!') + else: + tei_logger.log('DEBUG', f'{url}: SECTION TAG NOT FOUND!') + keywords_line = article_root.find('div', class_='widget widget-tags mb-5') + if keywords_line is not None: + keywords_sections = [a.text.strip() for a in section_line.find_all('a') if a is not None] + if keywords_sections is not None: + data['sch:keywords'] = keywords_sections + else: + tei_logger.log('DEBUG', f'{url}: KEYWORDS TEXT NOT FOUND!') + else: + tei_logger.log('DEBUG', f'{url}: KEYWORDS TAG NOT FOUND!') + # else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!') + return data + # tei_logger.log('WARNING', f'{url}: METADATA CONTAINER NOT FOUND!') + # tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') + # return None + else: + tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!') + return None + + +def excluded_tags_spec(tag): + if tag.name not in HTML_BASICS: + tag.name = 'else' + tag.attrs = {} + return tag + + +BLOCK_RULES_SPEC = {} +BIGRAM_RULES_SPEC = {} +LINKS_SPEC = BASIC_LINK_ATTRS +DECOMP = [(('div',), {'class': 'widget'}), + (('div',), {'class': 'widget-tamogatas-box'}), + (('div',), {'class': 'banner-wrapper'}), + (('div',), {'id': 'videoad'}), + (('div',), {'class': 'widget widget-tamogatas-box'}), + (('div',), {'class': 'banner-wrapper'}), + (('div',), {'class': 'img-container'}), + (('div',), {'id': 'pa_videoslider'}), + (('div',), {'id': 'widget-image'}), + (('div',), {'class': 'sidebar-col'}), + (('div',), {'class': 'entry-meta'}), + (('div',), {'class': 'entry-image'}), + (('div',), {'class': 'oygrvhab'}), + (('script',), {}), + (('iframe',), {})] + +MEDIA_LIST = [] + + +def decompose_spec(article_dec): + decompose_listed_subtrees_and_mark_media_descendants(article_dec, DECOMP, MEDIA_LIST) + return article_dec + + +BLACKLIST_SPEC = ['https://hang.hu/kultura/reg-keszult-olyan-felkavaro-kamaszokrol-szolo-sorozat-mint-az-euforia-' + '107933'] # just a picture + +MULTIPAGE_URL_END = re.compile(r'^\b$') # Dummy + + +def next_page_of_article_spec(_): + return None diff --git a/configs/hang/hang_text_tags_normal.tsv b/configs/hang/hang_text_tags_normal.tsv new file mode 100644 index 0000000..81914aa --- /dev/null +++ b/configs/hang/hang_text_tags_normal.tsv @@ -0,0 +1,29 @@ +frequency tag average_word_count average_descendant_num immediate_texts_average_length URL_example normal_name preserved_attribute +default

default default default no_example bekezdes default +default

default default default no_example cimsor default +default

default default default no_example cimsor default +default

default default default no_example cimsor default +default

default default default no_example cimsor default +default default default default no_example felkover default +default default default default no_example felkover default +default default default default no_example kiemelt default +default default default default no_example dolt default +default default default default no_example kiemelt default +default default default default no_example alsoindex default +default default default default no_example felsoindex default +default default default default no_example athuzott default +default default default default no_example alahuzott default +default default default default no_example athuzott default +default
    default default default no_example lista default +default
      default default default no_example lista default +default
    1. default default default no_example listaelem default +default default default default no_example table_text default +default default default default no_example sor default +default
      default default default no_example oszlop default +default default default default no_example oszlop default +default default default default no_example idezet default +default
      default default default no_example media_tartalom default +default