From 27a65babf6506925b5dc93935cf9a2f425839c11 Mon Sep 17 00:00:00 2001
From: Joker-dock <60212974+Joker-dock@users.noreply.github.com>
Date: Wed, 16 Feb 2022 03:04:27 +0100
Subject: [PATCH 1/2] TEI generator file for hang.hu.
---
configs/hang/hang_BASE.xml | 125 +++++++++++++++++++++++
configs/hang/hang_specific.py | 133 +++++++++++++++++++++++++
configs/hang/hang_text_tags_normal.tsv | 29 ++++++
3 files changed, 287 insertions(+)
create mode 100644 configs/hang/hang_BASE.xml
create mode 100644 configs/hang/hang_specific.py
create mode 100644 configs/hang/hang_text_tags_normal.tsv
diff --git a/configs/hang/hang_BASE.xml b/configs/hang/hang_BASE.xml
new file mode 100644
index 0000000..552d4b1
--- /dev/null
+++ b/configs/hang/hang_BASE.xml
@@ -0,0 +1,125 @@
+
+
+
+
+
+
+
+
+
+
+ ELTE-DH webcrawling
+
+ creator
+ ELTE-DH
+
+
+
+ project director
+ PalkóGábor
+ [https://orcid.org/0000-0002-4394-8577]
+
+
+ chief programmer
+ IndigBalázs
+ [https://orcid.org/0000-0001-8090-3661]
+
+
+ TEI expert
+ FellegiZsófia
+ [https://orcid.org/0000-0001-9199-1759]
+
+
+ programmer
+ Sárközi-LindnerZsófia
+ [https://orcid.org/0000-0002-2558-0633]
+
+
+
+
+ ELTE-DH
+ [http://elte-dh.hu/]
+
+ Budapest [http://www.geonames.org/3054643]
+
+ 2020
+
+ Metadata: IN COPYRIGHT - NON-COMMERCIAL USE PERMITTED[http://rightsstatements.org/vocab/InC-NC/1.0/]
+ Text: IN COPYRIGHT [http://rightsstatements.org/vocab/InC/1.0/]
+
+
+
+
+
+
+
+ Alhambra Press Bt.
+ 1034 Budapest, Szomolnok u. 7. I. em. 3.
+
+
+
+
+ Budapest
+ [http://www.geonames.org/3054643]
+
+ Minden jog fenntartva © 2018-2021 - Magyar Hang
+
+
+
+
+
+
+
+
+
+
+
+ Magyar Hanghttps://hang.hu/
+ hun
+
+ In Copyright
+
+
+
+
+
+
+ WARC/1.1
+
+ 2021-09-02 19:50:07/2021-09-06 18:31:37
+
+
+
+ urn:uuid:5ec17312-d7ee-4f3b-801e-4557ffa4a463
+
+
+
+
+
+
+ TEI
+
+
+ Palkó Gábor
+ Indig Balázs
+ Fellegi Zsófia
+ Sárközi-Lindner Zsófia
+
+
+
+
+
+ TEI file created
+
+
+
+
+
+
+
diff --git a/configs/hang/hang_specific.py b/configs/hang/hang_specific.py
new file mode 100644
index 0000000..2e4bc50
--- /dev/null
+++ b/configs/hang/hang_specific.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8, vim: expandtab:ts=4 -*
+
+import re
+
+from html2tei import parse_date, BASIC_LINK_ATTRS, decompose_listed_subtrees_and_mark_media_descendants, tei_defaultdict
+
+PORTAL_URL_PREFIX = 'https://hang.hu'
+
+ARTICLE_ROOT_PARAMS_SPEC = [(('article',), {'class': 'post-wrap post-single mt-3 icms-content'}),
+ (('article',), {'class': 'post-wrap post-single wide mt-3 icms-content'})]
+
+
+HTML_BASICS = {'p', 'h3', 'h2', 'h4', 'h5', 'em', 'i', 'b', 'strong', 'mark', 'u', 'sub', 'sup', 'del', 'strike',
+ 'ul', 'ol', 'li', 'table', 'tr', 'td', 'th', 'quote', 'figure', 'iframe'}
+
+
+def get_meta_from_articles_spec(tei_logger, url, bs):
+ data = tei_defaultdict()
+ data['sch:url'] = url
+ article_root = bs.find('article')
+ if article_root is not None:
+ date_tag = article_root.find('ul', class_='d-flex flex-row')
+ if date_tag is not None:
+ date_tag = date_tag.find_all('li')[1]
+ if date_tag is not None:
+ article_date_text = date_tag.text
+ if article_date_text is not None:
+ parsed_date = parse_date(article_date_text, '%Y. %B %d., %H:%M')
+ if parsed_date is not None:
+ data['sch:datePublished'] = parsed_date
+ else:
+ tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!')
+ else:
+ tei_logger.log('WARNING', f'{url}: DATE TEXT NOT FOUND!')
+ else:
+ tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')
+ else:
+ tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')
+ # data['sch:dateModified'] = write_it
+ # else: tei_logger.log('WARNING', f'{url}: MODIFIED DATE TEXT FORMAT ERROR!')
+ author_tag = article_root.find('span', class_='author-name')
+ if author_tag is not None:
+ author_text = author_tag.text
+ if author_text is not None:
+ author_text = author_text.strip()
+ if author_text == 'Magyar Hang':
+ data['sch:source'] = [author_text]
+ else:
+ data['sch:author'] = author_text.split(',')
+ else:
+ tei_logger.log('WARNING', f'{url}: AUTHOR TEXT NOT FOUND!')
+ else:
+ tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
+ name_tag = article_root.find('div', class_='entry-title')
+ if name_tag is not None:
+ name_text = name_tag.h1.text
+ if name_text is not None:
+ data['sch:name'] = name_text.strip()
+ else:
+ tei_logger.log('WARNING', f'{url}: TITLE TEXT NOT FOUND IN URL!')
+ else:
+ tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND IN URL!')
+ section_line = article_root.find('div', class_='entry-meta')
+ if section_line is not None:
+ sections = [a.text for a in section_line.find_all('a') if a is not None]
+ if sections is not None:
+ data['sch:articleSection'] = sections
+ else:
+ tei_logger.log('DEBUG', f'{url}: SECTION TEXT NOT FOUND!')
+ else:
+ tei_logger.log('DEBUG', f'{url}: SECTION TAG NOT FOUND!')
+ keywords_line = article_root.find('div', class_='widget widget-tags mb-5')
+ if keywords_line is not None:
+ keywords_sections = [a.text.strip() for a in section_line.find_all('a') if a is not None]
+ if keywords_sections is not None:
+ data['sch:keywords'] = keywords_sections
+ else:
+ tei_logger.log('DEBUG', f'{url}: KEYWORDS TEXT NOT FOUND!')
+ else:
+ tei_logger.log('DEBUG', f'{url}: KEYWORDS TAG NOT FOUND!')
+ # else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!')
+ return data
+ # tei_logger.log('WARNING', f'{url}: METADATA CONTAINER NOT FOUND!')
+ # tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!')
+ # return None
+ else:
+ tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!')
+ return None
+
+
+def excluded_tags_spec(tag):
+ if tag.name not in HTML_BASICS:
+ tag.name = 'else'
+ tag.attrs = {}
+ return tag
+
+
+BLOCK_RULES_SPEC = {}
+BIGRAM_RULES_SPEC = {}
+LINKS_SPEC = BASIC_LINK_ATTRS
+DECOMP = [(('div',), {'class': 'widget'}),
+ (('div',), {'class': 'widget-tamogatas-box'}),
+ (('div',), {'class': 'banner-wrapper'}),
+ (('div',), {'id': 'videoad'}),
+ (('div',), {'class': 'widget widget-tamogatas-box'}),
+ (('div',), {'class': 'banner-wrapper'}),
+ (('div',), {'class': 'img-container'}),
+ (('div',), {'id': 'pa_videoslider'}),
+ (('div',), {'id': 'widget-image'}),
+ (('div',), {'class': 'sidebar-col'}),
+ (('div',), {'class': 'entry-meta'}),
+ (('div',), {'class': 'entry-image'}),
+ (('div',), {'class': 'oygrvhab'}),
+ (('script',), {}),
+ (('iframe',), {})]
+
+MEDIA_LIST = []
+
+
+def decompose_spec(article_dec):
+ decompose_listed_subtrees_and_mark_media_descendants(article_dec, DECOMP, MEDIA_LIST)
+ return article_dec
+
+
+BLACKLIST_SPEC = ['https://hang.hu/kultura/reg-keszult-olyan-felkavaro-kamaszokrol-szolo-sorozat-mint-az-euforia-'
+ '107933'] # just a picture
+
+MULTIPAGE_URL_END = re.compile(r'^\b$') # Dummy
+
+
+def next_page_of_article_spec(_):
+ return None
diff --git a/configs/hang/hang_text_tags_normal.tsv b/configs/hang/hang_text_tags_normal.tsv
new file mode 100644
index 0000000..81914aa
--- /dev/null
+++ b/configs/hang/hang_text_tags_normal.tsv
@@ -0,0 +1,29 @@
+frequency tag average_word_count average_descendant_num immediate_texts_average_length URL_example normal_name preserved_attribute
+default
default default default no_example bekezdes default
+default
default default default no_example cimsor default
+default default default default no_example cimsor default
+default default default default no_example cimsor default
+default default default default no_example cimsor default
+default default default default no_example felkover default
+default default default default no_example felkover default
+default default default default no_example kiemelt default
+default default default default no_example dolt default
+default default default default no_example kiemelt default
+default default default default no_example alsoindex default
+default default default default no_example felsoindex default
+default default default default no_example athuzott default
+default default default default no_example alahuzott default
+default default default default no_example athuzott default
+default default default default no_example lista default
+default default default default no_example lista default
+default - default default default no_example listaelem default
+default
default default default no_example table_text default
+default default default default no_example sor default
+default | default default default no_example oszlop default
+default | default default default no_example oszlop default
+default default default default no_example idezet default
+default default default default no_example media_tartalom default
+default |