Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions configs/hang/hang_BASE.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml"
schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
</titleStmt>
<editionStmt>
<edition>ELTE-DH webcrawling</edition>
<respStmt>
<resp>creator</resp>
<orgName>ELTE-DH<ref type="http://elte-dh.hu"/>
</orgName>
</respStmt>
<respStmt>
<resp>project director</resp>
<persName><surname>Palkó</surname><forename>Gábor</forename>
<ref>https://orcid.org/0000-0002-4394-8577</ref></persName>
</respStmt>
<respStmt>
<resp>chief programmer</resp>
<persName><surname>Indig</surname><forename>Balázs</forename>
<ref>https://orcid.org/0000-0001-8090-3661</ref></persName>
</respStmt>
<respStmt>
<resp>TEI expert</resp>
<persName><surname>Fellegi</surname><forename>Zsófia</forename>
<ref>https://orcid.org/0000-0001-9199-1759</ref></persName>
</respStmt>
<respStmt>
<resp>programmer</resp>
<persName><surname>Sárközi-Lindner</surname><forename>Zsófia</forename>
<ref>https://orcid.org/0000-0002-2558-0633</ref></persName>
</respStmt>
</editionStmt>
<publicationStmt>
<publisher>
<orgName>ELTE-DH</orgName>
<ref type="url">http://elte-dh.hu/</ref>
</publisher>
<pubPlace>Budapest <ref type="url">http://www.geonames.org/3054643</ref>
</pubPlace>
<date>2020</date>
<availability>
<p>Metadata: IN COPYRIGHT - NON-COMMERCIAL USE PERMITTED<ref type="url"
>http://rightsstatements.org/vocab/InC-NC/1.0/</ref></p>
<p>Text: IN COPYRIGHT <ref type="url"
>http://rightsstatements.org/vocab/InC/1.0/</ref>
</p>
</availability>
<idno type="PID"></idno>
</publicationStmt>
<sourceDesc>
<bibl>
<title></title>
<publisher><orgName>Alhambra Press Bt.</orgName>
<placeName>1034 Budapest, Szomolnok u. 7. I. em. 3.</placeName>
<ref type="url" source="https://hang.hu/informacio/impresszum-25069"></ref>
<date when="2021-12-10"/>
</publisher>
<pubPlace>
Budapest
<ref type="url">http://www.geonames.org/3054643</ref>
</pubPlace>
<availability><p>Minden jog fenntartva © 2018-2021 - Magyar Hang</p>
<p><ref type="url" source="https://hang.hu/informacio/impresszum-25069"></ref>
<date when="2021-12-10"/></p>
</availability>
<date></date><!-- cikk megjelenésének dátuma 'when' attribútumba /nem ismert (a kód tölti)-->
</bibl>
</sourceDesc>
</fileDesc>
<xenoData xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sch="https://schema.org" xmlns:skos="http://www.w3.org/2008/05/skos-xl#">
<rdf:RDF>
<rdf:Description rdf:about="">
<sch:type rdf:resource="https://schema.org/NewsArticle"/>
<sch:ispartOf rdf:resource="">Magyar Hang</sch:ispartOf>https://hang.hu/
<sch:inLanguage>hun</sch:inLanguage>
<!-- ide töltődik ki a cikkből kinyert többi meta -->
<sch:license rdf:resource="http://rightsstatements.org/vocab/InC-EDU/1.0/">In Copyright</sch:license>
</rdf:Description>
</rdf:RDF>
</xenoData>
<xenoData xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sch="https://schema.org">
<rdf:RDF>
<rdf:Description rdf:about=""><!--KITÖLT: about: Zenodo link, pl: https://doi.org/10.5281/zenodo.3974489 -->
<sch:type>WARC/1.1</sch:type>
<sch:sdDatePublished>
2021-09-02 19:50:07/2021-09-06 18:31:37
</sch:sdDatePublished>
<sch:identifier rdf:about=""/><!-- KITÖLT ua. mint a Description about3 sorral feljebb https://doi.org/10.5281/zenodo.3974489 -->
<sch:identifier><!-- KITÖLT: ZENODO HASH --></sch:identifier>
<sch:identifier>urn:uuid:5ec17312-d7ee-4f3b-801e-4557ffa4a463</sch:identifier> </rdf:Description>
</rdf:RDF>
</xenoData>
<xenoData xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sch="https://schema.org">
<rdf:RDF>
<rdf:Description rdf:about="teiPid">
<sch:identifier></sch:identifier>
<sch:type>TEI</sch:type>
<sch:sdDatePublished></sch:sdDatePublished>
<sch:lastReviewed></sch:lastReviewed><!-- warc/cikk repsonse dátuma (küldi a response_warc_record_gen) -->
<sch:contributor rdf:resource="https://orcid.org/0000-0002-4394-8577">Palkó Gábor</sch:contributor>
<sch:contributor rdf:resource="https://orcid.org/0000-0001-8090-3661">Indig Balázs</sch:contributor>
<sch:contributor rdf:resource="https://orcid.org/0000-0001-9199-1759">Fellegi Zsófia</sch:contributor>
<sch:contributor rdf:resource="https://orcid.org/0000-0002-2558-0633">Sárközi-Lindner Zsófia</sch:contributor>
<sch:license rdf:resource="http://rightsstatements.org/vocab/InC/1.0/"/>
</rdf:Description>
</rdf:RDF>
</xenoData>
<revisionDesc>
<change source="teiPID">TEI file created</change>
</revisionDesc>
</teiHeader>
<text>
<body>
</body>
</text>
</TEI>
29 changes: 29 additions & 0 deletions configs/hang/hang_notext_tags_normal.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
frequency tag average_word_count average_descendant_num immediate_texts_average_length URL_example normal_name preserved_attribute
default <p> default default default no_example bekezdes default
default <h2> default default default no_example cimsor default
default <h3> default default default no_example cimsor default
default <h4> default default default no_example cimsor default
default <h5> default default default no_example cimsor default
default <b> default default default no_example felkover default
default <strong> default default default no_example felkover default
default <em> default default default no_example kiemelt default
default <i> default default default no_example dolt default
default <mark> default default default no_example kiemelt default
default <sub> default default default no_example alsoindex default
default <sup> default default default no_example felsoindex default
default <del> default default default no_example athuzott default
default <u> default default default no_example alahuzott default
default <strike> default default default no_example athuzott default
default <ul> default default default no_example lista default
default <ol> default default default no_example lista default
default <li> default default default no_example listaelem default
default <table> default default default no_example table_text default
default <tr> default default default no_example sor default
default <td> default default default no_example oszlop default
default <th> default default default no_example oszlop default
default <quote> default default default no_example idezet default
default <figure> default default default no_example media_tartalom default
default <iframe> default default default no_example beagyazott_tartalom default
default <else> default default default no_example default default
default <script> default default default no_example default default
default <noscript> default default default no_example default default
133 changes: 133 additions & 0 deletions configs/hang/hang_specific.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*

import re

from html2tei import parse_date, BASIC_LINK_ATTRS, decompose_listed_subtrees_and_mark_media_descendants, tei_defaultdict

PORTAL_URL_PREFIX = 'https://hang.hu'

ARTICLE_ROOT_PARAMS_SPEC = [(('article',), {'class': 'post-wrap post-single mt-3 icms-content'}),
(('article',), {'class': 'post-wrap post-single wide mt-3 icms-content'})]


HTML_BASICS = {'p', 'h3', 'h2', 'h4', 'h5', 'em', 'i', 'b', 'strong', 'mark', 'u', 'sub', 'sup', 'del', 'strike',
'ul', 'ol', 'li', 'table', 'tr', 'td', 'th', 'quote', 'figure', 'iframe'}


def get_meta_from_articles_spec(tei_logger, url, bs):
data = tei_defaultdict()
data['sch:url'] = url
article_root = bs.find('article')
if article_root is not None:
date_tag = article_root.find('ul', class_='d-flex flex-row')
if date_tag is not None:
date_tag = date_tag.find_all('li')[1]
if date_tag is not None:
article_date_text = date_tag.text
if article_date_text is not None:
parsed_date = parse_date(article_date_text, '%Y. %B %d., %H:%M')
if parsed_date is not None:
data['sch:datePublished'] = parsed_date
else:
tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!')
else:
tei_logger.log('WARNING', f'{url}: DATE TEXT NOT FOUND!')
else:
tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')
else:
tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')
# data['sch:dateModified'] = write_it
# else: tei_logger.log('WARNING', f'{url}: MODIFIED DATE TEXT FORMAT ERROR!')
author_tag = article_root.find('span', class_='author-name')
if author_tag is not None:
author_text = author_tag.text
if author_text is not None:
author_text = author_text.strip()
if author_text == 'Magyar Hang':
data['sch:source'] = [author_text]
else:
data['sch:author'] = author_text.split(',')
else:
tei_logger.log('WARNING', f'{url}: AUTHOR TEXT NOT FOUND!')
else:
tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
name_tag = article_root.find('div', class_='entry-title')
if name_tag is not None:
name_text = name_tag.h1.text
if name_text is not None:
data['sch:name'] = name_text.strip()
else:
tei_logger.log('WARNING', f'{url}: TITLE TEXT NOT FOUND IN URL!')
else:
tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND IN URL!')
section_line = article_root.find('div', class_='entry-meta')
if section_line is not None:
sections = [a.text for a in section_line.find_all('a') if a is not None]
if sections is not None:
data['sch:articleSection'] = sections
else:
tei_logger.log('DEBUG', f'{url}: SECTION TEXT NOT FOUND!')
else:
tei_logger.log('DEBUG', f'{url}: SECTION TAG NOT FOUND!')
keywords_line = article_root.find('div', class_='widget widget-tags mb-5')
if keywords_line is not None:
keywords_sections = [a.text.strip() for a in section_line.find_all('a') if a is not None]
if keywords_sections is not None:
data['sch:keywords'] = keywords_sections
else:
tei_logger.log('DEBUG', f'{url}: KEYWORDS TEXT NOT FOUND!')
else:
tei_logger.log('DEBUG', f'{url}: KEYWORDS TAG NOT FOUND!')
# else: tei_logger.log('WARNING', f'{url}: TAGS NOT FOUND!')
return data
# tei_logger.log('WARNING', f'{url}: METADATA CONTAINER NOT FOUND!')
# tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!')
# return None
else:
tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!')
return None


def excluded_tags_spec(tag):
if tag.name not in HTML_BASICS:
tag.name = 'else'
tag.attrs = {}
return tag


BLOCK_RULES_SPEC = {}
BIGRAM_RULES_SPEC = {}
LINKS_SPEC = BASIC_LINK_ATTRS
DECOMP = [(('div',), {'class': 'widget'}),
(('div',), {'class': 'widget-tamogatas-box'}),
(('div',), {'class': 'banner-wrapper'}),
(('div',), {'id': 'videoad'}),
(('div',), {'class': 'widget widget-tamogatas-box'}),
(('div',), {'class': 'banner-wrapper'}),
(('div',), {'class': 'img-container'}),
(('div',), {'id': 'pa_videoslider'}),
(('div',), {'id': 'widget-image'}),
(('div',), {'class': 'sidebar-col'}),
(('div',), {'class': 'entry-meta'}),
(('div',), {'class': 'entry-image'}),
(('div',), {'class': 'oygrvhab'}),
(('script',), {}),
(('iframe',), {})]

MEDIA_LIST = []


def decompose_spec(article_dec):
decompose_listed_subtrees_and_mark_media_descendants(article_dec, DECOMP, MEDIA_LIST)
return article_dec


BLACKLIST_SPEC = ['https://hang.hu/kultura/reg-keszult-olyan-felkavaro-kamaszokrol-szolo-sorozat-mint-az-euforia-'
'107933'] # just a picture

MULTIPAGE_URL_END = re.compile(r'^\b$') # Dummy


def next_page_of_article_spec(_):
return None
29 changes: 29 additions & 0 deletions configs/hang/hang_text_tags_normal.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
frequency tag average_word_count average_descendant_num immediate_texts_average_length URL_example normal_name preserved_attribute
default <p> default default default no_example bekezdes default
default <h2> default default default no_example cimsor default
default <h3> default default default no_example cimsor default
default <h4> default default default no_example cimsor default
default <h5> default default default no_example cimsor default
default <b> default default default no_example felkover default
default <strong> default default default no_example felkover default
default <em> default default default no_example kiemelt default
default <i> default default default no_example dolt default
default <mark> default default default no_example kiemelt default
default <sub> default default default no_example alsoindex default
default <sup> default default default no_example felsoindex default
default <del> default default default no_example athuzott default
default <u> default default default no_example alahuzott default
default <strike> default default default no_example athuzott default
default <ul> default default default no_example lista default
default <ol> default default default no_example lista default
default <li> default default default no_example listaelem default
default <table> default default default no_example table_text default
default <tr> default default default no_example sor default
default <td> default default default no_example oszlop default
default <th> default default default no_example oszlop default
default <quote> default default default no_example idezet default
default <figure> default default default no_example media_tartalom default
default <iframe> default default default no_example beagyazott_tartalom default
default <else> default default default no_example default default
default <script> default default default no_example default default
default <noscript> default default default no_example default default