Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions configs/vadhajtasok/vadhajtasok_BASE.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml"
schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
</titleStmt>
<editionStmt>
<edition>ELTE-DH webcrawling</edition>
<respStmt>
<resp>creator</resp>
<orgName>ELTE-DH<ref type="http://elte-dh.hu"/>
</orgName>
</respStmt>
<respStmt>
<resp>project director</resp>
<persName><surname>Palkó</surname><forename>Gábor</forename>
<ref>https://orcid.org/0000-0002-4394-8577</ref></persName>
</respStmt>
<respStmt>
<resp>chief programmer</resp>
<persName><surname>Indig</surname><forename>Balázs</forename>
<ref>https://orcid.org/0000-0001-8090-3661</ref></persName>
</respStmt>
<respStmt>
<resp>TEI expert</resp>
<persName><surname>Fellegi</surname><forename>Zsófia</forename>
<ref>https://orcid.org/0000-0001-9199-1759</ref></persName>
</respStmt>
<respStmt>
<resp>programmer</resp>
<persName><surname>Sárközi-Lindner</surname><forename>Zsófia</forename>
<ref>https://orcid.org/0000-0002-2558-0633</ref></persName>
</respStmt>
</editionStmt>
<publicationStmt>
<publisher>
<orgName>ELTE-DH</orgName>
<ref type="url">http://elte-dh.hu/</ref>
</publisher>
<pubPlace>Budapest <ref type="url">http://www.geonames.org/3054643</ref>
</pubPlace>
<date>2020</date>
<availability>
<p>Metadata: IN COPYRIGHT - NON-COMMERCIAL USE PERMITTED<ref type="url"
>http://rightsstatements.org/vocab/InC-NC/1.0/</ref></p>
<p>Text: IN COPYRIGHT <ref type="url"
>http://rightsstatements.org/vocab/InC/1.0/</ref>
</p>
</availability>
<idno type="PID"></idno>
</publicationStmt>
<sourceDesc>
<bibl>
<title></title>
<publisher><orgName>PLACEHOLDER<!--KITÖLT kiadó az impresszumból --></orgName>
<placeName>PLACEHOLDER<!--KITÖLT kiadó székhelye --></placeName>
<ref type="url" source="https://www.vadhajtasok.hu/impresszum"></ref>
<date when="2022-02-15"/>
</publisher>
<pubPlace>
PLACEHOLDER<!--KITÖLT: pl: Budapest -->
<ref type="url">PLACEHOLDER<!--KITÖLT: pl: http://www.geonames.org/3054643 --></ref>
</pubPlace>
<availability><p>© 2018 Vadhajtások.hu</p>
<p>
<ref type="url" source="https://www.vadhajtasok.hu/impresszum">
<!--KITÖLT: https://doi.org/</ref> --></ref>
<date when="2022-02-15"/><!-- when: impresszum/jognyil crawl dátum -->
</p>
</availability>
<date></date>
</bibl>
</sourceDesc>
</fileDesc>
<xenoData xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sch="https://schema.org" xmlns:skos="http://www.w3.org/2008/05/skos-xl#">
<rdf:RDF>
<rdf:Description rdf:about="">
<sch:type rdf:resource="https://schema.org/NewsArticle"/>
<sch:ispartOf rdf:resource="https://www.vadhajtasok.hu/">Vadhajtasok</sch:ispartOf>
<sch:inLanguage>hun</sch:inLanguage>
<sch:license rdf:resource="http://rightsstatements.org/vocab/InC-EDU/1.0/">In Copyright</sch:license>
</rdf:Description>
</rdf:RDF>
</xenoData>
<xenoData xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sch="https://schema.org">
<rdf:RDF>
<rdf:Description rdf:about=""><!--KITÖLT: about: Zenodo link, pl: https://doi.org/10.5281/zenodo.3974489 -->
<sch:type>WARC/1.1</sch:type>
<sch:sdDatePublished>2021-12-13T12:32:22/2021-12-13T23:19:44</sch:sdDatePublished>
<sch:identifier rdf:about=""/><!-- KITÖLT ua. mint a Description about 3 sorral feljebb https://doi.org/10.5281/zenodo.3974489 -->
<sch:identifier><!-- KITÖLT: ZENODO HASH --></sch:identifier>
<sch:identifier>b0d41bc0-01bb-4845-b7bd-45f95be24f0e</sch:identifier> </rdf:Description>
</rdf:RDF>
</xenoData>
<xenoData xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:sch="https://schema.org">
<rdf:RDF>
<rdf:Description rdf:about="teiPid">
<sch:identifier></sch:identifier>
<sch:type>TEI</sch:type>
<sch:sdDatePublished></sch:sdDatePublished>
<sch:lastReviewed></sch:lastReviewed>
<sch:contributor rdf:resource="https://orcid.org/0000-0002-4394-8577">Palkó Gábor</sch:contributor>
<sch:contributor rdf:resource="https://orcid.org/0000-0001-8090-3661">Indig Balázs</sch:contributor>
<sch:contributor rdf:resource="https://orcid.org/0000-0001-9199-1759">Fellegi Zsófia</sch:contributor>
<sch:contributor rdf:resource="https://orcid.org/0000-0002-2558-0633">Sárközi-Lindner Zsófia</sch:contributor>
<sch:license rdf:resource="http://rightsstatements.org/vocab/InC/1.0/"/>
</rdf:Description>
</rdf:RDF>
</xenoData>
<revisionDesc>
<change source="teiPID">TEI file created</change>
</revisionDesc>
</teiHeader>
<text>
<body>
</body>
</text>
</TEI>
29 changes: 29 additions & 0 deletions configs/vadhajtasok/vadhajtasok_notext_tags_normal.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
frequency tag average_word_count average_descendant_num immediate_texts_average_length URL_example normal_name preserved_attribute
default <p> default default default no_example bekezdes default
default <h2> default default default no_example cimsor default
default <h3> default default default no_example cimsor default
default <h4> default default default no_example cimsor default
default <h5> default default default no_example cimsor default
default <b> default default default no_example felkover default
default <strong> default default default no_example felkover default
default <em> default default default no_example kiemelt default
default <i> default default default no_example dolt default
default <mark> default default default no_example kiemelt default
default <sub> default default default no_example alsoindex default
default <sup> default default default no_example felsoindex default
default <del> default default default no_example athuzott default
default <u> default default default no_example alahuzott default
default <strike> default default default no_example athuzott default
default <ul> default default default no_example lista default
default <ol> default default default no_example lista default
default <li> default default default no_example listaelem default
default <table> default default default no_example table_text default
default <tr> default default default no_example sor default
default <td> default default default no_example oszlop default
default <th> default default default no_example oszlop default
default <quote> default default default no_example idezet default
default <figure> default default default no_example media_tartalom default
default <iframe> default default default no_example beagyazott_tartalom default
default <else> default default default no_example default default
default <script> default default default no_example decompose default
default <noscript> default default default no_example decompose default
102 changes: 102 additions & 0 deletions configs/vadhajtasok/vadhajtasok_specific.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*

import re

from html2tei import parse_date, BASIC_LINK_ATTRS, decompose_listed_subtrees_and_mark_media_descendants, tei_defaultdict

PORTAL_URL_PREFIX = 'https://www.vadhajtasok.hu/'

ARTICLE_ROOT_PARAMS_SPEC = [(('div',), {'class': 'entry-content content-article'})]

HTML_BASICS = {'p', 'h3', 'h2', 'h4', 'h5', 'em', 'i', 'b', 'strong', 'mark', 'u', 'sub', 'sup', 'del', 'strike',
'ul', 'ol', 'li', 'table', 'tr', 'td', 'th', 'quote', 'figure', 'iframe', 'script', 'noscript'}


def get_meta_from_articles_spec(tei_logger, url, bs):
data = tei_defaultdict()
data['sch:url'] = url
article_root = bs.find('div', class_='entry-content content-article')
info_root = bs.find('div', class_='cs-entry__header-info')
if article_root is not None:
if info_root is not None:
section_date_tag = info_root.find('div', class_='cs-entry__post-meta f16 meta-tiny')
if section_date_tag is not None:
section_main = section_date_tag.find('a', class_='meta-categories')
if section_main is not None:
data['sch:articleSection'] = section_main.text.strip()
else:
tei_logger.log('WARNING', f'{url}: SECTION TAG NOT FOUND!')
date_tag = section_date_tag.find('div', class_='cs-meta-date',
string=re.compile(r'\d{4}\. .*\d{2}:\d{2}'))
if date_tag is not None:
parsed_date = parse_date(date_tag.text.strip(), '%Y. %B %d. - %H:%M')
if parsed_date is not None:
data['sch:datePublished'] = parsed_date
else:
tei_logger.log('WARNING', f'{url}: DATE TEXT FORMAT ERROR!')
else:
tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')
else:
tei_logger.log('WARNING', f'{url}: SECTION & DATE TAG NOT FOUND!')
modified_date_tag = info_root.find('span', class_='pk-badge pk-badge-no-danger',
string=re.compile(r'Frissítve! - \d{4}\. .*\d{2}:\d{2}'))
if modified_date_tag is not None:
modified_date_text = modified_date_tag.text.replace('Frissítve! - ', '').strip()
parsed_modified_date = parse_date(modified_date_text, '%Y. %B %d. - %H:%M')
if parsed_modified_date is not None:
data['sch:dateModified'] = parsed_modified_date
else:
tei_logger.log('WARNING', f'{url}: MODIFIED DATE TEXT FORMAT ERROR!')
title = info_root.find('h1', class_='cs-entry__title')
if title is not None:
article_title = title.find('span')
data['sch:name'] = article_title.text.strip()
else:
tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!')
keywords_root = bs.find('ul', class_='post-categories')
if keywords_root is not None:
keywords_list = [t.text.strip() for t in keywords_root.find_all('a', class_='news-tag')]
if len(keywords_list) > 0:
data['sch:keywords'] = keywords_list
else:
tei_logger.log('DEBUG', f'{url}: TAGS NOT FOUND!')
return data
else:
tei_logger.log('WARNING', f'{url}: ARTICLE BODY NOT FOUND!')
return None


def excluded_tags_spec(tag):
if tag.name not in HTML_BASICS:
tag.name = 'else'
tag.attrs = {}
return tag


BLOCK_RULES_SPEC = {}
BIGRAM_RULES_SPEC = {}
LINKS_SPEC = BASIC_LINK_ATTRS
DECOMP = [(('script',), {})]

MEDIA_LIST = [(('table',), {}),
(('figure',), {}),
(('img',), {}),
(('iframe',), {}),
(('div',), {'class': 'twitter-tweet twitter-tweet-rendered'})]


def decompose_spec(article_dec):
decompose_listed_subtrees_and_mark_media_descendants(article_dec, DECOMP, MEDIA_LIST)
return article_dec


BLACKLIST_SPEC = []

LINK_FILTER_SUBSTRINGS_SPEC = re.compile('|'.join(['LINK_FILTER_DUMMY_STRING']))

MULTIPAGE_URL_END = re.compile(r'^\b$') # Dummy


def next_page_of_article_spec(_):
return None
29 changes: 29 additions & 0 deletions configs/vadhajtasok/vadhajtasok_text_tags_normal.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
frequency tag average_word_count average_descendant_num immediate_texts_average_length URL_example normal_name preserved_attribute
default <p> default default default no_example bekezdes default
default <h2> default default default no_example cimsor default
default <h3> default default default no_example cimsor default
default <h4> default default default no_example cimsor default
default <h5> default default default no_example cimsor default
default <b> default default default no_example felkover default
default <strong> default default default no_example felkover default
default <em> default default default no_example kiemelt default
default <i> default default default no_example dolt default
default <mark> default default default no_example kiemelt default
default <sub> default default default no_example alsoindex default
default <sup> default default default no_example felsoindex default
default <del> default default default no_example athuzott default
default <u> default default default no_example alahuzott default
default <strike> default default default no_example athuzott default
default <ul> default default default no_example lista default
default <ol> default default default no_example lista default
default <li> default default default no_example listaelem default
default <table> default default default no_example table_text default
default <tr> default default default no_example sor default
default <td> default default default no_example oszlop default
default <th> default default default no_example oszlop default
default <quote> default default default no_example idezet default
default <figure> default default default no_example media_tartalom default
default <iframe> default default default no_example beagyazott_tartalom default
default <else> default default default no_example default default
default <script> default default default no_example decompose default
default <noscript> default default default no_example decompose default