Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions configs/alfahir/alfahir_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
'hvg.hu - barikad.hu', 'MTI, Népszabadság'
]

def author_source_norm(extracted_meta):
return [m.strip() for m in re.split(',| - |/| – ', extracted_meta) if len(m.strip())>0]


def get_meta_from_articles_spec(tei_logger, url, bs):
data = tei_defaultdict()
Expand Down Expand Up @@ -194,7 +197,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if source_in_text_tag is not None:
source_in_text_tag_text = source_in_text_tag.find('div', class_='field--item').get_text(strip=True)
if source_in_text_tag_text is not None:
data['sch:source'] = source_in_text_tag_text
data['sch:source'] = author_source_norm(source_in_text_tag_text)
if len(data['sch:source'])> 1:
data['originalAuthorString'] = source_in_text_tag_text

# Sometimes implicitly inserted into a <p> tag
else:
Expand Down Expand Up @@ -222,7 +227,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if len(source_in_text_4) < 40:
source_text = source_in_text_4.strip()
if source_text in SOURCE_LIST: # Above code allows minimal mistakes - invalid sources are filtered
data['sch:source'] = [source_text]
data['sch:source'] = author_source_norm(source_text)
if len(data['sch:source'])> 1:
data['originalAuthorString'] = source_text
data['AuthorString_extracted_from_content'] = source_text
else:
tei_logger.log('DEBUG', f'{url}: SOURCE TAG NOT FOUND!')

Expand Down
39 changes: 33 additions & 6 deletions configs/hvg/hvg_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,24 @@
'MTI/Népszava', 'MTI/dpa/Hszinhua', 'OTS/MTI', 'BBC/MTI', 'MTI/Bors', 'MTI/Reuters/AP', 'MTI/AP',
'MTI/AFP/Reuters', 'MTI/Reuters/Hszinhua', 'MTI/Blikk', 'HVG/MTI']

SOURCE_NORM = {'EFE', 'HavariaPress', 'dpa', 'businesstraveller', 'Kisalföld', 'D.P.', 'dehir.hu', 'f1-live.hu',
'MT Zrt.', 'MTI', 'Bankmonitor.hu', 'manna.ro', 'pecsma.hu', 'foodnetwork', 'InfoRádió', 'VG',
'TV2', 'delmagyar.hu', 'Népszava', 'OTS', 'transindex.ro', 'portfolio.hu', 'honvedelem.hu',
'HVG Extra Business', 'CNN', 'napi.hu', 'MTI-OS', 'Index', 'met.hu', 'Utinform.hu',
'nyugat.hu', 'BBC', 'kemma.hu', 'turizmus.com', 'Jobline.hu', 'AP', 'muosz.hu',
'élelmiszer online', 'MNO', 'baon.hu', 'teol.hu', 'ITAR-TASZSZ', 'Blikk', 'hirado.hu',
'HVG Extra Pszichológia', 'indohaz.hu', 'Bors', 'Számlázz.hu', 'Napi.hu', 'bankmonitor.hu',
'Hszinhua', 'MTI ', 'HVG Konferencia', 'DW', 'Inforádió', 'Zgut Edit', 'Dow Jones', 'Origo',
'Eduline', 'OS', 'Világgazdaság', 'MR1-Kossuth Rádió', 'szoljon.hu', 'hvg.hu', 'Észak-Magyarország',
'VinceBudapest', 'vendeglatasmagazin.hu', 'termekmix.hu', 'AFP', 'nso.hu', 'termekmix.com', 'benke',
'f1-live', 'BiztosDöntés.hu', 'ingatlanmenedzser.hu', 'kisalföld.hu', 'atlatszo.blog.hu', 'Travellina',
'merites.hu', 'Euronews', 'Marabu', 'sonline.hu', ' hvg.hu', 'EUrologus', 'Tények', 'Reuters',
'Magyar Nemzet', 'DPA', 'MTA', '- esel -', 'eduline.hu', 'MLF', 'HVG', 'Adozona.hu', 'mult-kor.hu',
'REUTERS', 'I.N.', 'Népszabadság', 'police.hu', 'Bank360.hu'}


def author_source_norm(extracted_meta):
return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]

def get_meta_from_articles_spec(tei_logger, url, bs):
data = tei_defaultdict()
Expand All @@ -61,12 +79,20 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!')
author_or_source_tag = article_root.find('div', class_='author-name')
if author_or_source_tag is not None:
author_or_source = author_or_source_tag.text.strip().\
replace('\r', '').replace('\n', '').replace('\t', '').replace('Követés', '')
if author_or_source in SOURCE:
data['sch:source'] = [author_or_source]
else:
data['sch:author'] = [author_or_source]
author_or_source_raw = author_or_source_tag.text.strip().\
replace('\r', '').replace('\n', '').replace('\t', '')
author_or_source = author_or_source_raw.replace('Követés', '')
authors_list = author_source_norm(author_or_source)
if author_or_source != author_or_source_raw or len(authors_list) > 1:
data['originalAuthorString'] = author_or_source_raw
authors_l, sources_l = [], []
[sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in
authors_list]
if len(authors_l) > 0:
data['sch:author'] = authors_l
if len(sources_l) > 0:
data['sch:source'] = sources_l

else:
tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!')
keywords_root = article_root.find('div', class_='article-tags')
Expand Down Expand Up @@ -99,6 +125,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if beg != -1 and end != -1:
author = auth_string[beg+1:end+1]
data['sch:author'] = [author]
data['originalAuthorString'] = auth_string
else:
tei_logger.log('DEBUG', f'{url} NO AUTHOR FOUND!')

Expand Down
19 changes: 10 additions & 9 deletions configs/kurucinfo/kurucinfo_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def get_meta_from_articles_spec(tei_logger, url, bs):
data = tei_defaultdict()
data['sch:url'] = url

# MISSING FROM PORTAL: data['sch:dateModified']

# ARTICLE SECTION
Expand All @@ -29,7 +29,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
tei_logger.log('WARNING', f'{url}: ARTICLE SECTION TAG PARSE ERROR!')
else:
tei_logger.log('WARNING', f'{url}: ARTICLE SECTION NOT FOUND!')

article_root = bs.find('div', {'class': 'tblot'})
if article_root is not None:

Expand All @@ -39,9 +39,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
title_text = title_tag.get_text(strip=True)
if title_text is not None:
data['sch:name'] = title_text
else:
else:
tei_logger.log('WARNING', f'{url}: TITLE TAG EMPTY!')
else:
else:
tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!')

# AUTHOR / https://kuruc.info/r/6/150707/
Expand All @@ -55,6 +55,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if 1 < len(split_t) <= 3 and ('(' or ')') not in tag_text and all([w[0].isupper() for w in split_t]):
data['sch:author'] = [tag_text]
data['originalAuthorString'] = [possible_author_tag.get_text(strip=True)]
data['AuthorString_extracted_from_content'] = possible_author_tag.text
else:
tei_logger.log('DEBUG', f'{url}: AUTHOR TAG EMPTY!')
else:
Expand All @@ -63,25 +64,25 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
# KEYWORDS and DATE PUBLISHED
meta_header = article_root.find('p', {'class': 'cikkdatum'})
if meta_header is not None:

# keywords
a_tags = meta_header.find_all('a', href=re.compile('/t/[0-9]'))
if len(a_tags) > 0:
data['sch:keywords'] = [t.get_text(strip=True) for t in a_tags if len(t.get_text(strip=True)) > 0]
else:
tei_logger.log('INFO', f'{url}: KEYWORDS NOT FOUND!')

# datePublished
date_published_tag = meta_header.find('span', {'itemprop': "datePublished"})
if date_published_tag is not None:
date_published_raw = date_published_tag.get_text(strip=True)
if date_published_raw is not None:
data['sch:datePublished'] = parse_date(date_published_raw, "%Y. %B %d. %H:%M")
else:
else:
tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!')
else:
else:
tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')

else:
tei_logger.log('WARNING', f'{url}: META HEADER [datePublished, keywords] NOT FOUND!')

Expand Down
7 changes: 5 additions & 2 deletions configs/magyaridok/magyaridok_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,12 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
data['sch:author'] = [author.text.strip()]
elif source is not None:
# In case if not an author, only source (MTI)
data['sch:source'] = source.text.strip()
source_text = source.text.strip()
data['sch:source'] = [s.strip() for s in source_text.split(',')]
if len(data['sch:source']) > 1:
data['originalAuthorString'] = [source_text]
else:
tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!')
article_tags = []
section_line = article_root.find('span', class_='en-article-header-column')
if section_line is not None:
Expand Down
30 changes: 23 additions & 7 deletions configs/magyarnarancs/magyarnarancs_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
'narancs.hu/Republikon', 'MTI/narancs', 'narancs hu.', 'MTI/Világgazdaság/narancs.hu', 'narancs.hu - B. T.',
'transindex.ro', 'MTI-OS', 'nrancs.hu']

SOURCE_SOLO = {'narancs.hu', 'szegeder.hu', 'MTA', 'narancs', 'HVG', 'narancs hu', 'Markó Anita', 'narancs hu.', 'Guardian', 'Telex', 'Narancs.hu', 'Fizetett tartalom', 'MTI', 'narancsfül', 'narancs.', 'transindex.ro', 'Amnesty', 'media1.hu', 'Narancsfül', 'narancsblog', 'Reuters', 'narancs. hu', 'Narancs', 'narancsszem', 'M', 'Republikon', 'nrancs.hu', 'narancs.hu-összeállítás', 'TASZ', 'narancs.hu - B. T.', 'Magyar Narancs', 'narancs.hu-MTI', 'Világgazdaság', 'Police.hu', 'OS', 'Szabad ország', 'MTI-OS', 'narancs.h', 'Narancs-összeállítás', 'MT'}

def author_source_norm(extracted_meta):
ret_list = []
if isinstance(extracted_meta, list):
for meta in extracted_meta:
ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és ', meta) if len(m.strip()) > 0])
return ret_list
return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]

def get_meta_from_articles_spec(tei_logger, url, bs):
data = tei_defaultdict()
Expand Down Expand Up @@ -70,20 +79,31 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
subtitle = bs.find('h3', class_='card-subtitle')
if subtitle is not None:
data['sch:alternateName'] = subtitle.text.strip()
author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
author_or_source_o = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
author_or_source = author_source_norm(author_or_source_o)
if len(author_or_source) > len(author_or_source_o):
data['originalAuthorString'] = author_or_source_o
author_list, source_list = [], []
[source_list.append(creator) if creator in SOURCE else author_list.append(creator) for creator in
[source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in
author_or_source]
if len(author_list) > 0 or len(source_list) > 0:
author_list_corr = []
if len(author_list) > 0:
for auth in author_list:
if ',' in auth:
author_list_corr.extend(one_author.strip() for one_author in auth.split('\''))
if '(' not in auth:
author_list_corr.extend(one_author.strip() for one_author in auth.split(','))
data['originalAuthorString'] = [auth]
else:
# Szlankó Bálint (Argandab-folyóvölgy, Kandahár)
data['originalAuthorString'] = [auth]
auth = auth[0:auth.index('(')]
author_list_corr.extend(one_author.strip() for one_author in auth.split(','))
else:
author_list_corr.append(auth)
data['sch:author'] = author_list_corr
if len(source_list) > 0:

data['sch:source'] = source_list
else:
tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!')
Expand All @@ -102,10 +122,6 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
def excluded_tags_spec(tag):
if tag.name == 'li' and 'data-leiras' in tag.attrs and tag['data-leiras'] == ' \r\n':
tag['data-leiras'] = tag['data-leiras'].strip()

# if tag.name not in HTML_BASICS:
# tag.name = 'else'
# tag.attrs = {}
return tag


Expand Down
4 changes: 3 additions & 1 deletion configs/merce/merce_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
data['sch:author'] = authors
if len(authors) > 1:
print('Több szerző', url)
# TODO: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/
else:
# TODO 1 <a href="https://avm.merce.hu/author/evatessza/" title="Udvarhelyi Tessza cikkei" class="author url fn track-act-up" rel="author">Udvarhelyi Tessza</a>
# TODO 2: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/
# else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
# <div class="featured-tag">
"""is_section = bs.find('div', {'class': 'featured-tag'})
Expand Down
2 changes: 1 addition & 1 deletion configs/mno/mno_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if author is not None:
data['sch:author'] = [author.text.strip()]
elif source is not None:
data['sch:source'] = source.text.strip()
data['sch:source'] = [source.text.strip()]
else:
tei_logger.log('DEBUG', f'{url} AUTHOR AND SOURCE TAG NOT FOUND!')

Expand Down
20 changes: 15 additions & 5 deletions configs/p888/p888_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,24 +78,34 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if note_block_tag is not None:
author_or_source = note_block_tag.find('div', class_='text-wrap').get_text(strip=True)
if author_or_source is not None:
#print(url, note_block_tag)
if author_or_source in SOURCE or author_or_source in SOURCE_SECONDARY:
data["sch:source"] = [author_or_source]
data['originalAuthorString'] = [author_or_source]
else:
# split by: ANY OF THESE ',-–' CHARACTERS FOLLOWED BY WHITESPACE '\s' AND NOT 'a ', 'az ',
# 'A ' or 'Az '
# regex solution may be over complicated
split_list = re.split("[,\-\–]\s(?!a\s|az\s|A\s|Az\s)", author_or_source)
if len(split_list) > 0 and split_list[0] != '':
source_list, author_list = [], []
for author in split_list:
if author in SOURCE or author in SOURCE_SECONDARY:
source_list.append(author.strip())
else:
author_list.append(author.strip())
for author_ in split_list: # TODO: ez egy rovat tkp.: Olvasói vélemény
# Olvasói vélemény | Szerző: Ádám Attila
authors = re.split('/|\||-|;|–', author_)
for author in authors:
if author in SOURCE or author in SOURCE_SECONDARY:

source_list.append(author.strip())
else:
author = author.replace('Szerző:', '').replace('szerző:', '')
author_list.append(author.strip())
if len(author_list) > 0:
data['sch:author'] = author_list
if len(source_list) > 0:
data['sch:source'] = source_list
if len(author_list+source_list) > 1: # TODO: van más feltétel?
data['originalAuthorString'] = [author_or_source]
# data['AuthorString_extracted_from_content'] = author_or_source
else:
tei_logger.log('DEBUG', f'{url}: AUTHOR TAG TEXT EMPTY!')

Expand Down
9 changes: 8 additions & 1 deletion configs/vadhajtasok/vadhajtasok_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,14 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
if len(source_raw) > 0 and len(source_raw.split()) < 6 and \
(source_raw.startswith('Forrás:') or source_raw.startswith('Írta:') or
source_raw in SOURCE_2):
data['originalAuthorString'] = [source_raw]
if source_raw.startswith('Írta:'):
data['sch:author'] = source_raw.replace('Írta:', '').strip().split(',')
elif source_raw.startswith('Forrás:') or source_raw in SOURCE_2:
data['sch:source'] = [m.strip() for m in re.split(',|-|/|–| és |;', source_raw.replace('Forrás:', '').strip()) if 'Fotó:' not in m and len(m.strip())>0]
if len(data['sch:source'])> 1:
data['originalAuthorString'] = source_raw
data['AuthorString_extracted_from_content'] = source_raw

else:
tei_logger.log('DEBUG', f'{url}: SOURCE NOT FOUND!')
return data
Expand Down
19 changes: 11 additions & 8 deletions configs/valasz/valasz_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,21 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
# <span class="forras">Hírforrás: Válasz.hu</span>
author_tags = article_root.find_all('a', rel='author')
if len(author_tags) > 0:
if any(len(elem.text.strip()) == 0 for elem in author_tags):
source = article_root.find('span', class_="forras")
if source is not None:
data['sch:source'] = source.text.strip()
else:
data['sch:author'] = [a.text for a in author_tags]
else:
data['sch:author'] = [a.text.strip() for a in author_tags]
#if any(len(elem.text.strip()) == 0 for elem in author_tags):
source = article_root.find('span', class_="forras")
if source is not None:
# Hírforrás
sources = [m.strip() for m in re.split(',|/| - |;| és ', source.text.strip().replace('Hírforrás: ', '')) if len(m.strip()) > 0]
if len(sources) > 1:
data['originalAuthorString'] = [source.text.strip()]
data['sch:source'] = sources
if author_tags is None and source is None: # The following code fragment is probably redundant.
# The source and author fields can co-exist
article_source = article_root.find('span', class_='forras')
article_author2 = article_root.find('span', class_='szerzo')
if article_source is not None:
data['sch:source'] = article_source.text.strip()
data['sch:source'] = [article_source.text.strip()]
if article_author2 is not None:
data['sch:author'] = [article_author2.text.strip()]
keyword_root = bs.find('aside', class_='breadcrumb')
Expand Down
Loading