diff --git a/configs/alfahir/alfahir_specific.py b/configs/alfahir/alfahir_specific.py index 6f6e3fe..4a0d1b3 100644 --- a/configs/alfahir/alfahir_specific.py +++ b/configs/alfahir/alfahir_specific.py @@ -34,6 +34,9 @@ 'hvg.hu - barikad.hu', 'MTI, Népszabadság' ] +def author_source_norm(extracted_meta): + return [m.strip() for m in re.split(',| - |/| – ', extracted_meta) if len(m.strip())>0] + def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() @@ -194,7 +197,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if source_in_text_tag is not None: source_in_text_tag_text = source_in_text_tag.find('div', class_='field--item').get_text(strip=True) if source_in_text_tag_text is not None: - data['sch:source'] = source_in_text_tag_text + data['sch:source'] = author_source_norm(source_in_text_tag_text) + if len(data['sch:source'])> 1: + data['originalAuthorString'] = source_in_text_tag_text # Sometimes implicitly inserted into a
tag else: @@ -222,7 +227,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if len(source_in_text_4) < 40: source_text = source_in_text_4.strip() if source_text in SOURCE_LIST: # Above code allows minimal mistakes - invalid sources are filtered - data['sch:source'] = [source_text] + data['sch:source'] = author_source_norm(source_text) + if len(data['sch:source'])> 1: + data['originalAuthorString'] = source_text + data['AuthorString_extracted_from_content'] = source_text else: tei_logger.log('DEBUG', f'{url}: SOURCE TAG NOT FOUND!') diff --git a/configs/hvg/hvg_specific.py b/configs/hvg/hvg_specific.py index 487285f..888b10c 100644 --- a/configs/hvg/hvg_specific.py +++ b/configs/hvg/hvg_specific.py @@ -35,6 +35,24 @@ 'MTI/Népszava', 'MTI/dpa/Hszinhua', 'OTS/MTI', 'BBC/MTI', 'MTI/Bors', 'MTI/Reuters/AP', 'MTI/AP', 'MTI/AFP/Reuters', 'MTI/Reuters/Hszinhua', 'MTI/Blikk', 'HVG/MTI'] +SOURCE_NORM = {'EFE', 'HavariaPress', 'dpa', 'businesstraveller', 'Kisalföld', 'D.P.', 'dehir.hu', 'f1-live.hu', + 'MT Zrt.', 'MTI', 'Bankmonitor.hu', 'manna.ro', 'pecsma.hu', 'foodnetwork', 'InfoRádió', 'VG', + 'TV2', 'delmagyar.hu', 'Népszava', 'OTS', 'transindex.ro', 'portfolio.hu', 'honvedelem.hu', + 'HVG Extra Business', 'CNN', 'napi.hu', 'MTI-OS', 'Index', 'met.hu', 'Utinform.hu', + 'nyugat.hu', 'BBC', 'kemma.hu', 'turizmus.com', 'Jobline.hu', 'AP', 'muosz.hu', + 'élelmiszer online', 'MNO', 'baon.hu', 'teol.hu', 'ITAR-TASZSZ', 'Blikk', 'hirado.hu', + 'HVG Extra Pszichológia', 'indohaz.hu', 'Bors', 'Számlázz.hu', 'Napi.hu', 'bankmonitor.hu', + 'Hszinhua', 'MTI ', 'HVG Konferencia', 'DW', 'Inforádió', 'Zgut Edit', 'Dow Jones', 'Origo', + 'Eduline', 'OS', 'Világgazdaság', 'MR1-Kossuth Rádió', 'szoljon.hu', 'hvg.hu', 'Észak-Magyarország', + 'VinceBudapest', 'vendeglatasmagazin.hu', 'termekmix.hu', 'AFP', 'nso.hu', 'termekmix.com', 'benke', + 'f1-live', 'BiztosDöntés.hu', 'ingatlanmenedzser.hu', 'kisalföld.hu', 'atlatszo.blog.hu', 'Travellina', + 'merites.hu', 'Euronews', 'Marabu', 'sonline.hu', ' hvg.hu', 'EUrologus', 'Tények', 'Reuters', + 'Magyar Nemzet', 'DPA', 'MTA', '- esel -', 'eduline.hu', 'MLF', 'HVG', 'Adozona.hu', 'mult-kor.hu', + 'REUTERS', 'I.N.', 'Népszabadság', 'police.hu', 'Bank360.hu'} + + +def author_source_norm(extracted_meta): + return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0] def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() @@ -61,12 +79,20 @@ def get_meta_from_articles_spec(tei_logger, url, bs): tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') author_or_source_tag = article_root.find('div', class_='author-name') if author_or_source_tag is not None: - author_or_source = author_or_source_tag.text.strip().\ - replace('\r', '').replace('\n', '').replace('\t', '').replace('Követés', '') - if author_or_source in SOURCE: - data['sch:source'] = [author_or_source] - else: - data['sch:author'] = [author_or_source] + author_or_source_raw = author_or_source_tag.text.strip().\ + replace('\r', '').replace('\n', '').replace('\t', '') + author_or_source = author_or_source_raw.replace('Követés', '') + authors_list = author_source_norm(author_or_source) + if author_or_source != author_or_source_raw or len(authors_list) > 1: + data['originalAuthorString'] = author_or_source_raw + authors_l, sources_l = [], [] + [sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in + authors_list] + if len(authors_l) > 0: + data['sch:author'] = authors_l + if len(sources_l) > 0: + data['sch:source'] = sources_l + else: tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!') keywords_root = article_root.find('div', class_='article-tags') @@ -99,6 +125,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if beg != -1 and end != -1: author = auth_string[beg+1:end+1] data['sch:author'] = [author] + data['originalAuthorString'] = auth_string else: tei_logger.log('DEBUG', f'{url} NO AUTHOR FOUND!') diff --git a/configs/kurucinfo/kurucinfo_specific.py b/configs/kurucinfo/kurucinfo_specific.py index 6d4c356..d151373 100644 --- a/configs/kurucinfo/kurucinfo_specific.py +++ b/configs/kurucinfo/kurucinfo_specific.py @@ -16,7 +16,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url - + # MISSING FROM PORTAL: data['sch:dateModified'] # ARTICLE SECTION @@ -29,7 +29,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): tei_logger.log('WARNING', f'{url}: ARTICLE SECTION TAG PARSE ERROR!') else: tei_logger.log('WARNING', f'{url}: ARTICLE SECTION NOT FOUND!') - + article_root = bs.find('div', {'class': 'tblot'}) if article_root is not None: @@ -39,9 +39,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): title_text = title_tag.get_text(strip=True) if title_text is not None: data['sch:name'] = title_text - else: + else: tei_logger.log('WARNING', f'{url}: TITLE TAG EMPTY!') - else: + else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') # AUTHOR / https://kuruc.info/r/6/150707/ @@ -55,6 +55,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if 1 < len(split_t) <= 3 and ('(' or ')') not in tag_text and all([w[0].isupper() for w in split_t]): data['sch:author'] = [tag_text] data['originalAuthorString'] = [possible_author_tag.get_text(strip=True)] + data['AuthorString_extracted_from_content'] = possible_author_tag.text else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG EMPTY!') else: @@ -63,25 +64,25 @@ def get_meta_from_articles_spec(tei_logger, url, bs): # KEYWORDS and DATE PUBLISHED meta_header = article_root.find('p', {'class': 'cikkdatum'}) if meta_header is not None: - + # keywords a_tags = meta_header.find_all('a', href=re.compile('/t/[0-9]')) if len(a_tags) > 0: data['sch:keywords'] = [t.get_text(strip=True) for t in a_tags if len(t.get_text(strip=True)) > 0] else: tei_logger.log('INFO', f'{url}: KEYWORDS NOT FOUND!') - + # datePublished date_published_tag = meta_header.find('span', {'itemprop': "datePublished"}) if date_published_tag is not None: date_published_raw = date_published_tag.get_text(strip=True) if date_published_raw is not None: data['sch:datePublished'] = parse_date(date_published_raw, "%Y. %B %d. %H:%M") - else: + else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') - else: + else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') - + else: tei_logger.log('WARNING', f'{url}: META HEADER [datePublished, keywords] NOT FOUND!') diff --git a/configs/magyaridok/magyaridok_specific.py b/configs/magyaridok/magyaridok_specific.py index dd297dd..9975db3 100644 --- a/configs/magyaridok/magyaridok_specific.py +++ b/configs/magyaridok/magyaridok_specific.py @@ -48,9 +48,12 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data['sch:author'] = [author.text.strip()] elif source is not None: # In case if not an author, only source (MTI) - data['sch:source'] = source.text.strip() + source_text = source.text.strip() + data['sch:source'] = [s.strip() for s in source_text.split(',')] + if len(data['sch:source']) > 1: + data['originalAuthorString'] = [source_text] else: - tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') + tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!') article_tags = [] section_line = article_root.find('span', class_='en-article-header-column') if section_line is not None: diff --git a/configs/magyarnarancs/magyarnarancs_specific.py b/configs/magyarnarancs/magyarnarancs_specific.py index ccc4f22..b77a3f9 100644 --- a/configs/magyarnarancs/magyarnarancs_specific.py +++ b/configs/magyarnarancs/magyarnarancs_specific.py @@ -22,6 +22,15 @@ 'narancs.hu/Republikon', 'MTI/narancs', 'narancs hu.', 'MTI/Világgazdaság/narancs.hu', 'narancs.hu - B. T.', 'transindex.ro', 'MTI-OS', 'nrancs.hu'] +SOURCE_SOLO = {'narancs.hu', 'szegeder.hu', 'MTA', 'narancs', 'HVG', 'narancs hu', 'Markó Anita', 'narancs hu.', 'Guardian', 'Telex', 'Narancs.hu', 'Fizetett tartalom', 'MTI', 'narancsfül', 'narancs.', 'transindex.ro', 'Amnesty', 'media1.hu', 'Narancsfül', 'narancsblog', 'Reuters', 'narancs. hu', 'Narancs', 'narancsszem', 'M', 'Republikon', 'nrancs.hu', 'narancs.hu-összeállítás', 'TASZ', 'narancs.hu - B. T.', 'Magyar Narancs', 'narancs.hu-MTI', 'Világgazdaság', 'Police.hu', 'OS', 'Szabad ország', 'MTI-OS', 'narancs.h', 'Narancs-összeállítás', 'MT'} + +def author_source_norm(extracted_meta): + ret_list = [] + if isinstance(extracted_meta, list): + for meta in extracted_meta: + ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és ', meta) if len(m.strip()) > 0]) + return ret_list + return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0] def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() @@ -70,20 +79,31 @@ def get_meta_from_articles_spec(tei_logger, url, bs): subtitle = bs.find('h3', class_='card-subtitle') if subtitle is not None: data['sch:alternateName'] = subtitle.text.strip() - author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')] + author_or_source_o = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')] + author_or_source = author_source_norm(author_or_source_o) + if len(author_or_source) > len(author_or_source_o): + data['originalAuthorString'] = author_or_source_o author_list, source_list = [], [] - [source_list.append(creator) if creator in SOURCE else author_list.append(creator) for creator in + [source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in author_or_source] if len(author_list) > 0 or len(source_list) > 0: author_list_corr = [] if len(author_list) > 0: for auth in author_list: if ',' in auth: - author_list_corr.extend(one_author.strip() for one_author in auth.split('\'')) + if '(' not in auth: + author_list_corr.extend(one_author.strip() for one_author in auth.split(',')) + data['originalAuthorString'] = [auth] + else: + # Szlankó Bálint (Argandab-folyóvölgy, Kandahár) + data['originalAuthorString'] = [auth] + auth = auth[0:auth.index('(')] + author_list_corr.extend(one_author.strip() for one_author in auth.split(',')) else: author_list_corr.append(auth) data['sch:author'] = author_list_corr if len(source_list) > 0: + data['sch:source'] = source_list else: tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!') @@ -102,10 +122,6 @@ def get_meta_from_articles_spec(tei_logger, url, bs): def excluded_tags_spec(tag): if tag.name == 'li' and 'data-leiras' in tag.attrs and tag['data-leiras'] == ' \r\n': tag['data-leiras'] = tag['data-leiras'].strip() - - # if tag.name not in HTML_BASICS: - # tag.name = 'else' - # tag.attrs = {} return tag diff --git a/configs/merce/merce_specific.py b/configs/merce/merce_specific.py index 0ca6337..5b6e8d5 100644 --- a/configs/merce/merce_specific.py +++ b/configs/merce/merce_specific.py @@ -54,7 +54,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data['sch:author'] = authors if len(authors) > 1: print('Több szerző', url) - # TODO: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/ + else: + # TODO 1 Udvarhelyi Tessza + # TODO 2: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/ # else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') #