From 19d7c4f29b3a7f560ffb88037443d20cd74a00da Mon Sep 17 00:00:00 2001 From: SLZsofia Date: Wed, 22 Feb 2023 10:38:50 +0100 Subject: [PATCH 1/3] author/source/original/from_content for 7 portal --- configs/alfahir/alfahir_specific.py | 12 +++++- configs/hvg/hvg_specific.py | 39 ++++++++++++++++--- configs/kurucinfo/kurucinfo_specific.py | 19 ++++----- .../magyarnarancs/magyarnarancs_specific.py | 26 ++++++++++--- configs/p888/p888_specific.py | 20 +++++++--- configs/vadhajtasok/vadhajtasok_specific.py | 9 ++++- configs/vs/vs_specific.py | 22 ++++++++--- src/html2tei/modes/portal_article_cleaner.py | 11 +++++- 8 files changed, 123 insertions(+), 35 deletions(-) diff --git a/configs/alfahir/alfahir_specific.py b/configs/alfahir/alfahir_specific.py index 6f6e3fe..4a0d1b3 100644 --- a/configs/alfahir/alfahir_specific.py +++ b/configs/alfahir/alfahir_specific.py @@ -34,6 +34,9 @@ 'hvg.hu - barikad.hu', 'MTI, Népszabadság' ] +def author_source_norm(extracted_meta): + return [m.strip() for m in re.split(',| - |/| – ', extracted_meta) if len(m.strip())>0] + def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() @@ -194,7 +197,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if source_in_text_tag is not None: source_in_text_tag_text = source_in_text_tag.find('div', class_='field--item').get_text(strip=True) if source_in_text_tag_text is not None: - data['sch:source'] = source_in_text_tag_text + data['sch:source'] = author_source_norm(source_in_text_tag_text) + if len(data['sch:source'])> 1: + data['originalAuthorString'] = source_in_text_tag_text # Sometimes implicitly inserted into a

tag else: @@ -222,7 +227,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if len(source_in_text_4) < 40: source_text = source_in_text_4.strip() if source_text in SOURCE_LIST: # Above code allows minimal mistakes - invalid sources are filtered - data['sch:source'] = [source_text] + data['sch:source'] = author_source_norm(source_text) + if len(data['sch:source'])> 1: + data['originalAuthorString'] = source_text + data['AuthorString_extracted_from_content'] = source_text else: tei_logger.log('DEBUG', f'{url}: SOURCE TAG NOT FOUND!') diff --git a/configs/hvg/hvg_specific.py b/configs/hvg/hvg_specific.py index 487285f..1065006 100644 --- a/configs/hvg/hvg_specific.py +++ b/configs/hvg/hvg_specific.py @@ -35,6 +35,24 @@ 'MTI/Népszava', 'MTI/dpa/Hszinhua', 'OTS/MTI', 'BBC/MTI', 'MTI/Bors', 'MTI/Reuters/AP', 'MTI/AP', 'MTI/AFP/Reuters', 'MTI/Reuters/Hszinhua', 'MTI/Blikk', 'HVG/MTI'] +SOURCE_NORM = {'EFE', 'HavariaPress', 'dpa', 'businesstraveller', 'Kisalföld', 'D.P.', 'dehir.hu', 'f1-live.hu', + 'MT Zrt.', 'MTI', 'Bankmonitor.hu', 'manna.ro', 'pecsma.hu', 'foodnetwork', 'InfoRádió', 'VG', + 'TV2', 'delmagyar.hu', 'Népszava', 'OTS', 'transindex.ro', 'portfolio.hu', 'honvedelem.hu', + 'HVG Extra Business', 'CNN', 'napi.hu', 'MTI-OS', 'Index', 'met.hu', 'Utinform.hu', + 'nyugat.hu', 'BBC', 'kemma.hu', 'turizmus.com', 'Jobline.hu', 'AP', 'muosz.hu', + 'élelmiszer online', 'MNO', 'baon.hu', 'teol.hu', 'ITAR-TASZSZ', 'Blikk', 'hirado.hu', + 'HVG Extra Pszichológia', 'indohaz.hu', 'Bors', 'Számlázz.hu', 'Napi.hu', 'bankmonitor.hu', + 'Hszinhua', 'MTI ', 'HVG Konferencia', 'DW', 'Inforádió', 'Zgut Edit', 'Dow Jones', 'Origo', + 'Eduline', 'OS', 'Világgazdaság', 'MR1-Kossuth Rádió', 'szoljon.hu', 'hvg.hu', 'Észak-Magyarország', + 'VinceBudapest', 'vendeglatasmagazin.hu', 'termekmix.hu', 'AFP', 'nso.hu', 'termekmix.com', 'benke', + 'f1-live', 'BiztosDöntés.hu', 'ingatlanmenedzser.hu', 'kisalföld.hu', 'atlatszo.blog.hu', 'Travellina', + 'merites.hu', 'Euronews', 'Marabu', 'sonline.hu', ' hvg.hu', 'EUrologus', 'Tények', 'Reuters', + 'Magyar Nemzet', 'DPA', 'MTA', '- esel -', 'eduline.hu', 'MLF', 'HVG', 'Adozona.hu', 'mult-kor.hu', + 'REUTERS', 'I.N.', 'Népszabadság', 'police.hu', 'Bank360.hu'} + + +def author_source_norm(extracted_meta): + return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0] def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() @@ -61,12 +79,20 @@ def get_meta_from_articles_spec(tei_logger, url, bs): tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') author_or_source_tag = article_root.find('div', class_='author-name') if author_or_source_tag is not None: - author_or_source = author_or_source_tag.text.strip().\ - replace('\r', '').replace('\n', '').replace('\t', '').replace('Követés', '') - if author_or_source in SOURCE: - data['sch:source'] = [author_or_source] - else: - data['sch:author'] = [author_or_source] + author_or_source_raw = author_or_source_tag.text.strip().\ + replace('\r', '').replace('\n', '').replace('\t', '') + author_or_source = author_or_source_raw.replace('Követés', '') + authors_list = author_source_norm(author_or_source) + if author_or_source != author_or_source_raw or len(authors_list) > 1: + data['originalAuthorString'] = author_or_source_raw + authors_l, sources_l = [], [] + [sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in + author_or_source] + if len(authors_l) > 0: + data['sch:author'] = authors_l + if len(sources_l) > 0: + data['sch:source'] = sources_l + else: tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!') keywords_root = article_root.find('div', class_='article-tags') @@ -99,6 +125,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if beg != -1 and end != -1: author = auth_string[beg+1:end+1] data['sch:author'] = [author] + data['originalAuthorString'] = auth_string else: tei_logger.log('DEBUG', f'{url} NO AUTHOR FOUND!') diff --git a/configs/kurucinfo/kurucinfo_specific.py b/configs/kurucinfo/kurucinfo_specific.py index 6d4c356..d151373 100644 --- a/configs/kurucinfo/kurucinfo_specific.py +++ b/configs/kurucinfo/kurucinfo_specific.py @@ -16,7 +16,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url - + # MISSING FROM PORTAL: data['sch:dateModified'] # ARTICLE SECTION @@ -29,7 +29,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): tei_logger.log('WARNING', f'{url}: ARTICLE SECTION TAG PARSE ERROR!') else: tei_logger.log('WARNING', f'{url}: ARTICLE SECTION NOT FOUND!') - + article_root = bs.find('div', {'class': 'tblot'}) if article_root is not None: @@ -39,9 +39,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): title_text = title_tag.get_text(strip=True) if title_text is not None: data['sch:name'] = title_text - else: + else: tei_logger.log('WARNING', f'{url}: TITLE TAG EMPTY!') - else: + else: tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!') # AUTHOR / https://kuruc.info/r/6/150707/ @@ -55,6 +55,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if 1 < len(split_t) <= 3 and ('(' or ')') not in tag_text and all([w[0].isupper() for w in split_t]): data['sch:author'] = [tag_text] data['originalAuthorString'] = [possible_author_tag.get_text(strip=True)] + data['AuthorString_extracted_from_content'] = possible_author_tag.text else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG EMPTY!') else: @@ -63,25 +64,25 @@ def get_meta_from_articles_spec(tei_logger, url, bs): # KEYWORDS and DATE PUBLISHED meta_header = article_root.find('p', {'class': 'cikkdatum'}) if meta_header is not None: - + # keywords a_tags = meta_header.find_all('a', href=re.compile('/t/[0-9]')) if len(a_tags) > 0: data['sch:keywords'] = [t.get_text(strip=True) for t in a_tags if len(t.get_text(strip=True)) > 0] else: tei_logger.log('INFO', f'{url}: KEYWORDS NOT FOUND!') - + # datePublished date_published_tag = meta_header.find('span', {'itemprop': "datePublished"}) if date_published_tag is not None: date_published_raw = date_published_tag.get_text(strip=True) if date_published_raw is not None: data['sch:datePublished'] = parse_date(date_published_raw, "%Y. %B %d. %H:%M") - else: + else: tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!') - else: + else: tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!') - + else: tei_logger.log('WARNING', f'{url}: META HEADER [datePublished, keywords] NOT FOUND!') diff --git a/configs/magyarnarancs/magyarnarancs_specific.py b/configs/magyarnarancs/magyarnarancs_specific.py index ccc4f22..2b09e33 100644 --- a/configs/magyarnarancs/magyarnarancs_specific.py +++ b/configs/magyarnarancs/magyarnarancs_specific.py @@ -22,6 +22,15 @@ 'narancs.hu/Republikon', 'MTI/narancs', 'narancs hu.', 'MTI/Világgazdaság/narancs.hu', 'narancs.hu - B. T.', 'transindex.ro', 'MTI-OS', 'nrancs.hu'] +SOURCE_SOLO = {'narancs.hu', 'szegeder.hu', 'MTA', 'narancs', 'HVG', 'narancs hu', 'Markó Anita', 'narancs hu.', 'Guardian', 'Telex', 'Narancs.hu', 'Fizetett tartalom', 'MTI', 'narancsfül', 'narancs.', 'transindex.ro', 'Amnesty', 'media1.hu', 'Narancsfül', 'narancsblog', 'Reuters', 'narancs. hu', 'Narancs', 'narancsszem', 'M', 'Republikon', 'nrancs.hu', 'narancs.hu-összeállítás', 'TASZ', 'narancs.hu - B. T.', 'Magyar Narancs', 'narancs.hu-MTI', 'Világgazdaság', 'Police.hu', 'OS', 'Szabad ország', 'MTI-OS', 'narancs.h', 'Narancs-összeállítás', 'MT'} + +def author_source_norm(extracted_meta): + ret_list = [] + if isinstance(extracted_meta, list): + for meta in extracted_meta: + ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és |', meta) if len(m.strip()) > 0]) + return ret_list + return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0] def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() @@ -71,19 +80,28 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if subtitle is not None: data['sch:alternateName'] = subtitle.text.strip() author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')] + author_or_source = author_source_norm(author_or_source) author_list, source_list = [], [] - [source_list.append(creator) if creator in SOURCE else author_list.append(creator) for creator in + [source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in author_or_source] if len(author_list) > 0 or len(source_list) > 0: author_list_corr = [] if len(author_list) > 0: for auth in author_list: if ',' in auth: - author_list_corr.extend(one_author.strip() for one_author in auth.split('\'')) + if '(' not in auth: + author_list_corr.extend(one_author.strip() for one_author in auth.split(',')) + data['originalAuthorString'] = [auth] + else: + # Szlankó Bálint (Argandab-folyóvölgy, Kandahár) + data['originalAuthorString'] = [auth] + auth = auth[0:auth.index('(')] + author_list_corr.extend(one_author.strip() for one_author in auth.split(',')) else: author_list_corr.append(auth) data['sch:author'] = author_list_corr if len(source_list) > 0: + data['sch:source'] = source_list else: tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!') @@ -102,10 +120,6 @@ def get_meta_from_articles_spec(tei_logger, url, bs): def excluded_tags_spec(tag): if tag.name == 'li' and 'data-leiras' in tag.attrs and tag['data-leiras'] == ' \r\n': tag['data-leiras'] = tag['data-leiras'].strip() - - # if tag.name not in HTML_BASICS: - # tag.name = 'else' - # tag.attrs = {} return tag diff --git a/configs/p888/p888_specific.py b/configs/p888/p888_specific.py index ddedeb3..d4026ce 100644 --- a/configs/p888/p888_specific.py +++ b/configs/p888/p888_specific.py @@ -78,8 +78,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if note_block_tag is not None: author_or_source = note_block_tag.find('div', class_='text-wrap').get_text(strip=True) if author_or_source is not None: + #print(url, note_block_tag) if author_or_source in SOURCE or author_or_source in SOURCE_SECONDARY: data["sch:source"] = [author_or_source] + data['originalAuthorString'] = [author_or_source] else: # split by: ANY OF THESE ',-–' CHARACTERS FOLLOWED BY WHITESPACE '\s' AND NOT 'a ', 'az ', # 'A ' or 'Az ' @@ -87,15 +89,23 @@ def get_meta_from_articles_spec(tei_logger, url, bs): split_list = re.split("[,\-\–]\s(?!a\s|az\s|A\s|Az\s)", author_or_source) if len(split_list) > 0 and split_list[0] != '': source_list, author_list = [], [] - for author in split_list: - if author in SOURCE or author in SOURCE_SECONDARY: - source_list.append(author.strip()) - else: - author_list.append(author.strip()) + for author_ in split_list: # TODO: ez egy rovat tkp.: Olvasói vélemény + # Olvasói vélemény | Szerző: Ádám Attila + authors = re.split('/|\|', author_) + for author in authors: + if author in SOURCE or author in SOURCE_SECONDARY: + + source_list.append(author.strip()) + else: + author = author.replace('Szerző:', '').replace('szerző:', '') + author_list.append(author.strip()) if len(author_list) > 0: data['sch:author'] = author_list if len(source_list) > 0: data['sch:source'] = source_list + if len(author_list+source_list) > 1: # TODO: van más feltétel? + data['originalAuthorString'] = [author_or_source] + # data['AuthorString_extracted_from_content'] = author_or_source else: tei_logger.log('DEBUG', f'{url}: AUTHOR TAG TEXT EMPTY!') diff --git a/configs/vadhajtasok/vadhajtasok_specific.py b/configs/vadhajtasok/vadhajtasok_specific.py index 63bfa7c..2ea9593 100644 --- a/configs/vadhajtasok/vadhajtasok_specific.py +++ b/configs/vadhajtasok/vadhajtasok_specific.py @@ -83,7 +83,14 @@ def get_meta_from_articles_spec(tei_logger, url, bs): if len(source_raw) > 0 and len(source_raw.split()) < 6 and \ (source_raw.startswith('Forrás:') or source_raw.startswith('Írta:') or source_raw in SOURCE_2): - data['originalAuthorString'] = [source_raw] + if source_raw.startswith('Írta:'): + data['sch:author'] = source_raw.replace('Írta:', '').strip().split(',') + elif source_raw.startswith('Forrás:') or source_raw in SOURCE_2: + data['sch:source'] = [m.strip() for m in re.split(',|-|/|–| és |;', source_raw.replace('Forrás:', '').strip()) if 'Fotó:' not in m and len(m.strip())>0] + if len(data['sch:source'])> 1: + data['originalAuthorString'] = source_raw + data['AuthorString_extracted_from_content'] = source_raw + else: tei_logger.log('DEBUG', f'{url}: SOURCE NOT FOUND!') return data diff --git a/configs/vs/vs_specific.py b/configs/vs/vs_specific.py index bc3ae7e..683fc84 100644 --- a/configs/vs/vs_specific.py +++ b/configs/vs/vs_specific.py @@ -33,15 +33,27 @@ def get_meta_from_articles_spec(tei_logger, url, bs): author_tag = meta_root.find('span', itemprop='author') if author_tag is not None: author_text = author_tag.text.strip() + authorlist = [] """valójában ritkán van 'és', de a köv. sor az egyszerű esetet is kezeli""" # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184 - if ' – ' in author_text: - author_text = author_text.split(' – ') + if '– ' in author_text: + authorlist = [] + sourcelist = [] + data['originalAuthorString'] = [author_text] + #print('EREDETI:', author_text.split('– '), url) + [sourcelist.append(au.strip()) if 'MTI' in au.strip() else authorlist.append(au.strip()) for au in author_text.split('– ') if len(au.strip()) > 0 and len(au.strip()) is not None] + if len(sourcelist) > 0: + data['sch:source'] = sourcelist + '''print(authorlist, data['sch:source'], url) + else: + print('???', author_text, url)''' elif ' és ' in author_text: - author_text = author_text.split(' és ') + data['originalAuthorString'] = [author_text] + authorlist = author_text.split(' és ') else: - author_text = [author_text] - data['sch:author'] = author_text + authorlist = [author_text] + data['sch:author'] = authorlist + else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') keywords_list = [t.text.strip() for t in meta_root.find_all('a', class_='tag')] diff --git a/src/html2tei/modes/portal_article_cleaner.py b/src/html2tei/modes/portal_article_cleaner.py index 6d9672d..da0be89 100644 --- a/src/html2tei/modes/portal_article_cleaner.py +++ b/src/html2tei/modes/portal_article_cleaner.py @@ -96,9 +96,18 @@ def tei_writer(warc_date, warc_id, xml_string, meta_data, article_body_contents, 'The original contains the following strings: ' for auth in original_author_string: create_new_tag_with_string(beauty_xml, auth, 'p', note_tag_auth) - tei_change = beauty_xml.find('change', source=True) tei_change.append(note_tag_auth) + if 'AuthorString_extracted_from_content' in meta_data.keys(): + original_author_from_content = meta_data['AuthorString_extracted_from_content'] + del meta_data['AuthorString_extracted_from_content'] + note_tag_auth2 = beauty_xml.new_tag('note') + note_tag_auth2.string = 'The string referring to the author or the source was extracted from the content of' \ + ' the article, since it was not clearly annotated by the portal. ' \ + 'The original was the following: ' + create_new_tag_with_string(beauty_xml, original_author_from_content, 'p', note_tag_auth2) + tei_change = beauty_xml.find('change', source=True) + tei_change.append(note_tag_auth2) # XENODATA 1: metadata of article source xeno_meta_datas = beauty_xml.find('rdf:Description') From 92b9c2d01ae773d9e3e985811f02f1a2efb0e246 Mon Sep 17 00:00:00 2001 From: SLZsofia Date: Thu, 23 Feb 2023 19:05:44 +0100 Subject: [PATCH 2/3] inprog --- configs/hvg/hvg_specific.py | 2 +- configs/magyaridok/magyaridok_specific.py | 7 ++- .../magyarnarancs/magyarnarancs_specific.py | 8 ++-- configs/p888/p888_specific.py | 2 +- configs/valasz/valasz_specific.py | 17 ++++--- configs/vs/vs_specific.py | 44 +++++++++---------- src/html2tei/correctors/unicode_error.py | 2 +- 7 files changed, 45 insertions(+), 37 deletions(-) diff --git a/configs/hvg/hvg_specific.py b/configs/hvg/hvg_specific.py index 1065006..888b10c 100644 --- a/configs/hvg/hvg_specific.py +++ b/configs/hvg/hvg_specific.py @@ -87,7 +87,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data['originalAuthorString'] = author_or_source_raw authors_l, sources_l = [], [] [sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in - author_or_source] + authors_list] if len(authors_l) > 0: data['sch:author'] = authors_l if len(sources_l) > 0: diff --git a/configs/magyaridok/magyaridok_specific.py b/configs/magyaridok/magyaridok_specific.py index dd297dd..9975db3 100644 --- a/configs/magyaridok/magyaridok_specific.py +++ b/configs/magyaridok/magyaridok_specific.py @@ -48,9 +48,12 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data['sch:author'] = [author.text.strip()] elif source is not None: # In case if not an author, only source (MTI) - data['sch:source'] = source.text.strip() + source_text = source.text.strip() + data['sch:source'] = [s.strip() for s in source_text.split(',')] + if len(data['sch:source']) > 1: + data['originalAuthorString'] = [source_text] else: - tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') + tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!') article_tags = [] section_line = article_root.find('span', class_='en-article-header-column') if section_line is not None: diff --git a/configs/magyarnarancs/magyarnarancs_specific.py b/configs/magyarnarancs/magyarnarancs_specific.py index 2b09e33..b77a3f9 100644 --- a/configs/magyarnarancs/magyarnarancs_specific.py +++ b/configs/magyarnarancs/magyarnarancs_specific.py @@ -28,7 +28,7 @@ def author_source_norm(extracted_meta): ret_list = [] if isinstance(extracted_meta, list): for meta in extracted_meta: - ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és |', meta) if len(m.strip()) > 0]) + ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és ', meta) if len(m.strip()) > 0]) return ret_list return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0] @@ -79,8 +79,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs): subtitle = bs.find('h3', class_='card-subtitle') if subtitle is not None: data['sch:alternateName'] = subtitle.text.strip() - author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')] - author_or_source = author_source_norm(author_or_source) + author_or_source_o = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')] + author_or_source = author_source_norm(author_or_source_o) + if len(author_or_source) > len(author_or_source_o): + data['originalAuthorString'] = author_or_source_o author_list, source_list = [], [] [source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in author_or_source] diff --git a/configs/p888/p888_specific.py b/configs/p888/p888_specific.py index d4026ce..33ed8c2 100644 --- a/configs/p888/p888_specific.py +++ b/configs/p888/p888_specific.py @@ -91,7 +91,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): source_list, author_list = [], [] for author_ in split_list: # TODO: ez egy rovat tkp.: Olvasói vélemény # Olvasói vélemény | Szerző: Ádám Attila - authors = re.split('/|\|', author_) + authors = re.split('/|\||-|;|–', author_) for author in authors: if author in SOURCE or author in SOURCE_SECONDARY: diff --git a/configs/valasz/valasz_specific.py b/configs/valasz/valasz_specific.py index 9ac2f21..3a84700 100644 --- a/configs/valasz/valasz_specific.py +++ b/configs/valasz/valasz_specific.py @@ -50,13 +50,16 @@ def get_meta_from_articles_spec(tei_logger, url, bs): # Hírforrás: Válasz.hu author_tags = article_root.find_all('a', rel='author') if len(author_tags) > 0: - if any(len(elem.text.strip()) == 0 for elem in author_tags): - source = article_root.find('span', class_="forras") - if source is not None: - data['sch:source'] = source.text.strip() - else: - data['sch:author'] = [a.text for a in author_tags] - else: + data['sch:author'] = [a.text.strip() for a in author_tags] + #if any(len(elem.text.strip()) == 0 for elem in author_tags): + source = article_root.find('span', class_="forras") + if source is not None: + # Hírforrás + sources = [m.strip() for m in re.split(',|/| - |;| és ', source.text.strip().replace('Hírforrás: ', '')) if len(m.strip()) > 0] + if len(sources) > 1: + data['originalAuthorString'] = [source.text.strip()] + data['sch:source'] = source.text.strip() + if author_tags is None and source is None: # The source and author fields can co-exist article_source = article_root.find('span', class_='forras') article_author2 = article_root.find('span', class_='szerzo') diff --git a/configs/vs/vs_specific.py b/configs/vs/vs_specific.py index 683fc84..baa2ad6 100644 --- a/configs/vs/vs_specific.py +++ b/configs/vs/vs_specific.py @@ -11,6 +11,16 @@ ARTICLE_ROOT_PARAMS_SPEC = [(('div',), {'itemprop': 'articleBody'})] +def author_source_norm(extracted_meta): + # SPEC VS.HU + ret_list = [] + if isinstance(extracted_meta, list): + for meta in extracted_meta: + ret_list.extend([m.strip() for m in re.split(',|/|– |;| és ', meta) if len(m.strip()) > 0]) + return ret_list + return [m.strip() for m in re.split(',|/|– |;| és ', extracted_meta) if len(m.strip())>0] + + def get_meta_from_articles_spec(tei_logger, url, bs): data = tei_defaultdict() data['sch:url'] = url @@ -31,31 +41,21 @@ def get_meta_from_articles_spec(tei_logger, url, bs): else: tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!') author_tag = meta_root.find('span', itemprop='author') - if author_tag is not None: + if author_tag is not None: # MTI/VS.hu": 704 author_text = author_tag.text.strip() - authorlist = [] - """valójában ritkán van 'és', de a köv. sor az egyszerű esetet is kezeli""" - # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184 - if '– ' in author_text: - authorlist = [] - sourcelist = [] - data['originalAuthorString'] = [author_text] - #print('EREDETI:', author_text.split('– '), url) - [sourcelist.append(au.strip()) if 'MTI' in au.strip() else authorlist.append(au.strip()) for au in author_text.split('– ') if len(au.strip()) > 0 and len(au.strip()) is not None] - if len(sourcelist) > 0: - data['sch:source'] = sourcelist - '''print(authorlist, data['sch:source'], url) - else: - print('???', author_text, url)''' - elif ' és ' in author_text: + creatorlist = author_source_norm(author_text) + if len(creatorlist) > 1: data['originalAuthorString'] = [author_text] - authorlist = author_text.split(' és ') - else: - authorlist = [author_text] - data['sch:author'] = authorlist - + # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184 + authorlist = [] + sourcelist = [] + [sourcelist.append(au.strip()) if 'MTI' in au.strip() else authorlist.append(au.strip()) for au in creatorlist] + if len(sourcelist) > 0: + data['sch:source'] = sourcelist + if len(authorlist) > 0: + data['sch:author'] = authorlist else: - tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') + tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!') keywords_list = [t.text.strip() for t in meta_root.find_all('a', class_='tag')] if len(keywords_list) > 0: data['sch:articleSection'] = keywords_list[0] diff --git a/src/html2tei/correctors/unicode_error.py b/src/html2tei/correctors/unicode_error.py index 95cc3a1..c170a1d 100644 --- a/src/html2tei/correctors/unicode_error.py +++ b/src/html2tei/correctors/unicode_error.py @@ -20,7 +20,7 @@ def article_encoding_correction(article, decompose_fun): """Finds useful text in a mixture of json snippets and incorrectly encoded text. It also tries to repair damaged (ill-formed HTML) articles """ - decompose_fun(article, 'media_unwrap') + decompose_fun(article)#, 'media_unwrap') soup = BeautifulSoup('a', 'lxml') unwrap_all(article, True) ret = _extract_first_instance_of_article_text(article) From 41faa243c1abf88edca0fd231159b9c7397ed685 Mon Sep 17 00:00:00 2001 From: SLZsofia Date: Tue, 9 May 2023 09:28:37 +0200 Subject: [PATCH 3/3] 2 config update --- configs/merce/merce_specific.py | 4 +++- configs/mno/mno_specific.py | 2 +- configs/valasz/valasz_specific.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/configs/merce/merce_specific.py b/configs/merce/merce_specific.py index 0ca6337..5b6e8d5 100644 --- a/configs/merce/merce_specific.py +++ b/configs/merce/merce_specific.py @@ -54,7 +54,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): data['sch:author'] = authors if len(authors) > 1: print('Több szerző', url) - # TODO: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/ + else: + # TODO 1 + # TODO 2: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/ # else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!') #