From 19d7c4f29b3a7f560ffb88037443d20cd74a00da Mon Sep 17 00:00:00 2001
From: SLZsofia <zsofia.monika@gmail.com>
Date: Wed, 22 Feb 2023 10:38:50 +0100
Subject: [PATCH 1/3] author/source/original/from_content for 7 portal

---
 configs/alfahir/alfahir_specific.py           | 12 +++++-
 configs/hvg/hvg_specific.py                   | 39 ++++++++++++++++---
 configs/kurucinfo/kurucinfo_specific.py       | 19 ++++-----
 .../magyarnarancs/magyarnarancs_specific.py   | 26 ++++++++++---
 configs/p888/p888_specific.py                 | 20 +++++++---
 configs/vadhajtasok/vadhajtasok_specific.py   |  9 ++++-
 configs/vs/vs_specific.py                     | 22 ++++++++---
 src/html2tei/modes/portal_article_cleaner.py  | 11 +++++-
 8 files changed, 123 insertions(+), 35 deletions(-)
diff --git a/configs/alfahir/alfahir_specific.py b/configs/alfahir/alfahir_specific.py
index 6f6e3fe..4a0d1b3 100644
--- a/configs/alfahir/alfahir_specific.py
+++ b/configs/alfahir/alfahir_specific.py
@@ -34,6 +34,9 @@
                'hvg.hu - barikad.hu', 'MTI, Népszabadság'
                ]
 
+def author_source_norm(extracted_meta):
+    return [m.strip() for m in re.split(',| - |/| – ', extracted_meta) if len(m.strip())>0]
+
 
 def get_meta_from_articles_spec(tei_logger, url, bs):
     data = tei_defaultdict()
@@ -194,7 +197,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             if source_in_text_tag is not None:
                 source_in_text_tag_text = source_in_text_tag.find('div', class_='field--item').get_text(strip=True)
                 if source_in_text_tag_text is not None:
-                    data['sch:source'] = source_in_text_tag_text
+                    data['sch:source'] = author_source_norm(source_in_text_tag_text)
+                    if len(data['sch:source'])> 1:
+                        data['originalAuthorString'] = source_in_text_tag_text
 
             # Sometimes implicitly inserted into a <p> tag
             else:
@@ -222,7 +227,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                             if len(source_in_text_4) < 40:
                                 source_text = source_in_text_4.strip()
                     if source_text in SOURCE_LIST:  # Above code allows minimal mistakes - invalid sources are filtered
-                        data['sch:source'] = [source_text]
+                        data['sch:source'] = author_source_norm(source_text)
+                        if len(data['sch:source'])> 1:
+                            data['originalAuthorString'] = source_text
+                        data['AuthorString_extracted_from_content'] = source_text
                 else:
                     tei_logger.log('DEBUG', f'{url}: SOURCE TAG NOT FOUND!')
 
diff --git a/configs/hvg/hvg_specific.py b/configs/hvg/hvg_specific.py
index 487285f..1065006 100644
--- a/configs/hvg/hvg_specific.py
+++ b/configs/hvg/hvg_specific.py
@@ -35,6 +35,24 @@
           'MTI/Népszava', 'MTI/dpa/Hszinhua', 'OTS/MTI', 'BBC/MTI', 'MTI/Bors', 'MTI/Reuters/AP', 'MTI/AP', 
           'MTI/AFP/Reuters', 'MTI/Reuters/Hszinhua', 'MTI/Blikk', 'HVG/MTI']
 
+SOURCE_NORM = {'EFE', 'HavariaPress', 'dpa', 'businesstraveller', 'Kisalföld', 'D.P.', 'dehir.hu', 'f1-live.hu',
+               'MT Zrt.', 'MTI', 'Bankmonitor.hu', 'manna.ro', 'pecsma.hu', 'foodnetwork', 'InfoRádió', 'VG',
+               'TV2', 'delmagyar.hu', 'Népszava', 'OTS', 'transindex.ro', 'portfolio.hu', 'honvedelem.hu',
+               'HVG Extra Business', 'CNN', 'napi.hu', 'MTI-OS', 'Index', 'met.hu', 'Utinform.hu',
+               'nyugat.hu', 'BBC', 'kemma.hu', 'turizmus.com', 'Jobline.hu', 'AP', 'muosz.hu',
+               'élelmiszer online', 'MNO', 'baon.hu', 'teol.hu', 'ITAR-TASZSZ', 'Blikk', 'hirado.hu',
+               'HVG Extra Pszichológia', 'indohaz.hu', 'Bors', 'Számlázz.hu', 'Napi.hu', 'bankmonitor.hu',
+               'Hszinhua', 'MTI ', 'HVG Konferencia', 'DW', 'Inforádió', 'Zgut Edit', 'Dow Jones', 'Origo',
+               'Eduline', 'OS', 'Világgazdaság', 'MR1-Kossuth Rádió', 'szoljon.hu', 'hvg.hu', 'Észak-Magyarország',
+               'VinceBudapest', 'vendeglatasmagazin.hu', 'termekmix.hu', 'AFP', 'nso.hu', 'termekmix.com', 'benke',
+               'f1-live', 'BiztosDöntés.hu', 'ingatlanmenedzser.hu', 'kisalföld.hu', 'atlatszo.blog.hu', 'Travellina',
+               'merites.hu', 'Euronews', 'Marabu', 'sonline.hu', ' hvg.hu', 'EUrologus', 'Tények', 'Reuters',
+               'Magyar Nemzet', 'DPA', 'MTA', '- esel -', 'eduline.hu', 'MLF', 'HVG', 'Adozona.hu', 'mult-kor.hu',
+               'REUTERS', 'I.N.', 'Népszabadság', 'police.hu', 'Bank360.hu'}
+
+
+def author_source_norm(extracted_meta):
+    return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]
 
 def get_meta_from_articles_spec(tei_logger, url, bs):
     data = tei_defaultdict()
@@ -61,12 +79,20 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!')
         author_or_source_tag = article_root.find('div', class_='author-name')
         if author_or_source_tag is not None:
-            author_or_source = author_or_source_tag.text.strip().\
-                replace('\r', '').replace('\n', '').replace('\t', '').replace('Követés', '')
-            if author_or_source in SOURCE:
-                data['sch:source'] = [author_or_source]
-            else:
-                data['sch:author'] = [author_or_source]
+            author_or_source_raw = author_or_source_tag.text.strip().\
+                replace('\r', '').replace('\n', '').replace('\t', '')
+            author_or_source = author_or_source_raw.replace('Követés', '')
+            authors_list = author_source_norm(author_or_source)
+            if author_or_source != author_or_source_raw or len(authors_list) > 1:
+                data['originalAuthorString'] = author_or_source_raw
+            authors_l, sources_l = [], []
+            [sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in
+             author_or_source]
+            if len(authors_l) > 0:
+                data['sch:author'] = authors_l
+            if len(sources_l) > 0:
+                data['sch:source'] = sources_l
+
         else:
             tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!')
         keywords_root = article_root.find('div', class_='article-tags')
@@ -99,6 +125,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                 if beg != -1 and end != -1:
                     author = auth_string[beg+1:end+1]
                     data['sch:author'] = [author]
+                    data['originalAuthorString'] = auth_string
                 else:
                     tei_logger.log('DEBUG', f'{url} NO AUTHOR FOUND!')
 
diff --git a/configs/kurucinfo/kurucinfo_specific.py b/configs/kurucinfo/kurucinfo_specific.py
index 6d4c356..d151373 100644
--- a/configs/kurucinfo/kurucinfo_specific.py
+++ b/configs/kurucinfo/kurucinfo_specific.py
@@ -16,7 +16,7 @@
 def get_meta_from_articles_spec(tei_logger, url, bs):
     data = tei_defaultdict()
     data['sch:url'] = url
-    
+
     # MISSING FROM PORTAL: data['sch:dateModified']
 
     # ARTICLE SECTION
@@ -29,7 +29,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             tei_logger.log('WARNING', f'{url}: ARTICLE SECTION TAG PARSE ERROR!')
     else:
         tei_logger.log('WARNING', f'{url}: ARTICLE SECTION NOT FOUND!')
-    
+
     article_root = bs.find('div', {'class': 'tblot'})
     if article_root is not None:
 
@@ -39,9 +39,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             title_text = title_tag.get_text(strip=True)
             if title_text is not None:
                 data['sch:name'] = title_text
-            else: 
+            else:
                 tei_logger.log('WARNING', f'{url}: TITLE TAG EMPTY!')
-        else: 
+        else:
             tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!')
 
         # AUTHOR / https://kuruc.info/r/6/150707/
@@ -55,6 +55,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                 if 1 < len(split_t) <= 3 and ('(' or ')') not in tag_text and all([w[0].isupper() for w in split_t]):
                     data['sch:author'] = [tag_text]
                     data['originalAuthorString'] = [possible_author_tag.get_text(strip=True)]
+                    data['AuthorString_extracted_from_content'] = possible_author_tag.text
                 else:
                     tei_logger.log('DEBUG', f'{url}: AUTHOR TAG EMPTY!')
             else:
@@ -63,25 +64,25 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         # KEYWORDS and DATE PUBLISHED
         meta_header = article_root.find('p', {'class': 'cikkdatum'})
         if meta_header is not None:
-            
+
             # keywords
             a_tags = meta_header.find_all('a', href=re.compile('/t/[0-9]'))
             if len(a_tags) > 0:
                 data['sch:keywords'] = [t.get_text(strip=True) for t in a_tags if len(t.get_text(strip=True)) > 0]
             else:
                 tei_logger.log('INFO', f'{url}: KEYWORDS NOT FOUND!')
-                
+
             # datePublished
             date_published_tag = meta_header.find('span', {'itemprop': "datePublished"})
             if date_published_tag is not None:
                 date_published_raw = date_published_tag.get_text(strip=True)
                 if date_published_raw is not None:
                     data['sch:datePublished'] = parse_date(date_published_raw, "%Y. %B %d. %H:%M")
-                else: 
+                else:
                     tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!')
-            else: 
+            else:
                 tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')
-            
+
         else:
             tei_logger.log('WARNING', f'{url}: META HEADER [datePublished, keywords] NOT FOUND!')
 
diff --git a/configs/magyarnarancs/magyarnarancs_specific.py b/configs/magyarnarancs/magyarnarancs_specific.py
index ccc4f22..2b09e33 100644
--- a/configs/magyarnarancs/magyarnarancs_specific.py
+++ b/configs/magyarnarancs/magyarnarancs_specific.py
@@ -22,6 +22,15 @@
           'narancs.hu/Republikon', 'MTI/narancs', 'narancs hu.', 'MTI/Világgazdaság/narancs.hu', 'narancs.hu - B. T.',
           'transindex.ro', 'MTI-OS', 'nrancs.hu']
 
+SOURCE_SOLO = {'narancs.hu', 'szegeder.hu', 'MTA', 'narancs', 'HVG', 'narancs hu', 'Markó Anita', 'narancs hu.', 'Guardian', 'Telex', 'Narancs.hu', 'Fizetett tartalom', 'MTI', 'narancsfül', 'narancs.', 'transindex.ro', 'Amnesty', 'media1.hu', 'Narancsfül', 'narancsblog', 'Reuters', 'narancs. hu', 'Narancs', 'narancsszem', 'M', 'Republikon', 'nrancs.hu', 'narancs.hu-összeállítás', 'TASZ', 'narancs.hu - B. T.', 'Magyar Narancs', 'narancs.hu-MTI', 'Világgazdaság', 'Police.hu', 'OS', 'Szabad ország', 'MTI-OS', 'narancs.h', 'Narancs-összeállítás', 'MT'}
+
+def author_source_norm(extracted_meta):
+    ret_list = []
+    if isinstance(extracted_meta, list):
+        for meta in extracted_meta:
+            ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és |', meta) if len(m.strip()) > 0])
+        return ret_list
+    return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]
 
 def get_meta_from_articles_spec(tei_logger, url, bs):
     data = tei_defaultdict()
@@ -71,19 +80,28 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         if subtitle is not None:
             data['sch:alternateName'] = subtitle.text.strip()
         author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
+        author_or_source = author_source_norm(author_or_source)
         author_list, source_list = [], []
-        [source_list.append(creator) if creator in SOURCE else author_list.append(creator) for creator in
+        [source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in
          author_or_source]
         if len(author_list) > 0 or len(source_list) > 0:
             author_list_corr = []
             if len(author_list) > 0:
                 for auth in author_list:
                     if ',' in auth:
-                        author_list_corr.extend(one_author.strip() for one_author in auth.split('\''))
+                        if '(' not in auth:
+                            author_list_corr.extend(one_author.strip() for one_author in auth.split(','))
+                            data['originalAuthorString'] = [auth]
+                        else:
+                            # Szlankó Bálint (Argandab-folyóvölgy, Kandahár)
+                            data['originalAuthorString'] = [auth]
+                            auth = auth[0:auth.index('(')]
+                            author_list_corr.extend(one_author.strip() for one_author in auth.split(','))
                     else:
                         author_list_corr.append(auth)
                 data['sch:author'] = author_list_corr
             if len(source_list) > 0:
+
                 data['sch:source'] = source_list
         else:
             tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!')
@@ -102,10 +120,6 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
 def excluded_tags_spec(tag):
     if tag.name == 'li' and 'data-leiras' in tag.attrs and tag['data-leiras'] == ' \r\n':
         tag['data-leiras'] = tag['data-leiras'].strip()
-        
-    # if tag.name not in HTML_BASICS:
-    #     tag.name = 'else'
-    # tag.attrs = {}
     return tag
 
 
diff --git a/configs/p888/p888_specific.py b/configs/p888/p888_specific.py
index ddedeb3..d4026ce 100644
--- a/configs/p888/p888_specific.py
+++ b/configs/p888/p888_specific.py
@@ -78,8 +78,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                 if note_block_tag is not None:
                     author_or_source = note_block_tag.find('div', class_='text-wrap').get_text(strip=True)
                     if author_or_source is not None:
+                        #print(url, note_block_tag)
                         if author_or_source in SOURCE or author_or_source in SOURCE_SECONDARY:
                             data["sch:source"] = [author_or_source]
+                            data['originalAuthorString'] = [author_or_source]
                         else:
                             # split by: ANY OF THESE ',-–' CHARACTERS FOLLOWED BY WHITESPACE '\s' AND NOT 'a ', 'az ',
                             # 'A ' or 'Az '
@@ -87,15 +89,23 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                             split_list = re.split("[,\-\–]\s(?!a\s|az\s|A\s|Az\s)", author_or_source)
                             if len(split_list) > 0 and split_list[0] != '':
                                 source_list, author_list = [], []
-                                for author in split_list:
-                                    if author in SOURCE or author in SOURCE_SECONDARY:
-                                        source_list.append(author.strip())
-                                    else:
-                                        author_list.append(author.strip())
+                                for author_ in split_list:   # TODO: ez egy rovat tkp.: Olvasói vélemény
+                                    # Olvasói vélemény | Szerző: Ádám Attila
+                                    authors = re.split('/|\|', author_)
+                                    for author in authors:
+                                        if author in SOURCE or author in SOURCE_SECONDARY:
+
+                                            source_list.append(author.strip())
+                                        else:
+                                            author = author.replace('Szerző:', '').replace('szerző:', '')
+                                            author_list.append(author.strip())
                                 if len(author_list) > 0:
                                     data['sch:author'] = author_list
                                 if len(source_list) > 0:
                                     data['sch:source'] = source_list
+                                if len(author_list+source_list) > 1:    # TODO: van más feltétel?
+                                    data['originalAuthorString'] = [author_or_source]
+                        # data['AuthorString_extracted_from_content'] = author_or_source
                     else:
                         tei_logger.log('DEBUG', f'{url}: AUTHOR TAG TEXT EMPTY!')
 
diff --git a/configs/vadhajtasok/vadhajtasok_specific.py b/configs/vadhajtasok/vadhajtasok_specific.py
index 63bfa7c..2ea9593 100644
--- a/configs/vadhajtasok/vadhajtasok_specific.py
+++ b/configs/vadhajtasok/vadhajtasok_specific.py
@@ -83,7 +83,14 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             if len(source_raw) > 0 and len(source_raw.split()) < 6 and \
                     (source_raw.startswith('Forrás:') or source_raw.startswith('Írta:') or
                      source_raw in SOURCE_2):
-                data['originalAuthorString'] = [source_raw]
+                if source_raw.startswith('Írta:'):
+                    data['sch:author'] = source_raw.replace('Írta:', '').strip().split(',')
+                elif source_raw.startswith('Forrás:') or source_raw in SOURCE_2:
+                    data['sch:source'] = [m.strip() for m in re.split(',|-|/|–| és |;', source_raw.replace('Forrás:', '').strip()) if 'Fotó:' not in m and len(m.strip())>0]
+                    if len(data['sch:source'])> 1:
+                        data['originalAuthorString'] = source_raw
+                data['AuthorString_extracted_from_content'] = source_raw
+
         else:
             tei_logger.log('DEBUG', f'{url}: SOURCE NOT FOUND!')
         return data
diff --git a/configs/vs/vs_specific.py b/configs/vs/vs_specific.py
index bc3ae7e..683fc84 100644
--- a/configs/vs/vs_specific.py
+++ b/configs/vs/vs_specific.py
@@ -33,15 +33,27 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         author_tag = meta_root.find('span', itemprop='author')
         if author_tag is not None:
             author_text = author_tag.text.strip()
+            authorlist = []
             """valójában ritkán van 'és', de a köv. sor az egyszerű esetet is kezeli"""
             # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184
-            if ' – ' in author_text:
-                author_text = author_text.split(' – ')
+            if '– ' in author_text:
+                authorlist = []
+                sourcelist = []
+                data['originalAuthorString'] = [author_text]
+                #print('EREDETI:', author_text.split('– '), url)
+                [sourcelist.append(au.strip()) if 'MTI' in au.strip() else authorlist.append(au.strip()) for au in author_text.split('– ') if len(au.strip()) > 0 and len(au.strip()) is not None]
+                if len(sourcelist) > 0:
+                    data['sch:source'] = sourcelist
+                    '''print(authorlist, data['sch:source'], url)
+                else:
+                    print('???', author_text,  url)'''
             elif ' és ' in author_text:
-                author_text = author_text.split(' és ')
+                data['originalAuthorString'] = [author_text]
+                authorlist = author_text.split(' és ')
             else:
-                author_text = [author_text]
-            data['sch:author'] = author_text
+                authorlist = [author_text]
+            data['sch:author'] = authorlist
+
         else:
             tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
         keywords_list = [t.text.strip() for t in meta_root.find_all('a', class_='tag')]
diff --git a/src/html2tei/modes/portal_article_cleaner.py b/src/html2tei/modes/portal_article_cleaner.py
index 6d9672d..da0be89 100644
--- a/src/html2tei/modes/portal_article_cleaner.py
+++ b/src/html2tei/modes/portal_article_cleaner.py
@@ -96,9 +96,18 @@ def tei_writer(warc_date, warc_id, xml_string, meta_data, article_body_contents,
                                'The original contains the following strings: '
         for auth in original_author_string:
             create_new_tag_with_string(beauty_xml, auth, 'p', note_tag_auth)
-
         tei_change = beauty_xml.find('change', source=True)
         tei_change.append(note_tag_auth)
+    if 'AuthorString_extracted_from_content' in meta_data.keys():
+        original_author_from_content = meta_data['AuthorString_extracted_from_content']
+        del meta_data['AuthorString_extracted_from_content']
+        note_tag_auth2 = beauty_xml.new_tag('note')
+        note_tag_auth2.string = 'The string referring to the author or the source was extracted from the content of' \
+                                ' the article, since it was not clearly annotated by the portal. ' \
+                                'The original was the following: '
+        create_new_tag_with_string(beauty_xml, original_author_from_content, 'p', note_tag_auth2)
+        tei_change = beauty_xml.find('change', source=True)
+        tei_change.append(note_tag_auth2)
 
     # XENODATA 1: metadata of article source
     xeno_meta_datas = beauty_xml.find('rdf:Description')

From 92b9c2d01ae773d9e3e985811f02f1a2efb0e246 Mon Sep 17 00:00:00 2001
From: SLZsofia <zsofia.monika@gmail.com>
Date: Thu, 23 Feb 2023 19:05:44 +0100
Subject: [PATCH 2/3] inprog

---
 configs/hvg/hvg_specific.py                   |  2 +-
 configs/magyaridok/magyaridok_specific.py     |  7 ++-
 .../magyarnarancs/magyarnarancs_specific.py   |  8 ++--
 configs/p888/p888_specific.py                 |  2 +-
 configs/valasz/valasz_specific.py             | 17 ++++---
 configs/vs/vs_specific.py                     | 44 +++++++++----------
 src/html2tei/correctors/unicode_error.py      |  2 +-
 7 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/configs/hvg/hvg_specific.py b/configs/hvg/hvg_specific.py
index 1065006..888b10c 100644
--- a/configs/hvg/hvg_specific.py
+++ b/configs/hvg/hvg_specific.py
@@ -87,7 +87,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                 data['originalAuthorString'] = author_or_source_raw
             authors_l, sources_l = [], []
             [sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in
-             author_or_source]
+             authors_list]
             if len(authors_l) > 0:
                 data['sch:author'] = authors_l
             if len(sources_l) > 0:
diff --git a/configs/magyaridok/magyaridok_specific.py b/configs/magyaridok/magyaridok_specific.py
index dd297dd..9975db3 100644
--- a/configs/magyaridok/magyaridok_specific.py
+++ b/configs/magyaridok/magyaridok_specific.py
@@ -48,9 +48,12 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             data['sch:author'] = [author.text.strip()]
         elif source is not None:
             # In case if not an author, only source (MTI)
-            data['sch:source'] = source.text.strip()
+            source_text = source.text.strip()
+            data['sch:source'] = [s.strip() for s in source_text.split(',')]
+            if len(data['sch:source']) > 1:
+                data['originalAuthorString'] = [source_text]
         else:
-            tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
+            tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!')
         article_tags = []
         section_line = article_root.find('span', class_='en-article-header-column')
         if section_line is not None:
diff --git a/configs/magyarnarancs/magyarnarancs_specific.py b/configs/magyarnarancs/magyarnarancs_specific.py
index 2b09e33..b77a3f9 100644
--- a/configs/magyarnarancs/magyarnarancs_specific.py
+++ b/configs/magyarnarancs/magyarnarancs_specific.py
@@ -28,7 +28,7 @@ def author_source_norm(extracted_meta):
     ret_list = []
     if isinstance(extracted_meta, list):
         for meta in extracted_meta:
-            ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és |', meta) if len(m.strip()) > 0])
+            ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és ', meta) if len(m.strip()) > 0])
         return ret_list
     return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]
 
@@ -79,8 +79,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         subtitle = bs.find('h3', class_='card-subtitle')
         if subtitle is not None:
             data['sch:alternateName'] = subtitle.text.strip()
-        author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
-        author_or_source = author_source_norm(author_or_source)
+        author_or_source_o = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
+        author_or_source = author_source_norm(author_or_source_o)
+        if len(author_or_source) > len(author_or_source_o):
+            data['originalAuthorString'] = author_or_source_o
         author_list, source_list = [], []
         [source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in
          author_or_source]
diff --git a/configs/p888/p888_specific.py b/configs/p888/p888_specific.py
index d4026ce..33ed8c2 100644
--- a/configs/p888/p888_specific.py
+++ b/configs/p888/p888_specific.py
@@ -91,7 +91,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
                                 source_list, author_list = [], []
                                 for author_ in split_list:   # TODO: ez egy rovat tkp.: Olvasói vélemény
                                     # Olvasói vélemény | Szerző: Ádám Attila
-                                    authors = re.split('/|\|', author_)
+                                    authors = re.split('/|\||-|;|–', author_)
                                     for author in authors:
                                         if author in SOURCE or author in SOURCE_SECONDARY:
 
diff --git a/configs/valasz/valasz_specific.py b/configs/valasz/valasz_specific.py
index 9ac2f21..3a84700 100644
--- a/configs/valasz/valasz_specific.py
+++ b/configs/valasz/valasz_specific.py
@@ -50,13 +50,16 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         #  <span class="forras">Hírforrás: Válasz.hu</span>
         author_tags = article_root.find_all('a', rel='author')
         if len(author_tags) > 0:
-            if any(len(elem.text.strip()) == 0 for elem in author_tags):
-                source = article_root.find('span', class_="forras")
-                if source is not None:
-                    data['sch:source'] = source.text.strip()
-            else:
-                data['sch:author'] = [a.text for a in author_tags]
-        else:
+            data['sch:author'] = [a.text.strip() for a in author_tags]
+            #if any(len(elem.text.strip()) == 0 for elem in author_tags):
+        source = article_root.find('span', class_="forras")
+        if source is not None:
+            # Hírforrás
+            sources = [m.strip() for m in re.split(',|/| - |;| és ', source.text.strip().replace('Hírforrás: ', '')) if len(m.strip()) > 0]
+            if len(sources) > 1:
+                data['originalAuthorString'] = [source.text.strip()]
+            data['sch:source'] = source.text.strip()
+        if author_tags is None and source is None:
             # The source and author fields can co-exist
             article_source = article_root.find('span', class_='forras')
             article_author2 = article_root.find('span', class_='szerzo')
diff --git a/configs/vs/vs_specific.py b/configs/vs/vs_specific.py
index 683fc84..baa2ad6 100644
--- a/configs/vs/vs_specific.py
+++ b/configs/vs/vs_specific.py
@@ -11,6 +11,16 @@
 ARTICLE_ROOT_PARAMS_SPEC = [(('div',), {'itemprop': 'articleBody'})]
 
 
+def author_source_norm(extracted_meta):
+    # SPEC VS.HU
+    ret_list = []
+    if isinstance(extracted_meta, list):
+        for meta in extracted_meta:
+            ret_list.extend([m.strip() for m in re.split(',|/|– |;| és ', meta) if len(m.strip()) > 0])
+        return ret_list
+    return [m.strip() for m in re.split(',|/|– |;| és ', extracted_meta) if len(m.strip())>0]
+
+
 def get_meta_from_articles_spec(tei_logger, url, bs):
     data = tei_defaultdict()
     data['sch:url'] = url
@@ -31,31 +41,21 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         else:
             tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!')
         author_tag = meta_root.find('span', itemprop='author')
-        if author_tag is not None:
+        if author_tag is not None:  # MTI/VS.hu": 704
             author_text = author_tag.text.strip()
-            authorlist = []
-            """valójában ritkán van 'és', de a köv. sor az egyszerű esetet is kezeli"""
-            # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184
-            if '– ' in author_text:
-                authorlist = []
-                sourcelist = []
-                data['originalAuthorString'] = [author_text]
-                #print('EREDETI:', author_text.split('– '), url)
-                [sourcelist.append(au.strip()) if 'MTI' in au.strip() else authorlist.append(au.strip()) for au in author_text.split('– ') if len(au.strip()) > 0 and len(au.strip()) is not None]
-                if len(sourcelist) > 0:
-                    data['sch:source'] = sourcelist
-                    '''print(authorlist, data['sch:source'], url)
-                else:
-                    print('???', author_text,  url)'''
-            elif ' és ' in author_text:
+            creatorlist = author_source_norm(author_text)
+            if len(creatorlist) > 1:
                 data['originalAuthorString'] = [author_text]
-                authorlist = author_text.split(' és ')
-            else:
-                authorlist = [author_text]
-            data['sch:author'] = authorlist
-
+            # https://vs.hu/sport/osszes/magyarorszag-spanyolorszag-percrol-percre-1211#!s184
+            authorlist = []
+            sourcelist = []
+            [sourcelist.append(au.strip()) if 'MTI' in au.strip() else authorlist.append(au.strip()) for au in creatorlist]
+            if len(sourcelist) > 0:
+                data['sch:source'] = sourcelist
+            if len(authorlist) > 0:
+                data['sch:author'] = authorlist
         else:
-            tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
+            tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!')
         keywords_list = [t.text.strip() for t in meta_root.find_all('a', class_='tag')]
         if len(keywords_list) > 0:
             data['sch:articleSection'] = keywords_list[0]
diff --git a/src/html2tei/correctors/unicode_error.py b/src/html2tei/correctors/unicode_error.py
index 95cc3a1..c170a1d 100644
--- a/src/html2tei/correctors/unicode_error.py
+++ b/src/html2tei/correctors/unicode_error.py
@@ -20,7 +20,7 @@ def article_encoding_correction(article, decompose_fun):
     """Finds useful text in a mixture of json snippets and incorrectly encoded text.
        It also tries to repair damaged (ill-formed HTML) articles
     """
-    decompose_fun(article, 'media_unwrap')
+    decompose_fun(article)#, 'media_unwrap')
     soup = BeautifulSoup('a', 'lxml')
     unwrap_all(article, True)
     ret = _extract_first_instance_of_article_text(article)

From 41faa243c1abf88edca0fd231159b9c7397ed685 Mon Sep 17 00:00:00 2001
From: SLZsofia <zsofia.monika@gmail.com>
Date: Tue, 9 May 2023 09:28:37 +0200
Subject: [PATCH 3/3] 2 config update

---
 configs/merce/merce_specific.py   | 4 +++-
 configs/mno/mno_specific.py       | 2 +-
 configs/valasz/valasz_specific.py | 6 +++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/configs/merce/merce_specific.py b/configs/merce/merce_specific.py
index 0ca6337..5b6e8d5 100644
--- a/configs/merce/merce_specific.py
+++ b/configs/merce/merce_specific.py
@@ -54,7 +54,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             data['sch:author'] = authors
             if len(authors) > 1:
                 print('Több szerző', url)
-    # TODO: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/
+    else:
+        # TODO 1 <a href="https://avm.merce.hu/author/evatessza/" title="Udvarhelyi Tessza cikkei" class="author url fn track-act-up" rel="author">Udvarhelyi Tessza</a>
+    # TODO 2: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/
     # else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
     # <div class="featured-tag">
     """is_section = bs.find('div', {'class': 'featured-tag'})
diff --git a/configs/mno/mno_specific.py b/configs/mno/mno_specific.py
index 6505940..185d3bf 100755
--- a/configs/mno/mno_specific.py
+++ b/configs/mno/mno_specific.py
@@ -48,7 +48,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
         if author is not None:
             data['sch:author'] = [author.text.strip()]
         elif source is not None:
-            data['sch:source'] = source.text.strip()
+            data['sch:source'] = [source.text.strip()]
         else:
             tei_logger.log('DEBUG', f'{url}  AUTHOR AND SOURCE TAG NOT FOUND!')
 
diff --git a/configs/valasz/valasz_specific.py b/configs/valasz/valasz_specific.py
index 3a84700..7aecc5e 100644
--- a/configs/valasz/valasz_specific.py
+++ b/configs/valasz/valasz_specific.py
@@ -58,13 +58,13 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
             sources = [m.strip() for m in re.split(',|/| - |;| és ', source.text.strip().replace('Hírforrás: ', '')) if len(m.strip()) > 0]
             if len(sources) > 1:
                 data['originalAuthorString'] = [source.text.strip()]
-            data['sch:source'] = source.text.strip()
-        if author_tags is None and source is None:
+                data['sch:source'] = sources
+        if author_tags is None and source is None:  # The following code fragment is probably redundant.
             # The source and author fields can co-exist
             article_source = article_root.find('span', class_='forras')
             article_author2 = article_root.find('span', class_='szerzo')
             if article_source is not None:
-                data['sch:source'] = article_source.text.strip()
+                data['sch:source'] = [article_source.text.strip()]
             if article_author2 is not None:
                 data['sch:author'] = [article_author2.text.strip()]
         keyword_root = bs.find('aside', class_='breadcrumb')