Original author source #89

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

sarkozizsofia wants to merge 3 commits into main from original_author_source

configs/alfahir/alfahir_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,9 @@ @@
                    'hvg.hu - barikad.hu', 'MTI, Népszabadság'
                    ]
+    def author_source_norm(extracted_meta):
+        return [m.strip() for m in re.split(',| - |/| – ', extracted_meta) if len(m.strip())>0]
     def get_meta_from_articles_spec(tei_logger, url, bs):
         data = tei_defaultdict()
@@ Expand Down Expand Up / @@ -194,7 +197,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                 if source_in_text_tag is not None:
                     source_in_text_tag_text = source_in_text_tag.find('div', class_='field--item').get_text(strip=True)
                     if source_in_text_tag_text is not None:
-                        data['sch:source'] = source_in_text_tag_text
+                        data['sch:source'] = author_source_norm(source_in_text_tag_text)
+                        if len(data['sch:source'])> 1:
+                            data['originalAuthorString'] = source_in_text_tag_text
                 # Sometimes implicitly inserted into a <p> tag
                 else:
@@ Expand Down Expand Up / @@ -222,7 +227,10 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                                 if len(source_in_text_4) < 40:
                                     source_text = source_in_text_4.strip()
                         if source_text in SOURCE_LIST:  # Above code allows minimal mistakes - invalid sources are filtered
-                            data['sch:source'] = [source_text]
+                            data['sch:source'] = author_source_norm(source_text)
+                            if len(data['sch:source'])> 1:
+                                data['originalAuthorString'] = source_text
+                            data['AuthorString_extracted_from_content'] = source_text
                     else:
                         tei_logger.log('DEBUG', f'{url}: SOURCE TAG NOT FOUND!')
@@ Expand Down @@

configs/hvg/hvg_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,24 @@ @@
               'MTI/Népszava', 'MTI/dpa/Hszinhua', 'OTS/MTI', 'BBC/MTI', 'MTI/Bors', 'MTI/Reuters/AP', 'MTI/AP',
               'MTI/AFP/Reuters', 'MTI/Reuters/Hszinhua', 'MTI/Blikk', 'HVG/MTI']
+    SOURCE_NORM = {'EFE', 'HavariaPress', 'dpa', 'businesstraveller', 'Kisalföld', 'D.P.', 'dehir.hu', 'f1-live.hu',
+                   'MT Zrt.', 'MTI', 'Bankmonitor.hu', 'manna.ro', 'pecsma.hu', 'foodnetwork', 'InfoRádió', 'VG',
+                   'TV2', 'delmagyar.hu', 'Népszava', 'OTS', 'transindex.ro', 'portfolio.hu', 'honvedelem.hu',
+                   'HVG Extra Business', 'CNN', 'napi.hu', 'MTI-OS', 'Index', 'met.hu', 'Utinform.hu',
+                   'nyugat.hu', 'BBC', 'kemma.hu', 'turizmus.com', 'Jobline.hu', 'AP', 'muosz.hu',
+                   'élelmiszer online', 'MNO', 'baon.hu', 'teol.hu', 'ITAR-TASZSZ', 'Blikk', 'hirado.hu',
+                   'HVG Extra Pszichológia', 'indohaz.hu', 'Bors', 'Számlázz.hu', 'Napi.hu', 'bankmonitor.hu',
+                   'Hszinhua', 'MTI ', 'HVG Konferencia', 'DW', 'Inforádió', 'Zgut Edit', 'Dow Jones', 'Origo',
+                   'Eduline', 'OS', 'Világgazdaság', 'MR1-Kossuth Rádió', 'szoljon.hu', 'hvg.hu', 'Észak-Magyarország',
+                   'VinceBudapest', 'vendeglatasmagazin.hu', 'termekmix.hu', 'AFP', 'nso.hu', 'termekmix.com', 'benke',
+                   'f1-live', 'BiztosDöntés.hu', 'ingatlanmenedzser.hu', 'kisalföld.hu', 'atlatszo.blog.hu', 'Travellina',
+                   'merites.hu', 'Euronews', 'Marabu', 'sonline.hu', ' hvg.hu', 'EUrologus', 'Tények', 'Reuters',
+                   'Magyar Nemzet', 'DPA', 'MTA', '- esel -', 'eduline.hu', 'MLF', 'HVG', 'Adozona.hu', 'mult-kor.hu',
+                   'REUTERS', 'I.N.', 'Népszabadság', 'police.hu', 'Bank360.hu'}
+    def author_source_norm(extracted_meta):
+        return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]
     def get_meta_from_articles_spec(tei_logger, url, bs):
         data = tei_defaultdict()
@@ Expand All / @@ -61,12 +79,20 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                 tei_logger.log('WARNING', f'{url}: TITLE TAG NOT FOUND!')
             author_or_source_tag = article_root.find('div', class_='author-name')
             if author_or_source_tag is not None:
-                author_or_source = author_or_source_tag.text.strip().\
-                    replace('\r', '').replace('\n', '').replace('\t', '').replace('Követés', '')
-                if author_or_source in SOURCE:
-                    data['sch:source'] = [author_or_source]
-                else:
-                    data['sch:author'] = [author_or_source]
+                author_or_source_raw = author_or_source_tag.text.strip().\
+                    replace('\r', '').replace('\n', '').replace('\t', '')
+                author_or_source = author_or_source_raw.replace('Követés', '')
+                authors_list = author_source_norm(author_or_source)
+                if author_or_source != author_or_source_raw or len(authors_list) > 1:
+                    data['originalAuthorString'] = author_or_source_raw
+                authors_l, sources_l = [], []
+                [sources_l.append(creator) if creator in SOURCE_NORM else authors_l.append(creator) for creator in
+                 authors_list]
+                if len(authors_l) > 0:
+                    data['sch:author'] = authors_l
+                if len(sources_l) > 0:
+                    data['sch:source'] = sources_l
             else:
                 tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!')
             keywords_root = article_root.find('div', class_='article-tags')
@@ Expand Down Expand Up / @@ -99,6 +125,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                     if beg != -1 and end != -1:
                         author = auth_string[beg+1:end+1]
                         data['sch:author'] = [author]
+                        data['originalAuthorString'] = auth_string
                     else:
                         tei_logger.log('DEBUG', f'{url} NO AUTHOR FOUND!')
@@ Expand Down @@

configs/kurucinfo/kurucinfo_specific.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -16,7 +16,7 @@
  
    def get_meta_from_articles_spec(tei_logger, url, bs):

        data = tei_defaultdict()

        data['sch:url'] = url

        # MISSING FROM PORTAL: data['sch:dateModified']

        # ARTICLE SECTION

    @@ -29,7 +29,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
  
                tei_logger.log('WARNING', f'{url}: ARTICLE SECTION TAG PARSE ERROR!')

        else:

            tei_logger.log('WARNING', f'{url}: ARTICLE SECTION NOT FOUND!')

        article_root = bs.find('div', {'class': 'tblot'})

        if article_root is not None:

    @@ -39,9 +39,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
  
                title_text = title_tag.get_text(strip=True)

                if title_text is not None:

                    data['sch:name'] = title_text

                else: 

                else:

                    tei_logger.log('WARNING', f'{url}: TITLE TAG EMPTY!')

            else: 

            else:

                tei_logger.log('WARNING', f'{url}: TITLE NOT FOUND IN URL!')

            # AUTHOR / https://kuruc.info/r/6/150707/

    @@ -55,6 +55,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
  
                    if 1 < len(split_t) <= 3 and ('(' or ')') not in tag_text and all([w[0].isupper() for w in split_t]):

                        data['sch:author'] = [tag_text]

                        data['originalAuthorString'] = [possible_author_tag.get_text(strip=True)]

                        data['AuthorString_extracted_from_content'] = possible_author_tag.text

                    else:

                        tei_logger.log('DEBUG', f'{url}: AUTHOR TAG EMPTY!')

                else:

    @@ -63,25 +64,25 @@ def get_meta_from_articles_spec(tei_logger, url, bs):
  
            # KEYWORDS and DATE PUBLISHED

            meta_header = article_root.find('p', {'class': 'cikkdatum'})

            if meta_header is not None:

                # keywords

                a_tags = meta_header.find_all('a', href=re.compile('/t/[0-9]'))

                if len(a_tags) > 0:

                    data['sch:keywords'] = [t.get_text(strip=True) for t in a_tags if len(t.get_text(strip=True)) > 0]

                else:

                    tei_logger.log('INFO', f'{url}: KEYWORDS NOT FOUND!')

                # datePublished

                date_published_tag = meta_header.find('span', {'itemprop': "datePublished"})

                if date_published_tag is not None:

                    date_published_raw = date_published_tag.get_text(strip=True)

                    if date_published_raw is not None:

                        data['sch:datePublished'] = parse_date(date_published_raw, "%Y. %B %d. %H:%M")

                    else: 

                    else:

                        tei_logger.log('WARNING', f'{url}: DATE FORMAT ERROR!')

                else: 

                else:

                    tei_logger.log('WARNING', f'{url}: DATE TAG NOT FOUND!')

            else:

                tei_logger.log('WARNING', f'{url}: META HEADER [datePublished, keywords] NOT FOUND!')

configs/magyaridok/magyaridok_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -48,9 +48,12 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                 data['sch:author'] = [author.text.strip()]
             elif source is not None:
                 # In case if not an author, only source (MTI)
-                data['sch:source'] = source.text.strip()
+                source_text = source.text.strip()
+                data['sch:source'] = [s.strip() for s in source_text.split(',')]
+                if len(data['sch:source']) > 1:
+                    data['originalAuthorString'] = [source_text]
             else:
-                tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
+                tei_logger.log('DEBUG', f'{url}: AUTHOR TAG NOT FOUND!')
             article_tags = []
             section_line = article_root.find('span', class_='en-article-header-column')
             if section_line is not None:
@@ Expand Down @@

configs/magyarnarancs/magyarnarancs_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -22,6 +22,15 @@ @@
               'narancs.hu/Republikon', 'MTI/narancs', 'narancs hu.', 'MTI/Világgazdaság/narancs.hu', 'narancs.hu - B. T.',
               'transindex.ro', 'MTI-OS', 'nrancs.hu']
+    SOURCE_SOLO = {'narancs.hu', 'szegeder.hu', 'MTA', 'narancs', 'HVG', 'narancs hu', 'Markó Anita', 'narancs hu.', 'Guardian', 'Telex', 'Narancs.hu', 'Fizetett tartalom', 'MTI', 'narancsfül', 'narancs.', 'transindex.ro', 'Amnesty', 'media1.hu', 'Narancsfül', 'narancsblog', 'Reuters', 'narancs. hu', 'Narancs', 'narancsszem', 'M', 'Republikon', 'nrancs.hu', 'narancs.hu-összeállítás', 'TASZ', 'narancs.hu - B. T.', 'Magyar Narancs', 'narancs.hu-MTI', 'Világgazdaság', 'Police.hu', 'OS', 'Szabad ország', 'MTI-OS', 'narancs.h', 'Narancs-összeállítás', 'MT'}
+    def author_source_norm(extracted_meta):
+        ret_list = []
+        if isinstance(extracted_meta, list):
+            for meta in extracted_meta:
+                ret_list.extend([m.strip() for m in re.split(',|-|/|–|;| és ', meta) if len(m.strip()) > 0])
+            return ret_list
+        return [m.strip() for m in re.split(',|-|/|–|;| és ', extracted_meta) if len(m.strip())>0]
     def get_meta_from_articles_spec(tei_logger, url, bs):
         data = tei_defaultdict()
@@ Expand Down Expand Up / @@ -70,20 +79,31 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
             subtitle = bs.find('h3', class_='card-subtitle')
             if subtitle is not None:
                 data['sch:alternateName'] = subtitle.text.strip()
-            author_or_source = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
+            author_or_source_o = [t.text.strip() for t in meta_root.find_all('span', class_='author-name')]
+            author_or_source = author_source_norm(author_or_source_o)
+            if len(author_or_source) > len(author_or_source_o):
+                data['originalAuthorString'] = author_or_source_o
             author_list, source_list = [], []
-            [source_list.append(creator) if creator in SOURCE else author_list.append(creator) for creator in
+            [source_list.append(creator) if creator in SOURCE_SOLO else author_list.append(creator) for creator in
              author_or_source]
             if len(author_list) > 0 or len(source_list) > 0:
                 author_list_corr = []
                 if len(author_list) > 0:
                     for auth in author_list:
                         if ',' in auth:
-                            author_list_corr.extend(one_author.strip() for one_author in auth.split('\''))
+                            if '(' not in auth:
+                                author_list_corr.extend(one_author.strip() for one_author in auth.split(','))
+                                data['originalAuthorString'] = [auth]
+                            else:
+                                # Szlankó Bálint (Argandab-folyóvölgy, Kandahár)
+                                data['originalAuthorString'] = [auth]
+                                auth = auth[0:auth.index('(')]
+                                author_list_corr.extend(one_author.strip() for one_author in auth.split(','))
                         else:
                             author_list_corr.append(auth)
                     data['sch:author'] = author_list_corr
                 if len(source_list) > 0:
                     data['sch:source'] = source_list
             else:
                 tei_logger.log('DEBUG', f'{url}: AUTHOR / SOURCE TAG NOT FOUND!')
@@ Expand All / @@ -102,10 +122,6 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
     def excluded_tags_spec(tag):
         if tag.name == 'li' and 'data-leiras' in tag.attrs and tag['data-leiras'] == ' \r\n':
             tag['data-leiras'] = tag['data-leiras'].strip()
-        # if tag.name not in HTML_BASICS:
-        #     tag.name = 'else'
-        # tag.attrs = {}
         return tag
@@ Expand Down @@

configs/merce/merce_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -54,7 +54,9 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                 data['sch:author'] = authors
                 if len(authors) > 1:
                     print('Több szerző', url)
-        # TODO: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/
+        else:
+            # TODO 1 <a href="https://avm.merce.hu/author/evatessza/" title="Udvarhelyi Tessza cikkei" class="author url fn track-act-up" rel="author">Udvarhelyi Tessza</a>
+        # TODO 2: mérce vendégszerző, név a cikk alján: https://merce.hu/2017/09/01/megmentheti-e_emmanuel_macron_a_kelet-europaiakat_a_kizsakmanyolastol/
         # else: tei_logger.log('WARNING', f'{url}: AUTHOR TAG NOT FOUND!')
         # <div class="featured-tag">
         """is_section = bs.find('div', {'class': 'featured-tag'})
@@ Expand Down @@

configs/mno/mno_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -48,7 +48,7 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
             if author is not None:
                 data['sch:author'] = [author.text.strip()]
             elif source is not None:
-                data['sch:source'] = source.text.strip()
+                data['sch:source'] = [source.text.strip()]
             else:
                 tei_logger.log('DEBUG', f'{url}  AUTHOR AND SOURCE TAG NOT FOUND!')
@@ Expand Down @@

configs/p888/p888_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -78,24 +78,34 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                     if note_block_tag is not None:
                         author_or_source = note_block_tag.find('div', class_='text-wrap').get_text(strip=True)
                         if author_or_source is not None:
+                            #print(url, note_block_tag)
                             if author_or_source in SOURCE or author_or_source in SOURCE_SECONDARY:
                                 data["sch:source"] = [author_or_source]
+                                data['originalAuthorString'] = [author_or_source]
                             else:
                                 # split by: ANY OF THESE ',-–' CHARACTERS FOLLOWED BY WHITESPACE '\s' AND NOT 'a ', 'az ',
                                 # 'A ' or 'Az '
                                 # regex solution may be over complicated
                                 split_list = re.split("[,\-\–]\s(?!a\s|az\s|A\s|Az\s)", author_or_source)
                                 if len(split_list) > 0 and split_list[0] != '':
                                     source_list, author_list = [], []
-                                    for author in split_list:
-                                        if author in SOURCE or author in SOURCE_SECONDARY:
-                                            source_list.append(author.strip())
-                                        else:
-                                            author_list.append(author.strip())
+                                    for author_ in split_list:   # TODO: ez egy rovat tkp.: Olvasói vélemény
+                                        # Olvasói vélemény | Szerző: Ádám Attila
+                                        authors = re.split('/|\||-|;|–', author_)
+                                        for author in authors:
+                                            if author in SOURCE or author in SOURCE_SECONDARY:
+                                                source_list.append(author.strip())
+                                            else:
+                                                author = author.replace('Szerző:', '').replace('szerző:', '')
+                                                author_list.append(author.strip())
                                     if len(author_list) > 0:
                                         data['sch:author'] = author_list
                                     if len(source_list) > 0:
                                         data['sch:source'] = source_list
+                                    if len(author_list+source_list) > 1:    # TODO: van más feltétel?
+                                        data['originalAuthorString'] = [author_or_source]
+                            # data['AuthorString_extracted_from_content'] = author_or_source
                         else:
                             tei_logger.log('DEBUG', f'{url}: AUTHOR TAG TEXT EMPTY!')
@@ Expand Down @@

configs/vadhajtasok/vadhajtasok_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -83,7 +83,14 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
                 if len(source_raw) > 0 and len(source_raw.split()) < 6 and \
                         (source_raw.startswith('Forrás:') or source_raw.startswith('Írta:') or
                          source_raw in SOURCE_2):
-                    data['originalAuthorString'] = [source_raw]
+                    if source_raw.startswith('Írta:'):
+                        data['sch:author'] = source_raw.replace('Írta:', '').strip().split(',')
+                    elif source_raw.startswith('Forrás:') or source_raw in SOURCE_2:
+                        data['sch:source'] = [m.strip() for m in re.split(',|-|/|–| és |;', source_raw.replace('Forrás:', '').strip()) if 'Fotó:' not in m and len(m.strip())>0]
+                        if len(data['sch:source'])> 1:
+                            data['originalAuthorString'] = source_raw
+                    data['AuthorString_extracted_from_content'] = source_raw
             else:
                 tei_logger.log('DEBUG', f'{url}: SOURCE NOT FOUND!')
             return data
@@ Expand Down @@

configs/valasz/valasz_specific.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -50,18 +50,21 @@ def get_meta_from_articles_spec(tei_logger, url, bs): @@
             #  <span class="forras">Hírforrás: Válasz.hu</span>
             author_tags = article_root.find_all('a', rel='author')
             if len(author_tags) > 0:
-                if any(len(elem.text.strip()) == 0 for elem in author_tags):
-                    source = article_root.find('span', class_="forras")
-                    if source is not None:
-                        data['sch:source'] = source.text.strip()
-                else:
-                    data['sch:author'] = [a.text for a in author_tags]
-            else:
+                data['sch:author'] = [a.text.strip() for a in author_tags]
+                #if any(len(elem.text.strip()) == 0 for elem in author_tags):
+            source = article_root.find('span', class_="forras")
+            if source is not None:
+                # Hírforrás
+                sources = [m.strip() for m in re.split(',|/| - |;| és ', source.text.strip().replace('Hírforrás: ', '')) if len(m.strip()) > 0]
+                if len(sources) > 1:
+                    data['originalAuthorString'] = [source.text.strip()]
+                    data['sch:source'] = sources
+            if author_tags is None and source is None:  # The following code fragment is probably redundant.
                 # The source and author fields can co-exist
                 article_source = article_root.find('span', class_='forras')
                 article_author2 = article_root.find('span', class_='szerzo')
                 if article_source is not None:
-                    data['sch:source'] = article_source.text.strip()
+                    data['sch:source'] = [article_source.text.strip()]
                 if article_author2 is not None:
                     data['sch:author'] = [article_author2.text.strip()]
             keyword_root = bs.find('aside', class_='breadcrumb')
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Original author source #89

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Original author source #89

Are you sure you want to change the base?

Uh oh!

Original author source #89

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!