From 2df4929bef5b1efec964c7ff550b36204651566b Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Tue, 16 Dec 2025 15:17:13 -0500
Subject: [PATCH 01/16] Add WIP hf repo exporter and rename files

---
 .../{main.yml => gh-repo-exporter.yml}        |   4 +-
 .github/workflows/hf-repo-exporter.yml        |  37 +++
 export_repos.py => gh_repo_exporter.py        |  11 +-
 hf_repo_exporter.py                           | 266 ++++++++++++++++++
 requirements.txt                              |  28 +-
 tests/test_has_doi_for_repo.py                |   2 +-
 6 files changed, 314 insertions(+), 34 deletions(-)
 rename .github/workflows/{main.yml => gh-repo-exporter.yml} (93%)
 create mode 100644 .github/workflows/hf-repo-exporter.yml
 rename export_repos.py => gh_repo_exporter.py (99%)
 create mode 100644 hf_repo_exporter.py

diff --git a/.github/workflows/main.yml b/.github/workflows/gh-repo-exporter.yml
similarity index 93%
rename from .github/workflows/main.yml
rename to .github/workflows/gh-repo-exporter.yml
index 2577068..aa09f54 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/gh-repo-exporter.yml
@@ -8,7 +8,7 @@ on:
            # │ │ │ ┌────── restricted to month (1-12)
            # │ │ │ │ ┌──── restricted to day of week (0-6, 0=Sunday)
            # │ │ │ │ │    * means doesn't restrict anything
-    - cron: "0 9 * * 1" # Runs once every Monday at 9 AM
+    - cron: "0 9 * * 1" # Runs once every Monday at 9 AM UTC
   workflow_dispatch:
     inputs:
       repo_type:
@@ -44,4 +44,4 @@ jobs:
         GH_TOKEN: ${{ secrets.GH_TOKEN }}
         GOOGLE_CREDENTIALS_PATH: service_account.json
         REPO_TYPE: ${{ github.event.inputs.repo_type || 'all' }}
-      run: python export_repos.py
+      run: python export_repos.py
\ No newline at end of file
diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml
new file mode 100644
index 0000000..a878c5c
--- /dev/null
+++ b/.github/workflows/hf-repo-exporter.yml
@@ -0,0 +1,37 @@
+name: Update Metadata for GitHub Repository Sheet
+
+on:
+  schedule:
+           # ┌──────────── restricted to minute (0-59)
+           # │ ┌────────── restricted to hour (0-23)
+           # │ │ ┌──────── restricted to day of month (1-31)
+           # │ │ │ ┌────── restricted to month (1-12)
+           # │ │ │ │ ┌──── restricted to day of week (0-6, 0=Sunday)
+           # │ │ │ │ │    * means doesn't restrict anything
+    - cron: "0 9 * * 1" # Runs once every Monday at 9 AM UTC
+  workflow_dispatch:
+
+jobs:
+  update-sheet:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repo
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.x"
+
+    - name: Install dependencies
+      run: pip install -r requirements.txt
+
+    - name: Write Google credentials
+      run: printf "%s" '${{ secrets.GOOGLE_SERVICE_ACCOUNT_JSON }}' > service_account.json
+
+    - name: Run script
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        GOOGLE_CREDENTIALS_PATH: service_account.json
+      run: python export_repos.py
\ No newline at end of file
diff --git a/export_repos.py b/gh_repo_exporter.py
similarity index 99%
rename from export_repos.py
rename to gh_repo_exporter.py
index 87333ff..b369b45 100644
--- a/export_repos.py
+++ b/gh_repo_exporter.py
@@ -1,13 +1,14 @@
-import os
-import pandas as pd
 from github import Github, GithubException, Auth
+import pandas as pd
 from tqdm import tqdm
-from datetime import datetime, timedelta, timezone
+from google.oauth2.service_account import Credentials
+import gspread
 import yaml
+
+from datetime import datetime, timedelta, timezone
 import time
+import os
 import re
-import gspread
-from google.oauth2.service_account import Credentials
 
 # Config
 ORG_NAME = "Imageomics"
diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
new file mode 100644
index 0000000..6436d1b
--- /dev/null
+++ b/hf_repo_exporter.py
@@ -0,0 +1,266 @@
+from huggingface_hub import HfApi
+import pandas as pd
+import tqdm
+from google.oauth2.service_account import Credentials
+import gspread
+
+from datetime import datetime, timedelta, timezone
+import time
+import os
+import re
+
+# Config
+ORG_NAME = "imageomics"
+SPREADSHEET_ID = "1NOVB9IfBvkAh4YDbozhi5q0iwBfyp3enD6UxmO6wHIA"
+SHEET_NAME = "Sheet1"
+
+# Helper Functions
+def get_repo_url(repo) -> str:
+    if repo._hf_repo_type == "dataset":
+        return f"https://huggingface.co/datasets/{repo.id}"
+    elif repo._hf_repo_type == "space":
+        return f"https://huggingface.co/spaces/{repo.id}"
+    else: # model
+        return f"https://huggingface.co/{repo.id}"
+    
+def is_inactive(repo):
+    try:
+        last_modified = getattr(repo, "lastModified", None)
+        if not last_modified:
+            return "N/A"
+
+        # Parse ISO 8601 string
+        updated = datetime.fromisoformat(last_modified.replace("Z", ""))
+        if updated.tzinfo is None:
+            updated = updated.replace(tzinfo=timezone.utc)
+
+        one_year_ago = datetime.now(timezone.utc) - timedelta(days=365)
+        return "Yes" if updated < one_year_ago else "No"
+    except Exception:
+        return "N/A"
+
+def get_model_card_field(repo, key: str) -> str:
+    try:
+        return repo.cardData.get(key, "")
+    except Exception:
+        return "N/A"
+    
+def get_associated_assets(repo) -> str:
+    try:
+        related = [tag for tag in repo.tags if tag.startswith(("dataset:", "model:", "space:"))]
+        return ", ".join(related)
+    except Exception:
+        return "N/A"
+
+def get_repo_info(repo) -> dict[str, str | int]:
+    return {
+        "Repository Name": f'=HYPERLINK("{get_repo_url(repo)}", "{repo.id}")',
+        "Repository Type": repo._hf_repo_type, 
+        "Description": getattr(repo, "description", "N/A"),
+        "Date Created": repo.created_at.strftime("%Y-%m-%d") if getattr(repo, "created_at", False) else "N/A",
+        "Last Updated": datetime.fromisoformat(repo.lastModified.replace("Z", "")).strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A",
+        "Created By": repo.author,
+        "Top 4 Contributors/Curators": ...,
+        "Likes": getattr(repo, "likes", "N/A"),
+        "# of Open PRs": ...,
+        "README": "Yes" if getattr(repo, "cardData", False) else "No",
+        "License": "Yes" if getattr(repo, "license", False) else "No",
+        "Visibility": "Private" if getattr(repo, "private", False) else "Public",
+        "Inactive": is_inactive(repo),
+        "Homepage": get_model_card_field(repo, "homepage"), 
+        "Repo": f'=HYPERLINK("{get_model_card_field(repo, "github_repo")}", "{repo.id}")',
+        "Paper": f'=HYPERLINK("{get_model_card_field(repo, "paper")}", "Paper")',
+        "Associated data, models, or spaces": get_associated_assets(repo),
+        "DOI": get_model_card_field(repo, "doi"), 
+    }
+
+def extract_display_name(val: str) -> str:
+    match = re.search(r'"([^"]+)"\)$', val) # regex to extract the repo-name from "=HYPERLINK(..., "repo-name")"
+    return match.group(1) if match else val
+
+def update_google_sheet(df: pd.DataFrame) -> None:
+    # Authenticate Google API
+    creds_path = os.getenv("GOOGLE_CREDENTIALS_PATH", "service_account.json")
+
+    creds = Credentials.from_service_account_file(
+        creds_path,
+        scopes=[
+            "https://www.googleapis.com/auth/spreadsheets",
+            "https://www.googleapis.com/auth/drive"
+        ]
+    )
+
+    client = gspread.authorize(creds)
+    sheet = client.open_by_key(SPREADSHEET_ID).worksheet(SHEET_NAME)
+
+    # Pull current header
+    HEADER_ROW_INDEX = 2
+    header = sheet.row_values(HEADER_ROW_INDEX)
+
+    # Find 
+    try: 
+        repo_col_index = header.index("Repository Name")
+    except ValueError:
+        raise ValueError('Sheet is missing "Repository Name" column')
+
+    # Build a dict of repo name -> index
+    existing = sheet.get_all_values()
+    data_rows = existing[HEADER_ROW_INDEX:]
+    name_to_row = {}
+    for offset, row in enumerate(data_rows, start=HEADER_ROW_INDEX + 1):
+        if len(row) <= repo_col_index: # if row of data fetched is missing repo name column, ignore the row
+            continue
+
+        sheet_repo_name = extract_display_name(row[repo_col_index]) # hardcoded to check for "Repository Name" column in row 0
+        name_to_row[sheet_repo_name] = offset
+
+    batch_body = []
+    for _, row in df.iterrows():
+        repo_name = extract_display_name(row["Repository Name"])
+
+        # Determine row index
+        if repo_name in name_to_row:
+            row_idx = name_to_row[repo_name]
+        else:
+            row_idx = len(existing) + 1
+            existing.append([""] * len(header))
+
+        # Create (range, value) for each column individually
+        for col_idx, col_name in enumerate(header, start=1):
+            if col_name not in df.columns:
+                continue  # skip untouched columns
+
+            value = row.get(col_name, "")
+            cell = gspread.utils.rowcol_to_a1(row_idx, col_idx)
+
+            batch_body.append({
+                "range": cell,
+                "majorDimension": "ROWS",
+                "values": [[value]]  # single cell update
+            })
+
+    sheet.spreadsheet.values_batch_update(
+        body={
+            "value_input_option": "USER_ENTERED",
+            "data": batch_body
+        }
+    )
+
+    def get_column_index(col_name: str):
+        try:
+            return header.index(col_name)
+        except ValueError:
+            return None  # column not found
+
+    red_columns = {
+        "README",
+        "License",
+        ".gitignore",
+        "Package Requirements",
+        "CITATION"
+    }
+
+    orange_columns = {
+        ".zenodo.json",
+        "CONTRIBUTING",
+        "AGENTS",
+        "Website Reference",
+        "Dataset",
+        "Model",
+        "Paper Association",
+        "DOI for GitHub Repo"
+    }
+
+    rules = []
+
+    # Only loop over columns that need formatting
+    for col_set, color in [(red_columns, {"red": 1, "green": 0.5, "blue": 0.5}),
+                        (orange_columns, {"red": 1, "green": 0.8, "blue": 0.4})]:
+
+        for col_name in col_set:
+            col_index = get_column_index(col_name)
+            if col_index is None:
+                continue  # skip missing columns
+
+            rules.append({
+                "addConditionalFormatRule": {
+                    "rule": {
+                        "ranges": [{
+                            "sheetId": sheet.id,
+                            "startRowIndex": HEADER_ROW_INDEX,           # start after header
+                            "endRowIndex": HEADER_ROW_INDEX + len(df),   # only data rows
+                            "startColumnIndex": col_index,
+                            "endColumnIndex": col_index + 1
+                        }],
+                        "booleanRule": {
+                            "condition": {
+                                "type": "TEXT_EQ",
+                                "values": [{"userEnteredValue": "No"}]
+                            },
+                            "format": {
+                                "backgroundColor": color
+                            }
+                        }
+                    },
+                    "index": 0
+                }
+            })
+
+    sheet.spreadsheet.batch_update({"requests": rules})
+
+# -------
+
+def main():
+
+    TOKEN = os.getenv("HF_TOKEN") or input("Enter your Hugging Face token: ").strip()
+
+    start_time = time.time()
+
+    api = HfApi(token=TOKEN)
+
+    try:
+        models = list(api.list_models(author=ORG_NAME, full=True))
+    except Exception as e:
+        print(f'ERROR: Could not fetch models for "{ORG_NAME}"')
+        print(e)
+        return
+    
+    print("")
+    print(f"Fetching Hugging Face repositories for: {ORG_NAME}")
+    print("")
+    print("----------------")
+
+    data = []
+
+    tqdm_kwargs = {}
+    if os.environ.get("CI") == "true":
+        tqdm_kwargs = {"mininterval": 1, "dynamic_ncols": False, "leave": False}
+
+    for model in tqdm(models, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs):
+        try:
+            info = get_repo_info(model)
+            data.append(info)
+            tqdm.write(f"Fetched info for /{model.id}")
+        except Exception as e:
+            tqdm.write(f"ERROR: Cannot fetch /{model.id} info, due to {type(e).__name__}: {e}. Skipping...")
+    
+    if not data:
+        print("ERROR: No data collected")
+        return
+    
+    print("----------------")
+    print("")
+
+    df = pd.DataFrame(data)
+    df.sort_values(by="Repository Name", inplace=True)
+
+    update_google_sheet(df)
+    print(f"Finished fetching info for {len(df)} repositories from {ORG_NAME} organization")
+
+    elapsed = time.time() - start_time
+    minutes, seconds = divmod(int(elapsed), 60)
+
+    print(f"Total time taken: {minutes}m {seconds}s")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 01b8649..2160fd1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,32 +1,8 @@
-cachetools==6.2.1
-certifi==2025.10.5
-cffi==2.0.0
-charset-normalizer==3.4.4
-colorama==0.4.6
-cryptography==46.0.3
-et_xmlfile==2.0.0
 google-auth==2.41.1
-google-auth-oauthlib==1.2.3
 gspread==6.2.1
-idna==3.11
-numpy==2.3.4
-oauthlib==3.3.1
 pandas==2.3.3
-pyasn1==0.6.1
-pyasn1_modules==0.4.2
-pycparser==2.23
 PyGithub==2.8.1
-PyJWT==2.10.1
-PyNaCl==1.6.0
-python-dateutil==2.9.0.post0
-pytz==2025.2
-requests==2.32.5
-requests-oauthlib==2.0.0
-rsa==4.9.1
-six==1.17.0
 tqdm==4.67.1
-typing_extensions==4.15.0
-tzdata==2025.2
-urllib3==2.6.0
 PyYAML==6.0.3
-pytest==9.0.1
\ No newline at end of file
+pytest==9.0.1
+huggingface_hub==1.2.3
\ No newline at end of file
diff --git a/tests/test_has_doi_for_repo.py b/tests/test_has_doi_for_repo.py
index 5218c27..2904afd 100644
--- a/tests/test_has_doi_for_repo.py
+++ b/tests/test_has_doi_for_repo.py
@@ -1,5 +1,5 @@
 import pytest
-from export_repos import has_doi
+from gh_repo_exporter import has_doi
 
 # --- Fake GitHub objects ---
 

From 5bd8cf3e32e4b9a1b36eedeb987a5441c7d9802a Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Tue, 16 Dec 2025 15:23:06 -0500
Subject: [PATCH 02/16] move workflow to top-level to fix manual

---
 .github/workflows/hf-repo-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml
index a878c5c..b81aa72 100644
--- a/.github/workflows/hf-repo-exporter.yml
+++ b/.github/workflows/hf-repo-exporter.yml
@@ -1,6 +1,7 @@
 name: Update Metadata for GitHub Repository Sheet
 
 on:
+  workflow_dispatch:
   schedule:
            # ┌──────────── restricted to minute (0-59)
            # │ ┌────────── restricted to hour (0-23)
@@ -9,7 +10,6 @@ on:
            # │ │ │ │ ┌──── restricted to day of week (0-6, 0=Sunday)
            # │ │ │ │ │    * means doesn't restrict anything
     - cron: "0 9 * * 1" # Runs once every Monday at 9 AM UTC
-  workflow_dispatch:
 
 jobs:
   update-sheet:

From ec8adb5ab0029715032c486876680add62fed814 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Tue, 16 Dec 2025 15:28:21 -0500
Subject: [PATCH 03/16] Update name for hugging face workflow

---
 .github/workflows/hf-repo-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml
index b81aa72..155216f 100644
--- a/.github/workflows/hf-repo-exporter.yml
+++ b/.github/workflows/hf-repo-exporter.yml
@@ -1,4 +1,4 @@
-name: Update Metadata for GitHub Repository Sheet
+name: Update Metadata for Hugging Face Repository Sheet
 
 on:
   workflow_dispatch:

From 6ccdb4250cf3b97c04fca02bfaeb2b537f6b5a57 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Wed, 17 Dec 2025 11:44:01 -0500
Subject: [PATCH 04/16] Update coloring

---
 hf_repo_exporter.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index 6436d1b..ac261a7 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -155,20 +155,16 @@ def get_column_index(col_name: str):
     red_columns = {
         "README",
         "License",
-        ".gitignore",
-        "Package Requirements",
-        "CITATION"
+        "Visibility",
+        "Inactive",
+        "Homepage", 
+        "Repo",
+        "Paper",
+        "Associated data, models, or spaces",
     }
 
     orange_columns = {
-        ".zenodo.json",
-        "CONTRIBUTING",
-        "AGENTS",
-        "Website Reference",
-        "Dataset",
-        "Model",
-        "Paper Association",
-        "DOI for GitHub Repo"
+        "DOI"
     }
 
     rules = []

From 63a26ef95204c1706e0892c79cd3673de9213de5 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Wed, 17 Dec 2025 11:58:12 -0500
Subject: [PATCH 05/16] Add instrutions to create HF token for gh secrets

---
 README.md | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 23124cb..8e1630c 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,8 @@ A Python script that gathers metadata for all repositories in a GitHub organizat
 - [Features](#features)  
 - [Usage](#usage)  
 - [Set up your own GitHub Actions workflow](#set-up-your-own-github-actions-workflow)  
-  - [Create a GitHub Personal Access Token](#create-a-github-personal-access-token)  
+  - [Create a GitHub Personal Access Token](#create-a-github-personal-access-token)
+  - [Create a Hugging Face Token](#create-a-hugging-face-token)   
   - [Set up Google Cloud Service Account Access](#set-up-google-cloud-service-account-access)  
 - [Run repo exporter locally](#run-repo-exporter-locally)  
 - [Important Notes](#important-notes)  
@@ -42,17 +43,28 @@ To use this script within your own GitHub organization, first fork this repo, th
   
   To create one with permissions for both private and public repositories (public repository read-access only is enabled by default without adminstrator approval):
    
-   1. Go to [github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens)
-   2. Click **Generate new token → Fine-grained token**
-   3. Under **Resource owner**, select the **organization** you want to access.
-   4. Under **Repository access**, choose **All repositories**.
-   5. Under **Permissions** select **Repositories** and set:
+  1. Go to [github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens)
+  2. Click **Generate new token -> Fine-grained token**
+  3. Under **Resource owner**, select the **organization** you want to access.
+  4. Under **Repository access**, choose **All repositories**.
+  5. Under **Permissions** select **Repositories** and set:
       - **Metadata** -> Read-only 
       - **Contents** -> Read-only
       - **Adminstration** -> Read-only
-   6. Click **Generate token** and **copy it** (make sure to store it somewhere safe for future use).
-   7. Navigate to `https://github.com/<gh-org-name>/repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **GH_TOKEN** and copy paste the token into the **Secret** section and click **Add secret**
-   **Note:** The token must be approved by the organization administrator before accessing private repositories.
+  6. Click **Generate token** and **copy it** (make sure to store it somewhere safe for future use).
+  7. Navigate to `https://github.com/<gh-org-name>/repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **GH_TOKEN** and copy paste the token into the **Secret** section and click **Add secret**
+  **Note:** The token must be approved by the organization administrator before accessing private repositories.
+
+### Create a Hugging Face Token
+
+  To create one with permissions for both private and public repositoriesL
+
+  1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+  2. Click on **New Token** and name it **repo-exporter**
+  3. For permissions select:
+      - **Read**
+  4. Click **Generate** and **copy it** (make sure to store it somewhere safe for future use)
+  5. Navigate to `https://github.com/<gh-org-name>/repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **HF_TOKEN** and copy paste the token into the **Secret** section and click **Add secret** 
 
 ### Set up Google Cloud Service Account Access
 

From 6f3128d1bd04fb9565f8c46525094b6255acbb98 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:59:04 -0500
Subject: [PATCH 06/16] Update .github/workflows/gh-repo-exporter.yml

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>
---
 .github/workflows/gh-repo-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gh-repo-exporter.yml b/.github/workflows/gh-repo-exporter.yml
index aa09f54..38ebf9c 100644
--- a/.github/workflows/gh-repo-exporter.yml
+++ b/.github/workflows/gh-repo-exporter.yml
@@ -44,4 +44,4 @@ jobs:
         GH_TOKEN: ${{ secrets.GH_TOKEN }}
         GOOGLE_CREDENTIALS_PATH: service_account.json
         REPO_TYPE: ${{ github.event.inputs.repo_type || 'all' }}
-      run: python export_repos.py
\ No newline at end of file
+      run: python gh_repo_exporter.py

From 927fe7bf23ec27c538f578705f4985f5f234c9e0 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:59:13 -0500
Subject: [PATCH 07/16] Update .github/workflows/hf-repo-exporter.yml

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>
---
 .github/workflows/hf-repo-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml
index 155216f..0ae0999 100644
--- a/.github/workflows/hf-repo-exporter.yml
+++ b/.github/workflows/hf-repo-exporter.yml
@@ -34,4 +34,4 @@ jobs:
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
         GOOGLE_CREDENTIALS_PATH: service_account.json
-      run: python export_repos.py
\ No newline at end of file
+      run: python hf_repo_exporter.py

From 031b57aeee6b74c9b03ac8d2810eda9d6b0da905 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com>
Date: Wed, 17 Dec 2025 15:02:12 -0500
Subject: [PATCH 08/16] Update README.md

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e1630c..af9731b 100644
--- a/README.md
+++ b/README.md
@@ -61,8 +61,9 @@ To use this script within your own GitHub organization, first fork this repo, th
 
   1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
   2. Click on **New Token** and name it **repo-exporter**
-  3. For permissions select:
-      - **Read**
+  3. For permissions select **Fine-grained**:
+      - Specify the desired organization (under **Org permissions**)
+      - Under **Repositories**, select "Read access to contents of all repos in selected organizations"
   4. Click **Generate** and **copy it** (make sure to store it somewhere safe for future use)
   5. Navigate to `https://github.com/<gh-org-name>/repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **HF_TOKEN** and copy paste the token into the **Secret** section and click **Add secret** 
 

From 8541a3bf6a4cc80966b823ec040495d82eb213a1 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Thu, 18 Dec 2025 13:05:40 -0500
Subject: [PATCH 09/16] WIP get_repo() and get_paper() funcs

---
 hf_repo_exporter.py | 234 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 203 insertions(+), 31 deletions(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index ac261a7..3d82b61 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -1,6 +1,6 @@
-from huggingface_hub import HfApi
+from huggingface_hub import HfApi, hf_hub_download
 import pandas as pd
-import tqdm
+from tqdm import tqdm
 from google.oauth2.service_account import Credentials
 import gspread
 
@@ -8,6 +8,7 @@
 import time
 import os
 import re
+import yaml
 
 # Config
 ORG_NAME = "imageomics"
@@ -15,33 +16,154 @@
 SHEET_NAME = "Sheet1"
 
 # Helper Functions
-def get_repo_url(repo) -> str:
-    if repo._hf_repo_type == "dataset":
+def get_repo_url(repo, repo_type: str) -> str:
+    if repo_type == "dataset":
         return f"https://huggingface.co/datasets/{repo.id}"
-    elif repo._hf_repo_type == "space":
+    elif repo_type == "space":
         return f"https://huggingface.co/spaces/{repo.id}"
     else: # model
         return f"https://huggingface.co/{repo.id}"
-    
-def is_inactive(repo):
+
+def get_license(repo) -> str:
+    # 1. cardData
+    try:
+        license_from_card = getattr(repo, "cardData", {}).get("license")
+        if license_from_card:
+            return license_from_card
+    except Exception:
+        pass
+
+    # 2. repo.license attribute
+    try:
+        license_attr = getattr(repo, "license", None)
+        if license_attr:
+            return license_attr
+    except Exception:
+        pass
+
+    # 3. YAML content in README
+    try:
+        readme_text = getattr(repo, "readme", None)
+        if readme_text:
+            import re, yaml
+            match = re.search(r'^---\s*(.*?)\s*---', readme_text, re.DOTALL | re.MULTILINE)
+            if match:
+                yaml_content = match.group(1)
+                data = yaml.safe_load(yaml_content)
+                if isinstance(data, dict):
+                    return data.get("license", "No")
+    except Exception:
+        pass
+
+    return "No"
+
+def is_inactive(repo) -> str:
     try:
         last_modified = getattr(repo, "lastModified", None)
         if not last_modified:
             return "N/A"
 
-        # Parse ISO 8601 string
-        updated = datetime.fromisoformat(last_modified.replace("Z", ""))
-        if updated.tzinfo is None:
-            updated = updated.replace(tzinfo=timezone.utc)
+        # Ensure last_modified is aware (has tzinfo)
+        if last_modified.tzinfo is None:
+            last_modified = last_modified.replace(tzinfo=timezone.utc)
 
         one_year_ago = datetime.now(timezone.utc) - timedelta(days=365)
-        return "Yes" if updated < one_year_ago else "No"
+        return "Yes" if last_modified < one_year_ago else "No"
     except Exception:
         return "N/A"
+    
+def get_homepage_link(repo, repo_type: str) -> str:
+    try:
+        card_data = getattr(repo, "cardData", None)
+        if isinstance(card_data, dict) and card_data.get("homepage"):
+            return f'=HYPERLINK("{card_data.get("homepage")}", "Homepage")'
+    except Exception:
+        pass
+    
+    # Check README for homepage URLs
+    try:
+        readme_path = hf_hub_download(
+            repo_id=repo.id,
+            filename="README.md",
+            repo_type=repo_type,
+            token=os.getenv("HF_TOKEN")
+        )
+        with open(readme_path, 'r', encoding='utf-8') as f:
+            readme_text = f.read()
+        match = re.search(r'Homepage:\s*(https?://[^\s\n)]+)', readme_text, re.IGNORECASE)
+        if match:
+            return f'=HYPERLINK("{match.group(1)}", "Homepage")'
+    except Exception:
+        pass
+    
+    return "No"
+
+def get_repo_link(repo, repo_type: str) -> str:
+    try:
+        card_data = getattr(repo, "cardData", None)
+        if isinstance(card_data, dict):
+            for key in ("repository", "repo", "github_repo"):
+                url = card_data.get(key)
+                if url and url.startswith("http"):
+                    return f'=HYPERLINK("{url}", "Repository")'
+    except Exception:
+        pass
+    
+    # Check README for github/repo URLs
+    try:
+        from huggingface_hub import hf_hub_download
+        readme_path = hf_hub_download(
+            repo_id=repo.id,
+            filename="README.md",
+            repo_type=repo_type,
+            token=os.getenv("HF_TOKEN")
+        )
+        with open(readme_path, 'r', encoding='utf-8') as f:
+            readme_text = f.read()
+        match = re.search(r'(https?://(?:github\.com|gitlab\.com)[^\s\n)}\]]+)', readme_text, re.IGNORECASE)
+        if match:
+            url = match.group(1).rstrip('*`[]()]}')
+            return f'=HYPERLINK("{url}", "Repository")'
+    except Exception:
+        pass
+    
+    return "No"
+
+def get_paper_link(repo, repo_type: str) -> str:
+    try:
+        card_data = getattr(repo, "cardData", None)
+        if isinstance(card_data, dict) and card_data.get("paper"):
+            return f'=HYPERLINK("{card_data.get("paper")}", "Paper")'
+    except Exception:
+        pass
+    
+    # Check README for arxiv/paper URLs
+    try:
+        from huggingface_hub import hf_hub_download
+        readme_path = hf_hub_download(
+            repo_id=repo.id,
+            filename="README.md",
+            repo_type=repo_type,
+            token=os.getenv("HF_TOKEN")
+        )
+        with open(readme_path, 'r', encoding='utf-8') as f:
+            readme_text = f.read()
+        match = re.search(r'(https?://(?:arxiv\.org|doi\.org)[^\s\n)}\]]+)', readme_text, re.IGNORECASE)
+        if match:
+            url = match.group(1).rstrip('*`[]()]}')
+            return f'=HYPERLINK("{url}", "Paper")'
+    except Exception:
+        pass
+    
+    return "No"
 
 def get_model_card_field(repo, key: str) -> str:
     try:
-        return repo.cardData.get(key, "")
+        value = repo.cardData.get(key, "")
+        # Convert to string if it's a list or other type
+        if isinstance(value, list):
+            return ", ".join(str(v) for v in value)
+        return str(value) if value else ""
     except Exception:
         return "N/A"
     
@@ -51,29 +173,68 @@ def get_associated_assets(repo) -> str:
         return ", ".join(related)
     except Exception:
         return "N/A"
+    
+def clean_description(desc: str) -> str:
+    if not desc:
+        return "N/A"
+    
+    # remove HTML tags
+    desc = re.sub(r"<[^>]+>", "", desc)
+
+    # collapse multiple newlines/tabs/spaces into a single space
+    desc = re.sub(r"\s+", " ", desc)
+
+    return desc.strip()
+
+def get_doi(repo) -> str:
+    try:
+        for tag in repo.tags:
+            if tag.startswith("doi:"):
+                return tag.replace("doi:", "")
+    except Exception:
+        pass
+
+    return "No"
+
+def get_repo_info(repo, repo_type: str) -> dict[str, str | int]:
+    if repo_type == "dataset":
+        display_id = f"datasets/{repo.id}"
+    elif repo_type == "space":
+        display_id = f"spaces/{repo.id}"
+    else:
+        display_id = repo.id
 
-def get_repo_info(repo) -> dict[str, str | int]:
     return {
-        "Repository Name": f'=HYPERLINK("{get_repo_url(repo)}", "{repo.id}")',
-        "Repository Type": repo._hf_repo_type, 
-        "Description": getattr(repo, "description", "N/A"),
+        "Repository Name": f'=HYPERLINK("{get_repo_url(repo, repo_type)}", "{display_id}")',
+        "Repository Type": repo_type,
+        "Description": clean_description(getattr(repo, "description", "")),
         "Date Created": repo.created_at.strftime("%Y-%m-%d") if getattr(repo, "created_at", False) else "N/A",
-        "Last Updated": datetime.fromisoformat(repo.lastModified.replace("Z", "")).strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A",
+        "Last Updated": repo.lastModified.strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A",
         "Created By": repo.author,
-        "Top 4 Contributors/Curators": ...,
+        "Top 4 Contributors/Curators": "test",
         "Likes": getattr(repo, "likes", "N/A"),
-        "# of Open PRs": ...,
+        "# of Open PRs": "test",
         "README": "Yes" if getattr(repo, "cardData", False) else "No",
-        "License": "Yes" if getattr(repo, "license", False) else "No",
+        "License": get_license(repo),
         "Visibility": "Private" if getattr(repo, "private", False) else "Public",
         "Inactive": is_inactive(repo),
-        "Homepage": get_model_card_field(repo, "homepage"), 
-        "Repo": f'=HYPERLINK("{get_model_card_field(repo, "github_repo")}", "{repo.id}")',
-        "Paper": f'=HYPERLINK("{get_model_card_field(repo, "paper")}", "Paper")',
+        "Homepage": get_homepage_link(repo, repo_type), 
+        "Repo": get_repo_link(repo, repo_type),
+        "Paper": get_paper_link(repo, repo_type),
         "Associated data, models, or spaces": get_associated_assets(repo),
-        "DOI": get_model_card_field(repo, "doi"), 
+        "DOI": get_doi(repo), 
     }
 
+# Convert all data types to string representation
+def ensure_string_value(value) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, list):
+        return ", ".join(str(v) for v in value)
+    if isinstance(value, dict):
+        return str(value)
+    return str(value)
+
 def extract_display_name(val: str) -> str:
     match = re.search(r'"([^"]+)"\)$', val) # regex to extract the repo-name from "=HYPERLINK(..., "repo-name")"
     return match.group(1) if match else val
@@ -131,6 +292,7 @@ def update_google_sheet(df: pd.DataFrame) -> None:
                 continue  # skip untouched columns
 
             value = row.get(col_name, "")
+            value = ensure_string_value(value)
             cell = gspread.utils.rowcol_to_a1(row_idx, col_idx)
 
             batch_body.append({
@@ -215,12 +377,22 @@ def main():
     api = HfApi(token=TOKEN)
 
     try:
-        models = list(api.list_models(author=ORG_NAME, full=True))
+        repos = []
+
+        for m in api.list_models(author=ORG_NAME, full=True):
+            repos.append((api.model_info(m.id), "model"))
+
+        for d in api.list_datasets(author=ORG_NAME, full=True):
+            repos.append((api.dataset_info(d.id), "dataset"))
+
+        for s in api.list_spaces(author=ORG_NAME, full=True):
+            repos.append((api.space_info(s.id), "space"))
+
     except Exception as e:
         print(f'ERROR: Could not fetch models for "{ORG_NAME}"')
         print(e)
         return
-    
+
     print("")
     print(f"Fetching Hugging Face repositories for: {ORG_NAME}")
     print("")
@@ -232,13 +404,13 @@ def main():
     if os.environ.get("CI") == "true":
         tqdm_kwargs = {"mininterval": 1, "dynamic_ncols": False, "leave": False}
 
-    for model in tqdm(models, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs):
+    for repo, repo_type in tqdm(repos, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs):
         try:
-            info = get_repo_info(model)
+            info = get_repo_info(repo, repo_type)
             data.append(info)
-            tqdm.write(f"Fetched info for /{model.id}")
+            tqdm.write(f"Fetched info for /{repo.id}")
         except Exception as e:
-            tqdm.write(f"ERROR: Cannot fetch /{model.id} info, due to {type(e).__name__}: {e}. Skipping...")
+            tqdm.write(f"ERROR: Cannot fetch /{repo.id} info, due to {type(e).__name__}: {e}. Skipping...")
     
     if not data:
         print("ERROR: No data collected")

From 1158c8515f1bbeb548633df3d93205ff99ed9ee5 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Thu, 18 Dec 2025 16:38:38 -0500
Subject: [PATCH 10/16] Fix author, topcontributors. WIP: repos,home,paper

---
 hf_repo_exporter.py | 261 +++++++++++++++++++++++++-------------------
 1 file changed, 148 insertions(+), 113 deletions(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index 3d82b61..77b1c5a 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -8,7 +8,7 @@
 import time
 import os
 import re
-import yaml
+from collections import Counter
 
 # Config
 ORG_NAME = "imageomics"
@@ -24,6 +24,62 @@ def get_repo_url(repo, repo_type: str) -> str:
     else: # model
         return f"https://huggingface.co/{repo.id}"
 
+def get_author(api, repo_id, repo_type) -> str:
+    try:
+        # Fetch all commits
+        commits = api.list_repo_commits(repo_id=repo_id, repo_type=repo_type)
+        if not commits:
+            return ORG_NAME
+
+        # The last item in the list is the earliest commit (the creation)
+        first_commit = commits[-1]
+        
+        if hasattr(first_commit, 'authors') and first_commit.authors:
+            first_author = first_commit.authors[0]
+            
+            # Use our string vs object logic from before
+            if isinstance(first_author, str):
+                return first_author
+            
+            # If it's an object, check for user handle then display name
+            return getattr(first_author, 'user', getattr(first_author, 'name', ORG_NAME))
+            
+        return ORG_NAME
+    except Exception:
+        return ORG_NAME
+
+def get_top_contributors(api, repo_id, repo_type) -> str:
+    try:
+        commits = api.list_repo_commits(repo_id=repo_id, repo_type=repo_type)
+        
+        all_handles = []
+        for c in commits:
+            authors = getattr(c, 'authors', [])
+            for author in authors:
+                # If author is a string (as shown in your logs), use it.
+                # If it's an object, try to get .user or .name
+                if isinstance(author, str):
+                    all_handles.append(author)
+                else:
+                    handle = getattr(author, 'user', getattr(author, 'name', None))
+                    if handle:
+                        all_handles.append(str(handle))
+            
+        # Filter out the Org name and the web-flow bot
+        bots_and_orgs = {ORG_NAME.lower(), "web-flow"}
+        filtered = [n for n in all_handles if str(n).lower() not in bots_and_orgs]
+
+        if not filtered:
+            return ORG_NAME
+
+        counts = Counter(filtered)
+        # Get top 4 most common contributors
+        top_4 = [name for name, count in counts.most_common(4)]
+        return ", ".join(top_4)
+    except Exception as e:
+        # Optional: tqdm.write(f"Error for {repo_id}: {e}")
+        return ORG_NAME
+
 def get_license(repo) -> str:
     # 1. cardData
     try:
@@ -71,95 +127,10 @@ def is_inactive(repo) -> str:
         return "Yes" if last_modified < one_year_ago else "No"
     except Exception:
         return "N/A"
-    
-def get_homepage_link(repo, repo_type: str) -> str:
-    try:
-        card_data = getattr(repo, "cardData", None)
-        if isinstance(card_data, dict) and card_data.get("homepage"):
-            return f'=HYPERLINK("{card_data.get("homepage")}", "Homepage")'
-    except Exception:
-        pass
-    
-    # Check README for homepage URLs
-    try:
-        readme_path = hf_hub_download(
-            repo_id=repo.id,
-            filename="README.md",
-            repo_type=repo_type,
-            token=os.getenv("HF_TOKEN")
-        )
-        with open(readme_path, 'r', encoding='utf-8') as f:
-            readme_text = f.read()
-        match = re.search(r'Homepage:\s*(https?://[^\s\n)]+)', readme_text, re.IGNORECASE)
-        if match:
-            return f'=HYPERLINK("{match.group(1)}", "Homepage")'
-    except Exception:
-        pass
-    
-    return "No"
-
-def get_repo_link(repo, repo_type: str) -> str:
-    try:
-        card_data = getattr(repo, "cardData", None)
-        if isinstance(card_data, dict):
-            for key in ("repository", "repo", "github_repo"):
-                url = card_data.get(key)
-                if url and url.startswith("http"):
-                    return f'=HYPERLINK("{url}", "Repository")'
-    except Exception:
-        pass
-    
-    # Check README for github/repo URLs
-    try:
-        from huggingface_hub import hf_hub_download
-        readme_path = hf_hub_download(
-            repo_id=repo.id,
-            filename="README.md",
-            repo_type=repo_type,
-            token=os.getenv("HF_TOKEN")
-        )
-        with open(readme_path, 'r', encoding='utf-8') as f:
-            readme_text = f.read()
-        match = re.search(r'(https?://(?:github\.com|gitlab\.com)[^\s\n)}\]]+)', readme_text, re.IGNORECASE)
-        if match:
-            url = match.group(1).rstrip('*`[]()]}')
-            return f'=HYPERLINK("{url}", "Repository")'
-    except Exception:
-        pass
-    
-    return "No"
-
-def get_paper_link(repo, repo_type: str) -> str:
-    try:
-        card_data = getattr(repo, "cardData", None)
-        if isinstance(card_data, dict) and card_data.get("paper"):
-            return f'=HYPERLINK("{card_data.get("paper")}", "Paper")'
-    except Exception:
-        pass
-    
-    # Check README for arxiv/paper URLs
-    try:
-        from huggingface_hub import hf_hub_download
-        readme_path = hf_hub_download(
-            repo_id=repo.id,
-            filename="README.md",
-            repo_type=repo_type,
-            token=os.getenv("HF_TOKEN")
-        )
-        with open(readme_path, 'r', encoding='utf-8') as f:
-            readme_text = f.read()
-        match = re.search(r'(https?://(?:arxiv\.org|doi\.org)[^\s\n)}\]]+)', readme_text, re.IGNORECASE)
-        if match:
-            url = match.group(1).rstrip('*`[]()]}')
-            return f'=HYPERLINK("{url}", "Paper")'
-    except Exception:
-        pass
-    
-    return "No"
 
 def get_model_card_field(repo, key: str) -> str:
     try:
-        value = repo.cardData.get(key, "")
+        value = repo.card_data.get(key, "")
         # Convert to string if it's a list or other type
         if isinstance(value, list):
             return ", ".join(str(v) for v in value)
@@ -167,24 +138,44 @@ def get_model_card_field(repo, key: str) -> str:
     except Exception:
         return "N/A"
     
-def get_associated_assets(repo) -> str:
+def get_associated_datasets(repo) -> str:
     try:
-        related = [tag for tag in repo.tags if tag.startswith(("dataset:", "model:", "space:"))]
-        return ", ".join(related)
+        datasets = [tag.replace("dataset:", "") for tag in repo.tags if tag.startswith("dataset:")]
+        return ", ".join(datasets) if datasets else "No"
     except Exception:
-        return "N/A"
+        return "No"
     
-def clean_description(desc: str) -> str:
-    if not desc:
-        return "N/A"
+def get_associated_models(api, repo, repo_type) -> str:
+    found = [tag.replace("model:", "") for tag in getattr(repo, "tags", []) if tag.startswith("model:")]
     
-    # remove HTML tags
-    desc = re.sub(r"<[^>]+>", "", desc)
-
-    # collapse multiple newlines/tabs/spaces into a single space
-    desc = re.sub(r"\s+", " ", desc)
-
-    return desc.strip()
+    # If it's a dataset, search for models that use this dataset
+    if repo_type == "dataset":
+        try:
+            related_models = api.list_models(filter=f"datasets:{repo.id}")
+            for m in related_models:
+                if m.id not in found:
+                    found.append(m.id)
+        except Exception:
+            pass
+            
+    return ", ".join(found) if found else "No"
+    
+def get_associated_spaces(api, repo, repo_type) -> str:
+    # 1. Check direct tags in the model card (Upstream)
+    found = [tag.replace("space:", "") for tag in getattr(repo, "tags", []) if tag.startswith("space:")]
+    
+    # 2. If it's a model, search for Spaces that use this model (Downstream)
+    if repo_type == "model":
+        try:
+            # Search for spaces that list this model ID
+            related_spaces = api.list_spaces(filter=f"models:{repo.id}")
+            for s in related_spaces:
+                if s.id not in found:
+                    found.append(s.id)
+        except Exception:
+            pass
+            
+    return ", ".join(found) if found else "No"
 
 def get_doi(repo) -> str:
     try:
@@ -196,7 +187,52 @@ def get_doi(repo) -> str:
 
     return "No"
 
-def get_repo_info(repo, repo_type: str) -> dict[str, str | int]:
+def extract_link_from_text(text, label):
+    if not text:
+        return "No"
+
+    # 1. THE AGGRESSIVE SEARCH
+    # We look for the Label, a colon, and then we specifically look for an http(s) link.
+    # This ignores leading spaces/bullets and stops at the end of the URL.
+    url_pattern = rf"{label}:\s*(https?://[^\s\)\"\'\>]+)"
+    match = re.search(url_pattern, text, re.IGNORECASE)
+    
+    if match:
+        url = match.group(1).strip().rstrip('.,)]')
+        return f'=HYPERLINK("{url}", "{label}")'
+
+    # 2. FALLBACK: Look for the label followed by ANY text (for non-URL repos)
+    text_pattern = rf"{label}:\s*([^\r\n]+)"
+    match = re.search(text_pattern, text, re.IGNORECASE)
+    if match:
+        content = match.group(1).strip()
+        # Clean up markdown junk
+        content = re.sub(r'[*_`\[\]]', '', content)
+        if content.lower() not in ["no", "n/a", "none", ""]:
+            return content
+
+    return "No"
+
+def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]:
+
+    # 1. Download README once
+    readme_text = ""
+    try:
+        # Debug print
+        tqdm.write(f"--- Debugging README for: {repo.id} ---")
+        
+        path = hf_hub_download(
+            repo_id=repo.id, 
+            filename="README.md", 
+            repo_type=repo_type,
+            token=os.getenv("HF_TOKEN")
+        )
+        with open(path, 'r', encoding='utf-8') as f:
+            readme_text = f.read()
+        
+    except Exception as e:
+        tqdm.write(f"!!! Failed to download README for {repo.id}: {e}")
+
     if repo_type == "dataset":
         display_id = f"datasets/{repo.id}"
     elif repo_type == "space":
@@ -207,21 +243,23 @@ def get_repo_info(repo, repo_type: str) -> dict[str, str | int]:
     return {
         "Repository Name": f'=HYPERLINK("{get_repo_url(repo, repo_type)}", "{display_id}")',
         "Repository Type": repo_type,
-        "Description": clean_description(getattr(repo, "description", "")),
+        "Description": get_model_card_field(repo, "model_description") or "N/A",
         "Date Created": repo.created_at.strftime("%Y-%m-%d") if getattr(repo, "created_at", False) else "N/A",
         "Last Updated": repo.lastModified.strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A",
-        "Created By": repo.author,
-        "Top 4 Contributors/Curators": "test",
+        "Created By": get_author(api, repo.id, repo_type),
+        "Top 4 Contributors/Curators": get_top_contributors(api, repo.id, repo_type),
         "Likes": getattr(repo, "likes", "N/A"),
         "# of Open PRs": "test",
         "README": "Yes" if getattr(repo, "cardData", False) else "No",
         "License": get_license(repo),
         "Visibility": "Private" if getattr(repo, "private", False) else "Public",
         "Inactive": is_inactive(repo),
-        "Homepage": get_homepage_link(repo, repo_type), 
-        "Repo": get_repo_link(repo, repo_type),
-        "Paper": get_paper_link(repo, repo_type),
-        "Associated data, models, or spaces": get_associated_assets(repo),
+        "Homepage": extract_link_from_text(readme_text, "Homepage"), 
+        "Repo": extract_link_from_text(readme_text, "Repository"),
+        "Paper": extract_link_from_text(readme_text, "Paper"),
+        "Associated Datasets": get_associated_datasets(repo),
+        "Associated Models": get_associated_models(api, repo, repo_type),
+        "Associated Spaces": get_associated_spaces(api, repo, repo_type),
         "DOI": get_doi(repo), 
     }
 
@@ -317,12 +355,9 @@ def get_column_index(col_name: str):
     red_columns = {
         "README",
         "License",
-        "Visibility",
-        "Inactive",
         "Homepage", 
         "Repo",
         "Paper",
-        "Associated data, models, or spaces",
     }
 
     orange_columns = {
@@ -406,7 +441,7 @@ def main():
 
     for repo, repo_type in tqdm(repos, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs):
         try:
-            info = get_repo_info(repo, repo_type)
+            info = get_repo_info(api, repo, repo_type)
             data.append(info)
             tqdm.write(f"Fetched info for /{repo.id}")
         except Exception as e:

From 5a112f72c0d7f29b5819825b83610e5a5b090b4c Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Thu, 18 Dec 2025 16:44:20 -0500
Subject: [PATCH 11/16] Fix homepage, repo, and paper hyperlinking

---
 hf_repo_exporter.py | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index 77b1c5a..54b5c97 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -191,26 +191,32 @@ def extract_link_from_text(text, label):
     if not text:
         return "No"
 
-    # 1. THE AGGRESSIVE SEARCH
-    # We look for the Label, a colon, and then we specifically look for an http(s) link.
-    # This ignores leading spaces/bullets and stops at the end of the URL.
-    url_pattern = rf"{label}:\s*(https?://[^\s\)\"\'\>]+)"
-    match = re.search(url_pattern, text, re.IGNORECASE)
+    # Pattern to find 'Label: ...' anywhere in the text
+    # Added \b to ensure we match the exact word
+    pattern = rf"\b{label}\b:\s*([^\r\n]+)"
+    match = re.search(pattern, text, re.IGNORECASE)
     
-    if match:
-        url = match.group(1).strip().rstrip('.,)]')
-        return f'=HYPERLINK("{url}", "{label}")'
-
-    # 2. FALLBACK: Look for the label followed by ANY text (for non-URL repos)
-    text_pattern = rf"{label}:\s*([^\r\n]+)"
-    match = re.search(text_pattern, text, re.IGNORECASE)
     if match:
         content = match.group(1).strip()
-        # Clean up markdown junk
-        content = re.sub(r'[*_`\[\]]', '', content)
-        if content.lower() not in ["no", "n/a", "none", ""]:
-            return content
-
+        # Remove common markdown junk: *, _, `, [, ]
+        content = re.sub(r'[*_`\[\]]', '', content).strip()
+        
+        # Filter out placeholders
+        if content.upper() in ["N/A", "NONE", "", "NULL", "TBA", "COMING SOON", "IN PROGRESS", "-->"]:
+            return "No"
+
+        # If it contains an http link, create a clean HYPERLINK formula
+        if "http" in content.lower():
+            url_match = re.search(r'(https?://[^\s)]+)', content)
+            if url_match:
+                url = url_match.group(1).rstrip('.,)]')
+                # Use the text before the '(' as the label, or the default label
+                display_text = content.split('(')[0].strip() or label
+                # Double up quotes for Google Sheets formula safety
+                display_text = display_text.replace('"', '""')
+                return f'=HYPERLINK("{url}", "{display_text}")'
+        
+        return content
     return "No"
 
 def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]:

From 016ebd11846bf04d18efb4a04381820ed914da18 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com>
Date: Thu, 18 Dec 2025 18:09:48 -0500
Subject: [PATCH 12/16] Update hf_repo_exporter.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>
---
 hf_repo_exporter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index 54b5c97..c6ba0ba 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -361,7 +361,6 @@ def get_column_index(col_name: str):
     red_columns = {
         "README",
         "License",
-        "Homepage", 
         "Repo",
         "Paper",
     }

From 1075f3d29aae426cf7f2a399a36104031fd41a3c Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Thu, 18 Dec 2025 18:15:37 -0500
Subject: [PATCH 13/16] Add OpenPRs, WIP fix assoc. datasets,models,spaces

---
 hf_repo_exporter.py | 103 +++++++++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 34 deletions(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index 54b5c97..838e197 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -80,6 +80,19 @@ def get_top_contributors(api, repo_id, repo_type) -> str:
         # Optional: tqdm.write(f"Error for {repo_id}: {e}")
         return ORG_NAME
 
+def get_open_pr_count(api, repo_id, repo_type) -> int:
+    try:
+        # Fetch discussions filtered by pull_requests only
+        discussions = api.get_repo_discussions(
+            repo_id=repo_id, 
+            repo_type=repo_type
+        )
+        # Count how many are pull requests AND are currently open
+        open_prs = [d for d in discussions if d.is_pull_request and d.status == "open"]
+        return len(open_prs)
+    except Exception:
+        return 0
+
 def get_license(repo) -> str:
     # 1. cardData
     try:
@@ -140,42 +153,64 @@ def get_model_card_field(repo, key: str) -> str:
     
 def get_associated_datasets(repo) -> str:
     try:
-        datasets = [tag.replace("dataset:", "") for tag in repo.tags if tag.startswith("dataset:")]
+        # Looking for tags like 'dataset:user/repo'
+        datasets = [tag.replace("dataset:", "") for tag in getattr(repo, "tags", []) if tag.startswith("dataset:")]
         return ", ".join(datasets) if datasets else "No"
     except Exception:
         return "No"
-    
+
 def get_associated_models(api, repo, repo_type) -> str:
-    found = [tag.replace("model:", "") for tag in getattr(repo, "tags", []) if tag.startswith("model:")]
-    
-    # If it's a dataset, search for models that use this dataset
+    found = []
+    repo_id = getattr(repo, 'id', str(repo))
+
     if repo_type == "dataset":
+        tqdm.write(f"--- Searching Models for Dataset: {repo_id} ---")
         try:
-            related_models = api.list_models(filter=f"datasets:{repo.id}")
-            for m in related_models:
-                if m.id not in found:
-                    found.append(m.id)
-        except Exception:
-            pass
-            
+            # Again, using 'search' to find any model mentioning this dataset
+            related_models = list(api.list_models(search=repo_id))
+            if related_models:
+                found = [m.id for m in related_models if m.id != repo_id]
+                tqdm.write(f"   [Search] Found {len(found)} models")
+        except Exception as e:
+            tqdm.write(f"   [Search] Error: {e}")
+
     return ", ".join(found) if found else "No"
+
+def get_associated_spaces(api, repo_id) -> str:
+    found = set()
     
-def get_associated_spaces(api, repo, repo_type) -> str:
-    # 1. Check direct tags in the model card (Upstream)
-    found = [tag.replace("space:", "") for tag in getattr(repo, "tags", []) if tag.startswith("space:")]
+    # Ensure we are using the clean string ID
+    clean_id = repo_id.id if hasattr(repo_id, 'id') else str(repo_id)
     
-    # 2. If it's a model, search for Spaces that use this model (Downstream)
-    if repo_type == "model":
-        try:
-            # Search for spaces that list this model ID
-            related_spaces = api.list_spaces(filter=f"models:{repo.id}")
-            for s in related_spaces:
-                if s.id not in found:
-                    found.append(s.id)
-        except Exception:
-            pass
+    try:
+        # 1. Broad Metadata Search
+        # We search specifically for the ID in the 'models' metadata field
+        # Note: We use list() to ensure the generator is fully exhausted
+        spaces_by_model = list(api.list_spaces(filter=f"models:{clean_id}"))
+        for s in spaces_by_model:
+            found.add(s.id)
             
-    return ", ".join(found) if found else "No"
+        # 2. String-based Search (The "Catch-all")
+        # This finds spaces that mention it but didn't use the standard YAML format
+        spaces_by_search = list(api.list_spaces(search=clean_id))
+        for s in spaces_by_search:
+            found.add(s.id)
+
+        # 3. Handle specific Org-level associations
+        # Sometimes spaces are linked but not indexed under the ID
+        # Let's filter out the self-reference
+        if clean_id in found:
+            found.remove(clean_id)
+
+    except Exception as e:
+        tqdm.write(f"Error for {clean_id}: {e}")
+
+    if not found:
+        return "No"
+    
+    # Sort and format
+    sorted_found = sorted(list(found))
+    return ", ".join(sorted_found)
 
 def get_doi(repo) -> str:
     try:
@@ -224,9 +259,6 @@ def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]:
     # 1. Download README once
     readme_text = ""
     try:
-        # Debug print
-        tqdm.write(f"--- Debugging README for: {repo.id} ---")
-        
         path = hf_hub_download(
             repo_id=repo.id, 
             filename="README.md", 
@@ -255,7 +287,7 @@ def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]:
         "Created By": get_author(api, repo.id, repo_type),
         "Top 4 Contributors/Curators": get_top_contributors(api, repo.id, repo_type),
         "Likes": getattr(repo, "likes", "N/A"),
-        "# of Open PRs": "test",
+        "# of Open PRs": get_open_pr_count(api, repo.id, repo_type),
         "README": "Yes" if getattr(repo, "cardData", False) else "No",
         "License": get_license(repo),
         "Visibility": "Private" if getattr(repo, "private", False) else "Public",
@@ -265,7 +297,7 @@ def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]:
         "Paper": extract_link_from_text(readme_text, "Paper"),
         "Associated Datasets": get_associated_datasets(repo),
         "Associated Models": get_associated_models(api, repo, repo_type),
-        "Associated Spaces": get_associated_spaces(api, repo, repo_type),
+        "Associated Spaces": get_associated_spaces(api, repo),
         "DOI": get_doi(repo), 
     }
 
@@ -363,10 +395,13 @@ def get_column_index(col_name: str):
         "License",
         "Homepage", 
         "Repo",
-        "Paper",
+        "Paper"
     }
 
-    orange_columns = {
+    yellow_columns = {
+        "Associated Datasets",
+        "Associated Models",
+        "Associated Spaces",
         "DOI"
     }
 
@@ -374,7 +409,7 @@ def get_column_index(col_name: str):
 
     # Only loop over columns that need formatting
     for col_set, color in [(red_columns, {"red": 1, "green": 0.5, "blue": 0.5}),
-                        (orange_columns, {"red": 1, "green": 0.8, "blue": 0.4})]:
+                        (yellow_columns, {"red": 1, "green": 0.8, "blue": 0.4})]:
 
         for col_name in col_set:
             col_index = get_column_index(col_name)

From 4c1fb067497cf3f9de30495acc93bcdc786662e4 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Thu, 18 Dec 2025 18:22:48 -0500
Subject: [PATCH 14/16] Update README with examples to run scripts locally

---
 README.md | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index af9731b..14a3edf 100644
--- a/README.md
+++ b/README.md
@@ -110,10 +110,25 @@ Now update the script with [your GitHub Organization name](https://github.com/Im
     pip install -r requirements.txt
     ```
     
-5. Run the program
-    ```
-    python export_repos.py
-    ```
+5. Run the exporters
+
+   You can run **either exporter individually** or **both**, depending on your needs:
+
+    - **Run only the GitHub repository exporter**
+      ```
+      python gh_repo_exporter.py
+      ```
+
+    - **Run only the Hugging Face repository exporter**
+      ```
+      python hf_repo_exporter.py
+      ```
+
+    - **Run both exporters (wait for one to finish before running the other)**
+      ```
+      python hf_repo_exporter.py
+      python gh_repo_exporter.py
+      ```
 
 ## Important Notes
 

From 5e883c854e74104f21fd01ec72ae34386d740895 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Thu, 18 Dec 2025 18:32:25 -0500
Subject: [PATCH 15/16] Add TBD to filter word check

---
 hf_repo_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index fdcf5af..b0d43c6 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -237,7 +237,7 @@ def extract_link_from_text(text, label):
         content = re.sub(r'[*_`\[\]]', '', content).strip()
         
         # Filter out placeholders
-        if content.upper() in ["N/A", "NONE", "", "NULL", "TBA", "COMING SOON", "IN PROGRESS", "-->"]:
+        if content.upper() in ["N/A", "NONE", "", "NULL", "TBA", "COMING SOON", "IN PROGRESS", "TBD", "-->"]:
             return "No"
 
         # If it contains an http link, create a clean HYPERLINK formula

From fd48c4bcdf6070974124ab0755b35eb04bf49284 Mon Sep 17 00:00:00 2001
From: Balaji Radhakrishnan <rbalajirrk@gmail.com>
Date: Fri, 19 Dec 2025 16:59:30 -0500
Subject: [PATCH 16/16] Update get_doi() to better detect DOIs

---
 hf_repo_exporter.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py
index b0d43c6..1efce37 100644
--- a/hf_repo_exporter.py
+++ b/hf_repo_exporter.py
@@ -214,10 +214,23 @@ def get_associated_spaces(api, repo_id) -> str:
 
 def get_doi(repo) -> str:
     try:
-        for tag in repo.tags:
-            if tag.startswith("doi:"):
-                return tag.replace("doi:", "")
-    except Exception:
+        # 1. Check if the DOI is a direct attribute
+        if hasattr(repo, 'doi') and repo.doi:
+            return str(repo.doi).replace("doi:", "")
+
+        # 2. Check the metadata dictionary if it exists
+        if hasattr(repo, 'card_data') and repo.card_data:
+            doi = repo.card_data.get('doi')
+            if doi:
+                return str(doi).replace("doi:", "")
+
+        # 3. Fallback to the tags loop (for manually tagged DOIs)
+        if hasattr(repo, 'tags') and repo.tags:
+            for tag in repo.tags:
+                if isinstance(tag, str) and tag.lower().startswith("doi:"):
+                    return tag.replace("doi:", "").replace("DOI:", "")
+                    
+    except Exception as e:
         pass
 
     return "No"