From 2df4929bef5b1efec964c7ff550b36204651566b Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Tue, 16 Dec 2025 15:17:13 -0500 Subject: [PATCH 01/16] Add WIP hf repo exporter and rename files --- .../{main.yml => gh-repo-exporter.yml} | 4 +- .github/workflows/hf-repo-exporter.yml | 37 +++ export_repos.py => gh_repo_exporter.py | 11 +- hf_repo_exporter.py | 266 ++++++++++++++++++ requirements.txt | 28 +- tests/test_has_doi_for_repo.py | 2 +- 6 files changed, 314 insertions(+), 34 deletions(-) rename .github/workflows/{main.yml => gh-repo-exporter.yml} (93%) create mode 100644 .github/workflows/hf-repo-exporter.yml rename export_repos.py => gh_repo_exporter.py (99%) create mode 100644 hf_repo_exporter.py diff --git a/.github/workflows/main.yml b/.github/workflows/gh-repo-exporter.yml similarity index 93% rename from .github/workflows/main.yml rename to .github/workflows/gh-repo-exporter.yml index 2577068..aa09f54 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/gh-repo-exporter.yml @@ -8,7 +8,7 @@ on: # │ │ │ ┌────── restricted to month (1-12) # │ │ │ │ ┌──── restricted to day of week (0-6, 0=Sunday) # │ │ │ │ │ * means doesn't restrict anything - - cron: "0 9 * * 1" # Runs once every Monday at 9 AM + - cron: "0 9 * * 1" # Runs once every Monday at 9 AM UTC workflow_dispatch: inputs: repo_type: @@ -44,4 +44,4 @@ jobs: GH_TOKEN: ${{ secrets.GH_TOKEN }} GOOGLE_CREDENTIALS_PATH: service_account.json REPO_TYPE: ${{ github.event.inputs.repo_type || 'all' }} - run: python export_repos.py + run: python export_repos.py \ No newline at end of file diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml new file mode 100644 index 0000000..a878c5c --- /dev/null +++ b/.github/workflows/hf-repo-exporter.yml @@ -0,0 +1,37 @@ +name: Update Metadata for GitHub Repository Sheet + +on: + schedule: + # ┌──────────── restricted to minute (0-59) + # │ ┌────────── restricted to hour (0-23) + # │ │ ┌──────── restricted to day of month (1-31) + # │ │ │ ┌────── restricted to month (1-12) + # │ │ │ │ ┌──── restricted to day of week (0-6, 0=Sunday) + # │ │ │ │ │ * means doesn't restrict anything + - cron: "0 9 * * 1" # Runs once every Monday at 9 AM UTC + workflow_dispatch: + +jobs: + update-sheet: + runs-on: ubuntu-latest + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Write Google credentials + run: printf "%s" '${{ secrets.GOOGLE_SERVICE_ACCOUNT_JSON }}' > service_account.json + + - name: Run script + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + GOOGLE_CREDENTIALS_PATH: service_account.json + run: python export_repos.py \ No newline at end of file diff --git a/export_repos.py b/gh_repo_exporter.py similarity index 99% rename from export_repos.py rename to gh_repo_exporter.py index 87333ff..b369b45 100644 --- a/export_repos.py +++ b/gh_repo_exporter.py @@ -1,13 +1,14 @@ -import os -import pandas as pd from github import Github, GithubException, Auth +import pandas as pd from tqdm import tqdm -from datetime import datetime, timedelta, timezone +from google.oauth2.service_account import Credentials +import gspread import yaml + +from datetime import datetime, timedelta, timezone import time +import os import re -import gspread -from google.oauth2.service_account import Credentials # Config ORG_NAME = "Imageomics" diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py new file mode 100644 index 0000000..6436d1b --- /dev/null +++ b/hf_repo_exporter.py @@ -0,0 +1,266 @@ +from huggingface_hub import HfApi +import pandas as pd +import tqdm +from google.oauth2.service_account import Credentials +import gspread + +from datetime import datetime, timedelta, timezone +import time +import os +import re + +# Config +ORG_NAME = "imageomics" +SPREADSHEET_ID = "1NOVB9IfBvkAh4YDbozhi5q0iwBfyp3enD6UxmO6wHIA" +SHEET_NAME = "Sheet1" + +# Helper Functions +def get_repo_url(repo) -> str: + if repo._hf_repo_type == "dataset": + return f"https://huggingface.co/datasets/{repo.id}" + elif repo._hf_repo_type == "space": + return f"https://huggingface.co/spaces/{repo.id}" + else: # model + return f"https://huggingface.co/{repo.id}" + +def is_inactive(repo): + try: + last_modified = getattr(repo, "lastModified", None) + if not last_modified: + return "N/A" + + # Parse ISO 8601 string + updated = datetime.fromisoformat(last_modified.replace("Z", "")) + if updated.tzinfo is None: + updated = updated.replace(tzinfo=timezone.utc) + + one_year_ago = datetime.now(timezone.utc) - timedelta(days=365) + return "Yes" if updated < one_year_ago else "No" + except Exception: + return "N/A" + +def get_model_card_field(repo, key: str) -> str: + try: + return repo.cardData.get(key, "") + except Exception: + return "N/A" + +def get_associated_assets(repo) -> str: + try: + related = [tag for tag in repo.tags if tag.startswith(("dataset:", "model:", "space:"))] + return ", ".join(related) + except Exception: + return "N/A" + +def get_repo_info(repo) -> dict[str, str | int]: + return { + "Repository Name": f'=HYPERLINK("{get_repo_url(repo)}", "{repo.id}")', + "Repository Type": repo._hf_repo_type, + "Description": getattr(repo, "description", "N/A"), + "Date Created": repo.created_at.strftime("%Y-%m-%d") if getattr(repo, "created_at", False) else "N/A", + "Last Updated": datetime.fromisoformat(repo.lastModified.replace("Z", "")).strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A", + "Created By": repo.author, + "Top 4 Contributors/Curators": ..., + "Likes": getattr(repo, "likes", "N/A"), + "# of Open PRs": ..., + "README": "Yes" if getattr(repo, "cardData", False) else "No", + "License": "Yes" if getattr(repo, "license", False) else "No", + "Visibility": "Private" if getattr(repo, "private", False) else "Public", + "Inactive": is_inactive(repo), + "Homepage": get_model_card_field(repo, "homepage"), + "Repo": f'=HYPERLINK("{get_model_card_field(repo, "github_repo")}", "{repo.id}")', + "Paper": f'=HYPERLINK("{get_model_card_field(repo, "paper")}", "Paper")', + "Associated data, models, or spaces": get_associated_assets(repo), + "DOI": get_model_card_field(repo, "doi"), + } + +def extract_display_name(val: str) -> str: + match = re.search(r'"([^"]+)"\)$', val) # regex to extract the repo-name from "=HYPERLINK(..., "repo-name")" + return match.group(1) if match else val + +def update_google_sheet(df: pd.DataFrame) -> None: + # Authenticate Google API + creds_path = os.getenv("GOOGLE_CREDENTIALS_PATH", "service_account.json") + + creds = Credentials.from_service_account_file( + creds_path, + scopes=[ + "https://www.googleapis.com/auth/spreadsheets", + "https://www.googleapis.com/auth/drive" + ] + ) + + client = gspread.authorize(creds) + sheet = client.open_by_key(SPREADSHEET_ID).worksheet(SHEET_NAME) + + # Pull current header + HEADER_ROW_INDEX = 2 + header = sheet.row_values(HEADER_ROW_INDEX) + + # Find + try: + repo_col_index = header.index("Repository Name") + except ValueError: + raise ValueError('Sheet is missing "Repository Name" column') + + # Build a dict of repo name -> index + existing = sheet.get_all_values() + data_rows = existing[HEADER_ROW_INDEX:] + name_to_row = {} + for offset, row in enumerate(data_rows, start=HEADER_ROW_INDEX + 1): + if len(row) <= repo_col_index: # if row of data fetched is missing repo name column, ignore the row + continue + + sheet_repo_name = extract_display_name(row[repo_col_index]) # hardcoded to check for "Repository Name" column in row 0 + name_to_row[sheet_repo_name] = offset + + batch_body = [] + for _, row in df.iterrows(): + repo_name = extract_display_name(row["Repository Name"]) + + # Determine row index + if repo_name in name_to_row: + row_idx = name_to_row[repo_name] + else: + row_idx = len(existing) + 1 + existing.append([""] * len(header)) + + # Create (range, value) for each column individually + for col_idx, col_name in enumerate(header, start=1): + if col_name not in df.columns: + continue # skip untouched columns + + value = row.get(col_name, "") + cell = gspread.utils.rowcol_to_a1(row_idx, col_idx) + + batch_body.append({ + "range": cell, + "majorDimension": "ROWS", + "values": [[value]] # single cell update + }) + + sheet.spreadsheet.values_batch_update( + body={ + "value_input_option": "USER_ENTERED", + "data": batch_body + } + ) + + def get_column_index(col_name: str): + try: + return header.index(col_name) + except ValueError: + return None # column not found + + red_columns = { + "README", + "License", + ".gitignore", + "Package Requirements", + "CITATION" + } + + orange_columns = { + ".zenodo.json", + "CONTRIBUTING", + "AGENTS", + "Website Reference", + "Dataset", + "Model", + "Paper Association", + "DOI for GitHub Repo" + } + + rules = [] + + # Only loop over columns that need formatting + for col_set, color in [(red_columns, {"red": 1, "green": 0.5, "blue": 0.5}), + (orange_columns, {"red": 1, "green": 0.8, "blue": 0.4})]: + + for col_name in col_set: + col_index = get_column_index(col_name) + if col_index is None: + continue # skip missing columns + + rules.append({ + "addConditionalFormatRule": { + "rule": { + "ranges": [{ + "sheetId": sheet.id, + "startRowIndex": HEADER_ROW_INDEX, # start after header + "endRowIndex": HEADER_ROW_INDEX + len(df), # only data rows + "startColumnIndex": col_index, + "endColumnIndex": col_index + 1 + }], + "booleanRule": { + "condition": { + "type": "TEXT_EQ", + "values": [{"userEnteredValue": "No"}] + }, + "format": { + "backgroundColor": color + } + } + }, + "index": 0 + } + }) + + sheet.spreadsheet.batch_update({"requests": rules}) + +# ------- + +def main(): + + TOKEN = os.getenv("HF_TOKEN") or input("Enter your Hugging Face token: ").strip() + + start_time = time.time() + + api = HfApi(token=TOKEN) + + try: + models = list(api.list_models(author=ORG_NAME, full=True)) + except Exception as e: + print(f'ERROR: Could not fetch models for "{ORG_NAME}"') + print(e) + return + + print("") + print(f"Fetching Hugging Face repositories for: {ORG_NAME}") + print("") + print("----------------") + + data = [] + + tqdm_kwargs = {} + if os.environ.get("CI") == "true": + tqdm_kwargs = {"mininterval": 1, "dynamic_ncols": False, "leave": False} + + for model in tqdm(models, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs): + try: + info = get_repo_info(model) + data.append(info) + tqdm.write(f"Fetched info for /{model.id}") + except Exception as e: + tqdm.write(f"ERROR: Cannot fetch /{model.id} info, due to {type(e).__name__}: {e}. Skipping...") + + if not data: + print("ERROR: No data collected") + return + + print("----------------") + print("") + + df = pd.DataFrame(data) + df.sort_values(by="Repository Name", inplace=True) + + update_google_sheet(df) + print(f"Finished fetching info for {len(df)} repositories from {ORG_NAME} organization") + + elapsed = time.time() - start_time + minutes, seconds = divmod(int(elapsed), 60) + + print(f"Total time taken: {minutes}m {seconds}s") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 01b8649..2160fd1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,32 +1,8 @@ -cachetools==6.2.1 -certifi==2025.10.5 -cffi==2.0.0 -charset-normalizer==3.4.4 -colorama==0.4.6 -cryptography==46.0.3 -et_xmlfile==2.0.0 google-auth==2.41.1 -google-auth-oauthlib==1.2.3 gspread==6.2.1 -idna==3.11 -numpy==2.3.4 -oauthlib==3.3.1 pandas==2.3.3 -pyasn1==0.6.1 -pyasn1_modules==0.4.2 -pycparser==2.23 PyGithub==2.8.1 -PyJWT==2.10.1 -PyNaCl==1.6.0 -python-dateutil==2.9.0.post0 -pytz==2025.2 -requests==2.32.5 -requests-oauthlib==2.0.0 -rsa==4.9.1 -six==1.17.0 tqdm==4.67.1 -typing_extensions==4.15.0 -tzdata==2025.2 -urllib3==2.6.0 PyYAML==6.0.3 -pytest==9.0.1 \ No newline at end of file +pytest==9.0.1 +huggingface_hub==1.2.3 \ No newline at end of file diff --git a/tests/test_has_doi_for_repo.py b/tests/test_has_doi_for_repo.py index 5218c27..2904afd 100644 --- a/tests/test_has_doi_for_repo.py +++ b/tests/test_has_doi_for_repo.py @@ -1,5 +1,5 @@ import pytest -from export_repos import has_doi +from gh_repo_exporter import has_doi # --- Fake GitHub objects --- From 5bd8cf3e32e4b9a1b36eedeb987a5441c7d9802a Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Tue, 16 Dec 2025 15:23:06 -0500 Subject: [PATCH 02/16] move workflow to top-level to fix manual --- .github/workflows/hf-repo-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml index a878c5c..b81aa72 100644 --- a/.github/workflows/hf-repo-exporter.yml +++ b/.github/workflows/hf-repo-exporter.yml @@ -1,6 +1,7 @@ name: Update Metadata for GitHub Repository Sheet on: + workflow_dispatch: schedule: # ┌──────────── restricted to minute (0-59) # │ ┌────────── restricted to hour (0-23) @@ -9,7 +10,6 @@ on: # │ │ │ │ ┌──── restricted to day of week (0-6, 0=Sunday) # │ │ │ │ │ * means doesn't restrict anything - cron: "0 9 * * 1" # Runs once every Monday at 9 AM UTC - workflow_dispatch: jobs: update-sheet: From ec8adb5ab0029715032c486876680add62fed814 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Tue, 16 Dec 2025 15:28:21 -0500 Subject: [PATCH 03/16] Update name for hugging face workflow --- .github/workflows/hf-repo-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml index b81aa72..155216f 100644 --- a/.github/workflows/hf-repo-exporter.yml +++ b/.github/workflows/hf-repo-exporter.yml @@ -1,4 +1,4 @@ -name: Update Metadata for GitHub Repository Sheet +name: Update Metadata for Hugging Face Repository Sheet on: workflow_dispatch: From 6ccdb4250cf3b97c04fca02bfaeb2b537f6b5a57 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Wed, 17 Dec 2025 11:44:01 -0500 Subject: [PATCH 04/16] Update coloring --- hf_repo_exporter.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index 6436d1b..ac261a7 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -155,20 +155,16 @@ def get_column_index(col_name: str): red_columns = { "README", "License", - ".gitignore", - "Package Requirements", - "CITATION" + "Visibility", + "Inactive", + "Homepage", + "Repo", + "Paper", + "Associated data, models, or spaces", } orange_columns = { - ".zenodo.json", - "CONTRIBUTING", - "AGENTS", - "Website Reference", - "Dataset", - "Model", - "Paper Association", - "DOI for GitHub Repo" + "DOI" } rules = [] From 63a26ef95204c1706e0892c79cd3673de9213de5 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Wed, 17 Dec 2025 11:58:12 -0500 Subject: [PATCH 05/16] Add instrutions to create HF token for gh secrets --- README.md | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 23124cb..8e1630c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ A Python script that gathers metadata for all repositories in a GitHub organizat - [Features](#features) - [Usage](#usage) - [Set up your own GitHub Actions workflow](#set-up-your-own-github-actions-workflow) - - [Create a GitHub Personal Access Token](#create-a-github-personal-access-token) + - [Create a GitHub Personal Access Token](#create-a-github-personal-access-token) + - [Create a Hugging Face Token](#create-a-hugging-face-token) - [Set up Google Cloud Service Account Access](#set-up-google-cloud-service-account-access) - [Run repo exporter locally](#run-repo-exporter-locally) - [Important Notes](#important-notes) @@ -42,17 +43,28 @@ To use this script within your own GitHub organization, first fork this repo, th To create one with permissions for both private and public repositories (public repository read-access only is enabled by default without adminstrator approval): - 1. Go to [github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) - 2. Click **Generate new token → Fine-grained token** - 3. Under **Resource owner**, select the **organization** you want to access. - 4. Under **Repository access**, choose **All repositories**. - 5. Under **Permissions** select **Repositories** and set: + 1. Go to [github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) + 2. Click **Generate new token -> Fine-grained token** + 3. Under **Resource owner**, select the **organization** you want to access. + 4. Under **Repository access**, choose **All repositories**. + 5. Under **Permissions** select **Repositories** and set: - **Metadata** -> Read-only - **Contents** -> Read-only - **Adminstration** -> Read-only - 6. Click **Generate token** and **copy it** (make sure to store it somewhere safe for future use). - 7. Navigate to `https://github.com//repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **GH_TOKEN** and copy paste the token into the **Secret** section and click **Add secret** - **Note:** The token must be approved by the organization administrator before accessing private repositories. + 6. Click **Generate token** and **copy it** (make sure to store it somewhere safe for future use). + 7. Navigate to `https://github.com//repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **GH_TOKEN** and copy paste the token into the **Secret** section and click **Add secret** + **Note:** The token must be approved by the organization administrator before accessing private repositories. + +### Create a Hugging Face Token + + To create one with permissions for both private and public repositoriesL + + 1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) + 2. Click on **New Token** and name it **repo-exporter** + 3. For permissions select: + - **Read** + 4. Click **Generate** and **copy it** (make sure to store it somewhere safe for future use) + 5. Navigate to `https://github.com//repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **HF_TOKEN** and copy paste the token into the **Secret** section and click **Add secret** ### Set up Google Cloud Service Account Access From 6f3128d1bd04fb9565f8c46525094b6255acbb98 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:59:04 -0500 Subject: [PATCH 06/16] Update .github/workflows/gh-repo-exporter.yml Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com> --- .github/workflows/gh-repo-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-repo-exporter.yml b/.github/workflows/gh-repo-exporter.yml index aa09f54..38ebf9c 100644 --- a/.github/workflows/gh-repo-exporter.yml +++ b/.github/workflows/gh-repo-exporter.yml @@ -44,4 +44,4 @@ jobs: GH_TOKEN: ${{ secrets.GH_TOKEN }} GOOGLE_CREDENTIALS_PATH: service_account.json REPO_TYPE: ${{ github.event.inputs.repo_type || 'all' }} - run: python export_repos.py \ No newline at end of file + run: python gh_repo_exporter.py From 927fe7bf23ec27c538f578705f4985f5f234c9e0 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:59:13 -0500 Subject: [PATCH 07/16] Update .github/workflows/hf-repo-exporter.yml Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com> --- .github/workflows/hf-repo-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hf-repo-exporter.yml b/.github/workflows/hf-repo-exporter.yml index 155216f..0ae0999 100644 --- a/.github/workflows/hf-repo-exporter.yml +++ b/.github/workflows/hf-repo-exporter.yml @@ -34,4 +34,4 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} GOOGLE_CREDENTIALS_PATH: service_account.json - run: python export_repos.py \ No newline at end of file + run: python hf_repo_exporter.py From 031b57aeee6b74c9b03ac8d2810eda9d6b0da905 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:02:12 -0500 Subject: [PATCH 08/16] Update README.md Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com> --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8e1630c..af9731b 100644 --- a/README.md +++ b/README.md @@ -61,8 +61,9 @@ To use this script within your own GitHub organization, first fork this repo, th 1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) 2. Click on **New Token** and name it **repo-exporter** - 3. For permissions select: - - **Read** + 3. For permissions select **Fine-grained**: + - Specify the desired organization (under **Org permissions**) + - Under **Repositories**, select "Read access to contents of all repos in selected organizations" 4. Click **Generate** and **copy it** (make sure to store it somewhere safe for future use) 5. Navigate to `https://github.com//repo-exporter/settings/secrets/actions` and click **New repository secret** and name it **HF_TOKEN** and copy paste the token into the **Secret** section and click **Add secret** From 8541a3bf6a4cc80966b823ec040495d82eb213a1 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Thu, 18 Dec 2025 13:05:40 -0500 Subject: [PATCH 09/16] WIP get_repo() and get_paper() funcs --- hf_repo_exporter.py | 234 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 203 insertions(+), 31 deletions(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index ac261a7..3d82b61 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -1,6 +1,6 @@ -from huggingface_hub import HfApi +from huggingface_hub import HfApi, hf_hub_download import pandas as pd -import tqdm +from tqdm import tqdm from google.oauth2.service_account import Credentials import gspread @@ -8,6 +8,7 @@ import time import os import re +import yaml # Config ORG_NAME = "imageomics" @@ -15,33 +16,154 @@ SHEET_NAME = "Sheet1" # Helper Functions -def get_repo_url(repo) -> str: - if repo._hf_repo_type == "dataset": +def get_repo_url(repo, repo_type: str) -> str: + if repo_type == "dataset": return f"https://huggingface.co/datasets/{repo.id}" - elif repo._hf_repo_type == "space": + elif repo_type == "space": return f"https://huggingface.co/spaces/{repo.id}" else: # model return f"https://huggingface.co/{repo.id}" - -def is_inactive(repo): + +def get_license(repo) -> str: + # 1. cardData + try: + license_from_card = getattr(repo, "cardData", {}).get("license") + if license_from_card: + return license_from_card + except Exception: + pass + + # 2. repo.license attribute + try: + license_attr = getattr(repo, "license", None) + if license_attr: + return license_attr + except Exception: + pass + + # 3. YAML content in README + try: + readme_text = getattr(repo, "readme", None) + if readme_text: + import re, yaml + match = re.search(r'^---\s*(.*?)\s*---', readme_text, re.DOTALL | re.MULTILINE) + if match: + yaml_content = match.group(1) + data = yaml.safe_load(yaml_content) + if isinstance(data, dict): + return data.get("license", "No") + except Exception: + pass + + return "No" + +def is_inactive(repo) -> str: try: last_modified = getattr(repo, "lastModified", None) if not last_modified: return "N/A" - # Parse ISO 8601 string - updated = datetime.fromisoformat(last_modified.replace("Z", "")) - if updated.tzinfo is None: - updated = updated.replace(tzinfo=timezone.utc) + # Ensure last_modified is aware (has tzinfo) + if last_modified.tzinfo is None: + last_modified = last_modified.replace(tzinfo=timezone.utc) one_year_ago = datetime.now(timezone.utc) - timedelta(days=365) - return "Yes" if updated < one_year_ago else "No" + return "Yes" if last_modified < one_year_ago else "No" except Exception: return "N/A" + +def get_homepage_link(repo, repo_type: str) -> str: + try: + card_data = getattr(repo, "cardData", None) + if isinstance(card_data, dict) and card_data.get("homepage"): + return f'=HYPERLINK("{card_data.get("homepage")}", "Homepage")' + except Exception: + pass + + # Check README for homepage URLs + try: + readme_path = hf_hub_download( + repo_id=repo.id, + filename="README.md", + repo_type=repo_type, + token=os.getenv("HF_TOKEN") + ) + with open(readme_path, 'r', encoding='utf-8') as f: + readme_text = f.read() + match = re.search(r'Homepage:\s*(https?://[^\s\n)]+)', readme_text, re.IGNORECASE) + if match: + return f'=HYPERLINK("{match.group(1)}", "Homepage")' + except Exception: + pass + + return "No" + +def get_repo_link(repo, repo_type: str) -> str: + try: + card_data = getattr(repo, "cardData", None) + if isinstance(card_data, dict): + for key in ("repository", "repo", "github_repo"): + url = card_data.get(key) + if url and url.startswith("http"): + return f'=HYPERLINK("{url}", "Repository")' + except Exception: + pass + + # Check README for github/repo URLs + try: + from huggingface_hub import hf_hub_download + readme_path = hf_hub_download( + repo_id=repo.id, + filename="README.md", + repo_type=repo_type, + token=os.getenv("HF_TOKEN") + ) + with open(readme_path, 'r', encoding='utf-8') as f: + readme_text = f.read() + match = re.search(r'(https?://(?:github\.com|gitlab\.com)[^\s\n)}\]]+)', readme_text, re.IGNORECASE) + if match: + url = match.group(1).rstrip('*`[]()]}') + return f'=HYPERLINK("{url}", "Repository")' + except Exception: + pass + + return "No" + +def get_paper_link(repo, repo_type: str) -> str: + try: + card_data = getattr(repo, "cardData", None) + if isinstance(card_data, dict) and card_data.get("paper"): + return f'=HYPERLINK("{card_data.get("paper")}", "Paper")' + except Exception: + pass + + # Check README for arxiv/paper URLs + try: + from huggingface_hub import hf_hub_download + readme_path = hf_hub_download( + repo_id=repo.id, + filename="README.md", + repo_type=repo_type, + token=os.getenv("HF_TOKEN") + ) + with open(readme_path, 'r', encoding='utf-8') as f: + readme_text = f.read() + match = re.search(r'(https?://(?:arxiv\.org|doi\.org)[^\s\n)}\]]+)', readme_text, re.IGNORECASE) + if match: + url = match.group(1).rstrip('*`[]()]}') + return f'=HYPERLINK("{url}", "Paper")' + except Exception: + pass + + return "No" def get_model_card_field(repo, key: str) -> str: try: - return repo.cardData.get(key, "") + value = repo.cardData.get(key, "") + # Convert to string if it's a list or other type + if isinstance(value, list): + return ", ".join(str(v) for v in value) + return str(value) if value else "" except Exception: return "N/A" @@ -51,29 +173,68 @@ def get_associated_assets(repo) -> str: return ", ".join(related) except Exception: return "N/A" + +def clean_description(desc: str) -> str: + if not desc: + return "N/A" + + # remove HTML tags + desc = re.sub(r"<[^>]+>", "", desc) + + # collapse multiple newlines/tabs/spaces into a single space + desc = re.sub(r"\s+", " ", desc) + + return desc.strip() + +def get_doi(repo) -> str: + try: + for tag in repo.tags: + if tag.startswith("doi:"): + return tag.replace("doi:", "") + except Exception: + pass + + return "No" + +def get_repo_info(repo, repo_type: str) -> dict[str, str | int]: + if repo_type == "dataset": + display_id = f"datasets/{repo.id}" + elif repo_type == "space": + display_id = f"spaces/{repo.id}" + else: + display_id = repo.id -def get_repo_info(repo) -> dict[str, str | int]: return { - "Repository Name": f'=HYPERLINK("{get_repo_url(repo)}", "{repo.id}")', - "Repository Type": repo._hf_repo_type, - "Description": getattr(repo, "description", "N/A"), + "Repository Name": f'=HYPERLINK("{get_repo_url(repo, repo_type)}", "{display_id}")', + "Repository Type": repo_type, + "Description": clean_description(getattr(repo, "description", "")), "Date Created": repo.created_at.strftime("%Y-%m-%d") if getattr(repo, "created_at", False) else "N/A", - "Last Updated": datetime.fromisoformat(repo.lastModified.replace("Z", "")).strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A", + "Last Updated": repo.lastModified.strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A", "Created By": repo.author, - "Top 4 Contributors/Curators": ..., + "Top 4 Contributors/Curators": "test", "Likes": getattr(repo, "likes", "N/A"), - "# of Open PRs": ..., + "# of Open PRs": "test", "README": "Yes" if getattr(repo, "cardData", False) else "No", - "License": "Yes" if getattr(repo, "license", False) else "No", + "License": get_license(repo), "Visibility": "Private" if getattr(repo, "private", False) else "Public", "Inactive": is_inactive(repo), - "Homepage": get_model_card_field(repo, "homepage"), - "Repo": f'=HYPERLINK("{get_model_card_field(repo, "github_repo")}", "{repo.id}")', - "Paper": f'=HYPERLINK("{get_model_card_field(repo, "paper")}", "Paper")', + "Homepage": get_homepage_link(repo, repo_type), + "Repo": get_repo_link(repo, repo_type), + "Paper": get_paper_link(repo, repo_type), "Associated data, models, or spaces": get_associated_assets(repo), - "DOI": get_model_card_field(repo, "doi"), + "DOI": get_doi(repo), } +# Convert all data types to string representation +def ensure_string_value(value) -> str: + if value is None: + return "" + if isinstance(value, list): + return ", ".join(str(v) for v in value) + if isinstance(value, dict): + return str(value) + return str(value) + def extract_display_name(val: str) -> str: match = re.search(r'"([^"]+)"\)$', val) # regex to extract the repo-name from "=HYPERLINK(..., "repo-name")" return match.group(1) if match else val @@ -131,6 +292,7 @@ def update_google_sheet(df: pd.DataFrame) -> None: continue # skip untouched columns value = row.get(col_name, "") + value = ensure_string_value(value) cell = gspread.utils.rowcol_to_a1(row_idx, col_idx) batch_body.append({ @@ -215,12 +377,22 @@ def main(): api = HfApi(token=TOKEN) try: - models = list(api.list_models(author=ORG_NAME, full=True)) + repos = [] + + for m in api.list_models(author=ORG_NAME, full=True): + repos.append((api.model_info(m.id), "model")) + + for d in api.list_datasets(author=ORG_NAME, full=True): + repos.append((api.dataset_info(d.id), "dataset")) + + for s in api.list_spaces(author=ORG_NAME, full=True): + repos.append((api.space_info(s.id), "space")) + except Exception as e: print(f'ERROR: Could not fetch models for "{ORG_NAME}"') print(e) return - + print("") print(f"Fetching Hugging Face repositories for: {ORG_NAME}") print("") @@ -232,13 +404,13 @@ def main(): if os.environ.get("CI") == "true": tqdm_kwargs = {"mininterval": 1, "dynamic_ncols": False, "leave": False} - for model in tqdm(models, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs): + for repo, repo_type in tqdm(repos, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs): try: - info = get_repo_info(model) + info = get_repo_info(repo, repo_type) data.append(info) - tqdm.write(f"Fetched info for /{model.id}") + tqdm.write(f"Fetched info for /{repo.id}") except Exception as e: - tqdm.write(f"ERROR: Cannot fetch /{model.id} info, due to {type(e).__name__}: {e}. Skipping...") + tqdm.write(f"ERROR: Cannot fetch /{repo.id} info, due to {type(e).__name__}: {e}. Skipping...") if not data: print("ERROR: No data collected") From 1158c8515f1bbeb548633df3d93205ff99ed9ee5 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Thu, 18 Dec 2025 16:38:38 -0500 Subject: [PATCH 10/16] Fix author, topcontributors. WIP: repos,home,paper --- hf_repo_exporter.py | 261 +++++++++++++++++++++++++------------------- 1 file changed, 148 insertions(+), 113 deletions(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index 3d82b61..77b1c5a 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -8,7 +8,7 @@ import time import os import re -import yaml +from collections import Counter # Config ORG_NAME = "imageomics" @@ -24,6 +24,62 @@ def get_repo_url(repo, repo_type: str) -> str: else: # model return f"https://huggingface.co/{repo.id}" +def get_author(api, repo_id, repo_type) -> str: + try: + # Fetch all commits + commits = api.list_repo_commits(repo_id=repo_id, repo_type=repo_type) + if not commits: + return ORG_NAME + + # The last item in the list is the earliest commit (the creation) + first_commit = commits[-1] + + if hasattr(first_commit, 'authors') and first_commit.authors: + first_author = first_commit.authors[0] + + # Use our string vs object logic from before + if isinstance(first_author, str): + return first_author + + # If it's an object, check for user handle then display name + return getattr(first_author, 'user', getattr(first_author, 'name', ORG_NAME)) + + return ORG_NAME + except Exception: + return ORG_NAME + +def get_top_contributors(api, repo_id, repo_type) -> str: + try: + commits = api.list_repo_commits(repo_id=repo_id, repo_type=repo_type) + + all_handles = [] + for c in commits: + authors = getattr(c, 'authors', []) + for author in authors: + # If author is a string (as shown in your logs), use it. + # If it's an object, try to get .user or .name + if isinstance(author, str): + all_handles.append(author) + else: + handle = getattr(author, 'user', getattr(author, 'name', None)) + if handle: + all_handles.append(str(handle)) + + # Filter out the Org name and the web-flow bot + bots_and_orgs = {ORG_NAME.lower(), "web-flow"} + filtered = [n for n in all_handles if str(n).lower() not in bots_and_orgs] + + if not filtered: + return ORG_NAME + + counts = Counter(filtered) + # Get top 4 most common contributors + top_4 = [name for name, count in counts.most_common(4)] + return ", ".join(top_4) + except Exception as e: + # Optional: tqdm.write(f"Error for {repo_id}: {e}") + return ORG_NAME + def get_license(repo) -> str: # 1. cardData try: @@ -71,95 +127,10 @@ def is_inactive(repo) -> str: return "Yes" if last_modified < one_year_ago else "No" except Exception: return "N/A" - -def get_homepage_link(repo, repo_type: str) -> str: - try: - card_data = getattr(repo, "cardData", None) - if isinstance(card_data, dict) and card_data.get("homepage"): - return f'=HYPERLINK("{card_data.get("homepage")}", "Homepage")' - except Exception: - pass - - # Check README for homepage URLs - try: - readme_path = hf_hub_download( - repo_id=repo.id, - filename="README.md", - repo_type=repo_type, - token=os.getenv("HF_TOKEN") - ) - with open(readme_path, 'r', encoding='utf-8') as f: - readme_text = f.read() - match = re.search(r'Homepage:\s*(https?://[^\s\n)]+)', readme_text, re.IGNORECASE) - if match: - return f'=HYPERLINK("{match.group(1)}", "Homepage")' - except Exception: - pass - - return "No" - -def get_repo_link(repo, repo_type: str) -> str: - try: - card_data = getattr(repo, "cardData", None) - if isinstance(card_data, dict): - for key in ("repository", "repo", "github_repo"): - url = card_data.get(key) - if url and url.startswith("http"): - return f'=HYPERLINK("{url}", "Repository")' - except Exception: - pass - - # Check README for github/repo URLs - try: - from huggingface_hub import hf_hub_download - readme_path = hf_hub_download( - repo_id=repo.id, - filename="README.md", - repo_type=repo_type, - token=os.getenv("HF_TOKEN") - ) - with open(readme_path, 'r', encoding='utf-8') as f: - readme_text = f.read() - match = re.search(r'(https?://(?:github\.com|gitlab\.com)[^\s\n)}\]]+)', readme_text, re.IGNORECASE) - if match: - url = match.group(1).rstrip('*`[]()]}') - return f'=HYPERLINK("{url}", "Repository")' - except Exception: - pass - - return "No" - -def get_paper_link(repo, repo_type: str) -> str: - try: - card_data = getattr(repo, "cardData", None) - if isinstance(card_data, dict) and card_data.get("paper"): - return f'=HYPERLINK("{card_data.get("paper")}", "Paper")' - except Exception: - pass - - # Check README for arxiv/paper URLs - try: - from huggingface_hub import hf_hub_download - readme_path = hf_hub_download( - repo_id=repo.id, - filename="README.md", - repo_type=repo_type, - token=os.getenv("HF_TOKEN") - ) - with open(readme_path, 'r', encoding='utf-8') as f: - readme_text = f.read() - match = re.search(r'(https?://(?:arxiv\.org|doi\.org)[^\s\n)}\]]+)', readme_text, re.IGNORECASE) - if match: - url = match.group(1).rstrip('*`[]()]}') - return f'=HYPERLINK("{url}", "Paper")' - except Exception: - pass - - return "No" def get_model_card_field(repo, key: str) -> str: try: - value = repo.cardData.get(key, "") + value = repo.card_data.get(key, "") # Convert to string if it's a list or other type if isinstance(value, list): return ", ".join(str(v) for v in value) @@ -167,24 +138,44 @@ def get_model_card_field(repo, key: str) -> str: except Exception: return "N/A" -def get_associated_assets(repo) -> str: +def get_associated_datasets(repo) -> str: try: - related = [tag for tag in repo.tags if tag.startswith(("dataset:", "model:", "space:"))] - return ", ".join(related) + datasets = [tag.replace("dataset:", "") for tag in repo.tags if tag.startswith("dataset:")] + return ", ".join(datasets) if datasets else "No" except Exception: - return "N/A" + return "No" -def clean_description(desc: str) -> str: - if not desc: - return "N/A" +def get_associated_models(api, repo, repo_type) -> str: + found = [tag.replace("model:", "") for tag in getattr(repo, "tags", []) if tag.startswith("model:")] - # remove HTML tags - desc = re.sub(r"<[^>]+>", "", desc) - - # collapse multiple newlines/tabs/spaces into a single space - desc = re.sub(r"\s+", " ", desc) - - return desc.strip() + # If it's a dataset, search for models that use this dataset + if repo_type == "dataset": + try: + related_models = api.list_models(filter=f"datasets:{repo.id}") + for m in related_models: + if m.id not in found: + found.append(m.id) + except Exception: + pass + + return ", ".join(found) if found else "No" + +def get_associated_spaces(api, repo, repo_type) -> str: + # 1. Check direct tags in the model card (Upstream) + found = [tag.replace("space:", "") for tag in getattr(repo, "tags", []) if tag.startswith("space:")] + + # 2. If it's a model, search for Spaces that use this model (Downstream) + if repo_type == "model": + try: + # Search for spaces that list this model ID + related_spaces = api.list_spaces(filter=f"models:{repo.id}") + for s in related_spaces: + if s.id not in found: + found.append(s.id) + except Exception: + pass + + return ", ".join(found) if found else "No" def get_doi(repo) -> str: try: @@ -196,7 +187,52 @@ def get_doi(repo) -> str: return "No" -def get_repo_info(repo, repo_type: str) -> dict[str, str | int]: +def extract_link_from_text(text, label): + if not text: + return "No" + + # 1. THE AGGRESSIVE SEARCH + # We look for the Label, a colon, and then we specifically look for an http(s) link. + # This ignores leading spaces/bullets and stops at the end of the URL. + url_pattern = rf"{label}:\s*(https?://[^\s\)\"\'\>]+)" + match = re.search(url_pattern, text, re.IGNORECASE) + + if match: + url = match.group(1).strip().rstrip('.,)]') + return f'=HYPERLINK("{url}", "{label}")' + + # 2. FALLBACK: Look for the label followed by ANY text (for non-URL repos) + text_pattern = rf"{label}:\s*([^\r\n]+)" + match = re.search(text_pattern, text, re.IGNORECASE) + if match: + content = match.group(1).strip() + # Clean up markdown junk + content = re.sub(r'[*_`\[\]]', '', content) + if content.lower() not in ["no", "n/a", "none", ""]: + return content + + return "No" + +def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]: + + # 1. Download README once + readme_text = "" + try: + # Debug print + tqdm.write(f"--- Debugging README for: {repo.id} ---") + + path = hf_hub_download( + repo_id=repo.id, + filename="README.md", + repo_type=repo_type, + token=os.getenv("HF_TOKEN") + ) + with open(path, 'r', encoding='utf-8') as f: + readme_text = f.read() + + except Exception as e: + tqdm.write(f"!!! Failed to download README for {repo.id}: {e}") + if repo_type == "dataset": display_id = f"datasets/{repo.id}" elif repo_type == "space": @@ -207,21 +243,23 @@ def get_repo_info(repo, repo_type: str) -> dict[str, str | int]: return { "Repository Name": f'=HYPERLINK("{get_repo_url(repo, repo_type)}", "{display_id}")', "Repository Type": repo_type, - "Description": clean_description(getattr(repo, "description", "")), + "Description": get_model_card_field(repo, "model_description") or "N/A", "Date Created": repo.created_at.strftime("%Y-%m-%d") if getattr(repo, "created_at", False) else "N/A", "Last Updated": repo.lastModified.strftime("%Y-%m-%d") if getattr(repo, "lastModified", False) else "N/A", - "Created By": repo.author, - "Top 4 Contributors/Curators": "test", + "Created By": get_author(api, repo.id, repo_type), + "Top 4 Contributors/Curators": get_top_contributors(api, repo.id, repo_type), "Likes": getattr(repo, "likes", "N/A"), "# of Open PRs": "test", "README": "Yes" if getattr(repo, "cardData", False) else "No", "License": get_license(repo), "Visibility": "Private" if getattr(repo, "private", False) else "Public", "Inactive": is_inactive(repo), - "Homepage": get_homepage_link(repo, repo_type), - "Repo": get_repo_link(repo, repo_type), - "Paper": get_paper_link(repo, repo_type), - "Associated data, models, or spaces": get_associated_assets(repo), + "Homepage": extract_link_from_text(readme_text, "Homepage"), + "Repo": extract_link_from_text(readme_text, "Repository"), + "Paper": extract_link_from_text(readme_text, "Paper"), + "Associated Datasets": get_associated_datasets(repo), + "Associated Models": get_associated_models(api, repo, repo_type), + "Associated Spaces": get_associated_spaces(api, repo, repo_type), "DOI": get_doi(repo), } @@ -317,12 +355,9 @@ def get_column_index(col_name: str): red_columns = { "README", "License", - "Visibility", - "Inactive", "Homepage", "Repo", "Paper", - "Associated data, models, or spaces", } orange_columns = { @@ -406,7 +441,7 @@ def main(): for repo, repo_type in tqdm(repos, desc=f"Fetching HF repos from {ORG_NAME}...", unit="repo", colour="green", ncols=100, **tqdm_kwargs): try: - info = get_repo_info(repo, repo_type) + info = get_repo_info(api, repo, repo_type) data.append(info) tqdm.write(f"Fetched info for /{repo.id}") except Exception as e: From 5a112f72c0d7f29b5819825b83610e5a5b090b4c Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Thu, 18 Dec 2025 16:44:20 -0500 Subject: [PATCH 11/16] Fix homepage, repo, and paper hyperlinking --- hf_repo_exporter.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index 77b1c5a..54b5c97 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -191,26 +191,32 @@ def extract_link_from_text(text, label): if not text: return "No" - # 1. THE AGGRESSIVE SEARCH - # We look for the Label, a colon, and then we specifically look for an http(s) link. - # This ignores leading spaces/bullets and stops at the end of the URL. - url_pattern = rf"{label}:\s*(https?://[^\s\)\"\'\>]+)" - match = re.search(url_pattern, text, re.IGNORECASE) + # Pattern to find 'Label: ...' anywhere in the text + # Added \b to ensure we match the exact word + pattern = rf"\b{label}\b:\s*([^\r\n]+)" + match = re.search(pattern, text, re.IGNORECASE) - if match: - url = match.group(1).strip().rstrip('.,)]') - return f'=HYPERLINK("{url}", "{label}")' - - # 2. FALLBACK: Look for the label followed by ANY text (for non-URL repos) - text_pattern = rf"{label}:\s*([^\r\n]+)" - match = re.search(text_pattern, text, re.IGNORECASE) if match: content = match.group(1).strip() - # Clean up markdown junk - content = re.sub(r'[*_`\[\]]', '', content) - if content.lower() not in ["no", "n/a", "none", ""]: - return content - + # Remove common markdown junk: *, _, `, [, ] + content = re.sub(r'[*_`\[\]]', '', content).strip() + + # Filter out placeholders + if content.upper() in ["N/A", "NONE", "", "NULL", "TBA", "COMING SOON", "IN PROGRESS", "-->"]: + return "No" + + # If it contains an http link, create a clean HYPERLINK formula + if "http" in content.lower(): + url_match = re.search(r'(https?://[^\s)]+)', content) + if url_match: + url = url_match.group(1).rstrip('.,)]') + # Use the text before the '(' as the label, or the default label + display_text = content.split('(')[0].strip() or label + # Double up quotes for Google Sheets formula safety + display_text = display_text.replace('"', '""') + return f'=HYPERLINK("{url}", "{display_text}")' + + return content return "No" def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]: From 016ebd11846bf04d18efb4a04381820ed914da18 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan <62574332+balajiRRK@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:09:48 -0500 Subject: [PATCH 12/16] Update hf_repo_exporter.py Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com> --- hf_repo_exporter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index 54b5c97..c6ba0ba 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -361,7 +361,6 @@ def get_column_index(col_name: str): red_columns = { "README", "License", - "Homepage", "Repo", "Paper", } From 1075f3d29aae426cf7f2a399a36104031fd41a3c Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Thu, 18 Dec 2025 18:15:37 -0500 Subject: [PATCH 13/16] Add OpenPRs, WIP fix assoc. datasets,models,spaces --- hf_repo_exporter.py | 103 +++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index 54b5c97..838e197 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -80,6 +80,19 @@ def get_top_contributors(api, repo_id, repo_type) -> str: # Optional: tqdm.write(f"Error for {repo_id}: {e}") return ORG_NAME +def get_open_pr_count(api, repo_id, repo_type) -> int: + try: + # Fetch discussions filtered by pull_requests only + discussions = api.get_repo_discussions( + repo_id=repo_id, + repo_type=repo_type + ) + # Count how many are pull requests AND are currently open + open_prs = [d for d in discussions if d.is_pull_request and d.status == "open"] + return len(open_prs) + except Exception: + return 0 + def get_license(repo) -> str: # 1. cardData try: @@ -140,42 +153,64 @@ def get_model_card_field(repo, key: str) -> str: def get_associated_datasets(repo) -> str: try: - datasets = [tag.replace("dataset:", "") for tag in repo.tags if tag.startswith("dataset:")] + # Looking for tags like 'dataset:user/repo' + datasets = [tag.replace("dataset:", "") for tag in getattr(repo, "tags", []) if tag.startswith("dataset:")] return ", ".join(datasets) if datasets else "No" except Exception: return "No" - + def get_associated_models(api, repo, repo_type) -> str: - found = [tag.replace("model:", "") for tag in getattr(repo, "tags", []) if tag.startswith("model:")] - - # If it's a dataset, search for models that use this dataset + found = [] + repo_id = getattr(repo, 'id', str(repo)) + if repo_type == "dataset": + tqdm.write(f"--- Searching Models for Dataset: {repo_id} ---") try: - related_models = api.list_models(filter=f"datasets:{repo.id}") - for m in related_models: - if m.id not in found: - found.append(m.id) - except Exception: - pass - + # Again, using 'search' to find any model mentioning this dataset + related_models = list(api.list_models(search=repo_id)) + if related_models: + found = [m.id for m in related_models if m.id != repo_id] + tqdm.write(f" [Search] Found {len(found)} models") + except Exception as e: + tqdm.write(f" [Search] Error: {e}") + return ", ".join(found) if found else "No" + +def get_associated_spaces(api, repo_id) -> str: + found = set() -def get_associated_spaces(api, repo, repo_type) -> str: - # 1. Check direct tags in the model card (Upstream) - found = [tag.replace("space:", "") for tag in getattr(repo, "tags", []) if tag.startswith("space:")] + # Ensure we are using the clean string ID + clean_id = repo_id.id if hasattr(repo_id, 'id') else str(repo_id) - # 2. If it's a model, search for Spaces that use this model (Downstream) - if repo_type == "model": - try: - # Search for spaces that list this model ID - related_spaces = api.list_spaces(filter=f"models:{repo.id}") - for s in related_spaces: - if s.id not in found: - found.append(s.id) - except Exception: - pass + try: + # 1. Broad Metadata Search + # We search specifically for the ID in the 'models' metadata field + # Note: We use list() to ensure the generator is fully exhausted + spaces_by_model = list(api.list_spaces(filter=f"models:{clean_id}")) + for s in spaces_by_model: + found.add(s.id) - return ", ".join(found) if found else "No" + # 2. String-based Search (The "Catch-all") + # This finds spaces that mention it but didn't use the standard YAML format + spaces_by_search = list(api.list_spaces(search=clean_id)) + for s in spaces_by_search: + found.add(s.id) + + # 3. Handle specific Org-level associations + # Sometimes spaces are linked but not indexed under the ID + # Let's filter out the self-reference + if clean_id in found: + found.remove(clean_id) + + except Exception as e: + tqdm.write(f"Error for {clean_id}: {e}") + + if not found: + return "No" + + # Sort and format + sorted_found = sorted(list(found)) + return ", ".join(sorted_found) def get_doi(repo) -> str: try: @@ -224,9 +259,6 @@ def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]: # 1. Download README once readme_text = "" try: - # Debug print - tqdm.write(f"--- Debugging README for: {repo.id} ---") - path = hf_hub_download( repo_id=repo.id, filename="README.md", @@ -255,7 +287,7 @@ def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]: "Created By": get_author(api, repo.id, repo_type), "Top 4 Contributors/Curators": get_top_contributors(api, repo.id, repo_type), "Likes": getattr(repo, "likes", "N/A"), - "# of Open PRs": "test", + "# of Open PRs": get_open_pr_count(api, repo.id, repo_type), "README": "Yes" if getattr(repo, "cardData", False) else "No", "License": get_license(repo), "Visibility": "Private" if getattr(repo, "private", False) else "Public", @@ -265,7 +297,7 @@ def get_repo_info(api, repo, repo_type: str) -> dict[str, str | int]: "Paper": extract_link_from_text(readme_text, "Paper"), "Associated Datasets": get_associated_datasets(repo), "Associated Models": get_associated_models(api, repo, repo_type), - "Associated Spaces": get_associated_spaces(api, repo, repo_type), + "Associated Spaces": get_associated_spaces(api, repo), "DOI": get_doi(repo), } @@ -363,10 +395,13 @@ def get_column_index(col_name: str): "License", "Homepage", "Repo", - "Paper", + "Paper" } - orange_columns = { + yellow_columns = { + "Associated Datasets", + "Associated Models", + "Associated Spaces", "DOI" } @@ -374,7 +409,7 @@ def get_column_index(col_name: str): # Only loop over columns that need formatting for col_set, color in [(red_columns, {"red": 1, "green": 0.5, "blue": 0.5}), - (orange_columns, {"red": 1, "green": 0.8, "blue": 0.4})]: + (yellow_columns, {"red": 1, "green": 0.8, "blue": 0.4})]: for col_name in col_set: col_index = get_column_index(col_name) From 4c1fb067497cf3f9de30495acc93bcdc786662e4 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Thu, 18 Dec 2025 18:22:48 -0500 Subject: [PATCH 14/16] Update README with examples to run scripts locally --- README.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index af9731b..14a3edf 100644 --- a/README.md +++ b/README.md @@ -110,10 +110,25 @@ Now update the script with [your GitHub Organization name](https://github.com/Im pip install -r requirements.txt ``` -5. Run the program - ``` - python export_repos.py - ``` +5. Run the exporters + + You can run **either exporter individually** or **both**, depending on your needs: + + - **Run only the GitHub repository exporter** + ``` + python gh_repo_exporter.py + ``` + + - **Run only the Hugging Face repository exporter** + ``` + python hf_repo_exporter.py + ``` + + - **Run both exporters (wait for one to finish before running the other)** + ``` + python hf_repo_exporter.py + python gh_repo_exporter.py + ``` ## Important Notes From 5e883c854e74104f21fd01ec72ae34386d740895 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Thu, 18 Dec 2025 18:32:25 -0500 Subject: [PATCH 15/16] Add TBD to filter word check --- hf_repo_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index fdcf5af..b0d43c6 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -237,7 +237,7 @@ def extract_link_from_text(text, label): content = re.sub(r'[*_`\[\]]', '', content).strip() # Filter out placeholders - if content.upper() in ["N/A", "NONE", "", "NULL", "TBA", "COMING SOON", "IN PROGRESS", "-->"]: + if content.upper() in ["N/A", "NONE", "", "NULL", "TBA", "COMING SOON", "IN PROGRESS", "TBD", "-->"]: return "No" # If it contains an http link, create a clean HYPERLINK formula From fd48c4bcdf6070974124ab0755b35eb04bf49284 Mon Sep 17 00:00:00 2001 From: Balaji Radhakrishnan Date: Fri, 19 Dec 2025 16:59:30 -0500 Subject: [PATCH 16/16] Update get_doi() to better detect DOIs --- hf_repo_exporter.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/hf_repo_exporter.py b/hf_repo_exporter.py index b0d43c6..1efce37 100644 --- a/hf_repo_exporter.py +++ b/hf_repo_exporter.py @@ -214,10 +214,23 @@ def get_associated_spaces(api, repo_id) -> str: def get_doi(repo) -> str: try: - for tag in repo.tags: - if tag.startswith("doi:"): - return tag.replace("doi:", "") - except Exception: + # 1. Check if the DOI is a direct attribute + if hasattr(repo, 'doi') and repo.doi: + return str(repo.doi).replace("doi:", "") + + # 2. Check the metadata dictionary if it exists + if hasattr(repo, 'card_data') and repo.card_data: + doi = repo.card_data.get('doi') + if doi: + return str(doi).replace("doi:", "") + + # 3. Fallback to the tags loop (for manually tagged DOIs) + if hasattr(repo, 'tags') and repo.tags: + for tag in repo.tags: + if isinstance(tag, str) and tag.lower().startswith("doi:"): + return tag.replace("doi:", "").replace("DOI:", "") + + except Exception as e: pass return "No"