From 5e94b9652df4b6dad711beaffd4e061d3c098c5d Mon Sep 17 00:00:00 2001 From: sean1572 Date: Tue, 13 Jan 2026 15:48:53 -0800 Subject: [PATCH 1/2] Inital commit of label studio exporter --- .gitignore | 2 +- .../label_studio_exporter/README.md | 24 ++ data_exporters/label_studio_exporter/demo.py | 47 ++++ .../label_studio_exporter/label_studio.py | 231 ++++++++++++++++++ .../label_studio_exporter/template.xml | 12 + 5 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 data_exporters/label_studio_exporter/README.md create mode 100644 data_exporters/label_studio_exporter/demo.py create mode 100644 data_exporters/label_studio_exporter/label_studio.py create mode 100644 data_exporters/label_studio_exporter/template.xml diff --git a/.gitignore b/.gitignore index 30db9cd..a31c918 100644 --- a/.gitignore +++ b/.gitignore @@ -237,4 +237,4 @@ demos/ # Block predictions predictions/* *.pkl -*.arrow \ No newline at end of file +*.arrow diff --git a/data_exporters/label_studio_exporter/README.md b/data_exporters/label_studio_exporter/README.md new file mode 100644 index 0000000..088436a --- /dev/null +++ b/data_exporters/label_studio_exporter/README.md @@ -0,0 +1,24 @@ +# Labeling Audio Data in Label Studio + +This pipeline intends to take audiodatasets from the inferance or the data_extractors of `whoot_model_training` and format projects in label studio to easily verify annotations. The following readme outlines best practices for creating and intergrating with Label Studio + + +## Creating new project + +1) Create a .env file with the following properties in the same folder that this readme is in + +``` +# Define the URL where Label Studio is accessible +LABEL_STUDIO_URL = 'HOSTNAME OF LABEL STUDIO INSTANCE' +# API key is available at the Account & Settings page in Label Studio UI +LABEL_STUDIO_API_KEY = 'INSERT YOUR API KEY' +LABEL_STUDIO_PROJECT_ID = 'PROJECT ID FROM URL OF PROJECT' +``` + +2) Create a new project in that label studio instance and uploaded the needed data to it + +NOTE: Save the project_id from the URL of the project + +If keeping the data local on the instance, try to keep the file structure the same as is the audio file from your ML machine. For example, if some dataset is located at `mnt/datasets/audio_dataset_cool/AB/1/audio.wav` then you may want to make the path on label studio something like `label_studio_path/audio_dataset_cool/AB/1/audio.wav` for the easiest intergrations. Otherwise some minor file changes will be needed. + +3) Run the script to apply annotations, see demo.py in this folder \ No newline at end of file diff --git a/data_exporters/label_studio_exporter/demo.py b/data_exporters/label_studio_exporter/demo.py new file mode 100644 index 0000000..b524fc9 --- /dev/null +++ b/data_exporters/label_studio_exporter/demo.py @@ -0,0 +1,47 @@ +"""Demo for using Label Studio Exporter with a sample dataset.""" + +from label_studio import LabelStudioSetup +from dotenv import load_dotenv +import os + +load_dotenv() + + +## SELECT A PROJECT FROM LABEL STUDIO +## FIND ID IN URL OF PROJECT +PROJECT_ID = int(os.getenv("LABEL_STUDIO_PROJECT_ID")) +ls_setup = LabelStudioSetup( + current_project=PROJECT_ID +) +## ADD DEFAULT TEMPLATE TO LABEL STUDIO +ls_setup.apply_custom_template("template.xml") + +## HOW TO GET AUDIO FILES TO REVIEW +# Note this is not a perfect process as diffrences between label studio and your dataset may exist +ls_setup.get_files(ls_file_parent='data/local-files/?d=data1/') + +## TODO MAKE SURE YOUR DATASET AND LABEL STUDIO FILE PATHS ALIGN + +#=============================================================== +## below is a fake dataset creation for demo purposes only +## In practice you would load your dataset from the saves in whoot_model_training +import datasets +import random + +class_list = ['cluck','coocoo', 'twitter', 'alarm', 'chick begging', 'no_buow'] + +ds = datasets.Dataset.from_dict({ + "audio": ls_setup.get_files(ls_file_parent='data/local-files/?d=data1/')["files"], + "labels": random.choices(class_list, k=len(ls_setup.get_files(ls_file_parent='data/local-files/?d=data1/')["files"])) +}) + +ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000, decode=False)) + +#=============================================================== + + + +## UPLOAD DATASET TO LABEL STUDIO +ls_setup.update_tasks_in_ls(ds, ls_file_parent='data/local-files/?d=data1/', is_model_prediction=True) + + diff --git a/data_exporters/label_studio_exporter/label_studio.py b/data_exporters/label_studio_exporter/label_studio.py new file mode 100644 index 0000000..cf53016 --- /dev/null +++ b/data_exporters/label_studio_exporter/label_studio.py @@ -0,0 +1,231 @@ +from http import client +import os +import requests +from dotenv import load_dotenv +from label_studio_sdk import LabelStudio +import tqdm +from label_studio_sdk.label_interface.objects import PredictionValue, AnnotationValue +import datasets + +class LabelStudioSetup(): + """Sets up a Label Studio project for annotation. + + When submoduling, primarly do so for diffrent labeling templates. In particular, + - apply_audio_template + - default_template_annotation_style + + These will be template spefific. Currently, they mirror the template found in + data_exporters/label_studio_exporter/template.xml + """ + + def __init__(self, current_project = None): + """Initialize the Label Studio client and create a project.""" + load_dotenv() + LABEL_STUDIO_URL = os.getenv("LABEL_STUDIO_URL") + LABEL_STUDIO_API_KEY = os.getenv("LABEL_STUDIO_API_KEY") + + + if LABEL_STUDIO_URL is None or LABEL_STUDIO_API_KEY is None: + raise ValueError("LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY must be set in the .env file.") + + self.client = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=LABEL_STUDIO_API_KEY) + + if current_project is not None: + self.current_project = self.client.projects.get(current_project) + print("Project ID:", self.current_project.id, "\t Project Name:", self.current_project.title) + input("Double check this, this script can take destructive actions. Press Enter to continue...") + + self.api_key = LABEL_STUDIO_API_KEY + self.base_url = LABEL_STUDIO_URL + + def create_project(self, title: str = "Whoot Audio Annotation Project"): + """Create a new project in Label Studio. + + Args: + title (str): The title of the project. + + Returns: + project: The created Label Studio project. + """ + project = self.client.projects.create( + title=title, + label_config=self.label_config + ) + + print("Project ID:", project.id, project.title) + input("Double check this, this script can take destructive actions. Press Enter to continue...") + + # Associate this class instance with the created project + self.current_project = project + + return project + + def apply_audio_template(self, class_names: list = None): + """Apply a default audio annotation template. + + Args: + class_names (list): List of class names for labeling. + """ + + if self.current_project is None: + raise ValueError("No current project set. Please create a project first.") + + audio_template = """ + + + """ + + for class_name in class_names: + audio_template += f' +