Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions aobench/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
clean:
rm -rf ./scenario-server/__pycache__
rm -rf ./scenario-server/src/scenario_server/__pycache__
rm -rf ./scenario-server/src/scenario_server/handlers/__pycache__
rm -rf ./scenario-server/src/scenario_server/handlers/aob/__pycache__
rm -rf ./scenario-server/src/scenario_server/handlers/aob_iot/__pycache__
rm -rf ./scenario-server/src/scenario_server/handlers/aob_tsfm/__pycache__
rm -rf ./scenario-server/src/scenario_server/handlers/aob_workorders/__pycache__
rm -rf ./src/scenario-server/__pycache__


Expand Down
1 change: 1 addition & 0 deletions aobench/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ description = "Asset operations benchmarking"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"huggingface-hub>=0.35.3",
"scenario-client",
"scenario-server",
]
Expand Down
13 changes: 12 additions & 1 deletion aobench/scenario-server/src/scenario_server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
scenario_types,
set_tracking_uri,
)
from scenario_server.handlers.aob.aob import AOBScenarios
from scenario_server.handlers.aob_iot.aob_iot import AOBIoTScenarios
from scenario_server.handlers.aob_tsfm.aob_tsfm import AOBTSFMScenarios
from scenario_server.handlers.aob_workorders.aob_workorders import AOBWorkOrderScenarios

logger: logging.Logger = logging.getLogger("scenario-server")

Expand Down Expand Up @@ -52,7 +56,14 @@ def get_app(
register_scenario_handlers(handlers=handlers)

if include_default_handlers:
register_scenario_handlers(handlers=[])
register_scenario_handlers(
handlers=[
AOBScenarios,
AOBIoTScenarios,
AOBTSFMScenarios,
AOBWorkOrderScenarios,
]
)

app = Litestar(
debug=True,
Expand Down
1 change: 1 addition & 0 deletions aobench/scenario-server/src/scenario_server/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class ScenarioType:
class Scenario:
id: str
query: str
metadata: dict


@dataclass
Expand Down
156 changes: 156 additions & 0 deletions aobench/scenario-server/src/scenario_server/handlers/aob/aob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import json
import logging

from huggingface_hub import hf_hub_download
from scenario_server.entities import (
Scenario,
ScenarioType,
SubmissionAnswer,
SubmissionScore,
)
from scenario_server.grading import evaluation_agent
from scenario_server.handlers.scenario_handler import ScenarioHandler

logger: logging.Logger = logging.getLogger("scenario-server")

HUGGINGFACE_REPO = "ibm-research/AssetOpsBench"
HUGGINGFACE_DATA = "data/scenarios/all_utterance.jsonl"


class AOBScenarios(ScenarioHandler):
id = "d3bec9b0-59b4-4a2f-9497-28cb1eed1c80"
title = "Asset Operations Bench - General"
description = "Human-authored evaluation prompts for industrial asset agents."

def __init__(self):
self.scenario_data = dict()
try:
cache: str = hf_hub_download(
repo_id=HUGGINGFACE_REPO,
filename=HUGGINGFACE_DATA,
repo_type="dataset",
)

with open(cache, "r") as f:
scenario_data = [json.loads(line) for line in f]

for sd in scenario_data:
if ("type" in sd and sd["type"].lower() == "") or "type" not in sd:
self.scenario_data[str(sd["id"])] = sd

except Exception as e:
logger.error(f"failed to init AOBScenarios: {e=}")

def _grade_answer(self, entry_id, answer) -> SubmissionScore:
try:
unwrap = json.loads(answer)

c = self.scenario_data[entry_id]["characteristic_form"]
q = self.scenario_data[entry_id]["text"]
r = unwrap["result"]
t = unwrap["trace"]

result, details = evaluation_agent(
actual=r,
charactistic=c,
query=q,
trace=t,
)

return SubmissionScore(
scenario_id=entry_id,
correct=result,
details=details,
)
except Exception as e:
logger.error(f"failed to grade {entry_id=} : {e=}")
logger.debug(f"{entry_id=} / {answer=} / {self.scenario_data[entry_id]}")
return SubmissionScore(
scenario_id=entry_id,
correct=False,
details=[{"error": f"failed to grade scenario id: {entry_id}"}],
)

def scenario_type(self) -> ScenarioType:
return ScenarioType(id=self.id, title=self.title, description=self.description)

def fetch_scenarios(self) -> list[Scenario]:
scenarios = []

for k, v in self.scenario_data.items():
try:
metadata = dict()

if "type" in v:
metadata["type"] = v["type"]

if "category" in v:
metadata["category"] = v["category"]

scenarios.append(
Scenario(
id=str(k),
query=v["text"],
metadata=metadata,
)
)
except Exception as e:
logger.error(f"failed to process {k}, {v} : {e=}")

return scenarios

async def grade_responses(
self, submission: list[SubmissionAnswer]
) -> list[SubmissionScore]:

grade = []
for entry in submission:
try:
entry_id: str = entry.scenario_id
except Exception as e:
logger.error(f"missing scenario id: {entry=}")
continue

if entry_id not in self.scenario_data:
grade.append(
SubmissionScore(
scenario_id=entry_id,
correct=False,
details=[{"error": f"unknown scenario id: {entry_id}"}],
)
)
continue

g: SubmissionScore = self._grade_answer(entry_id, entry.answer)
grade.append(g)

return grade


if __name__ == "__main__":
import asyncio

aobs = AOBScenarios()
submission = [
SubmissionAnswer(
scenario_id="Q.S5",
answer='[{"scenario_id": "Q.S5.0", "answer": ""}]',
),
SubmissionAnswer(
scenario_id="501",
answer="",
),
SubmissionAnswer(
scenario_id="501",
answer=json.dumps(
{
"trace": "query database for iot data",
"result": [],
}
),
),
]
grade: list[SubmissionScore] = asyncio.run(
aobs.grade_responses(submission=submission)
)
print(f"{grade=}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import json
import logging

from huggingface_hub import hf_hub_download
from scenario_server.entities import (
Scenario,
ScenarioType,
SubmissionAnswer,
SubmissionScore,
)
from scenario_server.grading import evaluation_agent
from scenario_server.handlers.scenario_handler import ScenarioHandler

logger: logging.Logger = logging.getLogger("scenario-server")

HUGGINGFACE_REPO = "ibm-research/AssetOpsBench"
HUGGINGFACE_DATA = "data/scenarios/all_utterance.jsonl"


class AOBIoTScenarios(ScenarioHandler):
id = "b3aa206a-f7dc-43c9-a1f4-dcf984417487"
title = "Asset Operations Bench - IoT"
description = "Human-authored evaluation prompts for industrial asset agents."

def __init__(self):
self.scenario_data = dict()
try:
cache: str = hf_hub_download(
repo_id=HUGGINGFACE_REPO,
filename=HUGGINGFACE_DATA,
repo_type="dataset",
)

with open(cache, "r") as f:
scenario_data = [json.loads(line) for line in f]

for sd in scenario_data:
if "type" in sd and sd["type"].lower() == "iot":
self.scenario_data[str(sd["id"])] = sd

except Exception as e:
logger.error(f"failed to init AOBScenarios: {e=}")

def _grade_answer(self, entry_id, answer) -> SubmissionScore:
try:
unwrap = json.loads(answer)

c = self.scenario_data[entry_id]["characteristic_form"]
q = self.scenario_data[entry_id]["text"]
r = unwrap["result"]
t = unwrap["trace"]

result, details = evaluation_agent(
actual=r,
charactistic=c,
query=q,
trace=t,
)

return SubmissionScore(
scenario_id=entry_id,
correct=result,
details=details,
)
except Exception as e:
logger.error(f"failed to grade {entry_id=} : {e=}")
logger.debug(f"{entry_id=} / {answer=} / {self.scenario_data[entry_id]}")
return SubmissionScore(
scenario_id=entry_id,
correct=False,
details=[{"error": f"failed to grade scenario id: {entry_id}"}],
)

def scenario_type(self) -> ScenarioType:
return ScenarioType(id=self.id, title=self.title, description=self.description)

def fetch_scenarios(self) -> list[Scenario]:
scenarios = []

for k, v in self.scenario_data.items():
try:
metadata = dict()

if "category" in v:
metadata["category"] = v["category"]

scenarios.append(
Scenario(
id=str(k),
query=v["text"],
metadata=metadata,
)
)
except Exception as e:
logger.error(f"failed to process {k}, {v} : {e=}")

return scenarios

async def grade_responses(
self, submission: list[SubmissionAnswer]
) -> list[SubmissionScore]:

grade = []
for entry in submission:
try:
entry_id: str = entry.scenario_id
except Exception as e:
logger.error(f"missing scenario id: {entry=}")
continue

if entry_id not in self.scenario_data:
grade.append(
SubmissionScore(
scenario_id=entry_id,
correct=False,
details=[{"error": f"unknown scenario id: {entry_id}"}],
)
)
continue

g: SubmissionScore = self._grade_answer(entry_id, entry.answer)
grade.append(g)

return grade


if __name__ == "__main__":
import asyncio

aobs = AOBIoTScenarios()
submission: list[SubmissionAnswer] = [
SubmissionAnswer(
scenario_id="Q.S5",
answer='[{"scenario_id": "Q.S5.0", "answer": ""}]',
),
SubmissionAnswer(
scenario_id="2",
answer="",
),
SubmissionAnswer(
scenario_id="2",
answer=json.dumps(
{
"trace": "query database for iot sites",
"result": ["Downtown", "Uptown"],
}
),
),
]
grade: list[SubmissionScore] = asyncio.run(
aobs.grade_responses(submission=submission)
)
print(f"{grade=}")
Loading