From 5554e92d4f4846550ff9cf5ef3264598f7331199 Mon Sep 17 00:00:00 2001 From: aliel Date: Fri, 16 Feb 2024 11:47:11 +0100 Subject: [PATCH 01/13] Update CPU and memory details by switching to lshw method instead of cpuinfo Add CPU information (model, vendor, frequency) and memory details (clock, size, type) to API --- docker/vm_supervisor-dev.dockerfile | 2 +- packaging/aleph-vm/DEBIAN/control | 2 +- src/aleph/vm/orchestrator/machine.py | 77 +++++++++++++++++++++++++ src/aleph/vm/orchestrator/resources.py | 63 ++++++++++++++++++-- src/aleph/vm/orchestrator/supervisor.py | 1 + 5 files changed, 139 insertions(+), 6 deletions(-) create mode 100644 src/aleph/vm/orchestrator/machine.py diff --git a/docker/vm_supervisor-dev.dockerfile b/docker/vm_supervisor-dev.dockerfile index e78a02ec1..2d9e74eed 100644 --- a/docker/vm_supervisor-dev.dockerfile +++ b/docker/vm_supervisor-dev.dockerfile @@ -5,7 +5,7 @@ FROM debian:bookworm RUN apt-get update && apt-get -y upgrade && apt-get install -y \ sudo acl curl squashfs-tools git \ python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\ - python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \ + python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \ && rm -rf /var/lib/apt/lists/* RUN useradd jailman diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 6b42eea41..e2e73f8f0 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,python3-jwcrypto +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw,python3-jwcrypto Section: aleph-im Priority: Extra diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py new file mode 100644 index 000000000..9522f634b --- /dev/null +++ b/src/aleph/vm/orchestrator/machine.py @@ -0,0 +1,77 @@ +import json +import re +import subprocess +from functools import lru_cache + +import psutil + + +@lru_cache +def get_hardware_info(): + lshw = subprocess.Popen(["lshw", "-sanitize", "-json"], stdout=subprocess.PIPE, shell=False) + output, _ = lshw.communicate() + data = json.loads(output) + + hw_info = {} + + for hw in data["children"][0]["children"]: + if hw["id"] == "cpu": + hw_info["cpu"] = hw + elif hw["class"] == "memory" and hw["id"] == "memory": + hw_info["memory"] = hw + + return hw_info + + +@lru_cache +def get_cpu_info(): + hw = get_hardware_info() + + cpu_info = hw["cpu"] + architecture = cpu_info["width"] + + if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]: + architecture = "x86_64" + elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]: + architecture = "arm64" + + vendor = cpu_info["vendor"] + # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308 + + if "Intel Corp" in vendor: + vendor = "GenuineIntel" + elif "Advanced Micro Devices [AMD]" in vendor: + vendor = "AuthenticAMD" + + return { + "architecture": architecture, + "vendor": vendor, + "model": cpu_info["product"], + "frequency": cpu_info["capacity"], + "count": psutil.cpu_count(), + } + + +@lru_cache +def get_memory_info(): + hw = get_hardware_info() + mem_info = hw["memory"] + + memory_type = "" + memory_clock = "" + + for bank in mem_info["children"]: + memory_clock = bank["clock"] + try: + memory_type = re.search("(DDR[2-6])", bank["description"]).group(0) + break + except: + pass + + return { + "size": mem_info["size"], + "units": mem_info["units"], + "type": memory_type, + "clock": memory_clock, + "clock_units": "Hz", + } diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 694c61bb2..0cd7c1566 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -2,9 +2,10 @@ from datetime import datetime, timezone from functools import lru_cache -import cpuinfo import psutil from aiohttp import web +from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import get_cpu_info, get_memory_info from aleph_message.models import ItemHash from aleph_message.models.execution.environment import CpuProperties from pydantic import BaseModel, Field @@ -90,6 +91,30 @@ class MachineUsage(BaseModel): active: bool = True +class ExtendedCpuProperties(CpuProperties): + """CPU properties.""" + + model: Optional[str] = Field(default=None, description="CPU model") + frequency: Optional[str] = Field(default=None, description="CPU frequency") + count: Optional[str] = Field(default=None, description="CPU count") + + + +class MemoryProperties(BaseModel): + """MEMORY properties.""" + + size: Optional[str] = Field(default=None, description="Memory size") + units: Optional[str] = Field(default=None, description="Memory size units") + type: Optional[str] = Field(default=None, description="Memory type") + clock: Optional[str] = Field(default=None, description="Memory clock") + clock_units: Optional[str] = Field(default=None, description="Memory clock units") + + +class MachineCapability(BaseModel): + cpu: ExtendedCpuProperties + memory: MemoryProperties + + def get_machine_gpus(request: web.Request) -> GpuProperties: pool: VmPool = request.app["vm_pool"] gpus = pool.gpus @@ -108,12 +133,35 @@ def get_machine_properties() -> MachineProperties: In the future, some properties may have to be fetched from within a VM. """ - cpu_info = cpuinfo.get_cpu_info() # Slow + cpu_info = get_cpu_info() return MachineProperties( cpu=CpuProperties( - architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")), - vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")), + architecture=cpu_info.get("architecture"], + vendor=cpu_info["vendor"], + ), + ) + + +@lru_cache +def get_machine_capability() -> MachineCapability: + cpu_info = get_cpu_info() + mem_info = get_memory_info() + + return MachineCapability( + cpu=ExtendedCpuProperties( + architecture=cpu_info["architecture", cpu_info.get("arch_string_raw")), + vendor=cpu_info.get("vendor"], + model=cpu_info["model"], + frequency=cpu_info["frequency"], + count=cpu_info["count"], + ), + memory=MemoryProperties( + size=mem_info["size"], + units=mem_info["units"], + type=mem_info["type"], + clock=mem_info["clock"], + clock_units=mem_info["clock_units", cpu_info.get("vendor_id_raw")), features=list( filter( None, @@ -173,6 +221,13 @@ async def about_certificates(request: web.Request): return web.FileResponse(await sev_client.get_certificates()) +async def about_capability(_: web.Request): + """Public endpoint to expose information about the CRN capability.""" + + capability: MachineCapability = get_machine_capability() + return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"}) + + class Allocation(BaseModel): """An allocation is the set of resources that are currently allocated on this orchestrator. It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs. diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 36bd42dad..d7c6f39e1 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -130,6 +130,7 @@ def setup_webapp(pool: VmPool | None): web.get("/about/usage/system", about_system_usage), web.get("/about/certificates", about_certificates), web.get("/about/config", about_config), + web.get("/about/capability", about_capability), # /control APIs are used to control the VMs and access their logs web.post("/control/allocation/notify", notify_allocation), web.post("/control/reserve_resources", operate_reserve_resources), From 98a19a5c3ea1cd35fe07af647ef1cc1c30db5aab Mon Sep 17 00:00:00 2001 From: aliel Date: Fri, 5 Apr 2024 11:58:37 +0200 Subject: [PATCH 02/13] Refactor get_hardware_info() to use async subprocess.run for improved concurrency avoid Try-Catch using assertions --- src/aleph/vm/orchestrator/machine.py | 38 ++++++++++++++++---------- src/aleph/vm/orchestrator/resources.py | 14 +++++----- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index 9522f634b..cd1cdd805 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -1,18 +1,26 @@ +import asyncio import json import re -import subprocess from functools import lru_cache import psutil @lru_cache -def get_hardware_info(): - lshw = subprocess.Popen(["lshw", "-sanitize", "-json"], stdout=subprocess.PIPE, shell=False) - output, _ = lshw.communicate() +async def get_hardware_info(): + lshw = await asyncio.create_subprocess_shell( + "lshw -sanitize -json", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + + output, _ = await lshw.communicate() data = json.loads(output) - hw_info = {} + hw_info = { + "cpu": None, + "memory": None + } for hw in data["children"][0]["children"]: if hw["id"] == "cpu": @@ -24,8 +32,8 @@ def get_hardware_info(): @lru_cache -def get_cpu_info(): - hw = get_hardware_info() +async def get_cpu_info(): + hw = await get_hardware_info() cpu_info = hw["cpu"] architecture = cpu_info["width"] @@ -53,8 +61,8 @@ def get_cpu_info(): @lru_cache -def get_memory_info(): - hw = get_hardware_info() +async def get_memory_info(): + hw = await get_hardware_info() mem_info = hw["memory"] memory_type = "" @@ -62,11 +70,13 @@ def get_memory_info(): for bank in mem_info["children"]: memory_clock = bank["clock"] - try: - memory_type = re.search("(DDR[2-6])", bank["description"]).group(0) - break - except: - pass + if "description" in bank: + matched = re.search("(DDR[2-6])", bank["description"]) + if matched: + memory_type = matched.group(0) + break + else: + pass return { "size": mem_info["size"], diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 0cd7c1566..40ce0661b 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -127,14 +127,14 @@ def get_machine_gpus(request: web.Request) -> GpuProperties: @lru_cache -def get_machine_properties() -> MachineProperties: +async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ - cpu_info = get_cpu_info() + cpu_info = await get_cpu_info() return MachineProperties( cpu=CpuProperties( architecture=cpu_info.get("architecture"], @@ -144,9 +144,9 @@ def get_machine_properties() -> MachineProperties: @lru_cache -def get_machine_capability() -> MachineCapability: - cpu_info = get_cpu_info() - mem_info = get_memory_info() +async def get_machine_capability() -> MachineCapability: + cpu_info = await get_cpu_info() + mem_info = await get_memory_info() return MachineCapability( cpu=ExtendedCpuProperties( @@ -202,7 +202,7 @@ async def about_system_usage(request: web.Request): start_timestamp=period_start, duration_seconds=60, ), - properties=machine_properties, + properties=await get_machine_properties(), gpu=get_machine_gpus(request), ) @@ -224,7 +224,7 @@ async def about_certificates(request: web.Request): async def about_capability(_: web.Request): """Public endpoint to expose information about the CRN capability.""" - capability: MachineCapability = get_machine_capability() + capability: MachineCapability = await get_machine_capability() return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"}) From eec6835df8ad8a5c7eeb777522f66a795a861c30 Mon Sep 17 00:00:00 2001 From: aliel Date: Fri, 5 Apr 2024 16:42:37 +0200 Subject: [PATCH 03/13] Fix code style --- src/aleph/vm/orchestrator/machine.py | 9 ++------- src/aleph/vm/orchestrator/resources.py | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index cd1cdd805..aeffb6e7b 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -9,18 +9,13 @@ @lru_cache async def get_hardware_info(): lshw = await asyncio.create_subprocess_shell( - "lshw -sanitize -json", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE + "lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) output, _ = await lshw.communicate() data = json.loads(output) - hw_info = { - "cpu": None, - "memory": None - } + hw_info = {"cpu": None, "memory": None} for hw in data["children"][0]["children"]: if hw["id"] == "cpu": diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 40ce0661b..da0220266 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -99,7 +99,6 @@ class ExtendedCpuProperties(CpuProperties): count: Optional[str] = Field(default=None, description="CPU count") - class MemoryProperties(BaseModel): """MEMORY properties.""" From eaaa827bfc1918271b69024da7f0d22267994cd0 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 15:29:10 +0200 Subject: [PATCH 04/13] Remove duplicate cors headers --- src/aleph/vm/orchestrator/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index da0220266..0192284be 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -224,7 +224,7 @@ async def about_capability(_: web.Request): """Public endpoint to expose information about the CRN capability.""" capability: MachineCapability = await get_machine_capability() - return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"}) + return web.json_response(text=capability.json(exclude_none=False)) class Allocation(BaseModel): From e3473eda26055a454e17b58384b8780b5e0a903d Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:01:45 +0200 Subject: [PATCH 05/13] Problem: async funcs cannot be lru_cached Solution: Introduce our own decorator --- src/aleph/vm/orchestrator/machine.py | 11 ++---- src/aleph/vm/orchestrator/resources.py | 49 +++++++++++++++++++------- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index aeffb6e7b..092b53cf2 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -1,12 +1,10 @@ import asyncio import json import re -from functools import lru_cache import psutil -@lru_cache async def get_hardware_info(): lshw = await asyncio.create_subprocess_shell( "lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE @@ -26,10 +24,7 @@ async def get_hardware_info(): return hw_info -@lru_cache -async def get_cpu_info(): - hw = await get_hardware_info() - +def get_cpu_info(hw): cpu_info = hw["cpu"] architecture = cpu_info["width"] @@ -55,9 +50,7 @@ async def get_cpu_info(): } -@lru_cache -async def get_memory_info(): - hw = await get_hardware_info() +def get_memory_info(hw): mem_info = hw["memory"] memory_type = "" diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 0192284be..82b88b971 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,16 +1,21 @@ +import functools import math from datetime import datetime, timezone from functools import lru_cache import psutil from aiohttp import web -from aleph.vm.conf import settings -from aleph.vm.orchestrator.machine import get_cpu_info, get_memory_info from aleph_message.models import ItemHash from aleph_message.models.execution.environment import CpuProperties from pydantic import BaseModel, Field from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import ( + get_cpu_info, + get_hardware_info, + get_memory_info, +) +from aleph.vm.utils import cors_allow_all from aleph.vm.pool import VmPool from aleph.vm.resources import GpuDevice from aleph.vm.sevclient import SevClient @@ -125,32 +130,49 @@ def get_machine_gpus(request: web.Request) -> GpuProperties: ) -@lru_cache +machine_properties_cached = None + + +def async_cache(fn): + cache = {} + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in cache: + cache[key] = await fn(*args, **kwargs) + return cache[key] + + return wrapper + + +@async_cache async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ - - cpu_info = await get_cpu_info() + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) return MachineProperties( cpu=CpuProperties( - architecture=cpu_info.get("architecture"], + architecture=cpu_info.get("architecture"), vendor=cpu_info["vendor"], ), ) -@lru_cache +@async_cache async def get_machine_capability() -> MachineCapability: - cpu_info = await get_cpu_info() - mem_info = await get_memory_info() + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) + mem_info = get_memory_info(hw) return MachineCapability( cpu=ExtendedCpuProperties( - architecture=cpu_info["architecture", cpu_info.get("arch_string_raw")), - vendor=cpu_info.get("vendor"], + architecture=cpu_info["architecture", cpu_info.get("arch_string_raw")], + vendor=cpu_info.get("vendor"), model=cpu_info["model"], frequency=cpu_info["frequency"], count=cpu_info["count"], @@ -160,7 +182,7 @@ async def get_machine_capability() -> MachineCapability: units=mem_info["units"], type=mem_info["type"], clock=mem_info["clock"], - clock_units=mem_info["clock_units", cpu_info.get("vendor_id_raw")), + clock_units=mem_info["clock_units", cpu_info.get("vendor_id_raw")], features=list( filter( None, @@ -182,6 +204,7 @@ async def about_system_usage(request: web.Request): machine_properties = get_machine_properties() pool = request.app["vm_pool"] + machine_properties = await get_machine_properties() usage: MachineUsage = MachineUsage( cpu=CpuUsage( count=psutil.cpu_count(), @@ -201,7 +224,7 @@ async def about_system_usage(request: web.Request): start_timestamp=period_start, duration_seconds=60, ), - properties=await get_machine_properties(), + properties=machine_properties, gpu=get_machine_gpus(request), ) From e40ac8fc8fe7d38b2b0a240dd71069ed3ec9bbb3 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:04:36 +0200 Subject: [PATCH 06/13] fix lshw parsing for memory info --- src/aleph/vm/orchestrator/machine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index 092b53cf2..55f1072ee 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -55,9 +55,8 @@ def get_memory_info(hw): memory_type = "" memory_clock = "" - for bank in mem_info["children"]: - memory_clock = bank["clock"] + memory_clock = bank.get("clock") if "description" in bank: matched = re.search("(DDR[2-6])", bank["description"]) if matched: @@ -71,5 +70,5 @@ def get_memory_info(hw): "units": mem_info["units"], "type": memory_type, "clock": memory_clock, - "clock_units": "Hz", + "clock_units": "Hz" if memory_clock is not None else "", } From 58acf9724adaefe0cad993d57e539c1bf9361745 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:05:18 +0200 Subject: [PATCH 07/13] Move async_cache decorator to utils module --- src/aleph/vm/orchestrator/resources.py | 47 ++++++++++++-------------- src/aleph/vm/utils/__init__.py | 15 ++++++++ 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 82b88b971..d6dbe297a 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,7 +1,6 @@ -import functools import math from datetime import datetime, timezone -from functools import lru_cache +from typing import Optional import psutil from aiohttp import web @@ -15,11 +14,11 @@ get_hardware_info, get_memory_info, ) -from aleph.vm.utils import cors_allow_all from aleph.vm.pool import VmPool from aleph.vm.resources import GpuDevice from aleph.vm.sevclient import SevClient from aleph.vm.utils import ( + async_cache, check_amd_sev_es_supported, check_amd_sev_snp_supported, check_amd_sev_supported, @@ -133,19 +132,6 @@ def get_machine_gpus(request: web.Request) -> GpuProperties: machine_properties_cached = None -def async_cache(fn): - cache = {} - - @functools.wraps(fn) - async def wrapper(*args, **kwargs): - key = (args, frozenset(kwargs.items())) - if key not in cache: - cache[key] = await fn(*args, **kwargs) - return cache[key] - - return wrapper - - @async_cache async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... @@ -157,8 +143,18 @@ async def get_machine_properties() -> MachineProperties: cpu_info = get_cpu_info(hw) return MachineProperties( cpu=CpuProperties( - architecture=cpu_info.get("architecture"), + architecture=cpu_info["architecture"], vendor=cpu_info["vendor"], + features=list( + filter( + None, + ( + "sev" if check_amd_sev_supported() else None, + "sev_es" if check_amd_sev_es_supported() else None, + "sev_snp" if check_amd_sev_snp_supported() else None, + ), + ) + ), ), ) @@ -171,18 +167,11 @@ async def get_machine_capability() -> MachineCapability: return MachineCapability( cpu=ExtendedCpuProperties( - architecture=cpu_info["architecture", cpu_info.get("arch_string_raw")], - vendor=cpu_info.get("vendor"), + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], model=cpu_info["model"], frequency=cpu_info["frequency"], count=cpu_info["count"], - ), - memory=MemoryProperties( - size=mem_info["size"], - units=mem_info["units"], - type=mem_info["type"], - clock=mem_info["clock"], - clock_units=mem_info["clock_units", cpu_info.get("vendor_id_raw")], features=list( filter( None, @@ -194,6 +183,12 @@ async def get_machine_capability() -> MachineCapability: ) ), ), + memory=MemoryProperties( + size=mem_info["size"], + units=mem_info["units"], + type=mem_info["type"], + clock=mem_info["clock"], + ), ) diff --git a/src/aleph/vm/utils/__init__.py b/src/aleph/vm/utils/__init__.py index 62046184f..beaf288ef 100644 --- a/src/aleph/vm/utils/__init__.py +++ b/src/aleph/vm/utils/__init__.py @@ -1,5 +1,6 @@ import asyncio import dataclasses +import functools import hashlib import json import logging @@ -252,3 +253,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path return True return checksum(source) != checksum(destination) + + +def async_cache(fn): + """Simple async function cache decorator.""" + cache = {} + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in cache: + cache[key] = await fn(*args, **kwargs) + return cache[key] + + return wrapper From 156ef90c212750ecc569a4b1e2508e166d91901c Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:20:45 +0200 Subject: [PATCH 08/13] Change router order it's not really important but it was somehow annoying me --- src/aleph/vm/orchestrator/supervisor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index d7c6f39e1..087352301 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -20,7 +20,7 @@ from aleph.vm.sevclient import SevClient from aleph.vm.version import __version__ -from .resources import about_certificates, about_system_usage +from .resources import about_capability, about_certificates, about_system_usage from .tasks import ( start_payment_monitoring_task, start_watch_for_messages_task, @@ -129,8 +129,8 @@ def setup_webapp(pool: VmPool | None): web.get("/about/executions/records", about_execution_records), web.get("/about/usage/system", about_system_usage), web.get("/about/certificates", about_certificates), - web.get("/about/config", about_config), web.get("/about/capability", about_capability), + web.get("/about/config", about_config), # /control APIs are used to control the VMs and access their logs web.post("/control/allocation/notify", notify_allocation), web.post("/control/reserve_resources", operate_reserve_resources), From 252873f10b48f1d5d300bcf3fe08f3ea481322c8 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Wed, 15 May 2024 15:40:25 +0200 Subject: [PATCH 09/13] Adapt the tests --- tests/supervisor/test_views.py | 159 +++++++++++++++++++++++++++++++-- 1 file changed, 151 insertions(+), 8 deletions(-) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 025917379..8e28d9768 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -1,4 +1,3 @@ -import asyncio import tempfile from copy import deepcopy from pathlib import Path @@ -82,17 +81,161 @@ async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): assert resp["cpu"]["count"] > 0 +FAKE_SYSTEM_INFO = { + "cpu": { + "id": "cpu", + "class": "processor", + "claimed": True, + "handle": "DMI:0400", + "description": "CPU", + "product": "AMD EPYC 7763 64-Core Processor", + "vendor": "Advanced Micro Devices [AMD]", + "physid": "400", + "businfo": "cpu@0", + "version": "25.1.1", + "slot": "CPU 0", + "units": "Hz", + "size": 2000000000, + "capacity": 2000000000, + "width": 64, + "configuration": {"cores": "8", "enabledcores": "8", "microcode": "167776681", "threads": "1"}, + "capabilities": { + "fpu": "mathematical co-processor", + "fpu_exception": "FPU exceptions reporting", + "wp": True, + "vme": "virtual mode extensions", + "de": "debugging extensions", + "pse": "page size extensions", + "tsc": "time stamp counter", + "msr": "model-specific registers", + "pae": "4GB+ memory addressing (Physical Address Extension)", + "mce": "machine check exceptions", + "cx8": "compare and exchange 8-byte", + "apic": "on-chip advanced programmable interrupt controller (APIC)", + "sep": "fast system calls", + "mtrr": "memory type range registers", + "pge": "page global enable", + "mca": "machine check architecture", + "cmov": "conditional move instruction", + "pat": "page attribute table", + "pse36": "36-bit page size extensions", + "clflush": True, + "mmx": "multimedia extensions (MMX)", + "fxsr": "fast floating point save/restore", + "sse": "streaming SIMD extensions (SSE)", + "sse2": "streaming SIMD extensions (SSE2)", + "ht": "HyperThreading", + "syscall": "fast system calls", + "nx": "no-execute bit (NX)", + "mmxext": "multimedia extensions (MMXExt)", + "fxsr_opt": True, + "pdpe1gb": True, + "rdtscp": True, + "rep_good": True, + "nopl": True, + "cpuid": True, + "extd_apicid": True, + "tsc_known_freq": True, + "pni": True, + "pclmulqdq": True, + "ssse3": True, + "fma": True, + "cx16": True, + "pcid": True, + "sse4_1": True, + "sse4_2": True, + "x2apic": True, + "movbe": True, + "popcnt": True, + "tsc_deadline_timer": True, + "aes": True, + "xsave": True, + "avx": True, + "f16c": True, + "rdrand": True, + "hypervisor": True, + "lahf_lm": True, + "cmp_legacy": True, + "svm": True, + "cr8_legacy": True, + "abm": True, + "sse4a": True, + "misalignsse": True, + "3dnowprefetch": True, + "osvw": True, + "perfctr_core": True, + "invpcid_single": True, + "ssbd": True, + "ibrs": True, + "ibpb": True, + "stibp": True, + "vmmcall": True, + "fsgsbase": True, + "tsc_adjust": True, + "bmi1": True, + "avx2": True, + "smep": True, + "bmi2": True, + "erms": True, + "invpcid": True, + "rdseed": True, + "adx": True, + "clflushopt": True, + "clwb": True, + "sha_ni": True, + "xsaveopt": True, + "xsavec": True, + "xgetbv1": True, + "xsaves": True, + "clzero": True, + "xsaveerptr": True, + "wbnoinvd": True, + "arat": True, + "npt": True, + "nrip_save": True, + "umip": True, + "pku": True, + "vaes": True, + "vpclmulqdq": True, + "rdpid": True, + "fsrm": True, + "arch_capabilities": True, + }, + }, + "memory": { + "id": "memory", + "class": "memory", + "claimed": True, + "handle": "DMI:1000", + "description": "System Memory", + "physid": "1000", + "units": "bytes", + "size": 17179869184, + "configuration": {"errordetection": "multi-bit-ecc"}, + "capabilities": {"ecc": "Multi-bit error-correcting code (ECC)"}, + "children": [ + { + "id": "bank", + "class": "memory", + "claimed": True, + "handle": "DMI:1100", + "description": "DIMM RAM", + "vendor": "QEMU", + "physid": "0", + "slot": "DIMM 0", + "units": "bytes", + "size": 17179869184, + } + ], + }, +} + + @pytest.mark.asyncio async def test_system_usage_mock(aiohttp_client, mocker, mock_app_with_pool): """Test that the usage system endpoints response value. No auth needed""" - mocker.patch( - "cpuinfo.cpuinfo.get_cpu_info", - { - "arch_string_raw": "x86_64", - "vendor_id_raw": "AuthenticAMD", - }, - ) + mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) mocker.patch( "psutil.getloadavg", lambda: [1, 2, 3], From 7b8bdf4f50c80d1cbdc510ea6dc92ae8d3ea1e44 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Wed, 15 May 2024 15:44:47 +0200 Subject: [PATCH 10/13] add test for /about/capability --- tests/supervisor/test_views.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 8e28d9768..c3fbe6c3f 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -256,6 +256,36 @@ async def test_system_usage_mock(aiohttp_client, mocker, mock_app_with_pool): assert resp["cpu"]["count"] == 200 +@pytest.mark.asyncio +async def test_system_capability_mock(aiohttp_client, mocker): + """Test that the capability system endpoints response value. No auth needed""" + mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) + mocker.patch( + "psutil.getloadavg", + lambda: [1, 2, 3], + ) + mocker.patch( + "psutil.cpu_count", + lambda: 200, + ) + app = setup_webapp(pool=None) + client = await aiohttp_client(app) + response: web.Response = await client.get("/about/capability") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert resp == { + "cpu": { + "architecture": "x86_64", + "vendor": "AuthenticAMD", + "model": "AMD EPYC 7763 64-Core Processor", + "frequency": "2000000000", + "count": "200", + }, + "memory": {"size": "17179869184", "units": "bytes", "type": "", "clock": None, "clock_units": ""}, + } + + @pytest.mark.asyncio async def test_allocation_invalid_auth_token(aiohttp_client): """Test that the allocation endpoint fails when an invalid auth token is provided.""" From c681817b5a169e429642e4fbcd341d6ff8686359 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 17 May 2024 10:27:51 +0200 Subject: [PATCH 11/13] Fix dependencies in ci --- .github/workflows/test-using-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-using-pytest.yml b/.github/workflows/test-using-pytest.yml index cbb02e322..7abcbb904 100644 --- a/.github/workflows/test-using-pytest.yml +++ b/.github/workflows/test-using-pytest.yml @@ -25,7 +25,7 @@ jobs: run: | sudo apt-get update sudo apt-get -y upgrade - sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables libsystemd-dev cmake libdbus-1-dev libglib2.0-dev + sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables libsystemd-dev cmake libdbus-1-dev libglib2.0-dev lshw python3-jwcrypto pip install --upgrade typing-extensions types-PyYAML - name: Install required Python packages From bfdd26a17023a3be371fcd82a35043a7de8936e2 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 6 May 2025 12:09:55 +0200 Subject: [PATCH 12/13] Fix tests and parsing --- src/aleph/vm/orchestrator/machine.py | 16 ++++++----- src/aleph/vm/orchestrator/resources.py | 12 ++++---- src/aleph/vm/pool.py | 2 +- tests/supervisor/test_views.py | 40 ++++++++++++++++++++++---- 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index 55f1072ee..11095343d 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -1,17 +1,18 @@ import asyncio import json import re +import shutil import psutil +from aleph.vm.utils import run_in_subprocess -async def get_hardware_info(): - lshw = await asyncio.create_subprocess_shell( - "lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE - ) - output, _ = await lshw.communicate() - data = json.loads(output) +async def get_hardware_info(): + lshw_path = shutil.which("lshw") + assert lshw_path, "lshw not found in PATH. apt install lshw." + lshw_output = await run_in_subprocess([lshw_path, "-sanitize", "-json"]) + data = json.loads(lshw_output) hw_info = {"cpu": None, "memory": None} @@ -26,12 +27,13 @@ async def get_hardware_info(): def get_cpu_info(hw): cpu_info = hw["cpu"] - architecture = cpu_info["width"] if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]: architecture = "x86_64" elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]: architecture = "arm64" + else: + architecture = None vendor = cpu_info["vendor"] # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308 diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index d6dbe297a..a8452956b 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -99,17 +99,17 @@ class ExtendedCpuProperties(CpuProperties): """CPU properties.""" model: Optional[str] = Field(default=None, description="CPU model") - frequency: Optional[str] = Field(default=None, description="CPU frequency") - count: Optional[str] = Field(default=None, description="CPU count") + frequency: Optional[int] = Field(default=None, description="CPU frequency") + count: Optional[int] = Field(default=None, description="CPU count") class MemoryProperties(BaseModel): """MEMORY properties.""" - size: Optional[str] = Field(default=None, description="Memory size") + size: Optional[int] = Field(default=None, description="Memory size") units: Optional[str] = Field(default=None, description="Memory size units") type: Optional[str] = Field(default=None, description="Memory type") - clock: Optional[str] = Field(default=None, description="Memory clock") + clock: Optional[int] = Field(default=None, description="Memory clock") clock_units: Optional[str] = Field(default=None, description="Memory clock units") @@ -170,8 +170,8 @@ async def get_machine_capability() -> MachineCapability: architecture=cpu_info["architecture"], vendor=cpu_info["vendor"], model=cpu_info["model"], - frequency=cpu_info["frequency"], - count=cpu_info["count"], + frequency=(cpu_info["frequency"]), + count=(cpu_info["count"]), features=list( filter( None, diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 251cb2a04..17a8d386c 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -405,7 +405,7 @@ async def reserve_resources(self, message: ExecutableContent, user): return expiration_date def find_resources_available_for_user(self, message: ExecutableContent, user) -> set[GpuDevice]: - """Find required resource to run ExecutableContent from reserved resources by user or free resources. + """Find the required resource to run ExecutableContent from reserved resources by user or free resources. Only implement GPU for now""" # Calling function should use the creation_lock to avoid resource being stollem diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index c3fbe6c3f..02a665bca 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -1,3 +1,5 @@ +import asyncio +import os import tempfile from copy import deepcopy from pathlib import Path @@ -71,6 +73,7 @@ async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): @pytest.mark.asyncio async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): """Test that the usage system endpoints responds. No auth needed""" + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) client = await aiohttp_client(await mock_app_with_pool) response: web.Response = await client.get("/about/usage/system") @@ -81,7 +84,7 @@ async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): assert resp["cpu"]["count"] > 0 -FAKE_SYSTEM_INFO = { +MOCK_SYSTEM_INFO = { "cpu": { "id": "cpu", "class": "processor", @@ -100,6 +103,7 @@ async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): "width": 64, "configuration": {"cores": "8", "enabledcores": "8", "microcode": "167776681", "threads": "1"}, "capabilities": { + "x86-64": "64bits extensions (x86-64)", "fpu": "mathematical co-processor", "fpu_exception": "FPU exceptions reporting", "wp": True, @@ -235,7 +239,7 @@ async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): async def test_system_usage_mock(aiohttp_client, mocker, mock_app_with_pool): """Test that the usage system endpoints response value. No auth needed""" - mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) mocker.patch( "psutil.getloadavg", lambda: [1, 2, 3], @@ -259,7 +263,10 @@ async def test_system_usage_mock(aiohttp_client, mocker, mock_app_with_pool): @pytest.mark.asyncio async def test_system_capability_mock(aiohttp_client, mocker): """Test that the capability system endpoints response value. No auth needed""" - mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) + mocker.patch("aleph.vm.orchestrator.resources.check_amd_sev_supported", return_value=True) + mocker.patch("aleph.vm.orchestrator.resources.check_amd_sev_es_supported", return_value=True) + mocker.patch("aleph.vm.orchestrator.resources.check_amd_sev_snp_supported", return_value=False) mocker.patch( "psutil.getloadavg", lambda: [1, 2, 3], @@ -278,14 +285,34 @@ async def test_system_capability_mock(aiohttp_client, mocker): "cpu": { "architecture": "x86_64", "vendor": "AuthenticAMD", + "features": ["sev", "sev_es"], "model": "AMD EPYC 7763 64-Core Processor", - "frequency": "2000000000", - "count": "200", + "frequency": 2000000000, + "count": 200, }, - "memory": {"size": "17179869184", "units": "bytes", "type": "", "clock": None, "clock_units": ""}, + "memory": {"size": 17179869184, "units": "bytes", "type": "", "clock": None, "clock_units": None}, } +@pytest.mark.asyncio +async def test_system_capability_real(aiohttp_client, mocker): + """Test that the capability system endpoints response value + with real system value, no mock so we don't know the definive value but want ot see that it works""" + if os.environ.get("GITHUB_JOB"): + pytest.xfail("Test fail inside GITHUB CI because of invalid lshw return inside worker") + + app = setup_webapp(pool=None) + client = await aiohttp_client(app) + response: web.Response = await client.get("/about/capability") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert resp.get("cpu"), resp + assert resp["cpu"].get("architecture") + assert resp.get("memory") + assert resp["memory"].get("size") + + @pytest.mark.asyncio async def test_allocation_invalid_auth_token(aiohttp_client): """Test that the allocation endpoint fails when an invalid auth token is provided.""" @@ -597,6 +624,7 @@ def mock_is_kernel_enabled_gpu(pci_host: str) -> bool: async def test_system_usage_gpu_ressources(aiohttp_client, mocker, mock_app_with_pool): """Test gpu are properly listed""" client = await aiohttp_client(await mock_app_with_pool) + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) response: web.Response = await client.get("/about/usage/system") assert response.status == 200 From 03a1943fef7cfde78244c1f715b0e5f609dfb0af Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 19 May 2025 17:02:44 +0200 Subject: [PATCH 13/13] mod: uniformize coding style --- src/aleph/vm/orchestrator/resources.py | 16 ++++++++-------- src/aleph/vm/resources.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index a8452956b..4ce098371 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -98,19 +98,19 @@ class MachineUsage(BaseModel): class ExtendedCpuProperties(CpuProperties): """CPU properties.""" - model: Optional[str] = Field(default=None, description="CPU model") - frequency: Optional[int] = Field(default=None, description="CPU frequency") - count: Optional[int] = Field(default=None, description="CPU count") + model: str | None = Field(default=None, description="CPU model") + frequency: int | None = Field(default=None, description="CPU frequency") + count: int | None = Field(default=None, description="CPU count") class MemoryProperties(BaseModel): """MEMORY properties.""" - size: Optional[int] = Field(default=None, description="Memory size") - units: Optional[str] = Field(default=None, description="Memory size units") - type: Optional[str] = Field(default=None, description="Memory type") - clock: Optional[int] = Field(default=None, description="Memory clock") - clock_units: Optional[str] = Field(default=None, description="Memory clock units") + size: int | None = Field(default=None, description="Memory size") + units: str | None = Field(default=None, description="Memory size units") + type: str | None = Field(default=None, description="Memory type") + clock: int | None = Field(default=None, description="Memory clock") + clock_units: str | None = Field(default=None, description="Memory clock units") class MachineCapability(BaseModel): diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py index 1c3be8096..2f9a697ff 100644 --- a/src/aleph/vm/resources.py +++ b/src/aleph/vm/resources.py @@ -28,7 +28,7 @@ class GpuDevice(HashableModel): """GPU properties.""" vendor: str = Field(description="GPU vendor name") - model: Optional[str] = Field(description="GPU model name on Aleph Network", default=None) + model: str | None = Field(description="GPU model name on Aleph Network", default=None) device_name: str = Field(description="GPU vendor card name") device_class: GpuDeviceClass = Field( description="GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03"