From d82570d5c86a750ca3f779ebf1d3b6d60eac8fb2 Mon Sep 17 00:00:00 2001 From: aliel Date: Fri, 16 Feb 2024 11:47:11 +0100 Subject: [PATCH 01/14] Update CPU and memory details by switching to lshw method instead of cpuinfo Add CPU information (model, vendor, frequency) and memory details (clock, size, type) to API --- docker/vm_supervisor-dev.dockerfile | 2 +- packaging/aleph-vm/DEBIAN/control | 2 +- packaging/requirements-debian-11.txt | 1 - packaging/requirements-ubuntu-20.04.txt | 1 - packaging/requirements-ubuntu-22.04.txt | 1 - pyproject.toml | 1 - src/aleph/vm/orchestrator/machine.py | 77 +++++++++++++++++++++++++ src/aleph/vm/orchestrator/resources.py | 66 +++++++++++++++++++-- src/aleph/vm/orchestrator/supervisor.py | 4 +- 9 files changed, 141 insertions(+), 14 deletions(-) create mode 100644 src/aleph/vm/orchestrator/machine.py diff --git a/docker/vm_supervisor-dev.dockerfile b/docker/vm_supervisor-dev.dockerfile index da730aca8..3214e5494 100644 --- a/docker/vm_supervisor-dev.dockerfile +++ b/docker/vm_supervisor-dev.dockerfile @@ -5,7 +5,7 @@ FROM debian:bullseye RUN apt-get update && apt-get -y upgrade && apt-get install -y \ sudo acl curl squashfs-tools git \ python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\ - python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \ + python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \ && rm -rf /var/lib/apt/lists/* RUN useradd jailman diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 45aa6bd65..35906c86d 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw Section: aleph-im Priority: Extra diff --git a/packaging/requirements-debian-11.txt b/packaging/requirements-debian-11.txt index d708640db..3922e4c59 100644 --- a/packaging/requirements-debian-11.txt +++ b/packaging/requirements-debian-11.txt @@ -17,7 +17,6 @@ multidict==5.1.0 git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py packaging==20.9 psutil==5.8.0 -py-cpuinfo==5.0.0 pycares==3.1.1 pyparsing==2.4.7 pyrsistent==0.15.5 diff --git a/packaging/requirements-ubuntu-20.04.txt b/packaging/requirements-ubuntu-20.04.txt index 1175ab784..ccf416943 100644 --- a/packaging/requirements-ubuntu-20.04.txt +++ b/packaging/requirements-ubuntu-20.04.txt @@ -18,7 +18,6 @@ multidict==4.7.3 git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py packaging==20.3 psutil==5.5.1 -py-cpuinfo==5.0.0 pycares==3.1.1 PyGObject==3.36.0 pyparsing==2.4.6 diff --git a/packaging/requirements-ubuntu-22.04.txt b/packaging/requirements-ubuntu-22.04.txt index 580dc68ef..204bea18f 100644 --- a/packaging/requirements-ubuntu-22.04.txt +++ b/packaging/requirements-ubuntu-22.04.txt @@ -21,7 +21,6 @@ multidict==5.1.0 git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py packaging==21.3 psutil==5.9.0 -py-cpuinfo==5.0.0 pycares==4.1.2 PyGObject==3.42.1 pyparsing==2.4.7 diff --git a/pyproject.toml b/pyproject.toml index 4619dbd82..16680e102 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,6 @@ dependencies = [ "sentry-sdk==1.31.0", "aioredis==1.3.1", "psutil==5.9.5", - "py-cpuinfo==9.0.0", "schedule==1.2.1", "nftables @ git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py", "msgpack==1.0.7", diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py new file mode 100644 index 000000000..9522f634b --- /dev/null +++ b/src/aleph/vm/orchestrator/machine.py @@ -0,0 +1,77 @@ +import json +import re +import subprocess +from functools import lru_cache + +import psutil + + +@lru_cache +def get_hardware_info(): + lshw = subprocess.Popen(["lshw", "-sanitize", "-json"], stdout=subprocess.PIPE, shell=False) + output, _ = lshw.communicate() + data = json.loads(output) + + hw_info = {} + + for hw in data["children"][0]["children"]: + if hw["id"] == "cpu": + hw_info["cpu"] = hw + elif hw["class"] == "memory" and hw["id"] == "memory": + hw_info["memory"] = hw + + return hw_info + + +@lru_cache +def get_cpu_info(): + hw = get_hardware_info() + + cpu_info = hw["cpu"] + architecture = cpu_info["width"] + + if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]: + architecture = "x86_64" + elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]: + architecture = "arm64" + + vendor = cpu_info["vendor"] + # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308 + + if "Intel Corp" in vendor: + vendor = "GenuineIntel" + elif "Advanced Micro Devices [AMD]" in vendor: + vendor = "AuthenticAMD" + + return { + "architecture": architecture, + "vendor": vendor, + "model": cpu_info["product"], + "frequency": cpu_info["capacity"], + "count": psutil.cpu_count(), + } + + +@lru_cache +def get_memory_info(): + hw = get_hardware_info() + mem_info = hw["memory"] + + memory_type = "" + memory_clock = "" + + for bank in mem_info["children"]: + memory_clock = bank["clock"] + try: + memory_type = re.search("(DDR[2-6])", bank["description"]).group(0) + break + except: + pass + + return { + "size": mem_info["size"], + "units": mem_info["units"], + "type": memory_type, + "clock": memory_clock, + "clock_units": "Hz", + } diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 5be767dac..2c1ceca9b 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -3,15 +3,14 @@ from functools import lru_cache from typing import Optional -import cpuinfo import psutil from aiohttp import web +from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import get_cpu_info, get_memory_info from aleph_message.models import ItemHash from aleph_message.models.execution.environment import CpuProperties from pydantic import BaseModel, Field -from aleph.vm.conf import settings - class Period(BaseModel): datetime: datetime @@ -76,6 +75,30 @@ class MachineUsage(BaseModel): active: bool = True +class ExtendedCpuProperties(CpuProperties): + """CPU properties.""" + + model: Optional[str] = Field(default=None, description="CPU model") + frequency: Optional[str] = Field(default=None, description="CPU frequency") + count: Optional[str] = Field(default=None, description="CPU count") + + + +class MemoryProperties(BaseModel): + """MEMORY properties.""" + + size: Optional[str] = Field(default=None, description="Memory size") + units: Optional[str] = Field(default=None, description="Memory size units") + type: Optional[str] = Field(default=None, description="Memory type") + clock: Optional[str] = Field(default=None, description="Memory clock") + clock_units: Optional[str] = Field(default=None, description="Memory clock units") + + +class MachineCapability(BaseModel): + cpu: ExtendedCpuProperties + memory: MemoryProperties + + @lru_cache def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... @@ -83,11 +106,35 @@ def get_machine_properties() -> MachineProperties: In the future, some properties may have to be fetched from within a VM. """ - cpu_info = cpuinfo.get_cpu_info() # Slow + + cpu_info = get_cpu_info() return MachineProperties( cpu=CpuProperties( - architecture=cpu_info["raw_arch_string"], - vendor=cpu_info["vendor_id"], + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], + ), + ) + + +@lru_cache +def get_machine_capability() -> MachineCapability: + cpu_info = get_cpu_info() + mem_info = get_memory_info() + + return MachineCapability( + cpu=ExtendedCpuProperties( + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], + model=cpu_info["model"], + frequency=cpu_info["frequency"], + count=cpu_info["count"], + ), + memory=MemoryProperties( + size=mem_info["size"], + units=mem_info["units"], + type=mem_info["type"], + clock=mem_info["clock"], + clock_units=mem_info["clock_units"], ), ) @@ -119,6 +166,13 @@ async def about_system_usage(_: web.Request): return web.json_response(text=usage.json(exclude_none=True), headers={"Access-Control-Allow-Origin:": "*"}) +async def about_capability(_: web.Request): + """Public endpoint to expose information about the CRN capability.""" + + capability: MachineCapability = get_machine_capability() + return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"}) + + class Allocation(BaseModel): """An allocation is the set of resources that are currently allocated on this orchestrator. It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs. diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 20452df90..acbc763f3 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -15,13 +15,12 @@ import aiohttp_cors from aiohttp import web - from aleph.vm.conf import settings from aleph.vm.pool import VmPool from aleph.vm.version import __version__ from .metrics import create_tables, setup_engine -from .resources import about_system_usage +from .resources import about_capability, about_system_usage from .tasks import ( start_payment_monitoring_task, start_watch_for_messages_task, @@ -94,6 +93,7 @@ async def allow_cors_on_endpoint(request: web.Request): web.get("/about/executions/records", about_execution_records), web.get("/about/usage/system", about_system_usage), web.get("/about/config", about_config), + web.get("/about/capability", about_capability), # /control APIs are used to control the VMs and access their logs web.post("/control/allocations", update_allocations), web.post("/control/allocation/notify", notify_allocation), From a4d2a5c0f0c4010bb9f19ecfbeb743a24bfb7302 Mon Sep 17 00:00:00 2001 From: aliel Date: Fri, 5 Apr 2024 11:58:37 +0200 Subject: [PATCH 02/14] Refactor get_hardware_info() to use async subprocess.run for improved concurrency avoid Try-Catch using assertions --- src/aleph/vm/orchestrator/machine.py | 38 ++++++++++++++++---------- src/aleph/vm/orchestrator/resources.py | 14 +++++----- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index 9522f634b..cd1cdd805 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -1,18 +1,26 @@ +import asyncio import json import re -import subprocess from functools import lru_cache import psutil @lru_cache -def get_hardware_info(): - lshw = subprocess.Popen(["lshw", "-sanitize", "-json"], stdout=subprocess.PIPE, shell=False) - output, _ = lshw.communicate() +async def get_hardware_info(): + lshw = await asyncio.create_subprocess_shell( + "lshw -sanitize -json", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + + output, _ = await lshw.communicate() data = json.loads(output) - hw_info = {} + hw_info = { + "cpu": None, + "memory": None + } for hw in data["children"][0]["children"]: if hw["id"] == "cpu": @@ -24,8 +32,8 @@ def get_hardware_info(): @lru_cache -def get_cpu_info(): - hw = get_hardware_info() +async def get_cpu_info(): + hw = await get_hardware_info() cpu_info = hw["cpu"] architecture = cpu_info["width"] @@ -53,8 +61,8 @@ def get_cpu_info(): @lru_cache -def get_memory_info(): - hw = get_hardware_info() +async def get_memory_info(): + hw = await get_hardware_info() mem_info = hw["memory"] memory_type = "" @@ -62,11 +70,13 @@ def get_memory_info(): for bank in mem_info["children"]: memory_clock = bank["clock"] - try: - memory_type = re.search("(DDR[2-6])", bank["description"]).group(0) - break - except: - pass + if "description" in bank: + matched = re.search("(DDR[2-6])", bank["description"]) + if matched: + memory_type = matched.group(0) + break + else: + pass return { "size": mem_info["size"], diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 2c1ceca9b..e72d02327 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -100,14 +100,14 @@ class MachineCapability(BaseModel): @lru_cache -def get_machine_properties() -> MachineProperties: +async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ - cpu_info = get_cpu_info() + cpu_info = await get_cpu_info() return MachineProperties( cpu=CpuProperties( architecture=cpu_info["architecture"], @@ -117,9 +117,9 @@ def get_machine_properties() -> MachineProperties: @lru_cache -def get_machine_capability() -> MachineCapability: - cpu_info = get_cpu_info() - mem_info = get_memory_info() +async def get_machine_capability() -> MachineCapability: + cpu_info = await get_cpu_info() + mem_info = await get_memory_info() return MachineCapability( cpu=ExtendedCpuProperties( @@ -161,7 +161,7 @@ async def about_system_usage(_: web.Request): start_timestamp=period_start, duration_seconds=60, ), - properties=get_machine_properties(), + properties=await get_machine_properties(), ) return web.json_response(text=usage.json(exclude_none=True), headers={"Access-Control-Allow-Origin:": "*"}) @@ -169,7 +169,7 @@ async def about_system_usage(_: web.Request): async def about_capability(_: web.Request): """Public endpoint to expose information about the CRN capability.""" - capability: MachineCapability = get_machine_capability() + capability: MachineCapability = await get_machine_capability() return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"}) From 1a492ba48bed7e00cb17fa7682cb69ce1cb853ed Mon Sep 17 00:00:00 2001 From: aliel Date: Fri, 5 Apr 2024 16:42:37 +0200 Subject: [PATCH 03/14] Fix code style --- src/aleph/vm/orchestrator/machine.py | 9 ++------- src/aleph/vm/orchestrator/resources.py | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index cd1cdd805..aeffb6e7b 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -9,18 +9,13 @@ @lru_cache async def get_hardware_info(): lshw = await asyncio.create_subprocess_shell( - "lshw -sanitize -json", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE + "lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) output, _ = await lshw.communicate() data = json.loads(output) - hw_info = { - "cpu": None, - "memory": None - } + hw_info = {"cpu": None, "memory": None} for hw in data["children"][0]["children"]: if hw["id"] == "cpu": diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index e72d02327..38f2335c7 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -83,7 +83,6 @@ class ExtendedCpuProperties(CpuProperties): count: Optional[str] = Field(default=None, description="CPU count") - class MemoryProperties(BaseModel): """MEMORY properties.""" From 7e540787ae2cc28f305552ba04e4e088138cef9e Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 15:29:10 +0200 Subject: [PATCH 04/14] Remove duplicate cors headers --- src/aleph/vm/orchestrator/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index a5b9c8387..d702b19c2 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -173,7 +173,7 @@ async def about_capability(_: web.Request): """Public endpoint to expose information about the CRN capability.""" capability: MachineCapability = await get_machine_capability() - return web.json_response(text=capability.json(exclude_none=False), headers={"Access-Control-Allow-Origin:": "*"}) + return web.json_response(text=capability.json(exclude_none=False)) class Allocation(BaseModel): From b7aaf47468c0db28e6edba264fb7c3991f2aed35 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:01:45 +0200 Subject: [PATCH 05/14] Problem: async funcs cannott be lru_cached Solution: Introduce our own decorator --- src/aleph/vm/orchestrator/machine.py | 11 ++----- src/aleph/vm/orchestrator/resources.py | 41 +++++++++++++++++++------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index aeffb6e7b..092b53cf2 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -1,12 +1,10 @@ import asyncio import json import re -from functools import lru_cache import psutil -@lru_cache async def get_hardware_info(): lshw = await asyncio.create_subprocess_shell( "lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE @@ -26,10 +24,7 @@ async def get_hardware_info(): return hw_info -@lru_cache -async def get_cpu_info(): - hw = await get_hardware_info() - +def get_cpu_info(hw): cpu_info = hw["cpu"] architecture = cpu_info["width"] @@ -55,9 +50,7 @@ async def get_cpu_info(): } -@lru_cache -async def get_memory_info(): - hw = await get_hardware_info() +def get_memory_info(hw): mem_info = hw["memory"] memory_type = "" diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index d702b19c2..5c522f8b9 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,17 +1,20 @@ +import functools import math from datetime import datetime, timezone -from functools import lru_cache from typing import Optional import psutil from aiohttp import web -from aleph.vm.conf import settings -from aleph.vm.orchestrator.machine import get_cpu_info, get_memory_info from aleph_message.models import ItemHash from aleph_message.models.execution.environment import CpuProperties from pydantic import BaseModel, Field from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import ( + get_cpu_info, + get_hardware_info, + get_memory_info, +) from aleph.vm.utils import cors_allow_all @@ -101,15 +104,31 @@ class MachineCapability(BaseModel): memory: MemoryProperties -@lru_cache +machine_properties_cached = None + + +def async_cache(fn): + cache = {} + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in cache: + cache[key] = await fn(*args, **kwargs) + return cache[key] + + return wrapper + + +@async_cache async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ - - cpu_info = await get_cpu_info() + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) return MachineProperties( cpu=CpuProperties( architecture=cpu_info["architecture"], @@ -118,10 +137,11 @@ async def get_machine_properties() -> MachineProperties: ) -@lru_cache +@async_cache async def get_machine_capability() -> MachineCapability: - cpu_info = await get_cpu_info() - mem_info = await get_memory_info() + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) + mem_info = get_memory_info(hw) return MachineCapability( cpu=ExtendedCpuProperties( @@ -146,6 +166,7 @@ async def about_system_usage(_: web.Request): """Public endpoint to expose information about the system usage.""" period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0) + machine_properties = await get_machine_properties() usage: MachineUsage = MachineUsage( cpu=CpuUsage( count=psutil.cpu_count(), @@ -164,7 +185,7 @@ async def about_system_usage(_: web.Request): start_timestamp=period_start, duration_seconds=60, ), - properties=await get_machine_properties(), + properties=machine_properties, ) return web.json_response(text=usage.json(exclude_none=True)) From 6a3aff910d89f4bc6a37e950ba7c72e1fac6c5ec Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:04:36 +0200 Subject: [PATCH 06/14] fix lshw parsing for memory info --- src/aleph/vm/orchestrator/machine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py index 092b53cf2..55f1072ee 100644 --- a/src/aleph/vm/orchestrator/machine.py +++ b/src/aleph/vm/orchestrator/machine.py @@ -55,9 +55,8 @@ def get_memory_info(hw): memory_type = "" memory_clock = "" - for bank in mem_info["children"]: - memory_clock = bank["clock"] + memory_clock = bank.get("clock") if "description" in bank: matched = re.search("(DDR[2-6])", bank["description"]) if matched: @@ -71,5 +70,5 @@ def get_memory_info(hw): "units": mem_info["units"], "type": memory_type, "clock": memory_clock, - "clock_units": "Hz", + "clock_units": "Hz" if memory_clock is not None else "", } From c4c2d0df6cb3f5e4eac8f7c6efabaf04ad422482 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:05:18 +0200 Subject: [PATCH 07/14] Move async_cache decorator to utils module --- src/aleph/vm/orchestrator/resources.py | 16 +--------------- src/aleph/vm/utils.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 5c522f8b9..736c10e07 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,4 +1,3 @@ -import functools import math from datetime import datetime, timezone from typing import Optional @@ -15,7 +14,7 @@ get_hardware_info, get_memory_info, ) -from aleph.vm.utils import cors_allow_all +from aleph.vm.utils import cors_allow_all, async_cache class Period(BaseModel): @@ -107,19 +106,6 @@ class MachineCapability(BaseModel): machine_properties_cached = None -def async_cache(fn): - cache = {} - - @functools.wraps(fn) - async def wrapper(*args, **kwargs): - key = (args, frozenset(kwargs.items())) - if key not in cache: - cache[key] = await fn(*args, **kwargs) - return cache[key] - - return wrapper - - @async_cache async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... diff --git a/src/aleph/vm/utils.py b/src/aleph/vm/utils.py index 63ce18253..e11a6af81 100644 --- a/src/aleph/vm/utils.py +++ b/src/aleph/vm/utils.py @@ -1,5 +1,6 @@ import asyncio import dataclasses +import functools import hashlib import json import logging @@ -211,3 +212,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path return True return checksum(source) != checksum(destination) + + +def async_cache(fn): + """Simple async function cache decorator.""" + cache = {} + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in cache: + cache[key] = await fn(*args, **kwargs) + return cache[key] + + return wrapper From 0e079a06723b7d2daf9fe9867c4aeb37828bcc20 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:20:45 +0200 Subject: [PATCH 08/14] Change router order it's not really important but it was somehow anoying me --- src/aleph/vm/orchestrator/supervisor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 514033fe6..1656283c3 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -94,8 +94,8 @@ async def http_not_found(request: web.Request): web.get("/about/executions/details", about_executions), web.get("/about/executions/records", about_execution_records), web.get("/about/usage/system", about_system_usage), - web.get("/about/config", about_config), web.get("/about/capability", about_capability), + web.get("/about/config", about_config), # /control APIs are used to control the VMs and access their logs web.post("/control/allocation/notify", notify_allocation), web.get("/control/machine/{ref}/logs", stream_logs), From 4795a85b2efc903df519a8fa0237c2d22c960fd9 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 16:53:28 +0200 Subject: [PATCH 09/14] fix imports --- src/aleph/vm/orchestrator/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 736c10e07..e58391118 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -14,7 +14,7 @@ get_hardware_info, get_memory_info, ) -from aleph.vm.utils import cors_allow_all, async_cache +from aleph.vm.utils import async_cache, cors_allow_all class Period(BaseModel): From eac96f29eb5f5120d0298d819e9569f78a097b51 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Wed, 15 May 2024 15:40:25 +0200 Subject: [PATCH 10/14] fix test --- tests/supervisor/test_views.py | 159 +++++++++++++++++++++++++++++++-- 1 file changed, 152 insertions(+), 7 deletions(-) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 254e326df..dd7f16357 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -2,6 +2,7 @@ from aiohttp import web from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import get_hardware_info from aleph.vm.orchestrator.supervisor import setup_webapp @@ -40,16 +41,160 @@ async def test_system_usage(aiohttp_client): assert resp["cpu"]["count"] > 0 +FAKE_SYSTEM_INFO = { + "cpu": { + "id": "cpu", + "class": "processor", + "claimed": True, + "handle": "DMI:0400", + "description": "CPU", + "product": "AMD EPYC 7763 64-Core Processor", + "vendor": "Advanced Micro Devices [AMD]", + "physid": "400", + "businfo": "cpu@0", + "version": "25.1.1", + "slot": "CPU 0", + "units": "Hz", + "size": 2000000000, + "capacity": 2000000000, + "width": 64, + "configuration": {"cores": "8", "enabledcores": "8", "microcode": "167776681", "threads": "1"}, + "capabilities": { + "fpu": "mathematical co-processor", + "fpu_exception": "FPU exceptions reporting", + "wp": True, + "vme": "virtual mode extensions", + "de": "debugging extensions", + "pse": "page size extensions", + "tsc": "time stamp counter", + "msr": "model-specific registers", + "pae": "4GB+ memory addressing (Physical Address Extension)", + "mce": "machine check exceptions", + "cx8": "compare and exchange 8-byte", + "apic": "on-chip advanced programmable interrupt controller (APIC)", + "sep": "fast system calls", + "mtrr": "memory type range registers", + "pge": "page global enable", + "mca": "machine check architecture", + "cmov": "conditional move instruction", + "pat": "page attribute table", + "pse36": "36-bit page size extensions", + "clflush": True, + "mmx": "multimedia extensions (MMX)", + "fxsr": "fast floating point save/restore", + "sse": "streaming SIMD extensions (SSE)", + "sse2": "streaming SIMD extensions (SSE2)", + "ht": "HyperThreading", + "syscall": "fast system calls", + "nx": "no-execute bit (NX)", + "mmxext": "multimedia extensions (MMXExt)", + "fxsr_opt": True, + "pdpe1gb": True, + "rdtscp": True, + "rep_good": True, + "nopl": True, + "cpuid": True, + "extd_apicid": True, + "tsc_known_freq": True, + "pni": True, + "pclmulqdq": True, + "ssse3": True, + "fma": True, + "cx16": True, + "pcid": True, + "sse4_1": True, + "sse4_2": True, + "x2apic": True, + "movbe": True, + "popcnt": True, + "tsc_deadline_timer": True, + "aes": True, + "xsave": True, + "avx": True, + "f16c": True, + "rdrand": True, + "hypervisor": True, + "lahf_lm": True, + "cmp_legacy": True, + "svm": True, + "cr8_legacy": True, + "abm": True, + "sse4a": True, + "misalignsse": True, + "3dnowprefetch": True, + "osvw": True, + "perfctr_core": True, + "invpcid_single": True, + "ssbd": True, + "ibrs": True, + "ibpb": True, + "stibp": True, + "vmmcall": True, + "fsgsbase": True, + "tsc_adjust": True, + "bmi1": True, + "avx2": True, + "smep": True, + "bmi2": True, + "erms": True, + "invpcid": True, + "rdseed": True, + "adx": True, + "clflushopt": True, + "clwb": True, + "sha_ni": True, + "xsaveopt": True, + "xsavec": True, + "xgetbv1": True, + "xsaves": True, + "clzero": True, + "xsaveerptr": True, + "wbnoinvd": True, + "arat": True, + "npt": True, + "nrip_save": True, + "umip": True, + "pku": True, + "vaes": True, + "vpclmulqdq": True, + "rdpid": True, + "fsrm": True, + "arch_capabilities": True, + }, + }, + "memory": { + "id": "memory", + "class": "memory", + "claimed": True, + "handle": "DMI:1000", + "description": "System Memory", + "physid": "1000", + "units": "bytes", + "size": 17179869184, + "configuration": {"errordetection": "multi-bit-ecc"}, + "capabilities": {"ecc": "Multi-bit error-correcting code (ECC)"}, + "children": [ + { + "id": "bank", + "class": "memory", + "claimed": True, + "handle": "DMI:1100", + "description": "DIMM RAM", + "vendor": "QEMU", + "physid": "0", + "slot": "DIMM 0", + "units": "bytes", + "size": 17179869184, + } + ], + }, +} + + @pytest.mark.asyncio async def test_system_usage_mock(aiohttp_client, mocker): """Test that the usage system endpoints response value. No auth needed""" - mocker.patch( - "cpuinfo.cpuinfo.get_cpu_info", - { - "arch_string_raw": "x86_64", - "vendor_id_raw": "AuthenticAMD", - }, - ) + mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) mocker.patch( "psutil.getloadavg", lambda: [1, 2, 3], From 783e5eac49ff59300e8f67de76f38677c2aff0ec Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Wed, 15 May 2024 15:44:47 +0200 Subject: [PATCH 11/14] add test for /about/capability --- tests/supervisor/test_views.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index dd7f16357..a30cbf4ac 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -215,6 +215,36 @@ async def test_system_usage_mock(aiohttp_client, mocker): assert resp["cpu"]["count"] == 200 +@pytest.mark.asyncio +async def test_system_capability_mock(aiohttp_client, mocker): + """Test that the capability system endpoints response value. No auth needed""" + mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) + mocker.patch( + "psutil.getloadavg", + lambda: [1, 2, 3], + ) + mocker.patch( + "psutil.cpu_count", + lambda: 200, + ) + app = setup_webapp() + client = await aiohttp_client(app) + response: web.Response = await client.get("/about/capability") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert resp == { + "cpu": { + "architecture": "x86_64", + "vendor": "AuthenticAMD", + "model": "AMD EPYC 7763 64-Core Processor", + "frequency": "2000000000", + "count": "200", + }, + "memory": {"size": "17179869184", "units": "bytes", "type": "", "clock": None, "clock_units": ""}, + } + + @pytest.mark.asyncio async def test_allocation_invalid_auth_token(aiohttp_client): """Test that the allocation endpoint fails when an invalid auth token is provided.""" From da65e94859f4dea931b1d38455379b2234c9b4f6 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 16 May 2024 16:15:53 +0200 Subject: [PATCH 12/14] Remove unused deps --- src/aleph/vm/orchestrator/resources.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 262f73886..009bb6579 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,9 +1,7 @@ import math from datetime import datetime, timezone -from functools import lru_cache from typing import Optional -import cpuinfo import psutil from aiohttp import web from aleph_message.models import ItemHash From c42e480f40b36bf6158354e0f1e686d20e2f28ca Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 16 May 2024 20:03:54 +0200 Subject: [PATCH 13/14] black --- src/aleph/vm/orchestrator/supervisor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 86eac12e1..40269c9cb 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -87,7 +87,6 @@ def setup_webapp(): }, ) - # Routes that need CORS enabled cors_routes = [ # /about APIs return information about the VM Orchestrator From befc521bf1d363bebcfc90e717077030cb81e83e Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 17 May 2024 10:27:51 +0200 Subject: [PATCH 14/14] Fix depencies in ci --- .github/workflows/test-using-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-using-pytest.yml b/.github/workflows/test-using-pytest.yml index 732a646ea..99d799036 100644 --- a/.github/workflows/test-using-pytest.yml +++ b/.github/workflows/test-using-pytest.yml @@ -18,7 +18,7 @@ jobs: run: | sudo apt-get update sudo apt-get -y upgrade - sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables + sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-nftables python3-jsonschema nftables lshw python3-jwcrypto pip install --upgrade typing-extensions types-PyYAML - name: Install required Python packages