From a6c5f1724eab3f8d1893789e28f64a7720171f81 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Wed, 3 Sep 2025 09:49:45 +0000 Subject: [PATCH 01/31] Track actions, errors, versions, and create disable_telemetry flag --- olive/__init__.py | 2 - olive/cli/auto_opt.py | 4 + olive/cli/base.py | 6 + olive/cli/capture_onnx.py | 4 + olive/cli/configure_qualcomm_sdk.py | 6 +- olive/cli/convert_adapters.py | 5 +- olive/cli/extract_adapters.py | 5 +- olive/cli/finetune.py | 4 + olive/cli/generate_adapter.py | 4 + olive/cli/generate_cost_model.py | 5 +- olive/cli/launcher.py | 6 + olive/cli/optimize.py | 4 + olive/cli/quantize.py | 4 + olive/cli/run.py | 11 +- olive/cli/run_pass.py | 4 + olive/cli/session_params_tuning.py | 4 + olive/cli/shared_cache.py | 5 +- olive/telemetry/__init__.py | 4 + olive/telemetry/endpoint.b64 | 1 + olive/telemetry/headers.b64 | 1 + olive/telemetry/msft_log_exporter.py | 200 +++++++++++++++++++++++++++ olive/telemetry/telemetry_events.py | 83 +++++++++++ olive/telemetry/telemetry_logger.py | 60 ++++++++ olive/version.py | 1 + requirements.txt | 3 + setup.py | 12 +- 26 files changed, 429 insertions(+), 19 deletions(-) create mode 100644 olive/telemetry/__init__.py create mode 100644 olive/telemetry/endpoint.b64 create mode 100644 olive/telemetry/headers.b64 create mode 100644 olive/telemetry/msft_log_exporter.py create mode 100644 olive/telemetry/telemetry_events.py create mode 100644 olive/telemetry/telemetry_logger.py create mode 100644 olive/version.py diff --git a/olive/__init__.py b/olive/__init__.py index 17d0707c0d..8faede1b86 100644 --- a/olive/__init__.py +++ b/olive/__init__.py @@ -14,8 +14,6 @@ _logger.addHandler(_sc) _logger.propagate = False -__version__ = "0.10.0.dev0" - # pylint: disable=C0413 # Import Python API functions diff --git a/olive/cli/auto_opt.py b/olive/cli/auto_opt.py index a2e7948c12..ac1b71da1c 100644 --- a/olive/cli/auto_opt.py +++ b/olive/cli/auto_opt.py @@ -14,6 +14,7 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, get_input_model_config, update_accelerator_options, update_shared_cache_options, @@ -22,6 +23,7 @@ from olive.constants import Precision from olive.hardware.constants import ExecutionProvider from olive.package_config import OlivePackageConfig +from olive.telemetry.telemetry_events import action class AutoOptCommand(BaseOliveCLICommand): @@ -167,8 +169,10 @@ def register_subcommand(parser: ArgumentParser): add_shared_cache_options(sub_parser) add_logging_options(sub_parser) add_save_config_file_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=AutoOptCommand) + @action def run(self): return self._run_workflow() diff --git a/olive/cli/base.py b/olive/cli/base.py index 51f5803efc..3e9c34d86a 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -585,6 +585,12 @@ def add_search_options(sub_parser: ArgumentParser): search_strategy_group.add_argument("--seed", type=int, default=0, help="Random seed for search sampler") +def add_telemetry_options(sub_parser: ArgumentParser): + """Add telemetry options to the sub_parser.""" + sub_parser.add_argument("--disable-telemetry", action="store_true", help="Disable telemetry for this command.") + return sub_parser + + def update_search_options(args, config): to_replace = [] to_replace.extend( diff --git a/olive/cli/capture_onnx.py b/olive/cli/capture_onnx.py index 9f12dd62f2..26dc414413 100644 --- a/olive/cli/capture_onnx.py +++ b/olive/cli/capture_onnx.py @@ -12,10 +12,12 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, get_input_model_config, update_shared_cache_options, ) from olive.common.utils import IntEnumBase, set_nested_dict_value +from olive.telemetry.telemetry_events import action class ModelBuilderAccuracyLevel(IntEnumBase): @@ -162,8 +164,10 @@ def register_subcommand(parser: ArgumentParser): add_logging_options(sub_parser) add_save_config_file_options(sub_parser) add_shared_cache_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=CaptureOnnxGraphCommand) + @action def run(self): return self._run_workflow() diff --git a/olive/cli/configure_qualcomm_sdk.py b/olive/cli/configure_qualcomm_sdk.py index 883fef12e3..0bb6f60a75 100644 --- a/olive/cli/configure_qualcomm_sdk.py +++ b/olive/cli/configure_qualcomm_sdk.py @@ -4,7 +4,8 @@ # -------------------------------------------------------------------------- from argparse import ArgumentParser -from olive.cli.base import BaseOliveCLICommand +from olive.cli.base import BaseOliveCLICommand, add_telemetry_options +from olive.telemetry.telemetry_events import action class ConfigureQualcommSDKCommand(BaseOliveCLICommand): @@ -21,9 +22,10 @@ def register_subcommand(parser: ArgumentParser): required=True, choices=["3.6", "3.8"], ) - + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=ConfigureQualcommSDKCommand) + @action def run(self): from olive.platform_sdk.qualcomm.configure.configure import configure diff --git a/olive/cli/convert_adapters.py b/olive/cli/convert_adapters.py index 54ad5a0657..785d97c552 100644 --- a/olive/cli/convert_adapters.py +++ b/olive/cli/convert_adapters.py @@ -6,11 +6,12 @@ from argparse import ArgumentParser from typing import TYPE_CHECKING -from olive.cli.base import BaseOliveCLICommand, add_logging_options +from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights if TYPE_CHECKING: from numpy.typing import NDArray +from olive.telemetry.telemetry_events import action class ConvertAdaptersCommand(BaseOliveCLICommand): @@ -75,8 +76,10 @@ def register_subcommand(parser: ArgumentParser): help="Quantization mode for int4 quantization of adapter weights. Default is symmetric.", ) add_logging_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=ConvertAdaptersCommand) + @action def run(self): import torch from peft import LoraConfig, load_peft_weights diff --git a/olive/cli/extract_adapters.py b/olive/cli/extract_adapters.py index db4a91c6a6..460ada32a7 100644 --- a/olive/cli/extract_adapters.py +++ b/olive/cli/extract_adapters.py @@ -6,8 +6,9 @@ from transformers.utils import TRANSFORMERS_CACHE -from olive.cli.base import BaseOliveCLICommand, add_logging_options +from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights +from olive.telemetry.telemetry_events import action class ExtractAdaptersCommand(BaseOliveCLICommand): @@ -54,8 +55,10 @@ def register_subcommand(parser: ArgumentParser): help="Cache dir to store temporary files in. Default is Hugging Face's default cache dir.", ) add_logging_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=ExtractAdaptersCommand) + @action def run(self): # Reference: https://huggingface.co/microsoft/Phi-4-multimodal-instruct-onnx/blob/05f620b467891affcb00b464e5a73e7cf2de61f9/onnx/builder.py#L318 import os diff --git a/olive/cli/finetune.py b/olive/cli/finetune.py index 0e79b7e02b..c2cb8ae207 100644 --- a/olive/cli/finetune.py +++ b/olive/cli/finetune.py @@ -13,11 +13,13 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, get_input_model_config, update_dataset_options, update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value +from olive.telemetry.telemetry_events import action class FineTuneCommand(BaseOliveCLICommand): @@ -74,8 +76,10 @@ def register_subcommand(parser: ArgumentParser): add_shared_cache_options(sub_parser) add_logging_options(sub_parser) add_save_config_file_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=FineTuneCommand) + @action def run(self): return self._run_workflow() diff --git a/olive/cli/generate_adapter.py b/olive/cli/generate_adapter.py index 64b01d0119..7549db266f 100644 --- a/olive/cli/generate_adapter.py +++ b/olive/cli/generate_adapter.py @@ -11,11 +11,13 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, get_input_model_config, update_shared_cache_options, ) from olive.common.utils import WeightsFileFormat, set_nested_dict_value from olive.passes.onnx.common import AdapterType +from olive.telemetry.telemetry_events import action class GenerateAdapterCommand(BaseOliveCLICommand): @@ -45,8 +47,10 @@ def register_subcommand(parser: ArgumentParser): add_logging_options(sub_parser) add_save_config_file_options(sub_parser) add_shared_cache_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=GenerateAdapterCommand) + @action def run(self): return self._run_workflow() diff --git a/olive/cli/generate_cost_model.py b/olive/cli/generate_cost_model.py index e228b9335b..01f8bcaa24 100644 --- a/olive/cli/generate_cost_model.py +++ b/olive/cli/generate_cost_model.py @@ -5,8 +5,9 @@ import logging from pathlib import Path -from olive.cli.base import BaseOliveCLICommand, add_input_model_options, get_input_model_config +from olive.cli.base import BaseOliveCLICommand, add_input_model_options, add_telemetry_options, get_input_model_config from olive.model import ModelConfig +from olive.telemetry.telemetry_events import action logger = logging.getLogger(__name__) @@ -34,8 +35,10 @@ def register_subcommand(parser): choices=PRECISON_TO_BYTES.keys(), help="Weight precision", ) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=GenerateCostModelCommand) + @action def run(self): import torch diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index a8b27bfe9f..189f7d545a 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -20,6 +20,7 @@ from olive.cli.run_pass import RunPassCommand from olive.cli.session_params_tuning import SessionParamsTuningCommand from olive.cli.shared_cache import SharedCacheCommand +from olive.telemetry.telemetry_logger import TelemetryLogger def get_cli_parser(called_as_console_script: bool = True) -> ArgumentParser: @@ -57,6 +58,10 @@ def main(raw_args=None, called_as_console_script: bool = True): args, unknown_args = parser.parse_known_args(raw_args) + t = TelemetryLogger() + if args.disable_telemetry: + t.disable_telemetry() + if not hasattr(args, "func"): parser.print_help() sys.exit(1) @@ -64,6 +69,7 @@ def main(raw_args=None, called_as_console_script: bool = True): # Run the command service = args.func(parser, args, unknown_args) service.run() + t.shutdown() def legacy_call(deprecated_module: str, command_name: str, *args): diff --git a/olive/cli/optimize.py b/olive/cli/optimize.py index a7869a1e4e..105d7fe5d3 100644 --- a/olive/cli/optimize.py +++ b/olive/cli/optimize.py @@ -15,11 +15,13 @@ add_input_model_options, add_logging_options, add_save_config_file_options, + add_telemetry_options, get_input_model_config, ) from olive.common.utils import set_nested_dict_value from olive.constants import Precision, precision_bits_from_precision from olive.hardware.constants import ExecutionProvider +from olive.telemetry.telemetry_events import action class OptimizeCommand(BaseOliveCLICommand): @@ -184,6 +186,7 @@ def register_subcommand(parser: ArgumentParser): add_logging_options(sub_parser) add_save_config_file_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=OptimizeCommand) def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Optional[list] = None): @@ -216,6 +219,7 @@ def __init__(self, parser: ArgumentParser, args: Namespace, unknown_args: Option self.enable_compose_onnx_models = False self.enable_openvino_encapsulation = False + @action def run(self): return self._run_workflow() diff --git a/olive/cli/quantize.py b/olive/cli/quantize.py index 78dcb3d6a9..2cc1ed496a 100644 --- a/olive/cli/quantize.py +++ b/olive/cli/quantize.py @@ -17,6 +17,7 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, update_dataset_options, update_input_model_options, update_shared_cache_options, @@ -24,6 +25,7 @@ from olive.common.utils import StrEnumBase, set_nested_dict_value from olive.constants import Precision, QuantAlgorithm, precision_bits_from_precision from olive.package_config import OlivePackageConfig +from olive.telemetry.telemetry_events import action class ImplName(StrEnumBase): @@ -93,6 +95,7 @@ def register_subcommand(parser: ArgumentParser): add_shared_cache_options(sub_parser) add_logging_options(sub_parser) add_save_config_file_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=QuantizeCommand) def _check_data_name_arg(self, pinfo): @@ -203,6 +206,7 @@ def _get_run_config(self, tempdir: str) -> dict[str, Any]: self._customize_config(config) return config + @action def run(self): return self._run_workflow() diff --git a/olive/cli/run.py b/olive/cli/run.py index 57d643309c..8a99770068 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -4,7 +4,14 @@ # -------------------------------------------------------------------------- from argparse import ArgumentParser -from olive.cli.base import BaseOliveCLICommand, add_input_model_options, add_logging_options, get_input_model_config +from olive.cli.base import ( + BaseOliveCLICommand, + add_input_model_options, + add_logging_options, + add_telemetry_options, + get_input_model_config, +) +from olive.telemetry.telemetry_events import action class WorkflowRunCommand(BaseOliveCLICommand): @@ -36,8 +43,10 @@ def register_subcommand(parser: ArgumentParser): enable_onnx=True, required=False, ) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=WorkflowRunCommand) + @action def run(self): from olive.common.config_utils import load_config_file from olive.workflows import run as olive_run diff --git a/olive/cli/run_pass.py b/olive/cli/run_pass.py index 2f8afeddd3..60460f782c 100644 --- a/olive/cli/run_pass.py +++ b/olive/cli/run_pass.py @@ -12,11 +12,14 @@ add_input_model_options, add_logging_options, add_save_config_file_options, + add_telemetry_options, get_input_model_config, update_accelerator_options, ) +from olive.telemetry.telemetry_events import action +@action class RunPassCommand(BaseOliveCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): @@ -62,6 +65,7 @@ def register_subcommand(parser: ArgumentParser): add_logging_options(sub_parser) add_save_config_file_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=RunPassCommand) def _get_run_config(self, tempdir: str) -> dict[str, Any]: diff --git a/olive/cli/session_params_tuning.py b/olive/cli/session_params_tuning.py index da8895ebf7..65d8d9ca05 100644 --- a/olive/cli/session_params_tuning.py +++ b/olive/cli/session_params_tuning.py @@ -14,11 +14,13 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, get_input_model_config, update_accelerator_options, update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value +from olive.telemetry.telemetry_events import action class SessionParamsTuningCommand(BaseOliveCLICommand): @@ -96,6 +98,7 @@ def register_subcommand(parser: ArgumentParser): add_logging_options(sub_parser) add_save_config_file_options(sub_parser) add_shared_cache_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=SessionParamsTuningCommand) def _update_pass_config(self, default_pass_config) -> dict: @@ -135,6 +138,7 @@ def _get_run_config(self, tempdir) -> dict: update_shared_cache_options(config, self.args) return config + @action def run(self): workflow_output = self._run_workflow() diff --git a/olive/cli/shared_cache.py b/olive/cli/shared_cache.py index 40c72e460e..19d56ab917 100644 --- a/olive/cli/shared_cache.py +++ b/olive/cli/shared_cache.py @@ -4,8 +4,9 @@ # -------------------------------------------------------------------------- import logging -from olive.cli.base import BaseOliveCLICommand +from olive.cli.base import BaseOliveCLICommand, add_telemetry_options from olive.common.container_client_factory import AzureContainerClientFactory +from olive.telemetry.telemetry_events import action logger = logging.getLogger(__name__) @@ -47,8 +48,10 @@ def register_subcommand(parser): type=str, help="The model hash to remove from the shared cache.", ) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=SharedCacheCommand) + @action def run(self): container_client_factory = AzureContainerClientFactory(self.args.account, self.args.container) if self.args.delete: diff --git a/olive/telemetry/__init__.py b/olive/telemetry/__init__.py new file mode 100644 index 0000000000..862c45ce31 --- /dev/null +++ b/olive/telemetry/__init__.py @@ -0,0 +1,4 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- diff --git a/olive/telemetry/endpoint.b64 b/olive/telemetry/endpoint.b64 new file mode 100644 index 0000000000..657269e098 --- /dev/null +++ b/olive/telemetry/endpoint.b64 @@ -0,0 +1 @@ +aHR0cHM6Ly9tb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbS9PbmVDb2xsZWN0b3IvMS4w \ No newline at end of file diff --git a/olive/telemetry/headers.b64 b/olive/telemetry/headers.b64 new file mode 100644 index 0000000000..4a9b2dd686 --- /dev/null +++ b/olive/telemetry/headers.b64 @@ -0,0 +1 @@ +eyJ4LWFwaWtleSI6ICI5ZDVkZGFlYzYxZTI0NTY3Yjc4OGEyMGFlYTMyNDYzMS03MjM3ZDdjNi1lZTYxLTRjZmQtYmI3Yi01OTAzYTk3MmMyZTQtNzA0NyIsICJVc2VyLUFnZW50IjogIlB5dGhvbi8zIEh0dHBDbGllbnQiLCAiSG9zdCI6ICJtb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbSIsICJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24veC1qc29uLXN0cmVhbTsgY2hhcnNldD11dGYtOCIsICJzZGstdmVyc2lvbiI6ICJPVGVsLXB5dGhvbi0wLjEuMC4wIiwgIk5vUmVzcG9uc2VCb2R5IjogInRydWUifQ== \ No newline at end of file diff --git a/olive/telemetry/msft_log_exporter.py b/olive/telemetry/msft_log_exporter.py new file mode 100644 index 0000000000..f84ef67365 --- /dev/null +++ b/olive/telemetry/msft_log_exporter.py @@ -0,0 +1,200 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import base64 +import gzip +import hashlib +import hmac +import json +import platform +import random +import threading +import zlib +from collections.abc import Sequence +from datetime import datetime +from io import BytesIO +from pathlib import Path +from time import time +from typing import Optional + +import requests +from deviceid import get_device_id +from opentelemetry.exporter.otlp.proto.http import Compression +from opentelemetry.exporter.otlp.proto.http._common import ( + _is_retryable, +) +from opentelemetry.sdk._logs import LogData +from opentelemetry.sdk._logs.export import ( + LogExporter, + LogExportResult, +) + +from olive.version import __version__ as VERSION + +_MAX_RETRYS = 6 + + +class MSFTLogExporter(LogExporter): + def __init__( + self, + endpoint_file: str = Path(__file__).parent / "endpoint.b64", + headers_file: str = Path(__file__).parent / "headers.b64", + headers: Optional[dict[str, str]] = None, + timeout: Optional[float] = 10, + compression: Optional[Compression] = Compression.Deflate, + ): + self._shutdown_is_occuring = threading.Event() + + with open(Path(__file__).parent / "endpoint.b64", "rb") as f: + encoded_data = f.read() + + self._endpoint = base64.b64decode(encoded_data).decode() + self._timeout = timeout + self._compression = compression + self._session = requests.Session() + + with open(headers_file, "rb") as f: + encoded_data = f.read() + self._headers = json.loads(base64.b64decode(encoded_data).decode()) + self._iKey = f"o:{self._headers['x-apikey'].split('-')[0]}" + if headers: + self._headers.update(headers) + if self._compression is not Compression.NoCompression: + self._headers.update({"Content-Encoding": self._compression.value}) + self._session.headers.update(self._headers) + self._device_id = self._generate_encrypted_device_id() + self._system = platform.system().lower() + self._release = platform.release() + self._version = platform.version() + self._arch = platform.machine() + + self._shutdown = False + + def _generate_encrypted_device_id(self) -> str: + """Generate a FIPS-compliant encrypted device ID using HMAC-SHA256. + + This method uses HMAC-SHA256 which is FIPS 140-2 approved for cryptographic operations. + The device ID is encrypted using a key derived from the existing endpoint configuration + to ensure deterministic but secure device identification. + + Returns: + str: FIPS-compliant encrypted device ID (hex-encoded) + + """ + try: + # Get the raw device ID + raw_device_id = get_device_id() + + # Create a deterministic key from existing configuration + # Using the API key and endpoint as key material for HMAC + key_material = f"{self._headers.get('x-apikey', '')}{self._endpoint}".encode() + + # Use SHA256 to create a consistent 32-byte key + encryption_key = hashlib.sha256(key_material).digest() + + # Use HMAC-SHA256 to encrypt the device ID (FIPS 140-2 approved) + return hmac.new(encryption_key, raw_device_id.encode("utf-8"), hashlib.sha256).hexdigest() + + except Exception: + # Fallback to a consistent hash if anything fails + fallback_data = f"olive-telemetry-{self._iKey}".encode() + return hashlib.sha256(fallback_data).hexdigest() + + def _export(self, data: bytes, timeout_sec: Optional[float] = None): + if self._compression == Compression.Deflate: + compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate + compressed_data = compressor.compress(data) + compressed_data += compressor.flush() + data = compressed_data + elif self._compression == Compression.Gzip: + gzip_data = BytesIO() + with gzip.GzipFile(fileobj=gzip_data, mode="w") as gzip_stream: + gzip_stream.write(data) + data = gzip_data.getvalue() + elif self._compression == Compression.NoCompression: + pass + + if timeout_sec is None: + timeout_sec = self._timeout + + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + updated_headers = {**self._headers, "Content-Length": str(len(data))} + try: + resp = self._session.post( + url=self._endpoint, + data=data, + headers=updated_headers, + timeout=timeout_sec, + ) + except requests.exceptions.ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + headers=updated_headers, + timeout=timeout_sec, + ) + return resp + + def export(self, batch: Sequence[LogData]) -> LogExportResult: + if self._shutdown: + return LogExportResult.FAILURE + json_logs = [] + for log_data in batch: + log_record = log_data.log_record + data = { + k: v + for k, v in (log_record.attributes or {}).items() + if k not in {"code.file.path", "code.function.name", "code.line.number"} + } + data["deviceID"] = self._device_id + data["os"] = {"name": self._system, "version": self._version, "release": self._release, "arch": self._arch} + data["version"] = VERSION + log_entry = { + "ver": "4.0", + "name": log_record.body, + "time": datetime.fromtimestamp(log_record.timestamp / 1e9).isoformat() + "Z" + if log_record.timestamp + else None, + "iKey": self._iKey, + "data": data, + } + json_logs.append(log_entry) + + deadline_sec = time() + self._timeout + shutdown = False + for log_entry in json_logs: + for retry_num in range(_MAX_RETRYS): + data = json.dumps(log_entry, ensure_ascii=False).encode("utf-8") + resp = self._export(data, deadline_sec - time()) + if resp.ok: + break + # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + if ( + not _is_retryable(resp) + or retry_num + 1 == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + or self._shutdown + ): + return LogExportResult.FAILURE + shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + if shutdown: + break + if shutdown: + break + return LogExportResult.SUCCESS + + def force_flush(self, timeout_millis: float = 10_000) -> bool: + """Nothing is buffered in this exporter, so this method does nothing.""" + return True + + def shutdown(self): + if self._shutdown: + return + self._shutdown = True + self._shutdown_is_occuring.set() + self._session.close() diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py new file mode 100644 index 0000000000..4a0805bc8b --- /dev/null +++ b/olive/telemetry/telemetry_events.py @@ -0,0 +1,83 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import functools +import inspect +import traceback +from datetime import datetime + +from olive.telemetry.telemetry_logger import TelemetryLogger + + +def action(func): + t = TelemetryLogger() + + @functools.wraps(func) + def wrapper(*args, **kwargs): + stack = inspect.stack() + caller_frame = stack[1] + caller_module = inspect.getmodule(caller_frame[0]) + called_from = caller_module.__name__ + + if caller_module is None: + called_from = "Interactive" + elif caller_module.__name__ == "__main__": + called_from = "Script" + + success = False + error = None + start_time = datetime.now() + try: + result = func(*args, **kwargs) + success = True + except Exception as ex: + result = None + error = ex + duration_ms = int((datetime.now() - start_time).total_seconds() * 1000) + action_name = args[0].__class__.__name__ if args else "Invalid" + if action_name.endswith("Command"): + action_name = action_name[: -len("Command")] + t.log( + "OliveAction", + { + "action": action_name, + "caller": called_from, + "actionTime": start_time, + "timeMs": duration_ms, + "success": success, + }, + ) + + if error: + t.log( + "OliveError", + { + "action": action_name, + "caller": called_from, + "errorType": type(error).__name__, + "error": _format_exception_msg(error), + }, + ) + raise error + return result + + return wrapper + + +def _format_exception_msg(exc: Exception) -> str: + folder = "Olive" + file_line = 'File "' + exc = traceback.format_exception(exc, limit=5) + lines = [] + for line in exc: + line_trunc = line.strip() + if line_trunc.startswith(file_line) and folder in line_trunc: + idx = line_trunc.find(folder) + if idx != -1: + line_trunc = line_trunc[idx + len(folder) :] + elif line_trunc.startswith(file_line): + idx = line_trunc[len(file_line) :].find('"') + line_trunc = line_trunc[idx + len(file_line) :] + lines.append(line_trunc) + return "\n".join(lines) diff --git a/olive/telemetry/telemetry_logger.py b/olive/telemetry/telemetry_logger.py new file mode 100644 index 0000000000..d8044a9ba1 --- /dev/null +++ b/olive/telemetry/telemetry_logger.py @@ -0,0 +1,60 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import logging +from typing import Any + +from opentelemetry._logs import set_logger_provider +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.resources import Resource + +from olive.telemetry.msft_log_exporter import MSFTLogExporter + + +class TelemetryLogger: + _instance = None # Class-level attribute to store the single instance + _logger: logging.Logger = None + _logger_provider: LoggerProvider = None + + def __new__(cls, *args, **kwargs): + # Check if an instance already exists + if cls._instance is None: + # If not, create a new instance and store it in _instance + cls._instance = super().__new__(cls) + + try: + exporter = MSFTLogExporter() + cls._logger_provider = LoggerProvider( + resource=Resource.create( + { + "service.name": "olive-telemetry", + "service.instance.id": "olive-telemetry-instance", + } + ), + ) + set_logger_provider(cls._logger_provider) + cls._logger_provider.add_log_record_processor(BatchLogRecordProcessor(exporter)) + handler = LoggingHandler(level=logging.INFO, logger_provider=cls._logger_provider) + + logger = logging.getLogger("olive.telemetry") + logger.propagate = False + logger.setLevel(logging.INFO) + logger.addHandler(handler) + cls._logger = logger + except Exception: + pass + return cls._instance + + def __init__(self): + pass + + def log(self, event_name: str, information: dict[str, Any]): + self._logger.info(event_name, extra=information) + + def disable_telemetry(self): + self._logger.disabled = True + + def shutdown(self): + self._logger_provider.shutdown() diff --git a/olive/version.py b/olive/version.py new file mode 100644 index 0000000000..bdda88d0fc --- /dev/null +++ b/olive/version.py @@ -0,0 +1 @@ +__version__ = "0.10.0.dev0" diff --git a/requirements.txt b/requirements.txt index d013000357..15c56a6a71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,11 @@ numpy onnx onnx_ir>=0.1.2 onnxscript>=0.3.0 +opentelemetry-exporter-otlp-proto-http>=1.36.0 +opentelemetry-sdk>=1.36.0 optuna pandas +py-deviceid>=0.1.1 pydantic pyyaml torch diff --git a/setup.py b/setup.py index 1c5ca72475..12708a28a4 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,8 @@ from setuptools import find_packages, setup +from olive.version import __version__ as VERSION + def read(rel_path): here = os.path.abspath(os.path.dirname(__file__)) @@ -14,14 +16,6 @@ def read(rel_path): return fp.read() -def get_version(rel_path): - for line in read(rel_path).splitlines(): - if line.startswith("__version__"): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - raise RuntimeError("Unable to find version string.") - - def get_extra_deps(rel_path): here = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(here, rel_path)) as fp: @@ -30,7 +24,6 @@ def get_extra_deps(rel_path): # use techniques described at https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ # Don't use technique 6 since it needs extra dependencies. -VERSION = get_version("olive/__init__.py") EXTRAS = get_extra_deps("olive/olive_config.json") with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")) as req_file: @@ -50,7 +43,6 @@ def get_extra_deps(rel_path): "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", From ecb2cecedcb80071ad08315a1e21aa09b2f2b223 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 4 Sep 2025 18:26:47 +0000 Subject: [PATCH 02/31] Fmt --- olive/cli/convert_adapters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olive/cli/convert_adapters.py b/olive/cli/convert_adapters.py index 785d97c552..9ea8fe6162 100644 --- a/olive/cli/convert_adapters.py +++ b/olive/cli/convert_adapters.py @@ -8,10 +8,10 @@ from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights +from olive.telemetry.telemetry_events import action if TYPE_CHECKING: from numpy.typing import NDArray -from olive.telemetry.telemetry_events import action class ConvertAdaptersCommand(BaseOliveCLICommand): From 19fd0567b1c943767c902482f6bdc8fc82b9286b Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 7 Sep 2025 05:01:27 -0500 Subject: [PATCH 03/31] Refactor and format --- olive/cli/auto_opt.py | 2 +- olive/cli/capture_onnx.py | 2 +- olive/cli/configure_qualcomm_sdk.py | 2 +- olive/cli/convert_adapters.py | 2 +- olive/cli/extract_adapters.py | 2 +- olive/cli/finetune.py | 2 +- olive/cli/generate_adapter.py | 2 +- olive/cli/generate_cost_model.py | 2 +- olive/cli/launcher.py | 6 +- olive/cli/optimize.py | 2 +- olive/cli/quantize.py | 2 +- olive/cli/run.py | 2 +- olive/cli/run_pass.py | 2 +- olive/cli/session_params_tuning.py | 2 +- olive/cli/shared_cache.py | 2 +- olive/telemetry/constants.py | 4 ++ olive/telemetry/endpoint.b64 | 1 - olive/telemetry/headers.b64 | 1 - olive/telemetry/msft_log_exporter.py | 13 +--- olive/telemetry/telemetry.py | 44 ++++++++++++ olive/telemetry/telemetry_events.py | 104 ++++++++------------------- olive/telemetry/telemetry_logger.py | 13 ++-- olive/telemetry/utils.py | 23 ++++++ 23 files changed, 128 insertions(+), 109 deletions(-) create mode 100644 olive/telemetry/constants.py delete mode 100644 olive/telemetry/endpoint.b64 delete mode 100644 olive/telemetry/headers.b64 create mode 100644 olive/telemetry/telemetry.py create mode 100644 olive/telemetry/utils.py diff --git a/olive/cli/auto_opt.py b/olive/cli/auto_opt.py index ac1b71da1c..6db4bb79b7 100644 --- a/olive/cli/auto_opt.py +++ b/olive/cli/auto_opt.py @@ -23,7 +23,7 @@ from olive.constants import Precision from olive.hardware.constants import ExecutionProvider from olive.package_config import OlivePackageConfig -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class AutoOptCommand(BaseOliveCLICommand): diff --git a/olive/cli/capture_onnx.py b/olive/cli/capture_onnx.py index 26dc414413..049151d5bf 100644 --- a/olive/cli/capture_onnx.py +++ b/olive/cli/capture_onnx.py @@ -17,7 +17,7 @@ update_shared_cache_options, ) from olive.common.utils import IntEnumBase, set_nested_dict_value -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class ModelBuilderAccuracyLevel(IntEnumBase): diff --git a/olive/cli/configure_qualcomm_sdk.py b/olive/cli/configure_qualcomm_sdk.py index 0bb6f60a75..4b7d0c6448 100644 --- a/olive/cli/configure_qualcomm_sdk.py +++ b/olive/cli/configure_qualcomm_sdk.py @@ -5,7 +5,7 @@ from argparse import ArgumentParser from olive.cli.base import BaseOliveCLICommand, add_telemetry_options -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class ConfigureQualcommSDKCommand(BaseOliveCLICommand): diff --git a/olive/cli/convert_adapters.py b/olive/cli/convert_adapters.py index 9ea8fe6162..5f6fd323ab 100644 --- a/olive/cli/convert_adapters.py +++ b/olive/cli/convert_adapters.py @@ -8,7 +8,7 @@ from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action if TYPE_CHECKING: from numpy.typing import NDArray diff --git a/olive/cli/extract_adapters.py b/olive/cli/extract_adapters.py index 460ada32a7..5f3399fc39 100644 --- a/olive/cli/extract_adapters.py +++ b/olive/cli/extract_adapters.py @@ -8,7 +8,7 @@ from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class ExtractAdaptersCommand(BaseOliveCLICommand): diff --git a/olive/cli/finetune.py b/olive/cli/finetune.py index c2cb8ae207..efc7227a89 100644 --- a/olive/cli/finetune.py +++ b/olive/cli/finetune.py @@ -19,7 +19,7 @@ update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class FineTuneCommand(BaseOliveCLICommand): diff --git a/olive/cli/generate_adapter.py b/olive/cli/generate_adapter.py index 7549db266f..6e10315064 100644 --- a/olive/cli/generate_adapter.py +++ b/olive/cli/generate_adapter.py @@ -17,7 +17,7 @@ ) from olive.common.utils import WeightsFileFormat, set_nested_dict_value from olive.passes.onnx.common import AdapterType -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class GenerateAdapterCommand(BaseOliveCLICommand): diff --git a/olive/cli/generate_cost_model.py b/olive/cli/generate_cost_model.py index 01f8bcaa24..e3395aa04b 100644 --- a/olive/cli/generate_cost_model.py +++ b/olive/cli/generate_cost_model.py @@ -7,7 +7,7 @@ from olive.cli.base import BaseOliveCLICommand, add_input_model_options, add_telemetry_options, get_input_model_config from olive.model import ModelConfig -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action logger = logging.getLogger(__name__) diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index 189f7d545a..6a2f737049 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -58,9 +58,9 @@ def main(raw_args=None, called_as_console_script: bool = True): args, unknown_args = parser.parse_known_args(raw_args) - t = TelemetryLogger() + logger = TelemetryLogger() if args.disable_telemetry: - t.disable_telemetry() + logger.disable_telemetry() if not hasattr(args, "func"): parser.print_help() @@ -69,7 +69,7 @@ def main(raw_args=None, called_as_console_script: bool = True): # Run the command service = args.func(parser, args, unknown_args) service.run() - t.shutdown() + logger.shutdown() def legacy_call(deprecated_module: str, command_name: str, *args): diff --git a/olive/cli/optimize.py b/olive/cli/optimize.py index 105d7fe5d3..2f4d7c04cb 100644 --- a/olive/cli/optimize.py +++ b/olive/cli/optimize.py @@ -21,7 +21,7 @@ from olive.common.utils import set_nested_dict_value from olive.constants import Precision, precision_bits_from_precision from olive.hardware.constants import ExecutionProvider -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class OptimizeCommand(BaseOliveCLICommand): diff --git a/olive/cli/quantize.py b/olive/cli/quantize.py index 2cc1ed496a..6ebe00f2f4 100644 --- a/olive/cli/quantize.py +++ b/olive/cli/quantize.py @@ -25,7 +25,7 @@ from olive.common.utils import StrEnumBase, set_nested_dict_value from olive.constants import Precision, QuantAlgorithm, precision_bits_from_precision from olive.package_config import OlivePackageConfig -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class ImplName(StrEnumBase): diff --git a/olive/cli/run.py b/olive/cli/run.py index 8a99770068..0b3e40e456 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -11,7 +11,7 @@ add_telemetry_options, get_input_model_config, ) -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class WorkflowRunCommand(BaseOliveCLICommand): diff --git a/olive/cli/run_pass.py b/olive/cli/run_pass.py index 60460f782c..ed1637adc6 100644 --- a/olive/cli/run_pass.py +++ b/olive/cli/run_pass.py @@ -16,7 +16,7 @@ get_input_model_config, update_accelerator_options, ) -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action @action diff --git a/olive/cli/session_params_tuning.py b/olive/cli/session_params_tuning.py index 65d8d9ca05..da9bfc2408 100644 --- a/olive/cli/session_params_tuning.py +++ b/olive/cli/session_params_tuning.py @@ -20,7 +20,7 @@ update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action class SessionParamsTuningCommand(BaseOliveCLICommand): diff --git a/olive/cli/shared_cache.py b/olive/cli/shared_cache.py index 19d56ab917..8fe874a708 100644 --- a/olive/cli/shared_cache.py +++ b/olive/cli/shared_cache.py @@ -6,7 +6,7 @@ from olive.cli.base import BaseOliveCLICommand, add_telemetry_options from olive.common.container_client_factory import AzureContainerClientFactory -from olive.telemetry.telemetry_events import action +from olive.telemetry.telemetry import action logger = logging.getLogger(__name__) diff --git a/olive/telemetry/constants.py b/olive/telemetry/constants.py new file mode 100644 index 0000000000..81ae3e71e4 --- /dev/null +++ b/olive/telemetry/constants.py @@ -0,0 +1,4 @@ +############# Foundry Local OneCollector Instrumentation ############# + +_ENDPOINT = "aHR0cHM6Ly9tb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbS9PbmVDb2xsZWN0b3IvMS4w" +_HEADERS = "eyJ4LWFwaWtleSI6ICI5ZDVkZGFlYzYxZTI0NTY3Yjc4OGEyMGFlYTMyNDYzMS03MjM3ZDdjNi1lZTYxLTRjZmQtYmI3Yi01OTAzYTk3MmMyZTQtNzA0NyIsICJVc2VyLUFnZW50IjogIlB5dGhvbi8zIEh0dHBDbGllbnQiLCAiSG9zdCI6ICJtb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbSIsICJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24veC1qc29uLXN0cmVhbTsgY2hhcnNldD11dGYtOCIsICJzZGstdmVyc2lvbiI6ICJPVGVsLXB5dGhvbi0wLjEuMC4wIiwgIk5vUmVzcG9uc2VCb2R5IjogInRydWUifQ==" diff --git a/olive/telemetry/endpoint.b64 b/olive/telemetry/endpoint.b64 deleted file mode 100644 index 657269e098..0000000000 --- a/olive/telemetry/endpoint.b64 +++ /dev/null @@ -1 +0,0 @@ -aHR0cHM6Ly9tb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbS9PbmVDb2xsZWN0b3IvMS4w \ No newline at end of file diff --git a/olive/telemetry/headers.b64 b/olive/telemetry/headers.b64 deleted file mode 100644 index 4a9b2dd686..0000000000 --- a/olive/telemetry/headers.b64 +++ /dev/null @@ -1 +0,0 @@ -eyJ4LWFwaWtleSI6ICI5ZDVkZGFlYzYxZTI0NTY3Yjc4OGEyMGFlYTMyNDYzMS03MjM3ZDdjNi1lZTYxLTRjZmQtYmI3Yi01OTAzYTk3MmMyZTQtNzA0NyIsICJVc2VyLUFnZW50IjogIlB5dGhvbi8zIEh0dHBDbGllbnQiLCAiSG9zdCI6ICJtb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbSIsICJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24veC1qc29uLXN0cmVhbTsgY2hhcnNldD11dGYtOCIsICJzZGstdmVyc2lvbiI6ICJPVGVsLXB5dGhvbi0wLjEuMC4wIiwgIk5vUmVzcG9uc2VCb2R5IjogInRydWUifQ== \ No newline at end of file diff --git a/olive/telemetry/msft_log_exporter.py b/olive/telemetry/msft_log_exporter.py index f84ef67365..7939d7aadf 100644 --- a/olive/telemetry/msft_log_exporter.py +++ b/olive/telemetry/msft_log_exporter.py @@ -14,7 +14,6 @@ from collections.abc import Sequence from datetime import datetime from io import BytesIO -from pathlib import Path from time import time from typing import Optional @@ -30,6 +29,7 @@ LogExportResult, ) +from olive.telemetry.constants import _ENDPOINT, _HEADERS from olive.version import __version__ as VERSION _MAX_RETRYS = 6 @@ -38,25 +38,18 @@ class MSFTLogExporter(LogExporter): def __init__( self, - endpoint_file: str = Path(__file__).parent / "endpoint.b64", - headers_file: str = Path(__file__).parent / "headers.b64", headers: Optional[dict[str, str]] = None, timeout: Optional[float] = 10, compression: Optional[Compression] = Compression.Deflate, ): self._shutdown_is_occuring = threading.Event() - with open(Path(__file__).parent / "endpoint.b64", "rb") as f: - encoded_data = f.read() - - self._endpoint = base64.b64decode(encoded_data).decode() + self._endpoint = base64.b64decode(_ENDPOINT).decode() self._timeout = timeout self._compression = compression self._session = requests.Session() - with open(headers_file, "rb") as f: - encoded_data = f.read() - self._headers = json.loads(base64.b64decode(encoded_data).decode()) + self._headers = json.loads(base64.b64decode(_HEADERS).decode()) self._iKey = f"o:{self._headers['x-apikey'].split('-')[0]}" if headers: self._headers.update(headers) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py new file mode 100644 index 0000000000..6de3ac2cd6 --- /dev/null +++ b/olive/telemetry/telemetry.py @@ -0,0 +1,44 @@ +import functools +import inspect +from datetime import datetime + +from olive.telemetry.telemetry_events import log_action, log_error + + +def action(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + stack = inspect.stack() + caller_frame = stack[1] + caller_module = inspect.getmodule(caller_frame[0]) + called_from = caller_module.__name__ + + if caller_module is None: + called_from = "Interactive" + elif caller_module.__name__ == "__main__": + called_from = "Script" + + success = False + error = None + start_time = datetime.now() + try: + result = func(*args, **kwargs) + success = True + except Exception as ex: + result = None + error = ex + duration_ms = int((datetime.now() - start_time).total_seconds() * 1000) + action_name = args[0].__class__.__name__ if args else "Invalid" + + if action_name.endswith("Command"): + action_name = action_name[: -len("Command")] + + log_action(action_name, called_from, start_time, duration_ms, success) + + if error: + log_error(action_name, called_from, error) + raise error + + return result + + return wrapper diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 4a0805bc8b..979be16f17 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -2,82 +2,34 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -import functools -import inspect -import traceback from datetime import datetime from olive.telemetry.telemetry_logger import TelemetryLogger - - -def action(func): - t = TelemetryLogger() - - @functools.wraps(func) - def wrapper(*args, **kwargs): - stack = inspect.stack() - caller_frame = stack[1] - caller_module = inspect.getmodule(caller_frame[0]) - called_from = caller_module.__name__ - - if caller_module is None: - called_from = "Interactive" - elif caller_module.__name__ == "__main__": - called_from = "Script" - - success = False - error = None - start_time = datetime.now() - try: - result = func(*args, **kwargs) - success = True - except Exception as ex: - result = None - error = ex - duration_ms = int((datetime.now() - start_time).total_seconds() * 1000) - action_name = args[0].__class__.__name__ if args else "Invalid" - if action_name.endswith("Command"): - action_name = action_name[: -len("Command")] - t.log( - "OliveAction", - { - "action": action_name, - "caller": called_from, - "actionTime": start_time, - "timeMs": duration_ms, - "success": success, - }, - ) - - if error: - t.log( - "OliveError", - { - "action": action_name, - "caller": called_from, - "errorType": type(error).__name__, - "error": _format_exception_msg(error), - }, - ) - raise error - return result - - return wrapper - - -def _format_exception_msg(exc: Exception) -> str: - folder = "Olive" - file_line = 'File "' - exc = traceback.format_exception(exc, limit=5) - lines = [] - for line in exc: - line_trunc = line.strip() - if line_trunc.startswith(file_line) and folder in line_trunc: - idx = line_trunc.find(folder) - if idx != -1: - line_trunc = line_trunc[idx + len(folder) :] - elif line_trunc.startswith(file_line): - idx = line_trunc[len(file_line) :].find('"') - line_trunc = line_trunc[idx + len(file_line) :] - lines.append(line_trunc) - return "\n".join(lines) +from olive.telemetry.utils import _format_exception_msg + +logger = TelemetryLogger() + + +def log_action(action_name: str, called_from: str, start_time: datetime, duration_ms: float, success: bool): + logger.log( + "OliveAction", + { + "action_name": action_name, + "called_from": called_from, + "start_time": start_time, + "duration_ms": duration_ms, + "success": success, + }, + ) + + +def log_error(action_name: str, called_from: str, error: Exception): + logger.log( + "OliveError", + { + "action_name": action_name, + "called_from": called_from, + "errorType": type(error).__name__, + "error": _format_exception_msg(error), + }, + ) diff --git a/olive/telemetry/telemetry_logger.py b/olive/telemetry/telemetry_logger.py index d8044a9ba1..006a21fab3 100644 --- a/olive/telemetry/telemetry_logger.py +++ b/olive/telemetry/telemetry_logger.py @@ -44,17 +44,22 @@ def __new__(cls, *args, **kwargs): logger.addHandler(handler) cls._logger = logger except Exception: - pass + # If any error occurs during initialization, we will not set up the logger and will silently fail. + cls._logger = None + cls._logger_provider = None return cls._instance def __init__(self): pass def log(self, event_name: str, information: dict[str, Any]): - self._logger.info(event_name, extra=information) + if self._logger: # in case the logger was not initialized properly + self._logger.info(event_name, extra=information) def disable_telemetry(self): - self._logger.disabled = True + if self._logger: # in case the logger was not initialized properly + self._logger.disabled = True def shutdown(self): - self._logger_provider.shutdown() + if self._logger_provider: # in case the logger provider was not initialized properly + self._logger_provider.shutdown() diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py new file mode 100644 index 0000000000..990508282a --- /dev/null +++ b/olive/telemetry/utils.py @@ -0,0 +1,23 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import traceback + + +def _format_exception_msg(exc: Exception) -> str: + folder = "Olive" + file_line = 'File "' + exc = traceback.format_exception(exc, limit=5) + lines = [] + for line in exc: + line_trunc = line.strip() + if line_trunc.startswith(file_line) and folder in line_trunc: + idx = line_trunc.find(folder) + if idx != -1: + line_trunc = line_trunc[idx + len(folder) :] + elif line_trunc.startswith(file_line): + idx = line_trunc[len(file_line) :].find('"') + line_trunc = line_trunc[idx + len(file_line) :] + lines.append(line_trunc) + return "\n".join(lines) From 66dfa49117b580b8ec9677a051ef5e30a14c0004 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 7 Sep 2025 14:06:49 -0500 Subject: [PATCH 04/31] Refactor --- olive/telemetry/telemetry.py | 13 ++++++++----- olive/telemetry/telemetry_events.py | 7 +++---- olive/telemetry/utils.py | 6 +++--- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 6de3ac2cd6..8e7715bdab 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -3,6 +3,7 @@ from datetime import datetime from olive.telemetry.telemetry_events import log_action, log_error +from olive.telemetry.utils import _format_exception_msg def action(func): @@ -19,14 +20,14 @@ def wrapper(*args, **kwargs): called_from = "Script" success = False - error = None + exception = None start_time = datetime.now() try: result = func(*args, **kwargs) success = True except Exception as ex: result = None - error = ex + exception = ex duration_ms = int((datetime.now() - start_time).total_seconds() * 1000) action_name = args[0].__class__.__name__ if args else "Invalid" @@ -35,9 +36,11 @@ def wrapper(*args, **kwargs): log_action(action_name, called_from, start_time, duration_ms, success) - if error: - log_error(action_name, called_from, error) - raise error + if exception: + exception_type = type(exception).__name__ + exception_message = _format_exception_msg(exception) + log_error(action_name, called_from, exception_type, exception_message) + raise exception return result diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 979be16f17..11471a70ee 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -5,7 +5,6 @@ from datetime import datetime from olive.telemetry.telemetry_logger import TelemetryLogger -from olive.telemetry.utils import _format_exception_msg logger = TelemetryLogger() @@ -23,13 +22,13 @@ def log_action(action_name: str, called_from: str, start_time: datetime, duratio ) -def log_error(action_name: str, called_from: str, error: Exception): +def log_error(action_name: str, called_from: str, exception_type: str, exception_message: str): logger.log( "OliveError", { "action_name": action_name, "called_from": called_from, - "errorType": type(error).__name__, - "error": _format_exception_msg(error), + "exception_type": exception_type, + "exception_message": exception_message, }, ) diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 990508282a..2560675cdd 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -5,12 +5,12 @@ import traceback -def _format_exception_msg(exc: Exception) -> str: +def _format_exception_msg(ex: Exception) -> str: folder = "Olive" file_line = 'File "' - exc = traceback.format_exception(exc, limit=5) + ex = traceback.format_exception(ex, limit=5) lines = [] - for line in exc: + for line in ex: line_trunc = line.strip() if line_trunc.startswith(file_line) and folder in line_trunc: idx = line_trunc.find(folder) From 5e5bf4723e834310ce95d8106864dcb8d4d9b731 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 7 Sep 2025 15:14:27 -0500 Subject: [PATCH 05/31] Separate out library --- olive/cli/launcher.py | 2 +- olive/telemetry/{ => library}/msft_log_exporter.py | 0 olive/telemetry/{ => library}/telemetry_logger.py | 2 +- olive/telemetry/telemetry_events.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename olive/telemetry/{ => library}/msft_log_exporter.py (100%) rename olive/telemetry/{ => library}/telemetry_logger.py (97%) diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index 6a2f737049..6276417060 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -20,7 +20,7 @@ from olive.cli.run_pass import RunPassCommand from olive.cli.session_params_tuning import SessionParamsTuningCommand from olive.cli.shared_cache import SharedCacheCommand -from olive.telemetry.telemetry_logger import TelemetryLogger +from olive.telemetry.library.telemetry_logger import TelemetryLogger def get_cli_parser(called_as_console_script: bool = True) -> ArgumentParser: diff --git a/olive/telemetry/msft_log_exporter.py b/olive/telemetry/library/msft_log_exporter.py similarity index 100% rename from olive/telemetry/msft_log_exporter.py rename to olive/telemetry/library/msft_log_exporter.py diff --git a/olive/telemetry/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py similarity index 97% rename from olive/telemetry/telemetry_logger.py rename to olive/telemetry/library/telemetry_logger.py index 006a21fab3..bd64e55f4d 100644 --- a/olive/telemetry/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -10,7 +10,7 @@ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource -from olive.telemetry.msft_log_exporter import MSFTLogExporter +from olive.telemetry.library.msft_log_exporter import MSFTLogExporter class TelemetryLogger: diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 11471a70ee..3af72753a2 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- from datetime import datetime -from olive.telemetry.telemetry_logger import TelemetryLogger +from olive.telemetry.lib.telemetry_logger import TelemetryLogger logger = TelemetryLogger() From 77e02d6f9de701201fd262e752176967cee6e8ba Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 7 Sep 2025 19:13:02 -0500 Subject: [PATCH 06/31] Refactor --- olive/telemetry/telemetry.py | 30 ++++++++++++++++++++++++++++- olive/telemetry/telemetry_events.py | 3 +-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 8e7715bdab..6fc05d8c57 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -1,11 +1,39 @@ import functools import inspect from datetime import datetime +from typing import Any from olive.telemetry.telemetry_events import log_action, log_error from olive.telemetry.utils import _format_exception_msg +# For more complex tracking scenarios +class TelemetryContext: + def __init__(self, event_name: str): + self.event_name = event_name + self.start_time = datetime.now() + self.metadata = {} + + def add_metadata(self, key: str, value: Any): + self.metadata[key] = value + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Send telemetry with all collected metadata + log_action( + self.event_name, + "ContextManager", + self.start_time, + int((datetime.now() - self.start_time).total_seconds() * 1000), + exc_type is None, + ) + + if exc_type is not None: + log_error("ContextManager", self.start_time, exc_val, _format_exception_msg(exc_tb)) + + def action(func): @functools.wraps(func) def wrapper(*args, **kwargs): @@ -39,7 +67,7 @@ def wrapper(*args, **kwargs): if exception: exception_type = type(exception).__name__ exception_message = _format_exception_msg(exception) - log_error(action_name, called_from, exception_type, exception_message) + log_error(called_from, exception_type, exception_message) raise exception return result diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 3af72753a2..17be54f0cc 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -22,11 +22,10 @@ def log_action(action_name: str, called_from: str, start_time: datetime, duratio ) -def log_error(action_name: str, called_from: str, exception_type: str, exception_message: str): +def log_error(called_from: str, exception_type: str, exception_message: str): logger.log( "OliveError", { - "action_name": action_name, "called_from": called_from, "exception_type": exception_type, "exception_message": exception_message, From b12fd3ccb7b36887b50abf6c6f233f2cadd8d721 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 7 Sep 2025 20:49:11 -0500 Subject: [PATCH 07/31] Add "called from" --- olive/telemetry/telemetry.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 6fc05d8c57..40f59325a4 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -8,11 +8,12 @@ # For more complex tracking scenarios -class TelemetryContext: - def __init__(self, event_name: str): +class ActionContext: + def __init__(self, event_name: str, called_from: str = "ContextManager"): self.event_name = event_name self.start_time = datetime.now() self.metadata = {} + self.called_from = called_from def add_metadata(self, key: str, value: Any): self.metadata[key] = value @@ -24,14 +25,14 @@ def __exit__(self, exc_type, exc_val, exc_tb): # Send telemetry with all collected metadata log_action( self.event_name, - "ContextManager", + self.called_from, self.start_time, int((datetime.now() - self.start_time).total_seconds() * 1000), exc_type is None, ) if exc_type is not None: - log_error("ContextManager", self.start_time, exc_val, _format_exception_msg(exc_tb)) + log_error(self.called_from, self.start_time, exc_val, _format_exception_msg(exc_tb)) def action(func): From 043221e9e5efc9868df45195b8c1de453ca1534b Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 7 Sep 2025 20:54:19 -0500 Subject: [PATCH 08/31] Add init.py --- olive/telemetry/library/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 olive/telemetry/library/__init__.py diff --git a/olive/telemetry/library/__init__.py b/olive/telemetry/library/__init__.py new file mode 100644 index 0000000000..862c45ce31 --- /dev/null +++ b/olive/telemetry/library/__init__.py @@ -0,0 +1,4 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- From 064dae433e38f2985b88cba5259eaee1c47be07a Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 8 Sep 2025 11:34:59 -0500 Subject: [PATCH 09/31] Fix --- olive/telemetry/library/msft_log_exporter.py | 47 ++++++++------------ olive/telemetry/library/telemetry_logger.py | 10 +++-- olive/telemetry/telemetry_events.py | 2 +- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/olive/telemetry/library/msft_log_exporter.py b/olive/telemetry/library/msft_log_exporter.py index 7939d7aadf..252c0bf9ad 100644 --- a/olive/telemetry/library/msft_log_exporter.py +++ b/olive/telemetry/library/msft_log_exporter.py @@ -5,7 +5,6 @@ import base64 import gzip import hashlib -import hmac import json import platform import random @@ -15,7 +14,7 @@ from datetime import datetime from io import BytesIO from time import time -from typing import Optional +from typing import Any, Optional import requests from deviceid import get_device_id @@ -56,11 +55,16 @@ def __init__( if self._compression is not Compression.NoCompression: self._headers.update({"Content-Encoding": self._compression.value}) self._session.headers.update(self._headers) - self._device_id = self._generate_encrypted_device_id() - self._system = platform.system().lower() - self._release = platform.release() - self._version = platform.version() - self._arch = platform.machine() + self._metadata = { + "device_id": self._generate_encrypted_device_id(), + "os": { + "name": platform.system().lower(), + "version": platform.version(), + "release": platform.release(), + "arch": platform.machine(), + }, + "version": VERSION, + } self._shutdown = False @@ -75,24 +79,10 @@ def _generate_encrypted_device_id(self) -> str: str: FIPS-compliant encrypted device ID (hex-encoded) """ - try: - # Get the raw device ID - raw_device_id = get_device_id() - - # Create a deterministic key from existing configuration - # Using the API key and endpoint as key material for HMAC - key_material = f"{self._headers.get('x-apikey', '')}{self._endpoint}".encode() - - # Use SHA256 to create a consistent 32-byte key - encryption_key = hashlib.sha256(key_material).digest() - - # Use HMAC-SHA256 to encrypt the device ID (FIPS 140-2 approved) - return hmac.new(encryption_key, raw_device_id.encode("utf-8"), hashlib.sha256).hexdigest() - - except Exception: - # Fallback to a consistent hash if anything fails - fallback_data = f"olive-telemetry-{self._iKey}".encode() - return hashlib.sha256(fallback_data).hexdigest() + # Get the raw device ID + raw_device_id = get_device_id().encode("utf-8") + # Use SHA256 to encrypt and use Base64 encoding + return base64.b64encode(hashlib.sha256(raw_device_id).digest()).decode("utf-8") def _export(self, data: bytes, timeout_sec: Optional[float] = None): if self._compression == Compression.Deflate: @@ -132,6 +122,9 @@ def _export(self, data: bytes, timeout_sec: Optional[float] = None): ) return resp + def add_metadata(self, metadata: dict[str, Any]): + self._metadata.update(metadata) + def export(self, batch: Sequence[LogData]) -> LogExportResult: if self._shutdown: return LogExportResult.FAILURE @@ -143,9 +136,7 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: for k, v in (log_record.attributes or {}).items() if k not in {"code.file.path", "code.function.name", "code.line.number"} } - data["deviceID"] = self._device_id - data["os"] = {"name": self._system, "version": self._version, "release": self._release, "arch": self._arch} - data["version"] = VERSION + data.update(self._metadata) log_entry = { "ver": "4.0", "name": log_record.body, diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index bd64e55f4d..ea296403c3 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -7,7 +7,7 @@ from opentelemetry._logs import set_logger_provider from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler -from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor, LogExporter from opentelemetry.sdk.resources import Resource from olive.telemetry.library.msft_log_exporter import MSFTLogExporter @@ -16,6 +16,7 @@ class TelemetryLogger: _instance = None # Class-level attribute to store the single instance _logger: logging.Logger = None + _logger_exporter: LogExporter = None _logger_provider: LoggerProvider = None def __new__(cls, *args, **kwargs): @@ -25,7 +26,7 @@ def __new__(cls, *args, **kwargs): cls._instance = super().__new__(cls) try: - exporter = MSFTLogExporter() + cls._logger_exporter = MSFTLogExporter() cls._logger_provider = LoggerProvider( resource=Resource.create( { @@ -35,7 +36,7 @@ def __new__(cls, *args, **kwargs): ), ) set_logger_provider(cls._logger_provider) - cls._logger_provider.add_log_record_processor(BatchLogRecordProcessor(exporter)) + cls._logger_provider.add_log_record_processor(BatchLogRecordProcessor(cls._logger_exporter)) handler = LoggingHandler(level=logging.INFO, logger_provider=cls._logger_provider) logger = logging.getLogger("olive.telemetry") @@ -52,6 +53,9 @@ def __new__(cls, *args, **kwargs): def __init__(self): pass + def add_metadata(self, metadata: dict[str, Any]): + self._logger_exporter.add_metadata(metadata) + def log(self, event_name: str, information: dict[str, Any]): if self._logger: # in case the logger was not initialized properly self._logger.info(event_name, extra=information) diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 17be54f0cc..eb6562e03d 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- from datetime import datetime -from olive.telemetry.lib.telemetry_logger import TelemetryLogger +from olive.telemetry.library.telemetry_logger import TelemetryLogger logger = TelemetryLogger() From 66f8949f8fa98eb5876397b0b66df0fc3a87f6d8 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 8 Sep 2025 12:06:47 -0500 Subject: [PATCH 10/31] Update event and imports --- olive/telemetry/__init__.py | 3 +++ olive/telemetry/library/__init__.py | 3 +++ olive/telemetry/telemetry.py | 2 +- olive/telemetry/telemetry_events.py | 5 +---- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/olive/telemetry/__init__.py b/olive/telemetry/__init__.py index 862c45ce31..c12c7047d4 100644 --- a/olive/telemetry/__init__.py +++ b/olive/telemetry/__init__.py @@ -2,3 +2,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +from olive.telemetry.telemetry import ActionContext, action + +__all__ = ["ActionContext", "action"] diff --git a/olive/telemetry/library/__init__.py b/olive/telemetry/library/__init__.py index 862c45ce31..255ba83317 100644 --- a/olive/telemetry/library/__init__.py +++ b/olive/telemetry/library/__init__.py @@ -2,3 +2,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +from olive.telemetry.library.telemetry_logger import TelemetryLogger + +__all__ = ["TelemetryLogger"] diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 40f59325a4..1d6f625840 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -63,7 +63,7 @@ def wrapper(*args, **kwargs): if action_name.endswith("Command"): action_name = action_name[: -len("Command")] - log_action(action_name, called_from, start_time, duration_ms, success) + log_action(action_name, called_from, duration_ms, success) if exception: exception_type = type(exception).__name__ diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index eb6562e03d..2c657f7f0f 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -2,20 +2,17 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -from datetime import datetime - from olive.telemetry.library.telemetry_logger import TelemetryLogger logger = TelemetryLogger() -def log_action(action_name: str, called_from: str, start_time: datetime, duration_ms: float, success: bool): +def log_action(action_name: str, called_from: str, duration_ms: float, success: bool): logger.log( "OliveAction", { "action_name": action_name, "called_from": called_from, - "start_time": start_time, "duration_ms": duration_ms, "success": success, }, From 84e387faed56e9cec78f9bd9c1bf068ce66bcce4 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Tue, 9 Sep 2025 10:55:31 -0500 Subject: [PATCH 11/31] Fix version import --- olive/__init__.py | 2 ++ setup.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/olive/__init__.py b/olive/__init__.py index 8faede1b86..4d95a8d777 100644 --- a/olive/__init__.py +++ b/olive/__init__.py @@ -28,6 +28,7 @@ tune_session_params, ) from olive.engine.output import DeviceOutput, ModelOutput, WorkflowOutput # noqa: E402 +from olive.version import __version__ # noqa: E402 from olive.workflows import run # noqa: E402 __all__ = [ @@ -44,4 +45,5 @@ "quantize", "run", "tune_session_params", + "__version__", ] diff --git a/setup.py b/setup.py index 12708a28a4..86624bd99d 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,6 @@ from setuptools import find_packages, setup -from olive.version import __version__ as VERSION - def read(rel_path): here = os.path.abspath(os.path.dirname(__file__)) @@ -16,6 +14,14 @@ def read(rel_path): return fp.read() +def get_version(rel_path): + for line in read(rel_path).splitlines(): + if line.startswith("__version__"): + delim = '"' if '"' in line else "'" + return line.split(delim)[1] + raise RuntimeError("Unable to find version string.") + + def get_extra_deps(rel_path): here = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(here, rel_path)) as fp: @@ -24,6 +30,7 @@ def get_extra_deps(rel_path): # use techniques described at https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ # Don't use technique 6 since it needs extra dependencies. +VERSION = get_version("olive/version.py") EXTRAS = get_extra_deps("olive/olive_config.json") with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")) as req_file: From f0a5e79ec4eb5fb107b27f1eabfa2c1df985899b Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Tue, 9 Sep 2025 10:57:59 -0500 Subject: [PATCH 12/31] Revert --- olive/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/olive/__init__.py b/olive/__init__.py index 4d95a8d777..8faede1b86 100644 --- a/olive/__init__.py +++ b/olive/__init__.py @@ -28,7 +28,6 @@ tune_session_params, ) from olive.engine.output import DeviceOutput, ModelOutput, WorkflowOutput # noqa: E402 -from olive.version import __version__ # noqa: E402 from olive.workflows import run # noqa: E402 __all__ = [ @@ -45,5 +44,4 @@ "quantize", "run", "tune_session_params", - "__version__", ] From 646e3125a85aff012c658efd05f2e6d10937ac0b Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Tue, 9 Sep 2025 17:22:24 -0500 Subject: [PATCH 13/31] Fix examples; skip trace --- docs/source/examples.md | 2 -- olive/cli/launcher.py | 1 + olive/telemetry/{ => library}/constants.py | 0 olive/telemetry/library/msft_log_exporter.py | 2 +- olive/telemetry/library/telemetry_logger.py | 6 +++--- olive/telemetry/telemetry.py | 15 ++------------- olive/telemetry/telemetry_events.py | 8 ++++---- 7 files changed, 11 insertions(+), 23 deletions(-) rename olive/telemetry/{ => library}/constants.py (100%) diff --git a/docs/source/examples.md b/docs/source/examples.md index ff54bb7e5d..24f4b03618 100644 --- a/docs/source/examples.md +++ b/docs/source/examples.md @@ -4,8 +4,6 @@ |---|-----------|-----------|-----------| |NLP|deepseek|[Link](https://github.com/microsoft/Olive/tree/main/examples/deepseek)|`QDQ`: QDQ Model with 4-bit Weights & 16-bit Activations
`QNN EP`: PTQ + AOT Compilation for Qualcomm NPUs using QNN EP
`Vitis AI EP`: PTQ + AOT Compilation for AMD NPUs using Vitis AI EP ||llama2|[Link](https://github.com/microsoft/Olive/tree/main/examples/llama2)|`CPU`: with ONNX Runtime optimizations for optimized FP32 ONNX model
`CPU`: with ONNX Runtime optimizations for optimized INT8 ONNX model
`CPU`: with ONNX Runtime optimizations for optimized INT4 ONNX model
`GPU`: with ONNX Runtime optimizations for optimized FP16 ONNX model
`GPU`: with ONNX Runtime optimizations for optimized INT4 ONNX model
`GPU`: with QLoRA for model fine tune and ONNX Runtime optimizations for optimized ONNX model -||llama3|[Link](https://github.com/microsoft/Olive/tree/main/examples/llama3)|`QDQ`: QDQ Model with 4-bit Weights & 16-bit Activations
`QNN EP`: PTQ + AOT Compilation for Qualcomm NPUs using QNN EP
`Vitis AI EP`: PTQ + AOT Compilation for AMD NPUs using Vitis AI EP -||mistral|[Link](https://github.com/microsoft/Olive/tree/main/examples/mistral)|`CPU`: with Optimum conversion and ONNX Runtime optimizations and Intel® Neural Compressor static quantization for optimized INT8 ONNX model
`GPU`: with ONNX Runtime optimizations for optimized FP16 ONNX model ||phi2|[Link](https://github.com/microsoft/Olive/tree/main/examples/phi2)|`CPU`: with ONNX Runtime optimizations fp32/int4
`GPU` with ONNX Runtime optimizations fp16/int4, with PyTorch QLoRA for model fine tune
`GPU` with SliceGPT for an optimized PyTorch model with sparsity ||phi3.5|[Link](https://github.com/microsoft/Olive/tree/main/examples/phi3_5)|`QDQ`: QDQ Model with 4-bit Weights & 16-bit Activations
`QNN EP`: PTQ + AOT Compilation for Qualcomm NPUs using QNN EP
`Vitis AI EP`: PTQ + AOT Compilation for AMD NPUs using Vitis AI EP ||qwen2.5|[Link](https://github.com/microsoft/Olive/tree/main/examples/qwen2_5)|`QDQ`: QDQ Model with 4-bit Weights & 16-bit Activations
`QNN EP`: PTQ + AOT Compilation for Qualcomm NPUs using QNN EP
`Vitis AI EP`: PTQ + AOT Compilation for AMD NPUs using Vitis AI EP diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index 6276417060..6604de17a5 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -61,6 +61,7 @@ def main(raw_args=None, called_as_console_script: bool = True): logger = TelemetryLogger() if args.disable_telemetry: logger.disable_telemetry() + logger.set_metadata({"called_from": "console" if called_as_console_script else "script"}) if not hasattr(args, "func"): parser.print_help() diff --git a/olive/telemetry/constants.py b/olive/telemetry/library/constants.py similarity index 100% rename from olive/telemetry/constants.py rename to olive/telemetry/library/constants.py diff --git a/olive/telemetry/library/msft_log_exporter.py b/olive/telemetry/library/msft_log_exporter.py index 252c0bf9ad..5178849509 100644 --- a/olive/telemetry/library/msft_log_exporter.py +++ b/olive/telemetry/library/msft_log_exporter.py @@ -28,7 +28,7 @@ LogExportResult, ) -from olive.telemetry.constants import _ENDPOINT, _HEADERS +from olive.telemetry.library.constants import _ENDPOINT, _HEADERS from olive.version import __version__ as VERSION _MAX_RETRYS = 6 diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index ea296403c3..1deb110884 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -30,8 +30,8 @@ def __new__(cls, *args, **kwargs): cls._logger_provider = LoggerProvider( resource=Resource.create( { - "service.name": "olive-telemetry", - "service.instance.id": "olive-telemetry-instance", + "service.name": __name__, + "service.instance.id": f"{__name__}-instance", } ), ) @@ -39,7 +39,7 @@ def __new__(cls, *args, **kwargs): cls._logger_provider.add_log_record_processor(BatchLogRecordProcessor(cls._logger_exporter)) handler = LoggingHandler(level=logging.INFO, logger_provider=cls._logger_provider) - logger = logging.getLogger("olive.telemetry") + logger = logging.getLogger(__name__) logger.propagate = False logger.setLevel(logging.INFO) logger.addHandler(handler) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 1d6f625840..4abbf649e1 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -1,5 +1,4 @@ import functools -import inspect from datetime import datetime from typing import Any @@ -38,16 +37,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): def action(func): @functools.wraps(func) def wrapper(*args, **kwargs): - stack = inspect.stack() - caller_frame = stack[1] - caller_module = inspect.getmodule(caller_frame[0]) - called_from = caller_module.__name__ - - if caller_module is None: - called_from = "Interactive" - elif caller_module.__name__ == "__main__": - called_from = "Script" - success = False exception = None start_time = datetime.now() @@ -63,12 +52,12 @@ def wrapper(*args, **kwargs): if action_name.endswith("Command"): action_name = action_name[: -len("Command")] - log_action(action_name, called_from, duration_ms, success) + log_action(action_name, duration_ms, success) if exception: exception_type = type(exception).__name__ exception_message = _format_exception_msg(exception) - log_error(called_from, exception_type, exception_message) + log_error(exception_type, exception_message) raise exception return result diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 2c657f7f0f..56c8801089 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -7,24 +7,24 @@ logger = TelemetryLogger() -def log_action(action_name: str, called_from: str, duration_ms: float, success: bool): +def log_action(action_name: str, duration_ms: float, success: bool, called_from: str = "module"): logger.log( "OliveAction", { "action_name": action_name, - "called_from": called_from, "duration_ms": duration_ms, "success": success, + "called_from": called_from, }, ) -def log_error(called_from: str, exception_type: str, exception_message: str): +def log_error(exception_type: str, exception_message: str, called_from: str = "module"): logger.log( "OliveError", { - "called_from": called_from, "exception_type": exception_type, "exception_message": exception_message, + "called_from": called_from, }, ) From 65c7ead7c64e79f97163c612c30a6ce43a5610f2 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Tue, 9 Sep 2025 17:33:55 -0500 Subject: [PATCH 14/31] Revert finding caller_from --- olive/cli/launcher.py | 1 - olive/telemetry/telemetry.py | 15 +++++++++++++-- olive/telemetry/telemetry_events.py | 8 ++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index 6604de17a5..6276417060 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -61,7 +61,6 @@ def main(raw_args=None, called_as_console_script: bool = True): logger = TelemetryLogger() if args.disable_telemetry: logger.disable_telemetry() - logger.set_metadata({"called_from": "console" if called_as_console_script else "script"}) if not hasattr(args, "func"): parser.print_help() diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 4abbf649e1..1d6f625840 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -1,4 +1,5 @@ import functools +import inspect from datetime import datetime from typing import Any @@ -37,6 +38,16 @@ def __exit__(self, exc_type, exc_val, exc_tb): def action(func): @functools.wraps(func) def wrapper(*args, **kwargs): + stack = inspect.stack() + caller_frame = stack[1] + caller_module = inspect.getmodule(caller_frame[0]) + called_from = caller_module.__name__ + + if caller_module is None: + called_from = "Interactive" + elif caller_module.__name__ == "__main__": + called_from = "Script" + success = False exception = None start_time = datetime.now() @@ -52,12 +63,12 @@ def wrapper(*args, **kwargs): if action_name.endswith("Command"): action_name = action_name[: -len("Command")] - log_action(action_name, duration_ms, success) + log_action(action_name, called_from, duration_ms, success) if exception: exception_type = type(exception).__name__ exception_message = _format_exception_msg(exception) - log_error(exception_type, exception_message) + log_error(called_from, exception_type, exception_message) raise exception return result diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py index 56c8801089..41cdb6a9dd 100644 --- a/olive/telemetry/telemetry_events.py +++ b/olive/telemetry/telemetry_events.py @@ -7,24 +7,24 @@ logger = TelemetryLogger() -def log_action(action_name: str, duration_ms: float, success: bool, called_from: str = "module"): +def log_action(called_from: str, action_name: str, duration_ms: float, success: bool): logger.log( "OliveAction", { + "called_from": called_from, "action_name": action_name, "duration_ms": duration_ms, "success": success, - "called_from": called_from, }, ) -def log_error(exception_type: str, exception_message: str, called_from: str = "module"): +def log_error(called_from: str, exception_type: str, exception_message: str): logger.log( "OliveError", { + "called_from": called_from, "exception_type": exception_type, "exception_message": exception_message, - "called_from": called_from, }, ) From 1720f21183a3515de3b49a80de9edc4f0d022e64 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Tue, 9 Sep 2025 18:22:37 -0500 Subject: [PATCH 15/31] Mv called_from --- olive/telemetry/telemetry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 1d6f625840..0bcdec68b1 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -63,7 +63,7 @@ def wrapper(*args, **kwargs): if action_name.endswith("Command"): action_name = action_name[: -len("Command")] - log_action(action_name, called_from, duration_ms, success) + log_action(called_from, action_name, duration_ms, success) if exception: exception_type = type(exception).__name__ From e591b82a48b6183636bdb292e3a3ca4a05b089aa Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 1 Feb 2026 02:07:24 -0600 Subject: [PATCH 16/31] Update telemetry implementation --- olive/cli/auto_opt.py | 2 +- olive/cli/benchmark.py | 2 + olive/cli/capture_onnx.py | 2 +- olive/cli/configure_qualcomm_sdk.py | 2 +- olive/cli/convert_adapters.py | 2 +- olive/cli/diffusion_lora.py | 2 + olive/cli/extract_adapters.py | 2 +- olive/cli/finetune.py | 2 +- olive/cli/generate_adapter.py | 2 +- olive/cli/generate_cost_model.py | 2 +- olive/cli/launcher.py | 8 +- olive/cli/optimize.py | 2 +- olive/cli/quantize.py | 2 +- olive/cli/run.py | 2 +- olive/cli/run_pass.py | 2 +- olive/cli/session_params_tuning.py | 2 +- olive/cli/shared_cache.py | 2 +- olive/telemetry/__init__.py | 5 +- olive/telemetry/constants.py | 8 + olive/telemetry/deviceid/__init__.py | 3 + olive/telemetry/deviceid/_store.py | 84 +++++ olive/telemetry/deviceid/deviceid.py | 51 +++ olive/telemetry/library/__init__.py | 81 +++- olive/telemetry/library/callback_manager.py | 100 +++++ .../library/connection_string_parser.py | 44 +++ olive/telemetry/library/constants.py | 4 - olive/telemetry/library/event_source.py | 242 ++++++++++++ olive/telemetry/library/exporter.py | 350 ++++++++++++++++++ olive/telemetry/library/msft_log_exporter.py | 184 --------- olive/telemetry/library/options.py | 104 ++++++ olive/telemetry/library/payload_builder.py | 91 +++++ olive/telemetry/library/retry.py | 98 +++++ olive/telemetry/library/serialization.py | 141 +++++++ olive/telemetry/library/telemetry_logger.py | 188 +++++++--- olive/telemetry/library/transport.py | 248 +++++++++++++ olive/telemetry/telemetry.py | 224 +++++++---- olive/telemetry/telemetry_decorators.py | 127 +++++++ olive/telemetry/telemetry_events.py | 30 -- olive/telemetry/utils.py | 26 +- requirements.txt | 2 - 40 files changed, 2111 insertions(+), 364 deletions(-) create mode 100644 olive/telemetry/constants.py create mode 100644 olive/telemetry/deviceid/__init__.py create mode 100644 olive/telemetry/deviceid/_store.py create mode 100644 olive/telemetry/deviceid/deviceid.py create mode 100644 olive/telemetry/library/callback_manager.py create mode 100644 olive/telemetry/library/connection_string_parser.py delete mode 100644 olive/telemetry/library/constants.py create mode 100644 olive/telemetry/library/event_source.py create mode 100644 olive/telemetry/library/exporter.py delete mode 100644 olive/telemetry/library/msft_log_exporter.py create mode 100644 olive/telemetry/library/options.py create mode 100644 olive/telemetry/library/payload_builder.py create mode 100644 olive/telemetry/library/retry.py create mode 100644 olive/telemetry/library/serialization.py create mode 100644 olive/telemetry/library/transport.py create mode 100644 olive/telemetry/telemetry_decorators.py delete mode 100644 olive/telemetry/telemetry_events.py diff --git a/olive/cli/auto_opt.py b/olive/cli/auto_opt.py index 6db4bb79b7..2e0f73444f 100644 --- a/olive/cli/auto_opt.py +++ b/olive/cli/auto_opt.py @@ -23,7 +23,7 @@ from olive.constants import Precision from olive.hardware.constants import ExecutionProvider from olive.package_config import OlivePackageConfig -from olive.telemetry.telemetry import action +from olive.telemetry import action class AutoOptCommand(BaseOliveCLICommand): diff --git a/olive/cli/benchmark.py b/olive/cli/benchmark.py index 5231419552..575c591271 100644 --- a/olive/cli/benchmark.py +++ b/olive/cli/benchmark.py @@ -15,6 +15,7 @@ update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value +from olive.telemetry import action class BenchmarkCommand(BaseOliveCLICommand): @@ -71,6 +72,7 @@ def register_subcommand(parser: ArgumentParser): add_shared_cache_options(sub_parser) sub_parser.set_defaults(func=BenchmarkCommand) + @action def run(self): return self._run_workflow() diff --git a/olive/cli/capture_onnx.py b/olive/cli/capture_onnx.py index c3ffa52fda..ca193da02f 100644 --- a/olive/cli/capture_onnx.py +++ b/olive/cli/capture_onnx.py @@ -20,7 +20,7 @@ ) from olive.common.utils import set_nested_dict_value from olive.model.utils.diffusers_utils import is_valid_diffusers_model -from olive.telemetry.telemetry import action +from olive.telemetry import action class ModelBuilderAccuracyLevel(IntEnum): diff --git a/olive/cli/configure_qualcomm_sdk.py b/olive/cli/configure_qualcomm_sdk.py index 4b7d0c6448..7f42fd58d8 100644 --- a/olive/cli/configure_qualcomm_sdk.py +++ b/olive/cli/configure_qualcomm_sdk.py @@ -5,7 +5,7 @@ from argparse import ArgumentParser from olive.cli.base import BaseOliveCLICommand, add_telemetry_options -from olive.telemetry.telemetry import action +from olive.telemetry import action class ConfigureQualcommSDKCommand(BaseOliveCLICommand): diff --git a/olive/cli/convert_adapters.py b/olive/cli/convert_adapters.py index 5f6fd323ab..558cde8846 100644 --- a/olive/cli/convert_adapters.py +++ b/olive/cli/convert_adapters.py @@ -8,7 +8,7 @@ from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights -from olive.telemetry.telemetry import action +from olive.telemetry import action if TYPE_CHECKING: from numpy.typing import NDArray diff --git a/olive/cli/diffusion_lora.py b/olive/cli/diffusion_lora.py index d0061738a6..dae57d42da 100644 --- a/olive/cli/diffusion_lora.py +++ b/olive/cli/diffusion_lora.py @@ -16,6 +16,7 @@ from olive.common.utils import set_nested_dict_value from olive.constants import DiffusersModelVariant from olive.passes.diffusers.lora import LRSchedulerType, MixedPrecision +from olive.telemetry import action class DiffusionLoraCommand(BaseOliveCLICommand): @@ -239,6 +240,7 @@ def register_subcommand(parser: ArgumentParser): add_save_config_file_options(sub_parser) sub_parser.set_defaults(func=DiffusionLoraCommand) + @action def run(self): return self._run_workflow() diff --git a/olive/cli/extract_adapters.py b/olive/cli/extract_adapters.py index 710f0254f2..2a9f7b0f0d 100644 --- a/olive/cli/extract_adapters.py +++ b/olive/cli/extract_adapters.py @@ -8,7 +8,7 @@ from olive.cli.base import BaseOliveCLICommand, add_logging_options, add_telemetry_options from olive.common.utils import WeightsFileFormat, save_weights -from olive.telemetry.telemetry import action +from olive.telemetry import action class ExtractAdaptersCommand(BaseOliveCLICommand): diff --git a/olive/cli/finetune.py b/olive/cli/finetune.py index efc7227a89..56afb2e851 100644 --- a/olive/cli/finetune.py +++ b/olive/cli/finetune.py @@ -19,7 +19,7 @@ update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value -from olive.telemetry.telemetry import action +from olive.telemetry import action class FineTuneCommand(BaseOliveCLICommand): diff --git a/olive/cli/generate_adapter.py b/olive/cli/generate_adapter.py index 6e10315064..d1bd03659a 100644 --- a/olive/cli/generate_adapter.py +++ b/olive/cli/generate_adapter.py @@ -17,7 +17,7 @@ ) from olive.common.utils import WeightsFileFormat, set_nested_dict_value from olive.passes.onnx.common import AdapterType -from olive.telemetry.telemetry import action +from olive.telemetry import action class GenerateAdapterCommand(BaseOliveCLICommand): diff --git a/olive/cli/generate_cost_model.py b/olive/cli/generate_cost_model.py index e3395aa04b..7861932c1c 100644 --- a/olive/cli/generate_cost_model.py +++ b/olive/cli/generate_cost_model.py @@ -7,7 +7,7 @@ from olive.cli.base import BaseOliveCLICommand, add_input_model_options, add_telemetry_options, get_input_model_config from olive.model import ModelConfig -from olive.telemetry.telemetry import action +from olive.telemetry import action logger = logging.getLogger(__name__) diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py index 1ce1a0c5ed..d9088bc89b 100644 --- a/olive/cli/launcher.py +++ b/olive/cli/launcher.py @@ -22,7 +22,7 @@ from olive.cli.run_pass import RunPassCommand from olive.cli.session_params_tuning import SessionParamsTuningCommand from olive.cli.shared_cache import SharedCacheCommand -from olive.telemetry.library.telemetry_logger import TelemetryLogger +from olive.telemetry import Telemetry def get_cli_parser(called_as_console_script: bool = True) -> ArgumentParser: @@ -62,9 +62,9 @@ def main(raw_args=None, called_as_console_script: bool = True): args, unknown_args = parser.parse_known_args(raw_args) - logger = TelemetryLogger() + telemetry = Telemetry() if args.disable_telemetry: - logger.disable_telemetry() + telemetry.disable_telemetry() if not hasattr(args, "func"): parser.print_help() @@ -73,7 +73,7 @@ def main(raw_args=None, called_as_console_script: bool = True): # Run the command service = args.func(parser, args, unknown_args) service.run() - logger.shutdown() + telemetry.shutdown() def legacy_call(deprecated_module: str, command_name: str, *args): diff --git a/olive/cli/optimize.py b/olive/cli/optimize.py index 0521248703..a66919f947 100644 --- a/olive/cli/optimize.py +++ b/olive/cli/optimize.py @@ -21,7 +21,7 @@ from olive.common.utils import set_nested_dict_value from olive.constants import Precision, precision_bits_from_precision from olive.hardware.constants import ExecutionProvider -from olive.telemetry.telemetry import action +from olive.telemetry import action class OptimizeCommand(BaseOliveCLICommand): diff --git a/olive/cli/quantize.py b/olive/cli/quantize.py index 06411b305c..b24f0ab3c0 100644 --- a/olive/cli/quantize.py +++ b/olive/cli/quantize.py @@ -25,7 +25,7 @@ from olive.common.utils import StrEnumBase, set_nested_dict_value from olive.constants import Precision, QuantAlgorithm, precision_bits_from_precision from olive.package_config import OlivePackageConfig -from olive.telemetry.telemetry import action +from olive.telemetry import action class ImplName(StrEnumBase): diff --git a/olive/cli/run.py b/olive/cli/run.py index 84fb87cbce..6d2a831aef 100644 --- a/olive/cli/run.py +++ b/olive/cli/run.py @@ -11,7 +11,7 @@ add_telemetry_options, get_input_model_config, ) -from olive.telemetry.telemetry import action +from olive.telemetry import action class WorkflowRunCommand(BaseOliveCLICommand): diff --git a/olive/cli/run_pass.py b/olive/cli/run_pass.py index ed1637adc6..3ed269185f 100644 --- a/olive/cli/run_pass.py +++ b/olive/cli/run_pass.py @@ -16,7 +16,7 @@ get_input_model_config, update_accelerator_options, ) -from olive.telemetry.telemetry import action +from olive.telemetry import action @action diff --git a/olive/cli/session_params_tuning.py b/olive/cli/session_params_tuning.py index 572683526d..69976122ac 100644 --- a/olive/cli/session_params_tuning.py +++ b/olive/cli/session_params_tuning.py @@ -20,7 +20,7 @@ update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value -from olive.telemetry.telemetry import action +from olive.telemetry import action class SessionParamsTuningCommand(BaseOliveCLICommand): diff --git a/olive/cli/shared_cache.py b/olive/cli/shared_cache.py index 8fe874a708..78c89b5a55 100644 --- a/olive/cli/shared_cache.py +++ b/olive/cli/shared_cache.py @@ -6,7 +6,7 @@ from olive.cli.base import BaseOliveCLICommand, add_telemetry_options from olive.common.container_client_factory import AzureContainerClientFactory -from olive.telemetry.telemetry import action +from olive.telemetry import action logger = logging.getLogger(__name__) diff --git a/olive/telemetry/__init__.py b/olive/telemetry/__init__.py index c12c7047d4..358fbeb70b 100644 --- a/olive/telemetry/__init__.py +++ b/olive/telemetry/__init__.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -from olive.telemetry.telemetry import ActionContext, action +from olive.telemetry.telemetry_decorators import ActionContext, action +from olive.telemetry.telemetry import Telemetry -__all__ = ["ActionContext", "action"] +__all__ = ["ActionContext", "Telemetry", "action"] diff --git a/olive/telemetry/constants.py b/olive/telemetry/constants.py new file mode 100644 index 0000000000..ca9e150b1b --- /dev/null +++ b/olive/telemetry/constants.py @@ -0,0 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""OneCollector connection string.""" + +CONNECTION_STRING = "SW5zdHJ1bWVudGF0aW9uS2V5PTlkNWRkYWVjNjFlMjQ1NjdiNzg4YTIwYWVhMzI0NjMxLTcyMzdkN2M2LWVlNjEtNGNmZC1iYjdiLTU5MDNhOTcyYzJlNC03MDQ3" diff --git a/olive/telemetry/deviceid/__init__.py b/olive/telemetry/deviceid/__init__.py new file mode 100644 index 0000000000..24129b0eb0 --- /dev/null +++ b/olive/telemetry/deviceid/__init__.py @@ -0,0 +1,3 @@ +from olive.telemetry.deviceid.deviceid import get_device_id + +__all__ = ["get_device_id"] diff --git a/olive/telemetry/deviceid/_store.py b/olive/telemetry/deviceid/_store.py new file mode 100644 index 0000000000..3fd554d701 --- /dev/null +++ b/olive/telemetry/deviceid/_store.py @@ -0,0 +1,84 @@ +import os +import platform +from pathlib import Path + +REGISTRY_PATH = r"SOFTWARE\Microsoft\DeveloperTools\.onnxruntime" +REGISTRY_KEY = "deviceid" +DEVICEID_LOCATION = r"Microsoft/DeveloperTools/deviceid/.onnxruntime/" + + +class Store: + def __init__(self) -> None: + self._file_path: Path = self._build_path() + + def _build_path(self) -> Path: + os_name = platform.system() + if os_name in ("Darwin"): + home = os.getenv("HOME") + if home is None: + raise ValueError("HOME environment variable not set") + + return Path(f"{home}/Library/Application Support/{DEVICEID_LOCATION}") + + home = os.getenv("XDG_CACHE_HOME", f"{os.getenv('HOME')}/.cache") + + if not home: + raise ValueError("HOME environment variable not set") + + return Path(home).joinpath(DEVICEID_LOCATION) + + def retrieve_id(self) -> str: + """Retrieve the device id from the store location. + + :return: The device id. + :rtype: str + """ + # check if file doesnt exist and raise an Exception + if not self._file_path.is_file(): + raise FileExistsError(f"File {self._file_path.stem} does not exist") + + return self._file_path.read_text(encoding="utf-8") + + def store_id(self, device_id: str) -> None: + """Store the device id in the store location. + + :param str device_id: The device id to store. + :type device_id: str + """ + # create the folder location if it does not exist + try: + self._file_path.parent.mkdir(parents=True) + except FileExistsError: + pass + + self._file_path.touch() + self._file_path.write_text(device_id, encoding="utf-8") + + +class WindowsStore: + def retrieve_id(self) -> str: + """Retrieve the device id from the Windows registry.""" + import winreg + + device_id: str + + with winreg.OpenKeyEx( + winreg.HKEY_CURRENT_USER, REGISTRY_PATH, reserved=0, access=winreg.KEY_READ | winreg.KEY_WOW64_64KEY + ) as key_handle: + device_id = winreg.QueryValueEx(key_handle, REGISTRY_KEY) + return device_id[0] + + def store_id(self, device_id: str) -> None: + """Store the device id in the windows registry. + + :param str device_id: The device id to sstore. + """ + import winreg + + with winreg.CreateKeyEx( + winreg.HKEY_CURRENT_USER, + REGISTRY_PATH, + reserved=0, + access=winreg.KEY_ALL_ACCESS | winreg.KEY_WOW64_64KEY, + ) as key_handle: + winreg.SetValueEx(key_handle, REGISTRY_KEY, 0, winreg.REG_SZ, device_id) diff --git a/olive/telemetry/deviceid/deviceid.py b/olive/telemetry/deviceid/deviceid.py new file mode 100644 index 0000000000..53ed791f4d --- /dev/null +++ b/olive/telemetry/deviceid/deviceid.py @@ -0,0 +1,51 @@ +import logging +import platform +import uuid +from typing import Union + +from olive.telemetry.deviceid._store import Store, WindowsStore + + +def get_device_id(*, full_trace: bool = False) -> str: + r"""Get the device id from the store or create one if it does not exist. + + An empty string is returned if an error occurs during saving or retrieval of the device id. + + Linux id location: $XDG_CACHE_HOME/deviceid if defined else $HOME/.cache/deviceid + MacOS id location: $HOME/Library/Application Support/Microsoft/DeveloperTools/deviceid + Windows id location: HKEY_CURRENT_USER\SOFTWARE\Microsoft\DeveloperTools\deviceid + + :keyword full_trace: If True, the full stack trace is logged. Default is False. + :return: The device id. + :rtype: str + """ + logger = logging.getLogger(__name__) + device_id: str = "" + store: Union[Store, WindowsStore] + + try: + if platform.system() == "Windows": + store = WindowsStore() + elif platform.system() in ("Linux", "Darwin"): + store = Store() + else: + return device_id + return store.retrieve_id() + except (PermissionError, ValueError, NotImplementedError): + if full_trace: + logger.exception("Failed to retrieve stored device id.") + return device_id + except Exception: + if full_trace: + logger.exception("Failed to retrieve stored device id.") + + device_id = str(uuid.uuid4()).lower() + + try: + store.store_id(device_id) + except Exception: + if full_trace: + logger.exception("Failed to store device id.") + device_id = "" + + return device_id diff --git a/olive/telemetry/library/__init__.py b/olive/telemetry/library/__init__.py index 255ba83317..39831da66e 100644 --- a/olive/telemetry/library/__init__.py +++ b/olive/telemetry/library/__init__.py @@ -2,6 +2,83 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -from olive.telemetry.library.telemetry_logger import TelemetryLogger -__all__ = ["TelemetryLogger"] +"""OneCollector Exporter for OpenTelemetry Python. + +This package provides an OpenTelemetry exporter that sends telemetry data +to Microsoft OneCollector using the Common Schema JSON format. + +Example usage: + + from onecollector_exporter import ( + OneCollectorLogExporter, + OneCollectorExporterOptions, + get_telemetry_logger, + ) + + # Option 1: Use with OpenTelemetry SDK directly + options = OneCollectorExporterOptions( + connection_string="InstrumentationKey=your-key-here" + ) + exporter = OneCollectorLogExporter(options=options) + + # Add to logger provider + from opentelemetry.sdk._logs import LoggerProvider + from opentelemetry.sdk._logs.export import BatchLogRecordProcessor + + provider = LoggerProvider() + provider.add_log_record_processor(BatchLogRecordProcessor(exporter)) + + # Option 2: Use the simplified telemetry logger + logger = get_telemetry_logger( + connection_string="InstrumentationKey=your-key-here" + ) + logger.log("MyEvent", {"key": "value"}) + logger.shutdown() +""" + +from olive.telemetry.library.callback_manager import CallbackManager, PayloadTransmittedCallbackArgs +from olive.telemetry.library.connection_string_parser import ConnectionStringParser +from olive.telemetry.library.event_source import OneCollectorEventId, OneCollectorEventSource, event_source +from olive.telemetry.library.exporter import OneCollectorLogExporter +from olive.telemetry.library.options import ( + CompressionType, + OneCollectorExporterOptions, + OneCollectorExporterValidationError, + OneCollectorTransportOptions, +) +from olive.telemetry.library.payload_builder import PayloadBuilder +from olive.telemetry.library.retry import RetryHandler +from olive.telemetry.library.serialization import CommonSchemaJsonSerializationHelper +from olive.telemetry.library.telemetry_logger import ( + TelemetryLogger, + get_telemetry_logger, + log_event, + shutdown_telemetry, +) +from olive.telemetry.library.transport import HttpJsonPostTransport, ITransport + +__version__ = "0.0.1" + +__all__ = [ + "CallbackManager", + "CommonSchemaJsonSerializationHelper", + "CompressionType", + "ConnectionStringParser", + "HttpJsonPostTransport", + "ITransport", + "OneCollectorEventId", + "OneCollectorEventSource", + "OneCollectorExporterOptions", + "OneCollectorExporterValidationError", + "OneCollectorLogExporter", + "OneCollectorTransportOptions", + "PayloadBuilder", + "PayloadTransmittedCallbackArgs", + "RetryHandler", + "TelemetryLogger", + "event_source", + "get_telemetry_logger", + "log_event", + "shutdown_telemetry", +] diff --git a/olive/telemetry/library/callback_manager.py b/olive/telemetry/library/callback_manager.py new file mode 100644 index 0000000000..5936fd0196 --- /dev/null +++ b/olive/telemetry/library/callback_manager.py @@ -0,0 +1,100 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Callback manager for payload transmission events.""" + +import threading +from dataclasses import dataclass +from typing import Callable, Optional + +from olive.telemetry.library.event_source import event_source + + +@dataclass +class PayloadTransmittedCallbackArgs: + """Arguments passed to payload transmitted callbacks. + + Matches the .NET OneCollectorExporterPayloadTransmittedCallbackArguments. + """ + + succeeded: bool + """Whether the transmission succeeded.""" + + status_code: Optional[int] + """HTTP status code, if available.""" + + payload_size_bytes: int + """Size of the transmitted payload in bytes.""" + + item_count: int + """Number of items in the payload.""" + + +class CallbackManager: + """Manages callbacks for payload transmission events. + + Allows registration of callbacks that are invoked when payloads + are successfully transmitted or fail. + """ + + def __init__(self): + """Initialize the callback manager.""" + self._callbacks: list[tuple[Callable[[PayloadTransmittedCallbackArgs], None], bool]] = [] + self._lock = threading.Lock() + + def register( + self, callback: Callable[[PayloadTransmittedCallbackArgs], None], include_failures: bool = False + ) -> Callable[[], None]: + """Register a callback to be invoked on payload transmission. + + Args: + callback: Function to call when payload is transmitted + include_failures: Whether to invoke callback on transmission failures + + Returns: + Function to call to unregister the callback + + """ + with self._lock: + entry = (callback, include_failures) + self._callbacks.append(entry) + + def unregister(): + """Unregister this callback.""" + with self._lock: + try: + self._callbacks.remove(entry) + except ValueError: + pass # Already removed + + return unregister + + def notify(self, args: PayloadTransmittedCallbackArgs) -> None: + """Notify all registered callbacks. + + Args: + args: Callback arguments + + """ + # Get snapshot of callbacks to avoid holding lock during invocation + with self._lock: + callbacks_snapshot = self._callbacks.copy() + + # Invoke callbacks + for callback, include_failures in callbacks_snapshot: + # Check if we should invoke this callback + if not args.succeeded and not include_failures: + continue + + try: + callback(args) + except Exception as ex: + # Log but don't propagate exceptions from user code + event_source.exception_thrown_from_user_code("PayloadTransmittedCallback", ex) + + def clear(self) -> None: + """Clear all registered callbacks.""" + with self._lock: + self._callbacks.clear() diff --git a/olive/telemetry/library/connection_string_parser.py b/olive/telemetry/library/connection_string_parser.py new file mode 100644 index 0000000000..dc6e05ed6d --- /dev/null +++ b/olive/telemetry/library/connection_string_parser.py @@ -0,0 +1,44 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Connection string parser for OneCollector exporter.""" + + +class ConnectionStringParser: + """Parses OneCollector connection strings to extract configuration.""" + + def __init__(self, connection_string: str): + """Initialize the parser with a connection string. + + Args: + connection_string: Connection string in the format "Key1=Value1;Key2=Value2" + + Raises: + ValueError: If the connection string is invalid or missing required fields + + """ + if not connection_string: + raise ValueError("Connection string cannot be empty") + + self.instrumentation_key: str | None = None + self._parse(connection_string) + + if not self.instrumentation_key: + raise ValueError("InstrumentationKey not found in connection string") + + def _parse(self, connection_string: str) -> None: + """Parse the connection string into key-value pairs.""" + parts = connection_string.split(";") + for raw_part in parts: + part = raw_part.strip() + if not part or "=" not in part: + continue + + key, value = part.split("=", 1) + key = key.strip().lower() + value = value.strip() + + if key == "instrumentationkey": + self.instrumentation_key = value diff --git a/olive/telemetry/library/constants.py b/olive/telemetry/library/constants.py deleted file mode 100644 index 81ae3e71e4..0000000000 --- a/olive/telemetry/library/constants.py +++ /dev/null @@ -1,4 +0,0 @@ -############# Foundry Local OneCollector Instrumentation ############# - -_ENDPOINT = "aHR0cHM6Ly9tb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbS9PbmVDb2xsZWN0b3IvMS4w" -_HEADERS = "eyJ4LWFwaWtleSI6ICI5ZDVkZGFlYzYxZTI0NTY3Yjc4OGEyMGFlYTMyNDYzMS03MjM3ZDdjNi1lZTYxLTRjZmQtYmI3Yi01OTAzYTk3MmMyZTQtNzA0NyIsICJVc2VyLUFnZW50IjogIlB5dGhvbi8zIEh0dHBDbGllbnQiLCAiSG9zdCI6ICJtb2JpbGUuZXZlbnRzLmRhdGEubWljcm9zb2Z0LmNvbSIsICJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24veC1qc29uLXN0cmVhbTsgY2hhcnNldD11dGYtOCIsICJzZGstdmVyc2lvbiI6ICJPVGVsLXB5dGhvbi0wLjEuMC4wIiwgIk5vUmVzcG9uc2VCb2R5IjogInRydWUifQ==" diff --git a/olive/telemetry/library/event_source.py b/olive/telemetry/library/event_source.py new file mode 100644 index 0000000000..a9829edb93 --- /dev/null +++ b/olive/telemetry/library/event_source.py @@ -0,0 +1,242 @@ +"""EventSource-style logging for OneCollector exporter. + +Provides structured logging similar to .NET EventSource for diagnostics and monitoring. +""" + +import logging +from enum import IntEnum + + +class OneCollectorEventId(IntEnum): + """Event IDs matching .NET EventSource implementation.""" + + EXPORT_EXCEPTION = 1 + TRANSPORT_DATA_SENT = 2 + SINK_DATA_WRITTEN = 3 + DATA_DROPPED = 4 + TRANSPORT_EXCEPTION = 5 + HTTP_ERROR_RESPONSE = 6 + EVENT_FULL_NAME_DISCARDED = 7 + EVENT_NAMESPACE_INVALID = 8 + EVENT_NAME_INVALID = 9 + USER_CODE_EXCEPTION = 10 + ATTRIBUTE_DROPPED = 11 + + +class OneCollectorEventSource: + """EventSource for OneCollector exporter diagnostics. + + Provides structured logging matching the .NET EventSource implementation. + """ + + def __init__(self): + self.logger = logging.getLogger("OpenTelemetry.Exporter.OneCollector") + # Set default level to INFO to match .NET behavior + if not self.logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) + + def is_informational_logging_enabled(self) -> bool: + """Check if informational level logging is enabled.""" + return self.logger.isEnabledFor(logging.INFO) + + def export_exception_thrown(self, item_type: str, exception: Exception) -> None: + """Log an exception thrown during export. + + Args: + item_type: Type of item being exported (e.g., 'LogData') + exception: The exception that was thrown + + """ + if self.logger.isEnabledFor(logging.ERROR): + self.logger.error( + "Exception thrown exporting '%s' batch: %s", + item_type, + exception, + exc_info=exception, + extra={"event_id": OneCollectorEventId.EXPORT_EXCEPTION}, + ) + + def transport_data_sent(self, item_type: str, num_records: int, transport_description: str) -> None: + """Log successful data transmission. + + Args: + item_type: Type of items sent + num_records: Number of records sent + transport_description: Description of transport used + + """ + if self.is_informational_logging_enabled(): + self.logger.info( + "Sent '%s' batch of %s item(s) to '%s' transport", + item_type, + num_records, + transport_description, + extra={"event_id": OneCollectorEventId.TRANSPORT_DATA_SENT}, + ) + + def sink_data_written(self, item_type: str, num_records: int, sink_description: str) -> None: + """Log data written to sink. + + Args: + item_type: Type of items written + num_records: Number of records written + sink_description: Description of sink used + + """ + if self.is_informational_logging_enabled(): + self.logger.info( + "Wrote '%s' batch of %s item(s) to '%s' sink", + item_type, + num_records, + sink_description, + extra={"event_id": OneCollectorEventId.SINK_DATA_WRITTEN}, + ) + + def data_dropped( + self, item_type: str, num_records: int, during_serialization: int, during_transmission: int + ) -> None: + """Log dropped data. + + Args: + item_type: Type of items dropped + num_records: Total number of records dropped + during_serialization: Number dropped during serialization + during_transmission: Number dropped during transmission + + """ + if self.logger.isEnabledFor(logging.WARNING): + self.logger.warning( + "Dropped %s '%s' item(s). %s item(s) dropped during serialization. %s item(s) dropped due to " + "transmission failure", + num_records, + item_type, + during_serialization, + during_transmission, + extra={"event_id": OneCollectorEventId.DATA_DROPPED}, + ) + + def transport_exception_thrown(self, transport_type: str, exception: Exception) -> None: + """Log transport exception. + + Args: + transport_type: Type of transport + exception: The exception that was thrown + + """ + if self.logger.isEnabledFor(logging.ERROR): + self.logger.error( + "Exception thrown by '%s' transport: %s", + transport_type, + exception, + exc_info=exception, + extra={"event_id": OneCollectorEventId.TRANSPORT_EXCEPTION}, + ) + + def http_transport_error_response( + self, transport_type: str, status_code: int, error_message: str, error_details: str + ) -> None: + """Log HTTP error response. + + Args: + transport_type: Type of transport + status_code: HTTP status code + error_message: Error message from response + error_details: Additional error details + + """ + if self.logger.isEnabledFor(logging.ERROR): + self.logger.error( + "Error response received by '%s' transport. StatusCode: %s, ErrorMessage: '%s', ErrorDetails: '%s'", + transport_type, + status_code, + error_message, + error_details, + extra={"event_id": OneCollectorEventId.HTTP_ERROR_RESPONSE}, + ) + + def event_full_name_discarded(self, event_namespace: str, event_name: str) -> None: + """Log event full name discarded. + + Args: + event_namespace: Event namespace + event_name: Event name + + """ + if self.logger.isEnabledFor(logging.WARNING): + self.logger.warning( + "Event full name discarded. EventNamespace: '%s', EventName: '%s'", + event_namespace, + event_name, + extra={"event_id": OneCollectorEventId.EVENT_FULL_NAME_DISCARDED}, + ) + + def event_namespace_invalid(self, event_namespace: str) -> None: + """Log invalid event namespace. + + Args: + event_namespace: The invalid namespace + + """ + if self.logger.isEnabledFor(logging.WARNING): + self.logger.warning( + "Event namespace invalid. EventNamespace: '%s'", + event_namespace, + extra={"event_id": OneCollectorEventId.EVENT_NAMESPACE_INVALID}, + ) + + def event_name_invalid(self, event_name: str) -> None: + """Log invalid event name. + + Args: + event_name: The invalid event name + + """ + if self.logger.isEnabledFor(logging.WARNING): + self.logger.warning( + "Event name invalid. EventName: '%s'", + event_name, + extra={"event_id": OneCollectorEventId.EVENT_NAME_INVALID}, + ) + + def exception_thrown_from_user_code(self, user_code_type: str, exception: Exception) -> None: + """Log exception from user code (e.g., callbacks). + + Args: + user_code_type: Type of user code that threw exception + exception: The exception that was thrown + + """ + if self.logger.isEnabledFor(logging.ERROR): + self.logger.error( + "Exception thrown by '%s' user code: %s", + user_code_type, + exception, + exc_info=exception, + extra={"event_id": OneCollectorEventId.USER_CODE_EXCEPTION}, + ) + + def attribute_dropped(self, item_type: str, attribute_name: str, reason: str) -> None: + """Log dropped attribute. + + Args: + item_type: Type of item + attribute_name: Name of dropped attribute + reason: Reason for dropping + + """ + if self.logger.isEnabledFor(logging.WARNING): + self.logger.warning( + "Dropped %s attribute '%s': %s", + item_type, + attribute_name, + reason, + extra={"event_id": OneCollectorEventId.ATTRIBUTE_DROPPED}, + ) + + +# Global event source instance +event_source = OneCollectorEventSource() diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py new file mode 100644 index 0000000000..d7ecbe18b6 --- /dev/null +++ b/olive/telemetry/library/exporter.py @@ -0,0 +1,350 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Main OneCollector log exporter implementation.""" + +import threading +from collections.abc import Sequence +from datetime import datetime, timezone +from time import time +from typing import TYPE_CHECKING, Any, Callable, Optional + +import requests +from opentelemetry.sdk._logs import LogData +from opentelemetry.sdk._logs.export import LogExporter, LogExportResult +from opentelemetry.sdk.resources import Resource + +from olive.telemetry.library.callback_manager import CallbackManager +from olive.telemetry.library.event_source import event_source +from olive.telemetry.library.options import OneCollectorExporterOptions +from olive.telemetry.library.payload_builder import PayloadBuilder +from olive.telemetry.library.retry import RetryHandler +from olive.telemetry.library.serialization import CommonSchemaJsonSerializationHelper +from olive.telemetry.library.transport import HttpJsonPostTransport + +if TYPE_CHECKING: + from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs + + +class OneCollectorLogExporter(LogExporter): + """OpenTelemetry log exporter for Microsoft OneCollector. + + Implements the OpenTelemetry LogExporter interface and sends logs + to OneCollector using the Common Schema JSON format. + """ + + def __init__( + self, + options: Optional[OneCollectorExporterOptions] = None, + excluded_attributes: Optional[set[str]] = None, + **kwargs, + ): + """Initialize the OneCollector log exporter. + + Args: + options: Exporter configuration options + excluded_attributes: Attribute keys to exclude from log attributes + **kwargs: Legacy keyword arguments for backward compatibility + - connection_string: OneCollector connection string + - headers: Additional HTTP headers + - timeout: Request timeout in seconds + - compression: Compression type + + """ + # Handle legacy initialization + if options is None: + options = self._create_options_from_kwargs(kwargs) + + # Validate options + options.validate() + + self._options = options + self._shutdown_lock = threading.Lock() + self._shutdown = False + self._shutdown_event = threading.Event() + if excluded_attributes is None: + self._excluded_attributes = { + "code.filepath", + "code.function", + "code.lineno", + "code.file.path", + "code.function.name", + "code.line.number", + } + else: + self._excluded_attributes = set(excluded_attributes) + + # Initialize transport + transport_opts = options.transport_options + + # Create or get HTTP session + if transport_opts.http_client_factory: + self._session = transport_opts.http_client_factory() + else: + self._session = requests.Session() + + # Build iKey with tenant prefix + self._ikey = f"{CommonSchemaJsonSerializationHelper.ONE_COLLECTOR_TENANCY_SYMBOL}:{options.tenant_token}" + + # Initialize callback manager + self._callback_manager = CallbackManager() + + # Initialize transport with callback manager + self._transport = HttpJsonPostTransport( + endpoint=transport_opts.endpoint, + ikey=options.instrumentation_key, + compression=transport_opts.compression, + session=self._session, + callback_manager=self._callback_manager, + ) + + # Initialize payload builder + self._payload_builder = PayloadBuilder( + max_size_bytes=transport_opts.max_payload_size_bytes, max_items=transport_opts.max_items_per_payload + ) + + # Initialize retry handler + self._retry_handler = RetryHandler(max_retries=6) + + # Initialize metadata + self._metadata: dict[str, Any] = {} + self._add_default_metadata() + + # Cache for resource (populated on first export) + self._resource: Optional[Resource] = None + + def _create_options_from_kwargs(self, kwargs: dict) -> OneCollectorExporterOptions: + """Create options from legacy keyword arguments.""" + from olive.telemetry.library.options import ( + CompressionType, + OneCollectorExporterOptions, + OneCollectorTransportOptions, + ) + + connection_string = kwargs.get("connection_string") + timeout = kwargs.get("timeout", 10.0) + compression = kwargs.get("compression", CompressionType.DEFLATE) + + transport_options = OneCollectorTransportOptions(timeout_seconds=timeout, compression=compression) + + return OneCollectorExporterOptions(connection_string=connection_string, transport_options=transport_options) + + def add_metadata(self, metadata: dict[str, Any]) -> None: + """Add custom metadata fields to all exported logs. + + Args: + metadata: Dictionary of metadata fields to add + + """ + self._metadata.update(metadata) + + def register_payload_transmitted_callback( + self, callback: Callable[["PayloadTransmittedCallbackArgs"], None], include_failures: bool = False + ) -> Callable[[], None]: + """Register a callback that will be invoked on payload transmission. + + Callbacks are invoked after each HTTP request completes. If retries are + enabled, callbacks will be invoked for each retry attempt. + + Args: + callback: Function to call when payload is transmitted. + Receives PayloadTransmittedCallbackArgs with transmission details. + include_failures: If True, callback is invoked on both success and failure. + If False, callback is only invoked on success. + + Returns: + Function to call to unregister the callback. + + Example: + >>> def on_transmitted(args): + ... if args.succeeded: + ... print(f"✅ Sent {args.item_count} items ({args.payload_size_bytes} bytes)") + ... else: + ... print(f"❌ Failed: status={args.status_code}") + >>> + >>> unregister = exporter.register_payload_transmitted_callback( + ... on_transmitted, + ... include_failures=True + ... ) + >>> # Later: unregister() + + """ + return self._transport.register_payload_transmitted_callback(callback, include_failures) + + def export(self, batch: Sequence[LogData]) -> LogExportResult: + """Export a batch of log records. + + Args: + batch: Sequence of log data records to export + + Returns: + LogExportResult indicating success or failure + + """ + if self._shutdown: + return LogExportResult.FAILURE + + try: + # Get resource (cache for subsequent calls) + if self._resource is None: + first_item = batch[0] if batch else None + resource = getattr(first_item, "resource", None) + if resource is None and first_item is not None: + resource = getattr(first_item.log_record, "resource", None) + self._resource = resource or Resource.create() + + # Serialize log records to JSON + serialized_items = [] + for log_data in batch: + try: + item_bytes = self._serialize_log_data(log_data) + serialized_items.append(item_bytes) + except Exception as ex: + event_source.export_exception_thrown("LogData", ex) + # Continue with other items + + if not serialized_items: + return LogExportResult.FAILURE + + # Build payloads respecting size/count limits + payloads = self._build_payloads(serialized_items) + + # Send each payload with retry logic + deadline_sec = time() + self._options.transport_options.timeout_seconds + + for payload in payloads: + # Count items in this payload (approximation based on newlines) + item_count = payload.count(b"\n") + 1 if payload else 0 + + success = self._retry_handler.execute_with_retry( + operation=lambda payload=payload, item_count=item_count: self._transport.send( + payload, deadline_sec - time(), item_count=item_count + ), + deadline_sec=deadline_sec, + shutdown_event=self._shutdown_event, + ) + + if not success: + return LogExportResult.FAILURE + + # Check if shutdown occurred + if self._shutdown: + return LogExportResult.FAILURE + + # Log success + event_source.sink_data_written("LogData", len(batch), "OneCollector") + + return LogExportResult.SUCCESS + + except Exception as ex: + event_source.export_exception_thrown("LogData", ex) + return LogExportResult.FAILURE + + def _serialize_log_data(self, log_data: LogData) -> bytes: + """Serialize a single log record to JSON bytes. + + Args: + log_data: Log data to serialize + + Returns: + UTF-8 encoded JSON bytes + + """ + log_record = log_data.log_record + + # Build data dictionary + data = {} + + # Add resource attributes (if available) + if self._resource and self._resource.attributes: + for key, value in self._resource.attributes.items(): + # Map common resource attributes + if key == "service.name": + data["app_name"] = value + elif key == "service.version": + data["app_version"] = value + elif key == "service.instance.id": + data["app_instance_id"] = value + else: + data[key] = value + + # Add log record attributes (override resource attributes) + if log_record.attributes: + data.update( + {key: value for key, value in log_record.attributes.items() if key not in self._excluded_attributes} + ) + + # Add custom metadata + data.update(self._metadata) + + # Format timestamp + if log_record.timestamp: + timestamp = datetime.fromtimestamp(log_record.timestamp / 1e9, tz=timezone.utc) + else: + timestamp = datetime.now(timezone.utc) + + # Create event envelope + event_name = str(log_record.body) if log_record.body else "UnnamedEvent" + + envelope = CommonSchemaJsonSerializationHelper.create_event_envelope( + event_name=event_name, timestamp=timestamp, ikey=self._ikey, data=data + ) + + # Serialize to JSON bytes + return CommonSchemaJsonSerializationHelper.serialize_to_json_bytes(envelope) + + def _build_payloads(self, serialized_items: list[bytes]) -> list[bytes]: + """Build payloads from serialized items respecting size and count limits. + + Args: + serialized_items: List of serialized item bytes + + Returns: + List of payload bytes + + """ + payloads = [] + self._payload_builder.reset() + + for item_bytes in serialized_items: + if not self._payload_builder.can_add(item_bytes) and not self._payload_builder.is_empty(): + # Current payload is full, build it and start a new one + payloads.append(self._payload_builder.build()) + self._payload_builder.reset() + + self._payload_builder.add(item_bytes) + + # Build final payload + if not self._payload_builder.is_empty(): + payloads.append(self._payload_builder.build()) + + return payloads + + def force_flush(self, timeout_millis: float = 10_000) -> bool: + """Force flush any buffered data. + + Note: This exporter doesn't buffer data internally, so this is a no-op. + + Args: + timeout_millis: Timeout in milliseconds + + Returns: + True (always succeeds) + + """ + return True + + def shutdown(self) -> None: + """Shutdown the exporter and release resources.""" + with self._shutdown_lock: + if self._shutdown: + return + + self._shutdown = True + self._shutdown_event.set() + + # Close HTTP session + if hasattr(self, "_session"): + self._session.close() diff --git a/olive/telemetry/library/msft_log_exporter.py b/olive/telemetry/library/msft_log_exporter.py deleted file mode 100644 index 5178849509..0000000000 --- a/olive/telemetry/library/msft_log_exporter.py +++ /dev/null @@ -1,184 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -import base64 -import gzip -import hashlib -import json -import platform -import random -import threading -import zlib -from collections.abc import Sequence -from datetime import datetime -from io import BytesIO -from time import time -from typing import Any, Optional - -import requests -from deviceid import get_device_id -from opentelemetry.exporter.otlp.proto.http import Compression -from opentelemetry.exporter.otlp.proto.http._common import ( - _is_retryable, -) -from opentelemetry.sdk._logs import LogData -from opentelemetry.sdk._logs.export import ( - LogExporter, - LogExportResult, -) - -from olive.telemetry.library.constants import _ENDPOINT, _HEADERS -from olive.version import __version__ as VERSION - -_MAX_RETRYS = 6 - - -class MSFTLogExporter(LogExporter): - def __init__( - self, - headers: Optional[dict[str, str]] = None, - timeout: Optional[float] = 10, - compression: Optional[Compression] = Compression.Deflate, - ): - self._shutdown_is_occuring = threading.Event() - - self._endpoint = base64.b64decode(_ENDPOINT).decode() - self._timeout = timeout - self._compression = compression - self._session = requests.Session() - - self._headers = json.loads(base64.b64decode(_HEADERS).decode()) - self._iKey = f"o:{self._headers['x-apikey'].split('-')[0]}" - if headers: - self._headers.update(headers) - if self._compression is not Compression.NoCompression: - self._headers.update({"Content-Encoding": self._compression.value}) - self._session.headers.update(self._headers) - self._metadata = { - "device_id": self._generate_encrypted_device_id(), - "os": { - "name": platform.system().lower(), - "version": platform.version(), - "release": platform.release(), - "arch": platform.machine(), - }, - "version": VERSION, - } - - self._shutdown = False - - def _generate_encrypted_device_id(self) -> str: - """Generate a FIPS-compliant encrypted device ID using HMAC-SHA256. - - This method uses HMAC-SHA256 which is FIPS 140-2 approved for cryptographic operations. - The device ID is encrypted using a key derived from the existing endpoint configuration - to ensure deterministic but secure device identification. - - Returns: - str: FIPS-compliant encrypted device ID (hex-encoded) - - """ - # Get the raw device ID - raw_device_id = get_device_id().encode("utf-8") - # Use SHA256 to encrypt and use Base64 encoding - return base64.b64encode(hashlib.sha256(raw_device_id).digest()).decode("utf-8") - - def _export(self, data: bytes, timeout_sec: Optional[float] = None): - if self._compression == Compression.Deflate: - compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate - compressed_data = compressor.compress(data) - compressed_data += compressor.flush() - data = compressed_data - elif self._compression == Compression.Gzip: - gzip_data = BytesIO() - with gzip.GzipFile(fileobj=gzip_data, mode="w") as gzip_stream: - gzip_stream.write(data) - data = gzip_data.getvalue() - elif self._compression == Compression.NoCompression: - pass - - if timeout_sec is None: - timeout_sec = self._timeout - - # By default, keep-alive is enabled in Session's request - # headers. Backends may choose to close the connection - # while a post happens which causes an unhandled - # exception. This try/except will retry the post on such exceptions - updated_headers = {**self._headers, "Content-Length": str(len(data))} - try: - resp = self._session.post( - url=self._endpoint, - data=data, - headers=updated_headers, - timeout=timeout_sec, - ) - except requests.exceptions.ConnectionError: - resp = self._session.post( - url=self._endpoint, - data=data, - headers=updated_headers, - timeout=timeout_sec, - ) - return resp - - def add_metadata(self, metadata: dict[str, Any]): - self._metadata.update(metadata) - - def export(self, batch: Sequence[LogData]) -> LogExportResult: - if self._shutdown: - return LogExportResult.FAILURE - json_logs = [] - for log_data in batch: - log_record = log_data.log_record - data = { - k: v - for k, v in (log_record.attributes or {}).items() - if k not in {"code.file.path", "code.function.name", "code.line.number"} - } - data.update(self._metadata) - log_entry = { - "ver": "4.0", - "name": log_record.body, - "time": datetime.fromtimestamp(log_record.timestamp / 1e9).isoformat() + "Z" - if log_record.timestamp - else None, - "iKey": self._iKey, - "data": data, - } - json_logs.append(log_entry) - - deadline_sec = time() + self._timeout - shutdown = False - for log_entry in json_logs: - for retry_num in range(_MAX_RETRYS): - data = json.dumps(log_entry, ensure_ascii=False).encode("utf-8") - resp = self._export(data, deadline_sec - time()) - if resp.ok: - break - # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) - if ( - not _is_retryable(resp) - or retry_num + 1 == _MAX_RETRYS - or backoff_seconds > (deadline_sec - time()) - or self._shutdown - ): - return LogExportResult.FAILURE - shutdown = self._shutdown_is_occuring.wait(backoff_seconds) - if shutdown: - break - if shutdown: - break - return LogExportResult.SUCCESS - - def force_flush(self, timeout_millis: float = 10_000) -> bool: - """Nothing is buffered in this exporter, so this method does nothing.""" - return True - - def shutdown(self): - if self._shutdown: - return - self._shutdown = True - self._shutdown_is_occuring.set() - self._session.close() diff --git a/olive/telemetry/library/options.py b/olive/telemetry/library/options.py new file mode 100644 index 0000000000..dd934cad2d --- /dev/null +++ b/olive/telemetry/library/options.py @@ -0,0 +1,104 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Configuration options for OneCollector exporter.""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import Callable, Optional + +import requests + +from olive.telemetry.library.connection_string_parser import ConnectionStringParser + + +class CompressionType(Enum): + """HTTP compression types supported by OneCollector.""" + + NO_COMPRESSION = "none" + DEFLATE = "deflate" + GZIP = "gzip" + + +@dataclass +class OneCollectorTransportOptions: + """Transport configuration options for OneCollector exporter.""" + + DEFAULT_ENDPOINT = "https://mobile.events.data.microsoft.com/OneCollector/1.0/" + DEFAULT_MAX_PAYLOAD_SIZE_BYTES = 4 * 1024 * 1024 # 4MB + DEFAULT_MAX_ITEMS_PER_PAYLOAD = 1500 + + endpoint: str = DEFAULT_ENDPOINT + max_payload_size_bytes: int = DEFAULT_MAX_PAYLOAD_SIZE_BYTES + max_items_per_payload: int = DEFAULT_MAX_ITEMS_PER_PAYLOAD + compression: CompressionType = CompressionType.DEFLATE + timeout_seconds: float = 10.0 + http_client_factory: Optional[Callable[[], requests.Session]] = None + + def validate(self) -> None: + """Validate the transport options. + + Raises: + OneCollectorExporterValidationError: If any option is invalid + + """ + if not self.endpoint: + raise OneCollectorExporterValidationError("Endpoint is required") + + if self.max_payload_size_bytes <= 0 and self.max_payload_size_bytes != -1: + raise OneCollectorExporterValidationError("max_payload_size_bytes must be positive or -1 for unlimited") + + if self.max_items_per_payload <= 0 and self.max_items_per_payload != -1: + raise OneCollectorExporterValidationError("max_items_per_payload must be positive or -1 for unlimited") + + if self.timeout_seconds <= 0: + raise OneCollectorExporterValidationError("timeout_seconds must be positive") + + +@dataclass +class OneCollectorExporterOptions: + """Configuration options for OneCollector exporter.""" + + connection_string: Optional[str] = None + transport_options: OneCollectorTransportOptions = field(default_factory=OneCollectorTransportOptions) + + # Internal fields populated during validation + instrumentation_key: Optional[str] = field(default=None, init=False) + tenant_token: Optional[str] = field(default=None, init=False) + + def validate(self) -> None: + """Validate the exporter options and populate derived fields. + + Raises: + OneCollectorExporterValidationError: If any option is invalid + + """ + if not self.connection_string: + raise OneCollectorExporterValidationError("ConnectionString is required") + + # Parse connection string + try: + parser = ConnectionStringParser(self.connection_string) + except ValueError as ex: + raise OneCollectorExporterValidationError(str(ex)) from ex + + self.instrumentation_key = parser.instrumentation_key + + if not self.instrumentation_key: + raise OneCollectorExporterValidationError("Instrumentation key not found in connection string") + + # Extract tenant token (part before first dash) + dash_pos = self.instrumentation_key.find("-") + if dash_pos < 0: + raise OneCollectorExporterValidationError(f"Invalid instrumentation key format: {self.instrumentation_key}") + + self.tenant_token = self.instrumentation_key[:dash_pos] + + # Validate transport options + self.transport_options.validate() + + +class OneCollectorExporterValidationError(Exception): + """Exception raised when OneCollector exporter options validation fails.""" diff --git a/olive/telemetry/library/payload_builder.py b/olive/telemetry/library/payload_builder.py new file mode 100644 index 0000000000..c4678a6a97 --- /dev/null +++ b/olive/telemetry/library/payload_builder.py @@ -0,0 +1,91 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Payload builder for batching telemetry items.""" + + +class PayloadBuilder: + """Builds payloads respecting size and item count limits. + + Matches the batching logic from the .NET implementation. + """ + + NEWLINE_SEPARATOR = b"\n" + + def __init__(self, max_size_bytes: int, max_items: int): + """Initialize payload builder. + + Args: + max_size_bytes: Maximum payload size in bytes (-1 for unlimited) + max_items: Maximum number of items per payload (-1 for unlimited) + + """ + self.max_size_bytes = max_size_bytes + self.max_items = max_items + self.reset() + + def reset(self) -> None: + """Reset the builder to start a new payload.""" + self.items: list[bytes] = [] + self.current_size = 0 + + def can_add(self, item_bytes: bytes) -> bool: + """Check if an item can be added to the current payload. + + Args: + item_bytes: Serialized item bytes + + Returns: + True if item can be added without exceeding limits + + """ + # Check item count limit + if self.max_items != -1 and len(self.items) >= self.max_items: + return False + + # Check size limit + if self.max_size_bytes != -1: + # Calculate new size including newline separator + separator_size = len(self.NEWLINE_SEPARATOR) if self.items else 0 + new_size = self.current_size + len(item_bytes) + separator_size + + if new_size > self.max_size_bytes: + return False + + return True + + def add(self, item_bytes: bytes) -> None: + """Add an item to the current payload. + + Args: + item_bytes: Serialized item bytes + + """ + self.items.append(item_bytes) + self.current_size += len(item_bytes) + + # Account for newline separator (except for first item) + if len(self.items) > 1: + self.current_size += len(self.NEWLINE_SEPARATOR) + + def build(self) -> bytes: + """Build the final payload. + + Returns: + Newline-delimited payload bytes (x-json-stream format) + + """ + if not self.items: + return b"" + + return self.NEWLINE_SEPARATOR.join(self.items) + + def item_count(self) -> int: + """Get the number of items in the current payload.""" + return len(self.items) + + def is_empty(self) -> bool: + """Check if the payload is empty.""" + return len(self.items) == 0 diff --git a/olive/telemetry/library/retry.py b/olive/telemetry/library/retry.py new file mode 100644 index 0000000000..9f0cc7cfd8 --- /dev/null +++ b/olive/telemetry/library/retry.py @@ -0,0 +1,98 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Retry logic with exponential backoff for OneCollector exporter.""" + +import random +import threading +from time import time +from typing import Callable, Optional + +from olive.telemetry.library.event_source import event_source +from olive.telemetry.library.transport import HttpJsonPostTransport + + +class RetryHandler: + """Handles retry logic with exponential backoff and jitter. + + Implements retry strategy matching the .NET implementation. + """ + + def __init__(self, max_retries: int = 6, base_delay: float = 1.0, max_delay: float = 60.0): + """Initialize retry handler. + + Args: + max_retries: Maximum number of retry attempts + base_delay: Base delay for exponential backoff (seconds) + max_delay: Maximum delay between retries (seconds) + + """ + self.max_retries = max_retries + self.base_delay = base_delay + self.max_delay = max_delay + + def execute_with_retry( + self, + operation: Callable[[], tuple[bool, Optional[int]]], + deadline_sec: float, + shutdown_event: threading.Event, + ) -> bool: + """Execute an operation with retry logic. + + Args: + operation: Function that returns (success, status_code) + deadline_sec: Absolute deadline timestamp + shutdown_event: Event to signal shutdown + + Returns: + True if operation succeeded, False otherwise + + """ + for retry_num in range(self.max_retries): + # Check if we've exceeded the deadline + remaining_time = deadline_sec - time() + if remaining_time <= 0: + return False + + try: + # Execute the operation + success, status_code = operation() + + if success: + return True + + # Check if response is retryable + if not HttpJsonPostTransport.is_retryable(status_code): + return False + + except Exception as ex: + event_source.export_exception_thrown("RetryHandler", ex) + + # Last retry - don't wait + if retry_num + 1 == self.max_retries: + return False + + # Last retry - failed + if retry_num + 1 == self.max_retries: + return False + + # Calculate backoff with exponential increase and jitter + backoff = min(self.base_delay * (2**retry_num), self.max_delay) + # Add +/-20% jitter + backoff *= random.uniform(0.8, 1.2) + + # Don't wait longer than remaining time + remaining_time = deadline_sec - time() + wait_time = min(backoff, remaining_time) + + if wait_time <= 0: + return False + + # Wait with ability to interrupt on shutdown + if shutdown_event.wait(wait_time): + # Shutdown occurred + return False + + return False diff --git a/olive/telemetry/library/serialization.py b/olive/telemetry/library/serialization.py new file mode 100644 index 0000000000..8c9af6304a --- /dev/null +++ b/olive/telemetry/library/serialization.py @@ -0,0 +1,141 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""JSON serialization helper for Common Schema format.""" + +import base64 +import json +from datetime import date, datetime, time, timedelta +from typing import Any +from uuid import UUID + + +class CommonSchemaJsonSerializationHelper: + """Helper class for serializing values to Common Schema JSON format. + + Matches the .NET implementation in CommonSchemaJsonSerializationHelper.cs + """ + + # Common Schema constants + ONE_COLLECTOR_TENANCY_SYMBOL = "o" + SCHEMA_VERSION = "4.0" + + @staticmethod + def serialize_value(value: Any) -> Any: + """Serialize a Python value to JSON-compatible format. + + Args: + value: The value to serialize + + Returns: + JSON-serializable representation of the value + + """ + if value is None: + return None + + # Boolean + if isinstance(value, bool): + return value + + # Numeric types + if isinstance(value, (int, float)): + return value + + # String + if isinstance(value, str): + return value + + # DateTime types + if isinstance(value, datetime): + # Convert to UTC ISO 8601 format with 'Z' suffix + if value.tzinfo is None: + # Assume naive datetime is UTC + return value.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" + else: + return value.astimezone().isoformat() + + if isinstance(value, date): + return value.isoformat() + + if isinstance(value, time): + return value.isoformat() + + if isinstance(value, timedelta): + # Format as ISO 8601 duration + total_seconds = int(value.total_seconds()) + hours, remainder = divmod(abs(total_seconds), 3600) + minutes, seconds = divmod(remainder, 60) + sign = "-" if total_seconds < 0 else "" + return f"{sign}{hours:02d}:{minutes:02d}:{seconds:02d}" + + # UUID/GUID + if isinstance(value, UUID): + return str(value) + + # Bytes - encode as base64 + if isinstance(value, (bytes, bytearray)): + return base64.b64encode(bytes(value)).decode("ascii") + + # Arrays/Lists + if isinstance(value, (list, tuple)): + return [CommonSchemaJsonSerializationHelper.serialize_value(item) for item in value] + + # Dictionary/Map + if isinstance(value, dict): + result = {} + for k, v in value.items(): + if k: # Skip empty keys + result[str(k)] = CommonSchemaJsonSerializationHelper.serialize_value(v) + return result + + # Default: convert to string + try: + return str(value) + except Exception: + return f"ERROR: type {type(value).__name__} is not supported" + + @staticmethod + def create_event_envelope( + event_name: str, timestamp: datetime, ikey: str, data: dict[str, Any], extensions: dict[str, Any] | None = None + ) -> dict[str, Any]: + """Create a Common Schema event envelope. + + Args: + event_name: Full event name (namespace.name) + timestamp: Event timestamp + ikey: Instrumentation key with tenant prefix + data: Event data/attributes + extensions: Optional extension fields + + Returns: + Common Schema event envelope as dictionary + + """ + envelope = { + "ver": CommonSchemaJsonSerializationHelper.SCHEMA_VERSION, + "name": event_name, + "time": CommonSchemaJsonSerializationHelper.serialize_value(timestamp), + "iKey": ikey, + "data": CommonSchemaJsonSerializationHelper.serialize_value(data), + } + + if extensions: + envelope["ext"] = CommonSchemaJsonSerializationHelper.serialize_value(extensions) + + return envelope + + @staticmethod + def serialize_to_json_bytes(envelope: dict[str, Any]) -> bytes: + """Serialize an envelope to JSON bytes. + + Args: + envelope: Event envelope dictionary + + Returns: + UTF-8 encoded JSON bytes + + """ + return json.dumps(envelope, ensure_ascii=False, separators=(",", ":")).encode("utf-8") diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 1deb110884..41411b916d 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -2,68 +2,168 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- + +"""High-level telemetry logger facade for easy usage.""" + import logging -from typing import Any +from typing import Any, Optional from opentelemetry._logs import set_logger_provider from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler -from opentelemetry.sdk._logs.export import BatchLogRecordProcessor, LogExporter +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource -from olive.telemetry.library.msft_log_exporter import MSFTLogExporter +from olive.telemetry.library.exporter import OneCollectorLogExporter +from olive.telemetry.library.options import OneCollectorExporterOptions class TelemetryLogger: - _instance = None # Class-level attribute to store the single instance - _logger: logging.Logger = None - _logger_exporter: LogExporter = None - _logger_provider: LoggerProvider = None + """Singleton telemetry logger for simplified OneCollector integration. + + Provides a simple interface for logging telemetry events without + needing to configure OpenTelemetry directly. + """ + + _instance: Optional["TelemetryLogger"] = None + _logger: Optional[logging.Logger] = None + _logger_exporter: Optional[OneCollectorLogExporter] = None + _logger_provider: Optional[LoggerProvider] = None + + def __new__(cls, options: Optional[OneCollectorExporterOptions] = None): + """Create or return the singleton instance. - def __new__(cls, *args, **kwargs): - # Check if an instance already exists + Args: + options: Exporter options (only used on first instantiation) + + """ if cls._instance is None: - # If not, create a new instance and store it in _instance cls._instance = super().__new__(cls) + cls._instance._initialize(options) - try: - cls._logger_exporter = MSFTLogExporter() - cls._logger_provider = LoggerProvider( - resource=Resource.create( - { - "service.name": __name__, - "service.instance.id": f"{__name__}-instance", - } - ), - ) - set_logger_provider(cls._logger_provider) - cls._logger_provider.add_log_record_processor(BatchLogRecordProcessor(cls._logger_exporter)) - handler = LoggingHandler(level=logging.INFO, logger_provider=cls._logger_provider) - - logger = logging.getLogger(__name__) - logger.propagate = False - logger.setLevel(logging.INFO) - logger.addHandler(handler) - cls._logger = logger - except Exception: - # If any error occurs during initialization, we will not set up the logger and will silently fail. - cls._logger = None - cls._logger_provider = None return cls._instance - def __init__(self): - pass + def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: + """Initialize the logger (called only once). + + Args: + options: Exporter configuration options + + """ + try: + # Create exporter + self._logger_exporter = OneCollectorLogExporter(options=options) + + # Create logger provider + self._logger_provider = LoggerProvider( + resource=Resource.create( + { + "service.name": "telemetry_logger", + "service.instance.id": "telemetry_logger-instance", + } + ) + ) + + # Set as global logger provider + set_logger_provider(self._logger_provider) + + # Add batch processor + self._logger_provider.add_log_record_processor(BatchLogRecordProcessor(self._logger_exporter)) + + # Create logging handler + handler = LoggingHandler(level=logging.INFO, logger_provider=self._logger_provider) + + # Set up Python logger + logger = logging.getLogger(__name__) + logger.propagate = False + logger.setLevel(logging.INFO) + logger.addHandler(handler) + + self._logger = logger + + except Exception: + # Silently fail initialization - logger will be None + self._logger = None + self._logger_provider = None + self._logger_exporter = None + + def add_metadata(self, metadata: dict[str, Any]) -> None: + """Add metadata fields to all telemetry events. + + Args: + metadata: Dictionary of metadata to add - def add_metadata(self, metadata: dict[str, Any]): - self._logger_exporter.add_metadata(metadata) + """ + if self._logger_exporter: + self._logger_exporter.add_metadata(metadata) - def log(self, event_name: str, information: dict[str, Any]): - if self._logger: # in case the logger was not initialized properly - self._logger.info(event_name, extra=information) + def log(self, event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: + """Log a telemetry event. - def disable_telemetry(self): - if self._logger: # in case the logger was not initialized properly + Args: + event_name: Name of the event + attributes: Optional event attributes + + """ + if self._logger: + extra = attributes if attributes else {} + self._logger.info(event_name, extra=extra) + + def disable_telemetry(self) -> None: + """Disable telemetry logging.""" + if self._logger: self._logger.disabled = True - def shutdown(self): - if self._logger_provider: # in case the logger provider was not initialized properly + def enable_telemetry(self) -> None: + """Enable telemetry logging.""" + if self._logger: + self._logger.disabled = False + + def shutdown(self) -> None: + """Shutdown the telemetry logger and flush pending data.""" + if self._logger_provider: self._logger_provider.shutdown() + + +# Convenience functions for common use cases +_default_logger: Optional[TelemetryLogger] = None + + +def get_telemetry_logger(connection_string: Optional[str] = None) -> TelemetryLogger: + """Get or create the default telemetry logger. + + Args: + connection_string: OneCollector connection string (only used on first call) + + Returns: + TelemetryLogger instance + + """ + global _default_logger + + if _default_logger is None: + options = None + if connection_string: + options = OneCollectorExporterOptions(connection_string=connection_string) + _default_logger = TelemetryLogger(options=options) + + return _default_logger + + +def log_event(event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: + """Log a telemetry event using the default logger. + + Args: + event_name: Name of the event + attributes: Optional event attributes + + """ + logger = get_telemetry_logger() + logger.log(event_name, attributes) + + +def shutdown_telemetry() -> None: + """Shutdown the default telemetry logger.""" + global _default_logger + if _default_logger: + _default_logger.shutdown() + _default_logger = None diff --git a/olive/telemetry/library/transport.py b/olive/telemetry/library/transport.py new file mode 100644 index 0000000000..5053b8a744 --- /dev/null +++ b/olive/telemetry/library/transport.py @@ -0,0 +1,248 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""HTTP transport implementation for OneCollector exporter.""" + +import gzip +import zlib +from abc import ABC, abstractmethod +from io import BytesIO +from typing import TYPE_CHECKING, Callable, Optional + +import requests + +from olive.telemetry.library.event_source import event_source +from olive.telemetry.library.options import CompressionType + +if TYPE_CHECKING: + from olive.telemetry.library.callback_manager import CallbackManager, PayloadTransmittedCallbackArgs + + +class ITransport(ABC): + """Abstract base class for transports.""" + + @abstractmethod + def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple[bool, Optional[int]]: + """Send a payload. + + Args: + payload: The data to send + timeout_sec: Timeout in seconds + item_count: Number of items in the payload (for callbacks) + + Returns: + Tuple of (success, status_code) + + """ + + @abstractmethod + def register_payload_transmitted_callback( + self, callback: Callable[["PayloadTransmittedCallbackArgs"], None], include_failures: bool = False + ) -> Callable[[], None]: + """Register a callback for payload transmission events. + + Args: + callback: Function to call when payload is transmitted + include_failures: Whether to invoke callback on failures + + Returns: + Function to call to unregister the callback + + """ + + +class HttpJsonPostTransport(ITransport): + """HTTP JSON POST transport implementation. + + Sends telemetry data to OneCollector via HTTP POST with JSON payload. + """ + + def __init__( + self, + endpoint: str, + ikey: str, + compression: CompressionType, + session: requests.Session, + callback_manager: Optional["CallbackManager"] = None, + sdk_version: str = "OTel-python-1.0.0", + ): + """Initialize the HTTP transport. + + Args: + endpoint: OneCollector endpoint URL + ikey: Instrumentation key + compression: Compression type to use + session: Requests session for connection pooling + callback_manager: Optional callback manager for payload events + sdk_version: SDK version string + + """ + self.endpoint = endpoint + self.ikey = ikey + self.compression = compression + self.session = session + self.sdk_version = sdk_version + self.callback_manager = callback_manager + + # Build base headers + self.headers = { + "x-apikey": ikey, + "User-Agent": "Python/3 HttpClient", + "Host": "mobile.events.data.microsoft.com", + "Content-Type": "application/x-json-stream; charset=utf-8", + "sdk-version": sdk_version, + "NoResponseBody": "true", + } + + if compression != CompressionType.NO_COMPRESSION: + self.headers["Content-Encoding"] = compression.value + + def register_payload_transmitted_callback( + self, callback: Callable[["PayloadTransmittedCallbackArgs"], None], include_failures: bool = False + ) -> Callable[[], None]: + """Register a callback for payload transmission events. + + Args: + callback: Function to call when payload is transmitted + include_failures: Whether to invoke callback on failures + + Returns: + Function to call to unregister the callback + + """ + if self.callback_manager is None: + # Import here to avoid circular dependency + from olive.telemetry.library.callback_manager import CallbackManager + + self.callback_manager = CallbackManager() + + return self.callback_manager.register(callback, include_failures) + + def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple[bool, Optional[int]]: + """Send payload via HTTP POST. + + Args: + payload: Uncompressed payload bytes + timeout_sec: Request timeout in seconds + item_count: Number of items in the payload (for callbacks) + + Returns: + Tuple of (success, status_code) + + """ + payload_size_bytes = len(payload) + + try: + # Compress payload + compressed_payload = self._compress(payload) + + # Update headers with content length + headers = {**self.headers, "Content-Length": str(len(compressed_payload))} + + # Send request + try: + response = self.session.post( + url=self.endpoint, data=compressed_payload, headers=headers, timeout=timeout_sec + ) + except requests.exceptions.ConnectionError: + # Retry once on connection error + response = self.session.post( + url=self.endpoint, data=compressed_payload, headers=headers, timeout=timeout_sec + ) + + # Check response + success = response.ok + status_code = response.status_code + + # Invoke callbacks + if self.callback_manager: + from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs + + self.callback_manager.notify( + PayloadTransmittedCallbackArgs( + succeeded=success, + status_code=status_code, + payload_size_bytes=payload_size_bytes, + item_count=item_count, + ) + ) + + if success: + return True, status_code + else: + # Log error response + error_message = response.text[:100] if response.text else "" + event_source.http_transport_error_response("HttpJsonPost", status_code, error_message, "") + return False, status_code + + except requests.exceptions.Timeout: + # Invoke failure callbacks + if self.callback_manager: + from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs + + self.callback_manager.notify( + PayloadTransmittedCallbackArgs( + succeeded=False, status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count + ) + ) + + event_source.transport_exception_thrown("HttpJsonPost", Exception("Request timeout")) + return False, None + except Exception as ex: + # Invoke failure callbacks + if self.callback_manager: + from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs + + self.callback_manager.notify( + PayloadTransmittedCallbackArgs( + succeeded=False, status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count + ) + ) + + event_source.transport_exception_thrown("HttpJsonPost", ex) + return False, None + + def _compress(self, data: bytes) -> bytes: + """Compress data according to configured compression type. + + Args: + data: Uncompressed data + + Returns: + Compressed data + + """ + if self.compression == CompressionType.DEFLATE: + # Raw deflate (no zlib header) + compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) + compressed = compressor.compress(data) + compressed += compressor.flush() + return compressed + + elif self.compression == CompressionType.GZIP: + gzip_buffer = BytesIO() + with gzip.GzipFile(fileobj=gzip_buffer, mode="w") as gzip_file: + gzip_file.write(data) + return gzip_buffer.getvalue() + + else: # NO_COMPRESSION + return data + + @staticmethod + def is_retryable(status_code: Optional[int]) -> bool: + """Check if a response status code indicates the request should be retried. + + Args: + status_code: HTTP status code, or None if request failed + + Returns: + True if request should be retried + + """ + if status_code is None: + return True # Network errors are retryable + + # Retryable status codes + return status_code in {408, 429, 500, 502, 503, 504} diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 0bcdec68b1..526114935f 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -1,76 +1,150 @@ -import functools -import inspect -from datetime import datetime -from typing import Any - -from olive.telemetry.telemetry_events import log_action, log_error -from olive.telemetry.utils import _format_exception_msg - - -# For more complex tracking scenarios -class ActionContext: - def __init__(self, event_name: str, called_from: str = "ContextManager"): - self.event_name = event_name - self.start_time = datetime.now() - self.metadata = {} - self.called_from = called_from - - def add_metadata(self, key: str, value: Any): - self.metadata[key] = value - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - # Send telemetry with all collected metadata - log_action( - self.event_name, - self.called_from, - self.start_time, - int((datetime.now() - self.start_time).total_seconds() * 1000), - exc_type is None, - ) - - if exc_type is not None: - log_error(self.called_from, self.start_time, exc_val, _format_exception_msg(exc_tb)) - - -def action(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - stack = inspect.stack() - caller_frame = stack[1] - caller_module = inspect.getmodule(caller_frame[0]) - called_from = caller_module.__name__ - - if caller_module is None: - called_from = "Interactive" - elif caller_module.__name__ == "__main__": - called_from = "Script" - - success = False - exception = None - start_time = datetime.now() +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Thin wrapper around the OneCollector telemetry logger with event helpers.""" + +import base64 +import platform +import uuid +from typing import Any, Optional + +from olive.telemetry.constants import CONNECTION_STRING +from olive.telemetry.library.telemetry_logger import TelemetryLogger as _LibraryTelemetryLogger +from olive.telemetry.library.telemetry_logger import get_telemetry_logger +from olive.telemetry.utils import _generate_encrypted_device_id +from olive.version import __version__ as VERSION + +# Default event names used by the high-level telemetry helpers. +HEARTBEAT_EVENT_NAME = "OliveHeartbeat" +ACTION_EVENT_NAME = "OliveAction" +ERROR_EVENT_NAME = "OliveError" + + +class Telemetry: + """Wrapper that wires environment configuration into the library logger.""" + + _instance: Optional["Telemetry"] = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def __init__(self): + if self._initialized: + return + self._logger = self._create_logger() + self._session_id = str(uuid.uuid4()) + self._initialized = True + + def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: try: - result = func(*args, **kwargs) - success = True - except Exception as ex: - result = None - exception = ex - duration_ms = int((datetime.now() - start_time).total_seconds() * 1000) - action_name = args[0].__class__.__name__ if args else "Invalid" - - if action_name.endswith("Command"): - action_name = action_name[: -len("Command")] - - log_action(called_from, action_name, duration_ms, success) - - if exception: - exception_type = type(exception).__name__ - exception_message = _format_exception_msg(exception) - log_error(called_from, exception_type, exception_message) - raise exception - - return result - - return wrapper + return get_telemetry_logger(base64.b64decode(CONNECTION_STRING).decode()) + except Exception: + return None + + def add_metadata(self, metadata: dict[str, Any]) -> None: + if self._logger: + self._logger.add_metadata(metadata) + + def log(self, event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: + if self._logger: + # Always include session_id in every event + attrs = dict(attributes or {}) + attrs["session_id"] = self._session_id + self._logger.log(event_name, attrs) + + def disable_telemetry(self) -> None: + if self._logger: + self._logger.disable_telemetry() + + def shutdown(self) -> None: + if self._logger: + self._logger.shutdown() + + +def _get_logger() -> Telemetry: + """Get or create the singleton Telemetry instance.""" + return Telemetry() + + +def _merge_metadata(attributes: dict[str, Any], metadata: Optional[dict[str, Any]]) -> dict[str, Any]: + if metadata: + return {**attributes, **metadata} + return attributes + + +def log_heartbeat( + metadata: Optional[dict[str, Any]] = None, +) -> None: + """Log a heartbeat event with system information. + + Args: + metadata: Optional additional metadata to include. + + """ + logger = _get_logger() + attributes = { + "device_id": _generate_encrypted_device_id(), + "os": { + "name": platform.system().lower(), + "version": platform.version(), + "release": platform.release(), + "arch": platform.machine(), + }, + "version": VERSION, + } + logger.log(HEARTBEAT_EVENT_NAME, _merge_metadata(attributes, metadata)) + + +def log_action( + invoked_from: str, + action_name: str, + duration_ms: float, + success: bool, + metadata: Optional[dict[str, Any]] = None, +) -> None: + """Log an action event. + + Args: + invoked_from: Where the action was invoked from. + action_name: Name of the action. + duration_ms: Duration in milliseconds. + success: Whether the action succeeded. + metadata: Optional additional metadata to include. + + """ + logger = _get_logger() + attributes = { + "invoked_from": invoked_from, + "action_name": action_name, + "duration_ms": duration_ms, + "success": success, + } + logger.log(ACTION_EVENT_NAME, _merge_metadata(attributes, metadata)) + + +def log_error( + invoked_from: str, + exception_type: str, + exception_message: str, + metadata: Optional[dict[str, Any]] = None, +) -> None: + """Log an error event. + + Args: + invoked_from: Where the error occurred. + exception_type: Type of the exception. + exception_message: Exception message. + metadata: Optional additional metadata to include. + + """ + logger = _get_logger() + attributes = { + "invoked_from": invoked_from, + "exception_type": exception_type, + "exception_message": exception_message, + } + logger.log(ERROR_EVENT_NAME, _merge_metadata(attributes, metadata)) diff --git a/olive/telemetry/telemetry_decorators.py b/olive/telemetry/telemetry_decorators.py new file mode 100644 index 0000000000..2f0381ca8a --- /dev/null +++ b/olive/telemetry/telemetry_decorators.py @@ -0,0 +1,127 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import functools +import inspect +import time +from types import TracebackType +from typing import Any, Callable, Optional, TypeVar + +from olive.telemetry.telemetry import log_action, log_error +from olive.telemetry.utils import _format_exception_msg + +_TFunc = TypeVar("_TFunc", bound=Callable[..., Any]) + + +def _resolve_invoked_from(skip_frames: int = 0) -> str: + """Resolve how Olive was invoked by examining the call stack. + + Walks up the stack to find the first frame outside the olive package, + which indicates how the user invoked Olive (CLI, script, interactive, etc.). + + :param skip_frames: Number of additional frames to skip (for internal use). + :return: A string indicating how Olive was invoked. + """ + for frame_info in inspect.stack()[2 + skip_frames :]: # skip this function and caller + module = inspect.getmodule(frame_info.frame) + if module is None: + # Could be interactive or dynamically generated code + continue + module_name = module.__name__ + # Skip olive internals to find user code + if module_name.startswith("olive."): + continue + if module_name == "__main__": + return "Script" + return module_name + return "Interactive" + + +class ActionContext: + """Context manager for recording telemetry around a block of work.""" + + def __init__( + self, + action_name: str, + invoked_from: Optional[str] = None, + metadata: Optional[dict[str, Any]] = None, + ): + self.action_name = action_name + self.invoked_from = invoked_from if invoked_from is not None else _resolve_invoked_from() + self.metadata = metadata or {} + self._start_time: Optional[float] = None + + def add_metadata(self, key: str, value: Any) -> None: + self.metadata[key] = value + + def __enter__(self) -> "ActionContext": + self._start_time = time.perf_counter() + return self + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> bool: + duration_ms = int((time.perf_counter() - (self._start_time or time.perf_counter())) * 1000) + success = exc_type is None + + log_action( + invoked_from=self.invoked_from, + action_name=self.action_name, + duration_ms=duration_ms, + success=success, + metadata=self.metadata, + ) + + if exc_type is not None and exc_val is not None: + log_error( + invoked_from=self.invoked_from, + exception_type=exc_type.__name__, + exception_message=_format_exception_msg(exc_val, exc_tb), + metadata=self.metadata, + ) + + # Do not suppress exceptions + return False + + +def action(func: _TFunc) -> _TFunc: + """Record telemetry around a function call.""" + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any): + invoked_from = _resolve_invoked_from() + action_name = func.__name__ + if args and hasattr(args[0], "__class__"): + cls_name = args[0].__class__.__name__ + if cls_name: + action_name = f"{cls_name}.{action_name}" + if action_name.endswith(f"Command.{func.__name__}"): + action_name = f"{cls_name[: -len('Command')]}.{func.__name__}" + + start_time = time.perf_counter() + success = True + try: + return func(*args, **kwargs) + except Exception as exc: + success = False + log_error( + invoked_from=invoked_from, + exception_type=type(exc).__name__, + exception_message=_format_exception_msg(exc, exc.__traceback__), + ) + raise + finally: + duration_ms = int((time.perf_counter() - start_time) * 1000) + log_action( + invoked_from=invoked_from, + action_name=action_name, + duration_ms=duration_ms, + success=success, + ) + + return wrapper # type: ignore[return-value] diff --git a/olive/telemetry/telemetry_events.py b/olive/telemetry/telemetry_events.py deleted file mode 100644 index 41cdb6a9dd..0000000000 --- a/olive/telemetry/telemetry_events.py +++ /dev/null @@ -1,30 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from olive.telemetry.library.telemetry_logger import TelemetryLogger - -logger = TelemetryLogger() - - -def log_action(called_from: str, action_name: str, duration_ms: float, success: bool): - logger.log( - "OliveAction", - { - "called_from": called_from, - "action_name": action_name, - "duration_ms": duration_ms, - "success": success, - }, - ) - - -def log_error(called_from: str, exception_type: str, exception_message: str): - logger.log( - "OliveError", - { - "called_from": called_from, - "exception_type": exception_type, - "exception_message": exception_message, - }, - ) diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 2560675cdd..4eba04e0d4 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -2,15 +2,35 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import hashlib import traceback +from types import TracebackType +from typing import Optional +from olive.telemetry.deviceid import get_device_id -def _format_exception_msg(ex: Exception) -> str: + +def _generate_encrypted_device_id() -> str: + """Generate a FIPS-compliant encrypted device ID using SHA256. + + This method uses SHA256 which is FIPS 140-2 approved for cryptographic operations. + The device ID is hashed to ensure deterministic but secure device identification. + + Returns: + str: FIPS-compliant encrypted device ID (base64-encoded) + + """ + hash_bytes = hashlib.sha256(get_device_id().encode("utf-8")).digest() + return hash_bytes.hex().upper() + + +def _format_exception_msg(ex: BaseException, tb: Optional[TracebackType] = None) -> str: + """Format an exception and trim local paths for readability.""" folder = "Olive" file_line = 'File "' - ex = traceback.format_exception(ex, limit=5) + formatted = traceback.format_exception(type(ex), ex, tb, limit=5) lines = [] - for line in ex: + for line in formatted: line_trunc = line.strip() if line_trunc.startswith(file_line) and folder in line_trunc: idx = line_trunc.find(folder) diff --git a/requirements.txt b/requirements.txt index 108bfd510a..834e4afed4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,11 +3,9 @@ numpy onnx onnx_ir>=0.1.2 onnxscript>=0.3.0 -opentelemetry-exporter-otlp-proto-http>=1.36.0 opentelemetry-sdk>=1.36.0 optuna pandas -py-deviceid>=0.1.1 pydantic pyyaml torch From 65b99adaa8b77b248ca215013b73b2acb759b6da Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 1 Feb 2026 03:15:06 -0600 Subject: [PATCH 17/31] Fixes --- olive/__init__.py | 2 ++ olive/cli/benchmark.py | 2 ++ olive/cli/diffusion_lora.py | 2 ++ olive/telemetry/__init__.py | 2 +- olive/telemetry/library/exporter.py | 12 ++++++------ requirements.txt | 4 ++-- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/olive/__init__.py b/olive/__init__.py index cfbeb721db..7e48936701 100644 --- a/olive/__init__.py +++ b/olive/__init__.py @@ -31,8 +31,10 @@ tune_session_params, ) from olive.engine.output import ModelOutput, WorkflowOutput # noqa: E402 +from olive.version import __version__ # noqa: E402 __all__ = [ + "__version__", "ModelOutput", "WorkflowOutput", # Python API functions diff --git a/olive/cli/benchmark.py b/olive/cli/benchmark.py index 575c591271..85e0001ac1 100644 --- a/olive/cli/benchmark.py +++ b/olive/cli/benchmark.py @@ -11,6 +11,7 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, get_input_model_config, update_shared_cache_options, ) @@ -70,6 +71,7 @@ def register_subcommand(parser: ArgumentParser): add_logging_options(sub_parser) add_save_config_file_options(sub_parser) add_shared_cache_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=BenchmarkCommand) @action diff --git a/olive/cli/diffusion_lora.py b/olive/cli/diffusion_lora.py index dae57d42da..e51d3ab170 100644 --- a/olive/cli/diffusion_lora.py +++ b/olive/cli/diffusion_lora.py @@ -11,6 +11,7 @@ add_logging_options, add_save_config_file_options, add_shared_cache_options, + add_telemetry_options, update_shared_cache_options, ) from olive.common.utils import set_nested_dict_value @@ -238,6 +239,7 @@ def register_subcommand(parser: ArgumentParser): add_shared_cache_options(sub_parser) add_logging_options(sub_parser) add_save_config_file_options(sub_parser) + add_telemetry_options(sub_parser) sub_parser.set_defaults(func=DiffusionLoraCommand) @action diff --git a/olive/telemetry/__init__.py b/olive/telemetry/__init__.py index 358fbeb70b..3ee813dc24 100644 --- a/olive/telemetry/__init__.py +++ b/olive/telemetry/__init__.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -from olive.telemetry.telemetry_decorators import ActionContext, action from olive.telemetry.telemetry import Telemetry +from olive.telemetry.telemetry_decorators import ActionContext, action __all__ = ["ActionContext", "Telemetry", "action"] diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index d7ecbe18b6..7ef7091a63 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Any, Callable, Optional import requests -from opentelemetry.sdk._logs import LogData +from opentelemetry.sdk._logs import ReadableLogRecord from opentelemetry.sdk._logs.export import LogExporter, LogExportResult from opentelemetry.sdk.resources import Resource @@ -173,7 +173,7 @@ def register_payload_transmitted_callback( """ return self._transport.register_payload_transmitted_callback(callback, include_failures) - def export(self, batch: Sequence[LogData]) -> LogExportResult: + def export(self, batch: Sequence[ReadableLogRecord]) -> LogExportResult: """Export a batch of log records. Args: @@ -202,7 +202,7 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: item_bytes = self._serialize_log_data(log_data) serialized_items.append(item_bytes) except Exception as ex: - event_source.export_exception_thrown("LogData", ex) + event_source.export_exception_thrown("ReadableLogRecord", ex) # Continue with other items if not serialized_items: @@ -234,15 +234,15 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: return LogExportResult.FAILURE # Log success - event_source.sink_data_written("LogData", len(batch), "OneCollector") + event_source.sink_data_written("ReadableLogRecord", len(batch), "OneCollector") return LogExportResult.SUCCESS except Exception as ex: - event_source.export_exception_thrown("LogData", ex) + event_source.export_exception_thrown("ReadableLogRecord", ex) return LogExportResult.FAILURE - def _serialize_log_data(self, log_data: LogData) -> bytes: + def _serialize_log_data(self, log_data: ReadableLogRecord) -> bytes: """Serialize a single log record to JSON bytes. Args: diff --git a/requirements.txt b/requirements.txt index 834e4afed4..efd1fe7971 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ hf-xet numpy onnx onnx_ir>=0.1.2 -onnxscript>=0.3.0 -opentelemetry-sdk>=1.36.0 +onnxscript>=0.5.3 +opentelemetry-sdk>=1.39.1 optuna pandas pydantic From 996329cddd5fc01ba0721c709e9506a6379c7240 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 04:25:56 -0600 Subject: [PATCH 18/31] Add telemetry cache; cleanup --- olive/__init__.py | 2 +- olive/telemetry/__init__.py | 4 +- olive/telemetry/deviceid/_store.py | 6 +- olive/telemetry/library/event_source.py | 4 + olive/telemetry/library/exporter.py | 14 +- olive/telemetry/library/serialization.py | 6 +- olive/telemetry/library/telemetry_logger.py | 35 +- olive/telemetry/library/transport.py | 11 +- olive/telemetry/telemetry.py | 479 ++++++++++++++---- ..._decorators.py => telemetry_extensions.py} | 39 +- 10 files changed, 487 insertions(+), 113 deletions(-) rename olive/telemetry/{telemetry_decorators.py => telemetry_extensions.py} (79%) diff --git a/olive/__init__.py b/olive/__init__.py index 7e48936701..9a59f61ab0 100644 --- a/olive/__init__.py +++ b/olive/__init__.py @@ -34,9 +34,9 @@ from olive.version import __version__ # noqa: E402 __all__ = [ - "__version__", "ModelOutput", "WorkflowOutput", + "__version__", # Python API functions "benchmark", "capture_onnx_graph", diff --git a/olive/telemetry/__init__.py b/olive/telemetry/__init__.py index 3ee813dc24..0ecbbc7056 100644 --- a/olive/telemetry/__init__.py +++ b/olive/telemetry/__init__.py @@ -3,6 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from olive.telemetry.telemetry import Telemetry -from olive.telemetry.telemetry_decorators import ActionContext, action +from olive.telemetry.telemetry_extensions import action -__all__ = ["ActionContext", "Telemetry", "action"] +__all__ = ["Telemetry", "action"] diff --git a/olive/telemetry/deviceid/_store.py b/olive/telemetry/deviceid/_store.py index 3fd554d701..aeddd918f4 100644 --- a/olive/telemetry/deviceid/_store.py +++ b/olive/telemetry/deviceid/_store.py @@ -13,7 +13,7 @@ def __init__(self) -> None: def _build_path(self) -> Path: os_name = platform.system() - if os_name in ("Darwin"): + if os_name == "Darwin": home = os.getenv("HOME") if home is None: raise ValueError("HOME environment variable not set") @@ -82,3 +82,7 @@ def store_id(self, device_id: str) -> None: access=winreg.KEY_ALL_ACCESS | winreg.KEY_WOW64_64KEY, ) as key_handle: winreg.SetValueEx(key_handle, REGISTRY_KEY, 0, winreg.REG_SZ, device_id) + + +def get_device_id_store_path() -> Path: + return Store()._build_path() diff --git a/olive/telemetry/library/event_source.py b/olive/telemetry/library/event_source.py index a9829edb93..198c17398d 100644 --- a/olive/telemetry/library/event_source.py +++ b/olive/telemetry/library/event_source.py @@ -237,6 +237,10 @@ def attribute_dropped(self, item_type: str, attribute_name: str, reason: str) -> extra={"event_id": OneCollectorEventId.ATTRIBUTE_DROPPED}, ) + def disable(self) -> None: + """Disable telemetry logging.""" + self.logger.disabled = True + # Global event source instance event_source = OneCollectorEventSource() diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index 7ef7091a63..05a6775466 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -13,7 +13,7 @@ import requests from opentelemetry.sdk._logs import ReadableLogRecord -from opentelemetry.sdk._logs.export import LogExporter, LogExportResult +from opentelemetry.sdk._logs.export import LogExportResult, LogRecordExporter from opentelemetry.sdk.resources import Resource from olive.telemetry.library.callback_manager import CallbackManager @@ -28,10 +28,10 @@ from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs -class OneCollectorLogExporter(LogExporter): +class OneCollectorLogExporter(LogRecordExporter): """OpenTelemetry log exporter for Microsoft OneCollector. - Implements the OpenTelemetry LogExporter interface and sends logs + Implements the OpenTelemetry LogRecordExporter interface and sends logs to OneCollector using the Common Schema JSON format. """ @@ -110,7 +110,6 @@ def __init__( # Initialize metadata self._metadata: dict[str, Any] = {} - self._add_default_metadata() # Cache for resource (populated on first export) self._resource: Optional[Resource] = None @@ -217,7 +216,6 @@ def export(self, batch: Sequence[ReadableLogRecord]) -> LogExportResult: for payload in payloads: # Count items in this payload (approximation based on newlines) item_count = payload.count(b"\n") + 1 if payload else 0 - success = self._retry_handler.execute_with_retry( operation=lambda payload=payload, item_count=item_count: self._transport.send( payload, deadline_sec - time(), item_count=item_count @@ -261,11 +259,11 @@ def _serialize_log_data(self, log_data: ReadableLogRecord) -> bytes: if self._resource and self._resource.attributes: for key, value in self._resource.attributes.items(): # Map common resource attributes - if key == "service.name": + if key == "service.name" and "app_name" not in data: data["app_name"] = value - elif key == "service.version": + elif key == "service.version" and "app_version" not in data: data["app_version"] = value - elif key == "service.instance.id": + elif key == "service.instance.id" and "app_instance_id" not in data: data["app_instance_id"] = value else: data[key] = value diff --git a/olive/telemetry/library/serialization.py b/olive/telemetry/library/serialization.py index 8c9af6304a..069f85d7e1 100644 --- a/olive/telemetry/library/serialization.py +++ b/olive/telemetry/library/serialization.py @@ -7,7 +7,7 @@ import base64 import json -from datetime import date, datetime, time, timedelta +from datetime import date, datetime, time, timedelta, timezone from typing import Any from uuid import UUID @@ -54,8 +54,8 @@ def serialize_value(value: Any) -> Any: if value.tzinfo is None: # Assume naive datetime is UTC return value.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" - else: - return value.astimezone().isoformat() + utc_value = value.astimezone(timezone.utc) + return utc_value.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" if isinstance(value, date): return value.isoformat() diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 41411b916d..fa235f4930 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -6,6 +6,7 @@ """High-level telemetry logger facade for easy usage.""" import logging +import uuid from typing import Any, Optional from opentelemetry._logs import set_logger_provider @@ -15,6 +16,20 @@ from olive.telemetry.library.exporter import OneCollectorLogExporter from olive.telemetry.library.options import OneCollectorExporterOptions +from olive.version import __version__ as VERSION + + +def _get_service_name() -> str: + """Derive service name from the root package name. + + Returns: + The capitalized name of the root package + + """ + # Get the root package name from this module's path + # e.g., olive.telemetry.library.telemetry_logger -> olive + package_name = __name__.split(".")[0] + return package_name.capitalize() class TelemetryLogger: @@ -57,8 +72,9 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: self._logger_provider = LoggerProvider( resource=Resource.create( { - "service.name": "telemetry_logger", - "service.instance.id": "telemetry_logger-instance", + "service.name": _get_service_name(), + "service.version": VERSION, + "service.instance.id": str(uuid.uuid4()), # Unique instance ID; can double as session ID } ) ) @@ -67,7 +83,12 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: set_logger_provider(self._logger_provider) # Add batch processor - self._logger_provider.add_log_record_processor(BatchLogRecordProcessor(self._logger_exporter)) + self._logger_provider.add_log_record_processor( + BatchLogRecordProcessor( + self._logger_exporter, + schedule_delay_millis=1000, + ) + ) # Create logging handler handler = LoggingHandler(level=logging.INFO, logger_provider=self._logger_provider) @@ -86,7 +107,7 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: self._logger_provider = None self._logger_exporter = None - def add_metadata(self, metadata: dict[str, Any]) -> None: + def add_global_metadata(self, metadata: dict[str, Any]) -> None: """Add metadata fields to all telemetry events. Args: @@ -123,6 +144,12 @@ def shutdown(self) -> None: if self._logger_provider: self._logger_provider.shutdown() + def force_flush(self, timeout_millis: float = 10_000) -> bool: + """Force flush buffered log records.""" + if self._logger_provider: + return self._logger_provider.force_flush(timeout_millis=timeout_millis) + return False + # Convenience functions for common use cases _default_logger: Optional[TelemetryLogger] = None diff --git a/olive/telemetry/library/transport.py b/olive/telemetry/library/transport.py index 5053b8a744..17f0e92d39 100644 --- a/olive/telemetry/library/transport.py +++ b/olive/telemetry/library/transport.py @@ -184,7 +184,10 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple self.callback_manager.notify( PayloadTransmittedCallbackArgs( - succeeded=False, status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count + succeeded=False, + status_code=None, + payload_size_bytes=payload_size_bytes, + item_count=item_count, ) ) @@ -197,7 +200,11 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple self.callback_manager.notify( PayloadTransmittedCallbackArgs( - succeeded=False, status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count + succeeded=False, + status_code=None, + payload_size_bytes=payload_size_bytes, + item_count=item_count, + error_message=str(ex), ) ) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 526114935f..4dc3222588 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -5,21 +5,281 @@ """Thin wrapper around the OneCollector telemetry logger with event helpers.""" import base64 +import json +import os +import pickle import platform -import uuid +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path from typing import Any, Optional from olive.telemetry.constants import CONNECTION_STRING +from olive.telemetry.deviceid._store import get_device_id_store_path +from olive.telemetry.library.event_source import event_source from olive.telemetry.library.telemetry_logger import TelemetryLogger as _LibraryTelemetryLogger from olive.telemetry.library.telemetry_logger import get_telemetry_logger from olive.telemetry.utils import _generate_encrypted_device_id -from olive.version import __version__ as VERSION # Default event names used by the high-level telemetry helpers. HEARTBEAT_EVENT_NAME = "OliveHeartbeat" ACTION_EVENT_NAME = "OliveAction" ERROR_EVENT_NAME = "OliveError" +ALLOWED_KEYS = { + HEARTBEAT_EVENT_NAME: [ + "device_id", + "os.name", + "os.version", + "os.release", + "os.arch", + "app_version", + "app_instance_id", + ], + ACTION_EVENT_NAME: [ + "invoked_from", + "action_name", + "duration_ms", + "success", + "app_version", + "app_instance_id", + ], + ERROR_EVENT_NAME: [ + "exception_type", + "exception_message", + "app_version", + "app_instance_id", + ], +} + +CRITICAL_EVENTS = {HEARTBEAT_EVENT_NAME} +MAX_CACHE_SIZE_BYTES = 5 * 1024 * 1024 +HARD_MAX_CACHE_SIZE_BYTES = 10 * 1024 * 1024 +CACHE_FILE_NAME = "olive.pkl" + + +class TelemetryCacheHandler: + def __init__(self, telemetry: "Telemetry") -> None: + self._telemetry = telemetry + self._cache_lock = threading.Lock() + self._cache_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="olive_telemetry_cache") + self._flush_in_progress = False + self._transport = None + + # Track replayed events that haven't been sent yet + self._pending_replay_events = [] # List of events being replayed + self._pending_replay_lock = threading.Lock() + self._replay_in_progress = False + self._replay_writeback_scheduled = False + self._replay_new_writes = False + + def setup_payload_callbacks(self) -> None: + logger = self._telemetry._logger + if not logger: + return + + exporter = getattr(logger, "_logger_exporter", None) + if not exporter: + return + + transport = getattr(exporter, "_transport", None) + if not transport: + return + if getattr(transport, "_cache_wrap_installed", False): + return + + original_send = transport.send + + def wrapped_send(payload: bytes, timeout_sec: float, item_count: int = 1): + transport._last_payload = payload + return original_send(payload, timeout_sec, item_count=item_count) + + transport.send = wrapped_send + transport._cache_wrap_installed = True + self._transport = transport + + exporter.register_payload_transmitted_callback(self._on_payload_transmitted, include_failures=True) + + def shutdown(self) -> None: + if self._cache_executor: + # Wait for pending cache tasks to complete before shutting down + # Note: We intentionally do NOT flush the cache here. The cache persists across + # sessions for offline resilience. If network is working, the success callback + # already flushed. If network is down, flushing would fail anyway. + self._cache_executor.shutdown(wait=True) + self._cache_executor = None + + def _on_payload_transmitted(self, args) -> None: + try: + if args.succeeded: + # Telemetry succeeded - mark any pending replayed events as sent + with self._pending_replay_lock: + if self._pending_replay_events: + # Remove events from pending list (they were successfully sent) + sent_count = min(len(self._pending_replay_events), args.item_count) + self._pending_replay_events = self._pending_replay_events[sent_count:] + + # If no more pending replayed events, signal completion + if not self._pending_replay_events and self._replay_in_progress: + self._replay_in_progress = False + if not self._replay_new_writes and not self._replay_writeback_scheduled: + cache_path = self._get_cache_path() + if cache_path: + cache_path.unlink(missing_ok=True) + + # Also flush any previously cached failures + self._schedule_cache_task(self._flush_cache) + else: + # Telemetry failed - cache this payload for later replay + with self._pending_replay_lock: + has_pending_replay = bool(self._pending_replay_events) + should_writeback = self._replay_in_progress and has_pending_replay + if should_writeback and not self._replay_writeback_scheduled: + self._replay_writeback_scheduled = True + self._schedule_cache_task(self._write_entries_to_cache, self._pending_replay_events.copy()) + payload = getattr(self._transport, "_last_payload", None) + if payload: + self._schedule_cache_task(self._write_payload_to_cache, payload) + except Exception: + return + + def _schedule_cache_task(self, func, *args) -> None: + try: + if self._cache_executor: + self._cache_executor.submit(func, *args) + else: + # If executor is not available (e.g., during shutdown), execute synchronously + func(*args) + except Exception: + return + + def _get_telemetry_support_dir(self) -> Optional[Path]: + os_name = platform.system() + if os_name == "Windows": + base_dir = os.environ.get("LOCALAPPDATA") or os.environ.get("APPDATA") + if not base_dir: + base_dir = str(Path.home() / "AppData" / "Local") + return Path(base_dir) / "Microsoft" / ".onnxruntime" + + return get_device_id_store_path() + + def _get_cache_path(self) -> Optional[Path]: + support_dir = self._get_telemetry_support_dir() + if not support_dir: + return None + return support_dir / "cache" / CACHE_FILE_NAME + + def _write_payload_to_cache(self, payload: bytes) -> None: + try: + cache_path = self._get_cache_path() + if cache_path is None: + return + + with self._pending_replay_lock: + if self._replay_in_progress: + self._replay_new_writes = True + + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_size = cache_path.stat().st_size if cache_path.exists() else 0 + + if cache_size >= HARD_MAX_CACHE_SIZE_BYTES: + return + + entries = _parse_payload(payload) + if not entries: + return + + if cache_size >= MAX_CACHE_SIZE_BYTES: + entries = [entry for entry in entries if entry.get("event_name") in CRITICAL_EVENTS] + if not entries: + return + + with self._cache_lock, cache_path.open("ab") as cache_file: + for entry in entries: + pickle.dump(entry, cache_file, protocol=pickle.HIGHEST_PROTOCOL) + except Exception: + return + + def _write_entries_to_cache(self, entries: list[dict[str, Any]]) -> None: + try: + cache_path = self._get_cache_path() + if cache_path is None: + return + + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_size = cache_path.stat().st_size if cache_path.exists() else 0 + + if cache_size >= HARD_MAX_CACHE_SIZE_BYTES: + return + + if cache_size >= MAX_CACHE_SIZE_BYTES: + entries = [entry for entry in entries if entry.get("event_name") in CRITICAL_EVENTS] + if not entries: + return + + with self._cache_lock, cache_path.open("ab") as cache_file: + for entry in entries: + pickle.dump(entry, cache_file, protocol=pickle.HIGHEST_PROTOCOL) + except Exception: + return + + def _flush_cache(self) -> None: + with self._cache_lock: + if self._flush_in_progress: + return + self._flush_in_progress = True + + try: + cache_path = self._get_cache_path() + if cache_path is None: + return + if not cache_path.exists(): + return + + entries = _read_cache_entries(cache_path) + + if not entries: + cache_path.unlink(missing_ok=True) + return + + cache_path.unlink(missing_ok=True) + + # Mark these entries as pending replay + with self._pending_replay_lock: + self._pending_replay_events = entries.copy() + self._replay_in_progress = True + self._replay_writeback_scheduled = False + self._replay_new_writes = False + + for entry in entries: + try: + event_name = entry.get("event_name") + event_data = entry.get("event_data") + if not event_name or not event_data: + continue + attributes = json.loads(event_data) + if not isinstance(attributes, dict): + continue + attributes["initTs"] = entry.get("ts") + metadata = {} + for key in ("app_name", "app_version", "app_instance_id"): + if key in attributes: + metadata[key] = attributes.pop(key) + self._telemetry._log(event_name, attributes, metadata or None) + except Exception: + # Remove failed entry from pending list + with self._pending_replay_lock: + if entry in self._pending_replay_events: + self._pending_replay_events.remove(entry) + continue + + self._telemetry.force_flush(timeout_millis=5_000) + except Exception: + return + finally: + self._flush_in_progress = False + class Telemetry: """Wrapper that wires environment configuration into the library logger.""" @@ -36,8 +296,11 @@ def __init__(self): if self._initialized: return self._logger = self._create_logger() - self._session_id = str(uuid.uuid4()) + self._cache_handler = TelemetryCacheHandler(self) self._initialized = True + self._setup_payload_callbacks() + self._log_heartbeat() + event_source.disable() def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: try: @@ -45,22 +308,58 @@ def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: except Exception: return None - def add_metadata(self, metadata: dict[str, Any]) -> None: + def _setup_payload_callbacks(self) -> None: + if not self._logger: + return + self._cache_handler.setup_payload_callbacks() + + def add_global_metadata(self, metadata: dict[str, Any]) -> None: if self._logger: - self._logger.add_metadata(metadata) + self._logger.add_global_metadata(metadata) - def log(self, event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: + def _log( + self, event_name: str, attributes: Optional[dict[str, Any]] = None, metadata: Optional[dict[str, Any]] = None + ) -> None: if self._logger: - # Always include session_id in every event - attrs = dict(attributes or {}) - attrs["session_id"] = self._session_id + attrs = _merge_metadata(attributes, metadata) self._logger.log(event_name, attrs) + def _log_heartbeat( + self, + metadata: Optional[dict[str, Any]] = None, + ) -> None: + """Log a heartbeat event with system information. + + Args: + metadata: Optional additional metadata to include. + + """ + attributes = { + "device_id": _generate_encrypted_device_id(), + "os": { + "name": platform.system().lower(), + "version": platform.version(), + "release": platform.release(), + "arch": platform.machine(), + }, + } + self._log(HEARTBEAT_EVENT_NAME, attributes, metadata) + def disable_telemetry(self) -> None: if self._logger: self._logger.disable_telemetry() + def force_flush(self, timeout_millis: float = 10_000) -> bool: + if self._logger and hasattr(self._logger, "force_flush"): + return self._logger.force_flush(timeout_millis=timeout_millis) + return False + def shutdown(self) -> None: + # Shutdown cache handler FIRST to ensure pending cache tasks complete + # The cache handler will wait for replayed events to be sent before returning + if self._cache_handler: + self._cache_handler.shutdown() + if self._logger: self._logger.shutdown() @@ -70,81 +369,89 @@ def _get_logger() -> Telemetry: return Telemetry() -def _merge_metadata(attributes: dict[str, Any], metadata: Optional[dict[str, Any]]) -> dict[str, Any]: +def _merge_metadata(attributes: Optional[dict[str, Any]], metadata: Optional[dict[str, Any]]) -> dict[str, Any]: + merged = dict(attributes or {}) if metadata: - return {**attributes, **metadata} - return attributes - - -def log_heartbeat( - metadata: Optional[dict[str, Any]] = None, -) -> None: - """Log a heartbeat event with system information. - - Args: - metadata: Optional additional metadata to include. - - """ - logger = _get_logger() - attributes = { - "device_id": _generate_encrypted_device_id(), - "os": { - "name": platform.system().lower(), - "version": platform.version(), - "release": platform.release(), - "arch": platform.machine(), - }, - "version": VERSION, - } - logger.log(HEARTBEAT_EVENT_NAME, _merge_metadata(attributes, metadata)) - - -def log_action( - invoked_from: str, - action_name: str, - duration_ms: float, - success: bool, - metadata: Optional[dict[str, Any]] = None, -) -> None: - """Log an action event. - - Args: - invoked_from: Where the action was invoked from. - action_name: Name of the action. - duration_ms: Duration in milliseconds. - success: Whether the action succeeded. - metadata: Optional additional metadata to include. - - """ - logger = _get_logger() - attributes = { - "invoked_from": invoked_from, - "action_name": action_name, - "duration_ms": duration_ms, - "success": success, - } - logger.log(ACTION_EVENT_NAME, _merge_metadata(attributes, metadata)) - - -def log_error( - invoked_from: str, - exception_type: str, - exception_message: str, - metadata: Optional[dict[str, Any]] = None, -) -> None: - """Log an error event. - - Args: - invoked_from: Where the error occurred. - exception_type: Type of the exception. - exception_message: Exception message. - metadata: Optional additional metadata to include. - - """ - logger = _get_logger() - attributes = { - "invoked_from": invoked_from, - "exception_type": exception_type, - "exception_message": exception_message, - } - logger.log(ERROR_EVENT_NAME, _merge_metadata(attributes, metadata)) + merged.update(metadata) + return merged + + +def _parse_payload(payload: bytes) -> list[dict[str, Any]]: + entries = [] + try: + payload_text = payload.decode("utf-8") + lines = payload_text.splitlines() + + for raw_line in lines: + line = raw_line.strip() + if not line: + continue + try: + event = json.loads(line) + event_name = event.get("name") + if not event_name: + continue + filtered_data = _filter_event_data(event_name, event.get("data") or {}) + if not filtered_data: + continue + entries.append( + { + "ts": event.get("time") or time.time(), + "event_name": event_name, + "event_data": json.dumps(filtered_data, ensure_ascii=False, separators=(",", ":")), + } + ) + except Exception: + continue + except Exception: + return [] + + return entries + + +def _filter_event_data(event_name: str, data: dict[str, Any]) -> Optional[dict[str, Any]]: + allowed_keys = ALLOWED_KEYS.get(event_name) + if not allowed_keys: + return None + + filtered: dict[str, Any] = {} + for key in allowed_keys: + value = _get_nested_value(data, key) + if value is None: + continue + _set_nested_value(filtered, key, value) + return filtered or None + + +def _get_nested_value(data: dict[str, Any], key: str) -> Any: + current = data + for part in key.split("."): + if not isinstance(current, dict) or part not in current: + return None + current = current[part] + return current + + +def _set_nested_value(data: dict[str, Any], key: str, value: Any) -> None: + current = data + parts = key.split(".") + for part in parts[:-1]: + current = current.setdefault(part, {}) + current[parts[-1]] = value + + +def _read_cache_entries(cache_path: Path) -> list[dict[str, Any]]: + entries = [] + try: + with cache_path.open("rb") as cache_file: + while True: + try: + entry = pickle.load(cache_file) + entries.append(entry) + except EOFError: + break + except Exception: + continue + except Exception: + return [] + return entries diff --git a/olive/telemetry/telemetry_decorators.py b/olive/telemetry/telemetry_extensions.py similarity index 79% rename from olive/telemetry/telemetry_decorators.py rename to olive/telemetry/telemetry_extensions.py index 2f0381ca8a..7eaaaf34a8 100644 --- a/olive/telemetry/telemetry_decorators.py +++ b/olive/telemetry/telemetry_extensions.py @@ -9,12 +9,42 @@ from types import TracebackType from typing import Any, Callable, Optional, TypeVar -from olive.telemetry.telemetry import log_action, log_error +from olive.telemetry.telemetry import ACTION_EVENT_NAME, ERROR_EVENT_NAME, _get_logger from olive.telemetry.utils import _format_exception_msg _TFunc = TypeVar("_TFunc", bound=Callable[..., Any]) +def log_action( + invoked_from: str, + action_name: str, + duration_ms: float, + success: bool, + metadata: Optional[dict[str, Any]] = None, +) -> None: + telemetry = _get_logger() + attributes = { + "invoked_from": invoked_from, + "action_name": action_name, + "duration_ms": duration_ms, + "success": success, + } + telemetry._log(ACTION_EVENT_NAME, attributes, metadata) + + +def log_error( + exception_type: str, + exception_message: str, + metadata: Optional[dict[str, Any]] = None, +) -> None: + telemetry = _get_logger() + attributes = { + "exception_type": exception_type, + "exception_message": exception_message, + } + telemetry._log(ERROR_EVENT_NAME, attributes, metadata) + + def _resolve_invoked_from(skip_frames: int = 0) -> str: """Resolve how Olive was invoked by examining the call stack. @@ -79,7 +109,6 @@ def __exit__( if exc_type is not None and exc_val is not None: log_error( - invoked_from=self.invoked_from, exception_type=exc_type.__name__, exception_message=_format_exception_msg(exc_val, exc_tb), metadata=self.metadata, @@ -98,10 +127,9 @@ def wrapper(*args: Any, **kwargs: Any): action_name = func.__name__ if args and hasattr(args[0], "__class__"): cls_name = args[0].__class__.__name__ + cls_name = cls_name[: -len("Command")] if cls_name.endswith("Command") else cls_name if cls_name: - action_name = f"{cls_name}.{action_name}" - if action_name.endswith(f"Command.{func.__name__}"): - action_name = f"{cls_name[: -len('Command')]}.{func.__name__}" + action_name = cls_name if action_name == "run" else f"{cls_name}.{action_name}" start_time = time.perf_counter() success = True @@ -110,7 +138,6 @@ def wrapper(*args: Any, **kwargs: Any): except Exception as exc: success = False log_error( - invoked_from=invoked_from, exception_type=type(exc).__name__, exception_message=_format_exception_msg(exc, exc.__traceback__), ) From 30038f5b959d24da6ccb152a77492074d6d47431 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 09:31:39 -0600 Subject: [PATCH 19/31] Lint --- olive/telemetry/deviceid/_store.py | 2 ++ olive/telemetry/library/telemetry_logger.py | 6 +++--- olive/telemetry/telemetry.py | 14 +++++--------- olive/telemetry/telemetry_extensions.py | 4 ++-- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/olive/telemetry/deviceid/_store.py b/olive/telemetry/deviceid/_store.py index aeddd918f4..7c8cb7dc52 100644 --- a/olive/telemetry/deviceid/_store.py +++ b/olive/telemetry/deviceid/_store.py @@ -49,6 +49,8 @@ def store_id(self, device_id: str) -> None: try: self._file_path.parent.mkdir(parents=True) except FileExistsError: + # this is unexpected, but not an issue, since we want this file + # path to exist pass self._file_path.touch() diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index fa235f4930..1ca6a5d953 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -23,13 +23,13 @@ def _get_service_name() -> str: """Derive service name from the root package name. Returns: - The capitalized name of the root package + The name of the root package """ # Get the root package name from this module's path # e.g., olive.telemetry.library.telemetry_logger -> olive - package_name = __name__.split(".")[0] - return package_name.capitalize() + package_name = __name__.split(".", maxsplit=1)[0] + return package_name class TelemetryLogger: diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 4dc3222588..bd5744829a 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -142,7 +142,7 @@ def _on_payload_transmitted(self, args) -> None: if payload: self._schedule_cache_task(self._write_payload_to_cache, payload) except Exception: - return + pass def _schedule_cache_task(self, func, *args) -> None: try: @@ -152,7 +152,7 @@ def _schedule_cache_task(self, func, *args) -> None: # If executor is not available (e.g., during shutdown), execute synchronously func(*args) except Exception: - return + pass def _get_telemetry_support_dir(self) -> Optional[Path]: os_name = platform.system() @@ -262,11 +262,7 @@ def _flush_cache(self) -> None: if not isinstance(attributes, dict): continue attributes["initTs"] = entry.get("ts") - metadata = {} - for key in ("app_name", "app_version", "app_instance_id"): - if key in attributes: - metadata[key] = attributes.pop(key) - self._telemetry._log(event_name, attributes, metadata or None) + self._telemetry.log(event_name, attributes, None) except Exception: # Remove failed entry from pending list with self._pending_replay_lock: @@ -317,7 +313,7 @@ def add_global_metadata(self, metadata: dict[str, Any]) -> None: if self._logger: self._logger.add_global_metadata(metadata) - def _log( + def log( self, event_name: str, attributes: Optional[dict[str, Any]] = None, metadata: Optional[dict[str, Any]] = None ) -> None: if self._logger: @@ -343,7 +339,7 @@ def _log_heartbeat( "arch": platform.machine(), }, } - self._log(HEARTBEAT_EVENT_NAME, attributes, metadata) + self.log(HEARTBEAT_EVENT_NAME, attributes, metadata) def disable_telemetry(self) -> None: if self._logger: diff --git a/olive/telemetry/telemetry_extensions.py b/olive/telemetry/telemetry_extensions.py index 7eaaaf34a8..c0998077f1 100644 --- a/olive/telemetry/telemetry_extensions.py +++ b/olive/telemetry/telemetry_extensions.py @@ -29,7 +29,7 @@ def log_action( "duration_ms": duration_ms, "success": success, } - telemetry._log(ACTION_EVENT_NAME, attributes, metadata) + telemetry.log(ACTION_EVENT_NAME, attributes, metadata) def log_error( @@ -42,7 +42,7 @@ def log_error( "exception_type": exception_type, "exception_message": exception_message, } - telemetry._log(ERROR_EVENT_NAME, attributes, metadata) + telemetry.log(ERROR_EVENT_NAME, attributes, metadata) def _resolve_invoked_from(skip_frames: int = 0) -> str: From 38ffd31ea2b94da3bc6c42021543b2c7d6b59f08 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 10:39:38 -0600 Subject: [PATCH 20/31] Lint --- olive/telemetry/deviceid/_store.py | 28 ++----- olive/telemetry/deviceid/deviceid.py | 82 +++++++++++++++++---- olive/telemetry/library/callback_manager.py | 3 +- olive/telemetry/library/exporter.py | 26 ------- olive/telemetry/library/telemetry_logger.py | 57 +++++++------- olive/telemetry/library/transport.py | 1 - olive/telemetry/telemetry.py | 22 +++--- olive/telemetry/utils.py | 33 ++++++--- 8 files changed, 134 insertions(+), 118 deletions(-) diff --git a/olive/telemetry/deviceid/_store.py b/olive/telemetry/deviceid/_store.py index 7c8cb7dc52..c66087b699 100644 --- a/olive/telemetry/deviceid/_store.py +++ b/olive/telemetry/deviceid/_store.py @@ -1,10 +1,9 @@ -import os -import platform from pathlib import Path +from olive.telemetry.utils import get_telemetry_base_dir + REGISTRY_PATH = r"SOFTWARE\Microsoft\DeveloperTools\.onnxruntime" REGISTRY_KEY = "deviceid" -DEVICEID_LOCATION = r"Microsoft/DeveloperTools/deviceid/.onnxruntime/" class Store: @@ -12,20 +11,7 @@ def __init__(self) -> None: self._file_path: Path = self._build_path() def _build_path(self) -> Path: - os_name = platform.system() - if os_name == "Darwin": - home = os.getenv("HOME") - if home is None: - raise ValueError("HOME environment variable not set") - - return Path(f"{home}/Library/Application Support/{DEVICEID_LOCATION}") - - home = os.getenv("XDG_CACHE_HOME", f"{os.getenv('HOME')}/.cache") - - if not home: - raise ValueError("HOME environment variable not set") - - return Path(home).joinpath(DEVICEID_LOCATION) + return get_telemetry_base_dir() def retrieve_id(self) -> str: """Retrieve the device id from the store location. @@ -49,8 +35,8 @@ def store_id(self, device_id: str) -> None: try: self._file_path.parent.mkdir(parents=True) except FileExistsError: - # this is unexpected, but not an issue, since we want this file - # path to exist + # This is unexpected, but is not an issue, + # since we want this file path to exist. pass self._file_path.touch() @@ -84,7 +70,3 @@ def store_id(self, device_id: str) -> None: access=winreg.KEY_ALL_ACCESS | winreg.KEY_WOW64_64KEY, ) as key_handle: winreg.SetValueEx(key_handle, REGISTRY_KEY, 0, winreg.REG_SZ, device_id) - - -def get_device_id_store_path() -> Path: - return Store()._build_path() diff --git a/olive/telemetry/deviceid/deviceid.py b/olive/telemetry/deviceid/deviceid.py index 53ed791f4d..9745baaec8 100644 --- a/olive/telemetry/deviceid/deviceid.py +++ b/olive/telemetry/deviceid/deviceid.py @@ -1,12 +1,24 @@ +import hashlib import logging import platform import uuid +from enum import Enum from typing import Union from olive.telemetry.deviceid._store import Store, WindowsStore -def get_device_id(*, full_trace: bool = False) -> str: +class DeviceIdStatus(Enum): + NEW = "new" + EXISTING = "existing" + CORRUPTED = "corrupted" + FAILED = "failed" + + +_device_id_state = {"device_id": None, "status": DeviceIdStatus.NEW} + + +def get_device_id() -> str: r"""Get the device id from the store or create one if it does not exist. An empty string is returned if an error occurs during saving or retrieval of the device id. @@ -15,13 +27,13 @@ def get_device_id(*, full_trace: bool = False) -> str: MacOS id location: $HOME/Library/Application Support/Microsoft/DeveloperTools/deviceid Windows id location: HKEY_CURRENT_USER\SOFTWARE\Microsoft\DeveloperTools\deviceid - :keyword full_trace: If True, the full stack trace is logged. Default is False. :return: The device id. :rtype: str """ logger = logging.getLogger(__name__) device_id: str = "" store: Union[Store, WindowsStore] + create_new_id = False try: if platform.system() == "Windows": @@ -29,23 +41,65 @@ def get_device_id(*, full_trace: bool = False) -> str: elif platform.system() in ("Linux", "Darwin"): store = Store() else: + _device_id_state["status"] = DeviceIdStatus.FAILED + _device_id_state["device_id"] = device_id return device_id - return store.retrieve_id() + + device_id = store.retrieve_id().strip() + if len(device_id) > 256: + _device_id_state["status"] = DeviceIdStatus.CORRUPTED + _device_id_state["device_id"] = "" + create_new_id = True + else: + try: + uuid.UUID(device_id) + except ValueError: + _device_id_state["status"] = DeviceIdStatus.CORRUPTED + _device_id_state["device_id"] = "" + create_new_id = True + else: + _device_id_state["status"] = DeviceIdStatus.EXISTING + _device_id_state["device_id"] = device_id + return device_id + except (FileExistsError, FileNotFoundError): + _device_id_state["status"] = DeviceIdStatus.NEW + _device_id_state["device_id"] = "" + create_new_id = True except (PermissionError, ValueError, NotImplementedError): - if full_trace: - logger.exception("Failed to retrieve stored device id.") + _device_id_state["status"] = DeviceIdStatus.FAILED + _device_id_state["device_id"] = device_id return device_id except Exception: - if full_trace: - logger.exception("Failed to retrieve stored device id.") + _device_id_state["status"] = DeviceIdStatus.FAILED + _device_id_state["device_id"] = device_id + return device_id - device_id = str(uuid.uuid4()).lower() + if create_new_id: + device_id = str(uuid.uuid4()).lower() - try: - store.store_id(device_id) - except Exception: - if full_trace: - logger.exception("Failed to store device id.") - device_id = "" + try: + store.store_id(device_id) + except Exception: + _device_id_state["status"] = DeviceIdStatus.FAILED + device_id = "" + _device_id_state["device_id"] = device_id return device_id + + +def get_encrypted_device_id_and_status() -> tuple[str, DeviceIdStatus]: + """Generate a FIPS-compliant encrypted device ID using SHA256 and returns the deviceIdStatus. + + This method uses SHA256 which is FIPS 140-2 approved for cryptographic operations. + The device ID is hashed to ensure deterministic but secure device identification. + + Returns: + str: FIPS-compliant encrypted device ID (base64-encoded) + + """ + device_id = ( + get_device_id() if _device_id_state["device_id"] is not None else _device_id_state["device_id"] + ) + encrypted_device_id = hashlib.sha256(device_id.encode("utf-8")).digest().hex().upper() if device_id else "" + return encrypted_device_id, _device_id_state["status"] + diff --git a/olive/telemetry/library/callback_manager.py b/olive/telemetry/library/callback_manager.py index 5936fd0196..2c202ec7e4 100644 --- a/olive/telemetry/library/callback_manager.py +++ b/olive/telemetry/library/callback_manager.py @@ -67,7 +67,8 @@ def unregister(): try: self._callbacks.remove(entry) except ValueError: - pass # Already removed + # The callback was already removed. + pass return unregister diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index 05a6775466..0e6fc1d579 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -39,23 +39,13 @@ def __init__( self, options: Optional[OneCollectorExporterOptions] = None, excluded_attributes: Optional[set[str]] = None, - **kwargs, ): """Initialize the OneCollector log exporter. Args: options: Exporter configuration options excluded_attributes: Attribute keys to exclude from log attributes - **kwargs: Legacy keyword arguments for backward compatibility - - connection_string: OneCollector connection string - - headers: Additional HTTP headers - - timeout: Request timeout in seconds - - compression: Compression type - """ - # Handle legacy initialization - if options is None: - options = self._create_options_from_kwargs(kwargs) # Validate options options.validate() @@ -114,22 +104,6 @@ def __init__( # Cache for resource (populated on first export) self._resource: Optional[Resource] = None - def _create_options_from_kwargs(self, kwargs: dict) -> OneCollectorExporterOptions: - """Create options from legacy keyword arguments.""" - from olive.telemetry.library.options import ( - CompressionType, - OneCollectorExporterOptions, - OneCollectorTransportOptions, - ) - - connection_string = kwargs.get("connection_string") - timeout = kwargs.get("timeout", 10.0) - compression = kwargs.get("compression", CompressionType.DEFLATE) - - transport_options = OneCollectorTransportOptions(timeout_seconds=timeout, compression=compression) - - return OneCollectorExporterOptions(connection_string=connection_string, transport_options=transport_options) - def add_metadata(self, metadata: dict[str, Any]) -> None: """Add custom metadata fields to all exported logs. diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 1ca6a5d953..31f2618a88 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -19,19 +19,6 @@ from olive.version import __version__ as VERSION -def _get_service_name() -> str: - """Derive service name from the root package name. - - Returns: - The name of the root package - - """ - # Get the root package name from this module's path - # e.g., olive.telemetry.library.telemetry_logger -> olive - package_name = __name__.split(".", maxsplit=1)[0] - return package_name - - class TelemetryLogger: """Singleton telemetry logger for simplified OneCollector integration. @@ -40,6 +27,7 @@ class TelemetryLogger: """ _instance: Optional["TelemetryLogger"] = None + _default_logger: Optional["TelemetryLogger"] = None _logger: Optional[logging.Logger] = None _logger_exporter: Optional[OneCollectorLogExporter] = None _logger_provider: Optional[LoggerProvider] = None @@ -72,7 +60,7 @@ def _initialize(self, options: Optional[OneCollectorExporterOptions]) -> None: self._logger_provider = LoggerProvider( resource=Resource.create( { - "service.name": _get_service_name(), + "service.name": __name__.split(".", maxsplit=1)[0], "service.version": VERSION, "service.instance.id": str(uuid.uuid4()), # Unique instance ID; can double as session ID } @@ -150,9 +138,31 @@ def force_flush(self, timeout_millis: float = 10_000) -> bool: return self._logger_provider.force_flush(timeout_millis=timeout_millis) return False + @classmethod + def get_default_logger(cls, connection_string: Optional[str] = None) -> "TelemetryLogger": + """Get or create the default telemetry logger. + + Args: + connection_string: OneCollector connection string (only used on first call) + + Returns: + TelemetryLogger instance + + """ + if cls._default_logger is None: + options = None + if connection_string: + options = OneCollectorExporterOptions(connection_string=connection_string) + cls._default_logger = cls(options=options) -# Convenience functions for common use cases -_default_logger: Optional[TelemetryLogger] = None + return cls._default_logger + + @classmethod + def shutdown_default_logger(cls) -> None: + """Shutdown the default telemetry logger.""" + if cls._default_logger: + cls._default_logger.shutdown() + cls._default_logger = None def get_telemetry_logger(connection_string: Optional[str] = None) -> TelemetryLogger: @@ -165,15 +175,7 @@ def get_telemetry_logger(connection_string: Optional[str] = None) -> TelemetryLo TelemetryLogger instance """ - global _default_logger - - if _default_logger is None: - options = None - if connection_string: - options = OneCollectorExporterOptions(connection_string=connection_string) - _default_logger = TelemetryLogger(options=options) - - return _default_logger + return TelemetryLogger.get_default_logger(connection_string=connection_string) def log_event(event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: @@ -190,7 +192,4 @@ def log_event(event_name: str, attributes: Optional[dict[str, Any]] = None) -> N def shutdown_telemetry() -> None: """Shutdown the default telemetry logger.""" - global _default_logger - if _default_logger: - _default_logger.shutdown() - _default_logger = None + TelemetryLogger.shutdown_default_logger() diff --git a/olive/telemetry/library/transport.py b/olive/telemetry/library/transport.py index 17f0e92d39..f218e61eb1 100644 --- a/olive/telemetry/library/transport.py +++ b/olive/telemetry/library/transport.py @@ -204,7 +204,6 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count, - error_message=str(ex), ) ) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index bd5744829a..1d7e4a5089 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -6,7 +6,6 @@ import base64 import json -import os import pickle import platform import threading @@ -16,11 +15,11 @@ from typing import Any, Optional from olive.telemetry.constants import CONNECTION_STRING -from olive.telemetry.deviceid._store import get_device_id_store_path from olive.telemetry.library.event_source import event_source from olive.telemetry.library.telemetry_logger import TelemetryLogger as _LibraryTelemetryLogger from olive.telemetry.library.telemetry_logger import get_telemetry_logger -from olive.telemetry.utils import _generate_encrypted_device_id +from olive.telemetry.deviceid import get_encrypted_device_id_and_status +from olive.telemetry.utils import get_telemetry_base_dir # Default event names used by the high-level telemetry helpers. HEARTBEAT_EVENT_NAME = "OliveHeartbeat" @@ -30,6 +29,7 @@ ALLOWED_KEYS = { HEARTBEAT_EVENT_NAME: [ "device_id", + "id_status", "os.name", "os.version", "os.release", @@ -142,6 +142,7 @@ def _on_payload_transmitted(self, args) -> None: if payload: self._schedule_cache_task(self._write_payload_to_cache, payload) except Exception: + # Fail silently. pass def _schedule_cache_task(self, func, *args) -> None: @@ -152,17 +153,11 @@ def _schedule_cache_task(self, func, *args) -> None: # If executor is not available (e.g., during shutdown), execute synchronously func(*args) except Exception: + # Fail silently. pass def _get_telemetry_support_dir(self) -> Optional[Path]: - os_name = platform.system() - if os_name == "Windows": - base_dir = os.environ.get("LOCALAPPDATA") or os.environ.get("APPDATA") - if not base_dir: - base_dir = str(Path.home() / "AppData" / "Local") - return Path(base_dir) / "Microsoft" / ".onnxruntime" - - return get_device_id_store_path() + return get_telemetry_base_dir() def _get_cache_path(self) -> Optional[Path]: support_dir = self._get_telemetry_support_dir() @@ -281,6 +276,7 @@ class Telemetry: """Wrapper that wires environment configuration into the library logger.""" _instance: Optional["Telemetry"] = None + _initialized: bool = False def __new__(cls): if cls._instance is None: @@ -330,8 +326,10 @@ def _log_heartbeat( metadata: Optional additional metadata to include. """ + encrypted_device_id, device_id_status = get_encrypted_device_id_and_status() attributes = { - "device_id": _generate_encrypted_device_id(), + "device_id": encrypted_device_id, + "id_status": device_id_status.value, "os": { "name": platform.system().lower(), "version": platform.version(), diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 4eba04e0d4..86c0bbc296 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -2,29 +2,38 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -import hashlib +import os +import platform import traceback +from pathlib import Path from types import TracebackType from typing import Optional -from olive.telemetry.deviceid import get_device_id +DEVICEID_LOCATION = r"Microsoft/DeveloperTools/deviceid/.onnxruntime/" -def _generate_encrypted_device_id() -> str: - """Generate a FIPS-compliant encrypted device ID using SHA256. +def get_telemetry_base_dir() -> Path: + os_name = platform.system() + if os_name == "Windows": + base_dir = os.environ.get("LOCALAPPDATA") or os.environ.get("APPDATA") + if not base_dir: + base_dir = str(Path.home() / "AppData" / "Local") + return Path(base_dir) / "Microsoft" / ".onnxruntime" - This method uses SHA256 which is FIPS 140-2 approved for cryptographic operations. - The device ID is hashed to ensure deterministic but secure device identification. + if os_name == "Darwin": + home = os.getenv("HOME") + if home is None: + raise ValueError("HOME environment variable not set") + return Path(home) / "Library" / "Application Support" / DEVICEID_LOCATION - Returns: - str: FIPS-compliant encrypted device ID (base64-encoded) + home = os.getenv("XDG_CACHE_HOME", f"{os.getenv('HOME')}/.cache") + if not home: + raise ValueError("HOME environment variable not set") - """ - hash_bytes = hashlib.sha256(get_device_id().encode("utf-8")).digest() - return hash_bytes.hex().upper() + return Path(home) / DEVICEID_LOCATION -def _format_exception_msg(ex: BaseException, tb: Optional[TracebackType] = None) -> str: +def _format_exception_message(ex: BaseException, tb: Optional[TracebackType] = None) -> str: """Format an exception and trim local paths for readability.""" folder = "Olive" file_line = 'File "' From 0578e7d9bb003e8582f7c350e973e3979b58d917 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 11:47:00 -0600 Subject: [PATCH 21/31] Lint + fixes --- olive/telemetry/deviceid/__init__.py | 4 +- olive/telemetry/deviceid/deviceid.py | 7 +-- olive/telemetry/library/exporter.py | 2 +- olive/telemetry/telemetry.py | 68 +------------------------ olive/telemetry/telemetry_extensions.py | 6 +-- 5 files changed, 8 insertions(+), 79 deletions(-) diff --git a/olive/telemetry/deviceid/__init__.py b/olive/telemetry/deviceid/__init__.py index 24129b0eb0..50698c12d6 100644 --- a/olive/telemetry/deviceid/__init__.py +++ b/olive/telemetry/deviceid/__init__.py @@ -1,3 +1,3 @@ -from olive.telemetry.deviceid.deviceid import get_device_id +from olive.telemetry.deviceid.deviceid import get_encrypted_device_id_and_status -__all__ = ["get_device_id"] +__all__ = ["get_encrypted_device_id_and_status"] diff --git a/olive/telemetry/deviceid/deviceid.py b/olive/telemetry/deviceid/deviceid.py index 9745baaec8..cf45e9fc47 100644 --- a/olive/telemetry/deviceid/deviceid.py +++ b/olive/telemetry/deviceid/deviceid.py @@ -1,5 +1,4 @@ import hashlib -import logging import platform import uuid from enum import Enum @@ -30,7 +29,6 @@ def get_device_id() -> str: :return: The device id. :rtype: str """ - logger = logging.getLogger(__name__) device_id: str = "" store: Union[Store, WindowsStore] create_new_id = False @@ -97,9 +95,6 @@ def get_encrypted_device_id_and_status() -> tuple[str, DeviceIdStatus]: str: FIPS-compliant encrypted device ID (base64-encoded) """ - device_id = ( - get_device_id() if _device_id_state["device_id"] is not None else _device_id_state["device_id"] - ) + device_id = get_device_id() if _device_id_state["device_id"] is not None else _device_id_state["device_id"] encrypted_device_id = hashlib.sha256(device_id.encode("utf-8")).digest().hex().upper() if device_id else "" return encrypted_device_id, _device_id_state["status"] - diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index 0e6fc1d579..0d11e82321 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -45,8 +45,8 @@ def __init__( Args: options: Exporter configuration options excluded_attributes: Attribute keys to exclude from log attributes - """ + """ # Validate options options.validate() diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 1d7e4a5089..777fc024b0 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -15,10 +15,10 @@ from typing import Any, Optional from olive.telemetry.constants import CONNECTION_STRING +from olive.telemetry.deviceid import get_encrypted_device_id_and_status from olive.telemetry.library.event_source import event_source from olive.telemetry.library.telemetry_logger import TelemetryLogger as _LibraryTelemetryLogger from olive.telemetry.library.telemetry_logger import get_telemetry_logger -from olive.telemetry.deviceid import get_encrypted_device_id_and_status from olive.telemetry.utils import get_telemetry_base_dir # Default event names used by the high-level telemetry helpers. @@ -67,13 +67,6 @@ def __init__(self, telemetry: "Telemetry") -> None: self._flush_in_progress = False self._transport = None - # Track replayed events that haven't been sent yet - self._pending_replay_events = [] # List of events being replayed - self._pending_replay_lock = threading.Lock() - self._replay_in_progress = False - self._replay_writeback_scheduled = False - self._replay_new_writes = False - def setup_payload_callbacks(self) -> None: logger = self._telemetry._logger if not logger: @@ -113,31 +106,10 @@ def shutdown(self) -> None: def _on_payload_transmitted(self, args) -> None: try: if args.succeeded: - # Telemetry succeeded - mark any pending replayed events as sent - with self._pending_replay_lock: - if self._pending_replay_events: - # Remove events from pending list (they were successfully sent) - sent_count = min(len(self._pending_replay_events), args.item_count) - self._pending_replay_events = self._pending_replay_events[sent_count:] - - # If no more pending replayed events, signal completion - if not self._pending_replay_events and self._replay_in_progress: - self._replay_in_progress = False - if not self._replay_new_writes and not self._replay_writeback_scheduled: - cache_path = self._get_cache_path() - if cache_path: - cache_path.unlink(missing_ok=True) - # Also flush any previously cached failures self._schedule_cache_task(self._flush_cache) else: # Telemetry failed - cache this payload for later replay - with self._pending_replay_lock: - has_pending_replay = bool(self._pending_replay_events) - should_writeback = self._replay_in_progress and has_pending_replay - if should_writeback and not self._replay_writeback_scheduled: - self._replay_writeback_scheduled = True - self._schedule_cache_task(self._write_entries_to_cache, self._pending_replay_events.copy()) payload = getattr(self._transport, "_last_payload", None) if payload: self._schedule_cache_task(self._write_payload_to_cache, payload) @@ -171,10 +143,6 @@ def _write_payload_to_cache(self, payload: bytes) -> None: if cache_path is None: return - with self._pending_replay_lock: - if self._replay_in_progress: - self._replay_new_writes = True - cache_path.parent.mkdir(parents=True, exist_ok=True) cache_size = cache_path.stat().st_size if cache_path.exists() else 0 @@ -196,29 +164,6 @@ def _write_payload_to_cache(self, payload: bytes) -> None: except Exception: return - def _write_entries_to_cache(self, entries: list[dict[str, Any]]) -> None: - try: - cache_path = self._get_cache_path() - if cache_path is None: - return - - cache_path.parent.mkdir(parents=True, exist_ok=True) - cache_size = cache_path.stat().st_size if cache_path.exists() else 0 - - if cache_size >= HARD_MAX_CACHE_SIZE_BYTES: - return - - if cache_size >= MAX_CACHE_SIZE_BYTES: - entries = [entry for entry in entries if entry.get("event_name") in CRITICAL_EVENTS] - if not entries: - return - - with self._cache_lock, cache_path.open("ab") as cache_file: - for entry in entries: - pickle.dump(entry, cache_file, protocol=pickle.HIGHEST_PROTOCOL) - except Exception: - return - def _flush_cache(self) -> None: with self._cache_lock: if self._flush_in_progress: @@ -240,13 +185,6 @@ def _flush_cache(self) -> None: cache_path.unlink(missing_ok=True) - # Mark these entries as pending replay - with self._pending_replay_lock: - self._pending_replay_events = entries.copy() - self._replay_in_progress = True - self._replay_writeback_scheduled = False - self._replay_new_writes = False - for entry in entries: try: event_name = entry.get("event_name") @@ -259,10 +197,6 @@ def _flush_cache(self) -> None: attributes["initTs"] = entry.get("ts") self._telemetry.log(event_name, attributes, None) except Exception: - # Remove failed entry from pending list - with self._pending_replay_lock: - if entry in self._pending_replay_events: - self._pending_replay_events.remove(entry) continue self._telemetry.force_flush(timeout_millis=5_000) diff --git a/olive/telemetry/telemetry_extensions.py b/olive/telemetry/telemetry_extensions.py index c0998077f1..e5b13395d0 100644 --- a/olive/telemetry/telemetry_extensions.py +++ b/olive/telemetry/telemetry_extensions.py @@ -10,7 +10,7 @@ from typing import Any, Callable, Optional, TypeVar from olive.telemetry.telemetry import ACTION_EVENT_NAME, ERROR_EVENT_NAME, _get_logger -from olive.telemetry.utils import _format_exception_msg +from olive.telemetry.utils import _format_exception_message _TFunc = TypeVar("_TFunc", bound=Callable[..., Any]) @@ -110,7 +110,7 @@ def __exit__( if exc_type is not None and exc_val is not None: log_error( exception_type=exc_type.__name__, - exception_message=_format_exception_msg(exc_val, exc_tb), + exception_message=_format_exception_message(exc_val, exc_tb), metadata=self.metadata, ) @@ -139,7 +139,7 @@ def wrapper(*args: Any, **kwargs: Any): success = False log_error( exception_type=type(exc).__name__, - exception_message=_format_exception_msg(exc, exc.__traceback__), + exception_message=_format_exception_message(exc, exc.__traceback__), ) raise finally: From 627c5ea93685698b8b2c5fe43a456c55159ee388 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 21:56:31 -0600 Subject: [PATCH 22/31] Lint --- olive/telemetry/library/callback_manager.py | 18 +++++ olive/telemetry/library/event_source.py | 4 + olive/telemetry/library/exporter.py | 3 + olive/telemetry/library/telemetry_logger.py | 10 ++- olive/telemetry/library/transport.py | 11 ++- olive/telemetry/telemetry.py | 81 ++++++++------------- 6 files changed, 72 insertions(+), 55 deletions(-) diff --git a/olive/telemetry/library/callback_manager.py b/olive/telemetry/library/callback_manager.py index 2c202ec7e4..7ec11f2b2e 100644 --- a/olive/telemetry/library/callback_manager.py +++ b/olive/telemetry/library/callback_manager.py @@ -31,6 +31,9 @@ class PayloadTransmittedCallbackArgs: item_count: int """Number of items in the payload.""" + payload_bytes: Optional[bytes] = None + """Raw payload bytes (uncompressed), if available.""" + class CallbackManager: """Manages callbacks for payload transmission events. @@ -43,6 +46,7 @@ def __init__(self): """Initialize the callback manager.""" self._callbacks: list[tuple[Callable[[PayloadTransmittedCallbackArgs], None], bool]] = [] self._lock = threading.Lock() + self._disposed = False def register( self, callback: Callable[[PayloadTransmittedCallbackArgs], None], include_failures: bool = False @@ -58,6 +62,12 @@ def register( """ with self._lock: + if self._disposed: + + def unregister(): + return None + + return unregister entry = (callback, include_failures) self._callbacks.append(entry) @@ -81,6 +91,8 @@ def notify(self, args: PayloadTransmittedCallbackArgs) -> None: """ # Get snapshot of callbacks to avoid holding lock during invocation with self._lock: + if self._disposed: + return callbacks_snapshot = self._callbacks.copy() # Invoke callbacks @@ -99,3 +111,9 @@ def clear(self) -> None: """Clear all registered callbacks.""" with self._lock: self._callbacks.clear() + + def dispose(self) -> None: + """Dispose the manager and prevent further registrations.""" + with self._lock: + self._callbacks.clear() + self._disposed = True diff --git a/olive/telemetry/library/event_source.py b/olive/telemetry/library/event_source.py index 198c17398d..361387f2b5 100644 --- a/olive/telemetry/library/event_source.py +++ b/olive/telemetry/library/event_source.py @@ -43,6 +43,10 @@ def is_informational_logging_enabled(self) -> bool: """Check if informational level logging is enabled.""" return self.logger.isEnabledFor(logging.INFO) + def is_error_logging_enabled(self) -> bool: + """Check if error level logging is enabled.""" + return self.logger.isEnabledFor(logging.ERROR) + def export_exception_thrown(self, item_type: str, exception: Exception) -> None: """Log an exception thrown during export. diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index 0d11e82321..a1de2efdb3 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -320,3 +320,6 @@ def shutdown(self) -> None: # Close HTTP session if hasattr(self, "_session"): self._session.close() + + if hasattr(self, "_callback_manager"): + self._callback_manager.dispose() diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 31f2618a88..23d3d34476 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -7,7 +7,7 @@ import logging import uuid -from typing import Any, Optional +from typing import Any, Callable, Optional from opentelemetry._logs import set_logger_provider from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler @@ -105,6 +105,14 @@ def add_global_metadata(self, metadata: dict[str, Any]) -> None: if self._logger_exporter: self._logger_exporter.add_metadata(metadata) + def register_payload_transmitted_callback( + self, callback, include_failures: bool = False + ) -> Optional[Callable[[], None]]: + """Register a callback for payload transmission events.""" + if self._logger_exporter: + return self._logger_exporter.register_payload_transmitted_callback(callback, include_failures) + return None + def log(self, event_name: str, attributes: Optional[dict[str, Any]] = None) -> None: """Log a telemetry event. diff --git a/olive/telemetry/library/transport.py b/olive/telemetry/library/transport.py index f218e61eb1..0e5d09566d 100644 --- a/olive/telemetry/library/transport.py +++ b/olive/telemetry/library/transport.py @@ -166,6 +166,7 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple status_code=status_code, payload_size_bytes=payload_size_bytes, item_count=item_count, + payload_bytes=payload, ) ) @@ -173,8 +174,12 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple return True, status_code else: # Log error response - error_message = response.text[:100] if response.text else "" - event_source.http_transport_error_response("HttpJsonPost", status_code, error_message, "") + if event_source.is_error_logging_enabled(): + collector_error = response.headers.get("Collector-Error", "") + error_details = response.text[:100] if response.text else "" + event_source.http_transport_error_response( + "HttpJsonPost", status_code, collector_error, error_details + ) return False, status_code except requests.exceptions.Timeout: @@ -188,6 +193,7 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count, + payload_bytes=payload, ) ) @@ -204,6 +210,7 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple status_code=None, payload_size_bytes=payload_size_bytes, item_count=item_count, + payload_bytes=payload, ) ) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 777fc024b0..956540838f 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -65,34 +65,6 @@ def __init__(self, telemetry: "Telemetry") -> None: self._cache_lock = threading.Lock() self._cache_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="olive_telemetry_cache") self._flush_in_progress = False - self._transport = None - - def setup_payload_callbacks(self) -> None: - logger = self._telemetry._logger - if not logger: - return - - exporter = getattr(logger, "_logger_exporter", None) - if not exporter: - return - - transport = getattr(exporter, "_transport", None) - if not transport: - return - if getattr(transport, "_cache_wrap_installed", False): - return - - original_send = transport.send - - def wrapped_send(payload: bytes, timeout_sec: float, item_count: int = 1): - transport._last_payload = payload - return original_send(payload, timeout_sec, item_count=item_count) - - transport.send = wrapped_send - transport._cache_wrap_installed = True - self._transport = transport - - exporter.register_payload_transmitted_callback(self._on_payload_transmitted, include_failures=True) def shutdown(self) -> None: if self._cache_executor: @@ -103,14 +75,14 @@ def shutdown(self) -> None: self._cache_executor.shutdown(wait=True) self._cache_executor = None - def _on_payload_transmitted(self, args) -> None: + def on_payload_transmitted(self, args) -> None: try: if args.succeeded: # Also flush any previously cached failures self._schedule_cache_task(self._flush_cache) else: # Telemetry failed - cache this payload for later replay - payload = getattr(self._transport, "_last_payload", None) + payload = getattr(args, "payload_bytes", None) if payload: self._schedule_cache_task(self._write_payload_to_cache, payload) except Exception: @@ -165,12 +137,13 @@ def _write_payload_to_cache(self, payload: bytes) -> None: return def _flush_cache(self) -> None: - with self._cache_lock: - if self._flush_in_progress: - return - self._flush_in_progress = True + entries: list[dict[str, Any]] = [] + try: + with self._cache_lock: + if self._flush_in_progress: + return + self._flush_in_progress = True - try: cache_path = self._get_cache_path() if cache_path is None: return @@ -185,24 +158,25 @@ def _flush_cache(self) -> None: cache_path.unlink(missing_ok=True) - for entry in entries: - try: - event_name = entry.get("event_name") - event_data = entry.get("event_data") - if not event_name or not event_data: - continue - attributes = json.loads(event_data) - if not isinstance(attributes, dict): - continue - attributes["initTs"] = entry.get("ts") - self._telemetry.log(event_name, attributes, None) - except Exception: + for entry in entries: + try: + event_name = entry.get("event_name") + event_data = entry.get("event_data") + if not event_name or not event_data: + continue + attributes = json.loads(event_data) + if not isinstance(attributes, dict): continue + attributes["initTs"] = entry.get("ts") + self._telemetry.log(event_name, attributes, None) + except Exception: + continue - self._telemetry.force_flush(timeout_millis=5_000) - except Exception: - return - finally: + self._telemetry.force_flush(timeout_millis=5_000) + except Exception: + return + finally: + with self._cache_lock: self._flush_in_progress = False @@ -237,7 +211,10 @@ def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: def _setup_payload_callbacks(self) -> None: if not self._logger: return - self._cache_handler.setup_payload_callbacks() + self._logger.register_payload_transmitted_callback( + self._cache_handler.on_payload_transmitted, + include_failures=True, + ) def add_global_metadata(self, metadata: dict[str, Any]) -> None: if self._logger: From 1bfd8224d77eeb39886a0931565d3264fc3bcbf0 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 23:39:01 -0600 Subject: [PATCH 23/31] Add Privacy.md --- README.md | 3 +++ docs/Privacy.md | 16 ++++++++++++++++ olive/cli/base.py | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 docs/Privacy.md diff --git a/README.md b/README.md index 0cac2cf78d..70bee80855 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,9 @@ The sample chat app to run is found as [model-chat.py](https://github.com/micros - [Documentation](https://microsoft.github.io/Olive) - [Recipes](https://github.com/microsoft/olive-recipes) +## Data/Telemetry +Distributions of this project may collect usage data and send it to Microsoft to help improve our products and services. See the [privacy statement](docs/Privacy.md) for more details. + ## 🤝 Contributions and Feedback - We welcome contributions! Please read the [contribution guidelines](./CONTRIBUTING.md) for more details on how to contribute to the Olive project. - For feature requests or bug reports, file a [GitHub Issue](https://github.com/microsoft/Olive/issues). diff --git a/docs/Privacy.md b/docs/Privacy.md new file mode 100644 index 0000000000..5b8e48a36d --- /dev/null +++ b/docs/Privacy.md @@ -0,0 +1,16 @@ +# Privacy + +## Data Collection +The software may collect information about you and your use of the software and send it to Microsoft. Microsoft may use this information to provide services and improve our products and services. You may turn off telemetry as described below. Our privacy statement is located at https://go.microsoft.com/fwlink/?LinkID=824704. You can learn more about data collection and use in the help documentation and our privacy statement. Your use of the software operates as your consent to these practices. + +*** + +#### Technical Details +Olive uses the [OpenTelemetry](https://opentelemetry.io/) API for its implementation. Based on user consent, this data may be periodically sent to Microsoft servers following GDPR and privacy regulations for anonymity and data access controls. Application, device, and version information is collected automatically. + +In addition, Olive may collect optional telemetry data such as: +- User interactions +- Performance data +- Exception information + +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to your Olive CLI commands. diff --git a/olive/cli/base.py b/olive/cli/base.py index 969d1205e6..ffbea3a095 100644 --- a/olive/cli/base.py +++ b/olive/cli/base.py @@ -633,7 +633,7 @@ def add_search_options(sub_parser: ArgumentParser): def add_telemetry_options(sub_parser: ArgumentParser): """Add telemetry options to the sub_parser.""" - sub_parser.add_argument("--disable-telemetry", action="store_true", help="Disable telemetry for this command.") + sub_parser.add_argument("--disable_telemetry", action="store_true", help="Disable telemetry for this command.") return sub_parser From 9ddd042d66dc8df6be42db786ee8932c131b0886 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 2 Feb 2026 23:53:12 -0600 Subject: [PATCH 24/31] Lint --- olive/telemetry/library/callback_manager.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/olive/telemetry/library/callback_manager.py b/olive/telemetry/library/callback_manager.py index 7ec11f2b2e..a3a8eb1f08 100644 --- a/olive/telemetry/library/callback_manager.py +++ b/olive/telemetry/library/callback_manager.py @@ -63,11 +63,7 @@ def register( """ with self._lock: if self._disposed: - - def unregister(): - return None - - return unregister + return lambda: None # No-op unregister if disposed entry = (callback, include_failures) self._callbacks.append(entry) From 1662297ab72a0df000eea71e5ffbe81d7c6c9594 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 5 Feb 2026 04:49:06 -0600 Subject: [PATCH 25/31] Fixes, improvements, tracking, don't collect stats during tests --- olive/engine/engine.py | 2 + olive/telemetry/deviceid/_store.py | 2 +- olive/telemetry/deviceid/deviceid.py | 9 +- olive/telemetry/library/callback_manager.py | 23 +- olive/telemetry/library/exporter.py | 3 +- olive/telemetry/library/telemetry_logger.py | 6 - olive/telemetry/telemetry.py | 543 ++++++++++++++++---- olive/telemetry/utils.py | 6 +- test/conftest.py | 6 + 9 files changed, 476 insertions(+), 124 deletions(-) diff --git a/olive/engine/engine.py b/olive/engine/engine.py index e78264892e..c48b099537 100644 --- a/olive/engine/engine.py +++ b/olive/engine/engine.py @@ -29,6 +29,7 @@ from olive.search.search_strategy import SearchStrategy, SearchStrategyConfig from olive.systems.common import SystemType from olive.systems.system_config import SystemConfig +from olive.telemetry import action if TYPE_CHECKING: from olive.engine.packaging.packaging_config import PackagingConfig @@ -148,6 +149,7 @@ def register( def set_input_passes_configs(self, pass_configs: dict[str, list[RunPassConfig]]): self.input_passes_configs = pass_configs + @action def run( self, input_model_config: ModelConfig, diff --git a/olive/telemetry/deviceid/_store.py b/olive/telemetry/deviceid/_store.py index c66087b699..c7491d4331 100644 --- a/olive/telemetry/deviceid/_store.py +++ b/olive/telemetry/deviceid/_store.py @@ -11,7 +11,7 @@ def __init__(self) -> None: self._file_path: Path = self._build_path() def _build_path(self) -> Path: - return get_telemetry_base_dir() + return get_telemetry_base_dir() / "deviceid" def retrieve_id(self) -> str: """Retrieve the device id from the store location. diff --git a/olive/telemetry/deviceid/deviceid.py b/olive/telemetry/deviceid/deviceid.py index cf45e9fc47..70847fa5e3 100644 --- a/olive/telemetry/deviceid/deviceid.py +++ b/olive/telemetry/deviceid/deviceid.py @@ -22,9 +22,10 @@ def get_device_id() -> str: An empty string is returned if an error occurs during saving or retrieval of the device id. - Linux id location: $XDG_CACHE_HOME/deviceid if defined else $HOME/.cache/deviceid - MacOS id location: $HOME/Library/Application Support/Microsoft/DeveloperTools/deviceid - Windows id location: HKEY_CURRENT_USER\SOFTWARE\Microsoft\DeveloperTools\deviceid + Linux id location: $XDG_CACHE_HOME/Microsoft/DeveloperTools/.onnxruntime/deviceid if defined + else $HOME/.cache/Microsoft/DeveloperTools/.onnxruntime/deviceid + MacOS id location: $HOME/Library/Application Support/Microsoft/DeveloperTools/.onnxruntime/deviceid + Windows id location: HKEY_CURRENT_USER\SOFTWARE\Microsoft\.onnxruntime\deviceid :return: The device id. :rtype: str @@ -95,6 +96,6 @@ def get_encrypted_device_id_and_status() -> tuple[str, DeviceIdStatus]: str: FIPS-compliant encrypted device ID (base64-encoded) """ - device_id = get_device_id() if _device_id_state["device_id"] is not None else _device_id_state["device_id"] + device_id = _device_id_state["device_id"] if _device_id_state["device_id"] is not None else get_device_id() encrypted_device_id = hashlib.sha256(device_id.encode("utf-8")).digest().hex().upper() if device_id else "" return encrypted_device_id, _device_id_state["status"] diff --git a/olive/telemetry/library/callback_manager.py b/olive/telemetry/library/callback_manager.py index a3a8eb1f08..ee62553163 100644 --- a/olive/telemetry/library/callback_manager.py +++ b/olive/telemetry/library/callback_manager.py @@ -14,10 +14,7 @@ @dataclass class PayloadTransmittedCallbackArgs: - """Arguments passed to payload transmitted callbacks. - - Matches the .NET OneCollectorExporterPayloadTransmittedCallbackArguments. - """ + """Arguments passed to payload transmitted callbacks.""" succeeded: bool """Whether the transmission succeeded.""" @@ -46,7 +43,7 @@ def __init__(self): """Initialize the callback manager.""" self._callbacks: list[tuple[Callable[[PayloadTransmittedCallbackArgs], None], bool]] = [] self._lock = threading.Lock() - self._disposed = False + self._closed = False def register( self, callback: Callable[[PayloadTransmittedCallbackArgs], None], include_failures: bool = False @@ -62,7 +59,7 @@ def register( """ with self._lock: - if self._disposed: + if self._closed: return lambda: None # No-op unregister if disposed entry = (callback, include_failures) self._callbacks.append(entry) @@ -87,7 +84,7 @@ def notify(self, args: PayloadTransmittedCallbackArgs) -> None: """ # Get snapshot of callbacks to avoid holding lock during invocation with self._lock: - if self._disposed: + if self._closed: return callbacks_snapshot = self._callbacks.copy() @@ -103,13 +100,11 @@ def notify(self, args: PayloadTransmittedCallbackArgs) -> None: # Log but don't propagate exceptions from user code event_source.exception_thrown_from_user_code("PayloadTransmittedCallback", ex) - def clear(self) -> None: - """Clear all registered callbacks.""" - with self._lock: - self._callbacks.clear() + def close(self) -> None: + """Close the callback manager and prevent further registrations. - def dispose(self) -> None: - """Dispose the manager and prevent further registrations.""" + This method is idempotent and can be called multiple times. + """ with self._lock: self._callbacks.clear() - self._disposed = True + self._closed = True diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index a1de2efdb3..69f6cf87dc 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -321,5 +321,6 @@ def shutdown(self) -> None: if hasattr(self, "_session"): self._session.close() + # Close callback manager if hasattr(self, "_callback_manager"): - self._callback_manager.dispose() + self._callback_manager.close() diff --git a/olive/telemetry/library/telemetry_logger.py b/olive/telemetry/library/telemetry_logger.py index 23d3d34476..7eb236e759 100644 --- a/olive/telemetry/library/telemetry_logger.py +++ b/olive/telemetry/library/telemetry_logger.py @@ -140,12 +140,6 @@ def shutdown(self) -> None: if self._logger_provider: self._logger_provider.shutdown() - def force_flush(self, timeout_millis: float = 10_000) -> bool: - """Force flush buffered log records.""" - if self._logger_provider: - return self._logger_provider.force_flush(timeout_millis=timeout_millis) - return False - @classmethod def get_default_logger(cls, connection_string: Optional[str] = None) -> "TelemetryLogger": """Get or create the default telemetry logger. diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 956540838f..08167045de 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -5,14 +5,16 @@ """Thin wrapper around the OneCollector telemetry logger with event helpers.""" import base64 +import errno import json -import pickle import platform import threading import time -from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional + +if TYPE_CHECKING: + from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs from olive.telemetry.constants import CONNECTION_STRING from olive.telemetry.deviceid import get_encrypted_device_id_and_status @@ -36,6 +38,7 @@ "os.arch", "app_version", "app_instance_id", + "initTs", ], ACTION_EVENT_NAME: [ "invoked_from", @@ -44,157 +47,406 @@ "success", "app_version", "app_instance_id", + "initTs", ], ERROR_EVENT_NAME: [ "exception_type", "exception_message", "app_version", "app_instance_id", + "initTs", ], } CRITICAL_EVENTS = {HEARTBEAT_EVENT_NAME} MAX_CACHE_SIZE_BYTES = 5 * 1024 * 1024 HARD_MAX_CACHE_SIZE_BYTES = 10 * 1024 * 1024 -CACHE_FILE_NAME = "olive.pkl" +CACHE_FILE_NAME = "olive.json" class TelemetryCacheHandler: + """Handles caching of failed telemetry events for offline resilience. + + Design decisions: + - Single shared cache file (olive.json) for simplicity + - Cache writes are synchronous (fast JSON operations don't need async) + - Cache flush runs in a separate thread (slow network I/O) + - Flush triggered on success when cached events exist + - All critical sections protected by lock to prevent race conditions + - Newline-delimited JSON format for human readability and partial corruption recovery + + Assumptions: + - File I/O (JSON lines) is fast enough for synchronous execution (~microseconds) + - Network I/O is slow and should not block the callback thread + - Successful send indicates network is available to retry cached events + - Cache persists across sessions for offline resilience + """ + def __init__(self, telemetry: "Telemetry") -> None: self._telemetry = telemetry - self._cache_lock = threading.Lock() - self._cache_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="olive_telemetry_cache") - self._flush_in_progress = False + # Single shared cache file for all processes + self._cache_file_name = CACHE_FILE_NAME + self._shutdown = False + # Protects all shared state to prevent race conditions + self._lock = threading.Lock() + self._callback_condition = threading.Condition() + self._callbacks_item_count = 0 + self._events_logged = 0 + # Prevents concurrent flush operations + self._is_flushing = False def shutdown(self) -> None: - if self._cache_executor: - # Wait for pending cache tasks to complete before shutting down - # Note: We intentionally do NOT flush the cache here. The cache persists across - # sessions for offline resilience. If network is working, the success callback - # already flushed. If network is down, flushing would fail anyway. - self._cache_executor.shutdown(wait=True) - self._cache_executor = None - - def on_payload_transmitted(self, args) -> None: + """Signal shutdown to prevent new operations. + + Note: Does NOT flush the cache. Cache persists across sessions for + offline resilience. If network is working, success callbacks already + flushed. If network is down, flushing would fail anyway. + """ + with self._lock: + self._shutdown = True + + def __del__(self): + """Cleanup cache handler resources on garbage collection. + + Safety net to ensure shutdown is called even if not done explicitly. + """ try: - if args.succeeded: - # Also flush any previously cached failures - self._schedule_cache_task(self._flush_cache) - else: - # Telemetry failed - cache this payload for later replay - payload = getattr(args, "payload_bytes", None) - if payload: - self._schedule_cache_task(self._write_payload_to_cache, payload) + self.shutdown() except Exception: - # Fail silently. pass - def _schedule_cache_task(self, func, *args) -> None: + def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None: + """Telemetry payload transmission callback. + + Design decisions: + - Ignore callbacks during flush (unlikely to fail during successful flush) + - On success: flush cache if any cached events exist + - On failure: write to cache immediately (synchronous for simplicity) + + Assumptions: + - Successful transmission indicates network is available to retry cached events + - If flush is in progress, we already successfully sent an event, so unlikely an event would suddenly fail + - Multiple concurrent successes don't need multiple flush operations + - Failed payloads should be cached immediately to avoid loss + """ try: - if self._cache_executor: - self._cache_executor.submit(func, *args) - else: - # If executor is not available (e.g., during shutdown), execute synchronously - func(*args) + payload = None + should_flush = False + + with self._lock: + if self._shutdown: + return + + # Skip callbacks from replayed events during flush + # If a flush is in progress it means we successfully sent an event, + # so it's unlikely that an event would suddenly fail and need to be cached + # and we don't need to flush again. + if self._is_flushing: + with self._callback_condition: + self._callbacks_item_count += args.item_count + self._callback_condition.notify_all() + return + + if args.succeeded: + # Only flush if cache exists and no flush is in progress + cache_path = self.cache_path + if cache_path and cache_path.exists(): + should_flush = True + else: + payload = args.payload_bytes + + if should_flush: + # Release lock before scheduling (flush runs in separate thread) + self._schedule_flush() + elif payload: + # Write synchronously - JSON operations are fast enough + self._write_payload_to_cache(payload) except Exception: - # Fail silently. + # Fail silently - telemetry should never crash the application pass + finally: + with self._callback_condition: + self._callbacks_item_count += args.item_count + self._callback_condition.notify_all() + + def wait_for_callbacks(self, timeout_sec: float) -> bool: + deadline = time.time() + timeout_sec + while True: + with self._callback_condition: + callbacks_item_count = self._callbacks_item_count + expected_items = self._events_logged + if not self.is_flushing() and callbacks_item_count >= expected_items: + return True + remaining = deadline - time.time() + if remaining <= 0: + return False + with self._callback_condition: + self._callback_condition.wait(timeout=remaining) + + def record_event_logged(self, count: int = 1) -> None: + with self._callback_condition: + self._events_logged += count + + def _schedule_flush(self) -> None: + """Schedule cache flush in a separate thread to avoid blocking the callback. + + Design decisions: + - Check _is_flushing before spawning thread to avoid unnecessary threads + - Run flush in daemon thread (don't block process exit) + - Acquire lock at start to set _is_flushing flag atomically + - Always clear _is_flushing flag even if flush fails + + Assumptions: + - Flush operations are slow (network I/O) and should not block callbacks + - Daemon thread is acceptable (flush is best-effort) + """ + # Check before spawning thread to avoid unnecessary thread creation + with self._lock: + if self._shutdown or self._is_flushing: + return + self._is_flushing = True + + def flush_task(): + try: + self._flush_cache() + except Exception: + # Fail silently + pass + finally: + # Always clear flag, even on exception + with self._lock: + self._is_flushing = False - def _get_telemetry_support_dir(self) -> Optional[Path]: - return get_telemetry_base_dir() + thread = threading.Thread(target=flush_task, daemon=True) + thread.start() - def _get_cache_path(self) -> Optional[Path]: - support_dir = self._get_telemetry_support_dir() + @property + def cache_path(self) -> Optional[Path]: + """Get the path to the telemetry cache file. + + Returns: + Optional[Path]: Path to cache file, or None if base directory unavailable. + + """ + support_dir = get_telemetry_base_dir() if not support_dir: return None - return support_dir / "cache" / CACHE_FILE_NAME + return support_dir / "cache" / self._cache_file_name def _write_payload_to_cache(self, payload: bytes) -> None: + """Write failed telemetry payload to cache for later retry. + + Design decisions: + - Parse payload to extract individual events (allows filtering) + - Filter to only critical events near size limit (preserves important data) + - Use exponential backoff for file contention (avoids spinning) + - Fail silently on errors (telemetry should never crash app) + + Assumptions: + - JSON operations are fast enough for synchronous execution + - File contention is rare and transient (retry a few times) + - Cache size limits prevent unbounded growth + - Critical events (heartbeat) are more important than others + """ try: - cache_path = self._get_cache_path() + cache_path = self.cache_path if cache_path is None: return - cache_path.parent.mkdir(parents=True, exist_ok=True) - cache_size = cache_path.stat().st_size if cache_path.exists() else 0 - - if cache_size >= HARD_MAX_CACHE_SIZE_BYTES: - return - + # Parse payload into individual events for filtering entries = _parse_payload(payload) if not entries: return - if cache_size >= MAX_CACHE_SIZE_BYTES: - entries = [entry for entry in entries if entry.get("event_name") in CRITICAL_EVENTS] - if not entries: - return + cache_path.parent.mkdir(parents=True, exist_ok=True) - with self._cache_lock, cache_path.open("ab") as cache_file: - for entry in entries: - pickle.dump(entry, cache_file, protocol=pickle.HIGHEST_PROTOCOL) + max_retries = 3 + for attempt in range(max_retries + 1): + try: + cache_size = cache_path.stat().st_size if cache_path.exists() else 0 + + # Hard limit: stop caching entirely to prevent unbounded growth + if cache_size >= HARD_MAX_CACHE_SIZE_BYTES: + return + + # Soft limit: keep only critical events to preserve space + if cache_size >= MAX_CACHE_SIZE_BYTES: + entries = [entry for entry in entries if entry["event_name"] in CRITICAL_EVENTS] + if not entries: + return + + # Append newline-delimited JSON (human-readable, partial corruption recovery) + with cache_path.open("a", encoding="utf-8") as cache_file: + for entry in entries: + # Write compact JSON on single line + json.dump(entry, cache_file, ensure_ascii=False, separators=(",", ":")) + cache_file.write("\n") + return + except OSError as exc: + # Retry only on transient access errors (file locked by another process) + if exc.errno not in {errno.EACCES, errno.EAGAIN, errno.EWOULDBLOCK, errno.EBUSY}: + return + if attempt >= max_retries: + return + # Exponential backoff: 50ms, 100ms, 200ms (aligned with C# implementation) + time.sleep(0.05 * (2**attempt)) except Exception: + # Fail silently - telemetry errors should not crash the application return def _flush_cache(self) -> None: - entries: list[dict[str, Any]] = [] + """Flush this process's cached events back to telemetry service.""" + cache_path = self.cache_path + if cache_path is None or not cache_path.exists(): + return + + self._flush_cache_file(cache_path) + + def _flush_cache_file(self, cache_path: Path) -> None: + """Flush cached events back to telemetry service. + + Approach: + 1. Atomically rename cache → .flush (claims ownership, prevents concurrent flushes) + 2. Read all events from .flush file + 3. Queue all events for sending via telemetry logger + 4. Force flush with 2-second timeout + 5. On success: delete .flush file + 6. On failure: restore .flush → cache for retry + + Multi-process coordination: + - `replace()` is atomic; only one process can successfully rename the cache file + - If another process already renamed it, we get FileNotFoundError and abort + - Stale .flush files from crashes are overwritten by the atomic rename + + Shutdown handling: + - If shutdown flag set during flush, restore cache before returning + - This preserves events even if callbacks don't fire during shutdown + + Callback behavior: + - Queued events trigger callbacks with success/failure + - Failed events are automatically re-cached via callbacks (unless shutting down) + - The _is_flushing flag prevents re-caching of replayed events during flush + """ + flush_path = None try: - with self._cache_lock: - if self._flush_in_progress: + # Check shutdown before starting (under lock to prevent race) + with self._lock: + if self._shutdown: return - self._flush_in_progress = True - cache_path = self._get_cache_path() - if cache_path is None: - return - if not cache_path.exists(): - return + if not cache_path.exists(): + return - entries = _read_cache_entries(cache_path) + # Atomically rename to .flush file to claim ownership + # Overwrite any stale .flush file from crashed process (C# pattern) + flush_path = cache_path.with_name(f"{cache_path.name}.flush") + try: + # On Windows/POSIX, replace() overwrites existing files atomically + cache_path.replace(flush_path) + except FileNotFoundError: + # Cache already claimed by another flush or doesn't exist + return - if not entries: - cache_path.unlink(missing_ok=True) - return + # Read all cached entries + entries = _read_cache_entries(flush_path) - cache_path.unlink(missing_ok=True) + if not entries: + # Empty cache, just delete the flush file + flush_path.unlink(missing_ok=True) + return + # Replay all events through telemetry logger + # Note: _is_flushing flag (set by caller) prevents these callbacks from re-caching or triggering nested flushes + # (unlikely since we just successfully sent an event, indicating network is available) for entry in entries: try: - event_name = entry.get("event_name") - event_data = entry.get("event_data") + event_name = entry["event_name"] + event_data = entry["event_data"] if not event_name or not event_data: continue attributes = json.loads(event_data) if not isinstance(attributes, dict): continue - attributes["initTs"] = entry.get("ts") + # Preserve original timestamp + attributes["initTs"] = entry.get("initTs", entry["ts"]) self._telemetry.log(event_name, attributes, None) except Exception: + # Skip malformed entries continue - self._telemetry.force_flush(timeout_millis=5_000) + # Check if shutdown happened during flush + with self._lock: + if self._shutdown: + # Restore cache to avoid data loss during shutdown + if flush_path and flush_path.exists(): + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + flush_path.replace(cache_path) + except Exception: + pass + return + + # Cleanup based on flush result + flush_success = False + with self._callback_condition: + callbacks_item_count = self._callbacks_item_count + expected_items = self._events_logged + if callbacks_item_count >= expected_items: + flush_success = True + if flush_success: + # Success: delete the flush file (events were sent) + if flush_path: + flush_path.unlink(missing_ok=True) + elif flush_path and flush_path.exists(): + # Failure: restore cache for retry later + cache_path.parent.mkdir(parents=True, exist_ok=True) + flush_path.replace(cache_path) except Exception: + # Best-effort restore on any exception to prevent data loss + if flush_path and flush_path.exists(): + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + flush_path.replace(cache_path) + except Exception: + # If restore fails, we lose the data (acceptable for telemetry) + pass return - finally: - with self._cache_lock: - self._flush_in_progress = False + + def is_flushing(self) -> bool: + with self._lock: + return self._is_flushing class Telemetry: - """Wrapper that wires environment configuration into the library logger.""" + """Wrapper that wires environment configuration into the library logger. + + This is a singleton class - all instances share the same state. + Use Telemetry() to get the singleton instance. + """ _instance: Optional["Telemetry"] = None - _initialized: bool = False + _lock = threading.Lock() def __new__(cls): + """Create or return the singleton instance. + + Thread-safe singleton implementation using double-checked locking. + """ if cls._instance is None: - cls._instance = super().__new__(cls) - cls._instance._initialized = False + with cls._lock: + # Double-check pattern to prevent race conditions + if cls._instance is None: + instance = super().__new__(cls) + instance._initialized = False + cls._instance = instance return cls._instance def __init__(self): + """Initialize the telemetry logger (only runs once for singleton).""" + # Prevent re-initialization if self._initialized: return + self._logger = self._create_logger() self._cache_handler = TelemetryCacheHandler(self) self._initialized = True @@ -211,21 +463,51 @@ def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: def _setup_payload_callbacks(self) -> None: if not self._logger: return + # Register callback for payload transmission events + # No need to store unregister function - logger shutdown will clean up callbacks self._logger.register_payload_transmitted_callback( self._cache_handler.on_payload_transmitted, include_failures=True, ) def add_global_metadata(self, metadata: dict[str, Any]) -> None: + """Add metadata to all telemetry events. + + Args: + metadata: Dictionary of metadata key-value pairs to add to all events. + These will be included in every telemetry event sent. + + Example: + >>> telemetry = Telemetry() + >>> telemetry.add_global_metadata({"user_id": "12345", "environment": "production"}) + + """ if self._logger: self._logger.add_global_metadata(metadata) def log( - self, event_name: str, attributes: Optional[dict[str, Any]] = None, metadata: Optional[dict[str, Any]] = None + self, + event_name: str, + attributes: Optional[dict[str, Any]] = None, + metadata: Optional[dict[str, Any]] = None, ) -> None: + """Log a telemetry event. + + Args: + event_name: Name of the event to log (e.g., "UserLogin", "ModelTrained"). + attributes: Optional dictionary of event-specific attributes. + metadata: Optional dictionary of additional metadata to merge with attributes. + + Example: + >>> telemetry = Telemetry() + >>> telemetry.log("ModelOptimized", {"model_type": "bert", "duration_ms": 1500}) + + """ if self._logger: attrs = _merge_metadata(attributes, metadata) self._logger.log(event_name, attrs) + if self._cache_handler: + self._cache_handler.record_event_logged() def _log_heartbeat( self, @@ -251,23 +533,52 @@ def _log_heartbeat( self.log(HEARTBEAT_EVENT_NAME, attributes, metadata) def disable_telemetry(self) -> None: + """Disable all telemetry logging. + + After calling this method, no telemetry events will be sent until + telemetry is explicitly re-enabled. + """ if self._logger: self._logger.disable_telemetry() - def force_flush(self, timeout_millis: float = 10_000) -> bool: - if self._logger and hasattr(self._logger, "force_flush"): - return self._logger.force_flush(timeout_millis=timeout_millis) - return False + def shutdown(self, timeout_millis: float = 10_000, callback_timeout_millis: float = 2_000) -> None: + """Shutdown telemetry and flush pending events. - def shutdown(self) -> None: - # Shutdown cache handler FIRST to ensure pending cache tasks complete - # The cache handler will wait for replayed events to be sent before returning + Shutdown sequence: + 1. Wait for in-flight flush to complete (up to 1 second) + 2. Wait for callbacks + signal shutdown to cache handler + 3. Shutdown logger (cleans up callbacks automatically) + """ + # Step 1: Wait for pending flush to complete (matches C# 1-second timeout) + start_time = time.time() + while time.time() - start_time < 1.0: + if not self._cache_handler or not self._cache_handler.is_flushing(): + break + time.sleep(0.05) + + # Step 2: Wait for callbacks/flush to complete before shutting down cache handler if self._cache_handler: + # Nothing can be done if callbacks don't complete in time, so we ignore the result + _ = self._cache_handler.wait_for_callbacks(callback_timeout_millis / 1000) self._cache_handler.shutdown() + # Step 3: Shutdown logger (callbacks cleaned up automatically) if self._logger: self._logger.shutdown() + def __del__(self): + """Cleanup telemetry resources on garbage collection. + + This is a safety net to ensure resources are cleaned up even if + shutdown() is not explicitly called. However, relying on __del__ + is not recommended - always call shutdown() explicitly when done. + """ + try: + self.shutdown() + except Exception: + # Silently ignore errors during cleanup + pass + def _get_logger() -> Telemetry: """Get or create the singleton Telemetry instance.""" @@ -282,6 +593,18 @@ def _merge_metadata(attributes: Optional[dict[str, Any]], metadata: Optional[dic def _parse_payload(payload: bytes) -> list[dict[str, Any]]: + """Parse telemetry payload into individual event entries. + + Design decisions: + - Filter events to only allowed keys (privacy/security) + - Store as minimal JSON (reduces cache size) + - Fail silently on malformed data (telemetry should be robust) + + Assumptions: + - Payload is newline-delimited JSON (OneCollector format) + - Events have "name", "time", and "data" fields + - Only whitelisted events and fields should be cached + """ entries = [] try: payload_text = payload.decode("utf-8") @@ -293,31 +616,46 @@ def _parse_payload(payload: bytes) -> list[dict[str, Any]]: continue try: event = json.loads(line) - event_name = event.get("name") + event_name = event["name"] if not event_name: continue - filtered_data = _filter_event_data(event_name, event.get("data") or {}) + # Filter to only allowed keys for privacy/security + filtered_data = _filter_event_data(event_name, event["data"]) if not filtered_data: continue entries.append( { - "ts": event.get("time") or time.time(), + "ts": event["time"] or time.time(), "event_name": event_name, + # Compact JSON to reduce cache size "event_data": json.dumps(filtered_data, ensure_ascii=False, separators=(",", ":")), } ) except Exception: + # Skip malformed lines continue except Exception: + # If entire payload is malformed, return empty list return [] return entries def _filter_event_data(event_name: str, data: dict[str, Any]) -> Optional[dict[str, Any]]: - allowed_keys = ALLOWED_KEYS.get(event_name) - if not allowed_keys: + """Filter event data to only allowed keys for privacy/security. + + Design decisions: + - Whitelist approach (only explicitly allowed keys are included) + - Support nested keys with dot notation (e.g., "os.name") + - Return None if no allowed keys found (filters out unknown events) + + Assumptions: + - ALLOWED_KEYS dict defines all cacheable events and their fields + - Unknown events should not be cached (privacy/security) + """ + if event_name not in ALLOWED_KEYS: return None + allowed_keys = ALLOWED_KEYS[event_name] filtered: dict[str, Any] = {} for key in allowed_keys: @@ -346,17 +684,32 @@ def _set_nested_value(data: dict[str, Any], key: str, value: Any) -> None: def _read_cache_entries(cache_path: Path) -> list[dict[str, Any]]: + """Read all entries from a cache file. + + Design decisions: + - Continue reading past malformed entries (partial data recovery) + - Return empty list on complete read failure (fail gracefully) + + Assumptions: + - Cache file contains newline-delimited JSON (one event per line) + - Each line is independent (one malformed line doesn't affect others) + - Empty or whitespace-only lines are skipped + """ entries = [] try: - with cache_path.open("rb") as cache_file: - while True: + with cache_path.open("r", encoding="utf-8") as cache_file: + for raw_line in cache_file: + line = raw_line.strip() + if not line: + continue try: - entry = pickle.load(cache_file) - entries.append(entry) - except EOFError: - break + entry = json.loads(line) + if isinstance(entry, dict): + entries.append(entry) except Exception: + # Malformed line, skip and continue continue except Exception: + # If file cannot be opened or read, return empty list return [] return entries diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 86c0bbc296..955d4e03cb 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -9,7 +9,7 @@ from types import TracebackType from typing import Optional -DEVICEID_LOCATION = r"Microsoft/DeveloperTools/deviceid/.onnxruntime/" +ORT_SUPPORT_DIR = r"Microsoft/DeveloperTools/.onnxruntime" def get_telemetry_base_dir() -> Path: @@ -24,13 +24,13 @@ def get_telemetry_base_dir() -> Path: home = os.getenv("HOME") if home is None: raise ValueError("HOME environment variable not set") - return Path(home) / "Library" / "Application Support" / DEVICEID_LOCATION + return Path(home) / "Library" / "Application Support" / ORT_SUPPORT_DIR home = os.getenv("XDG_CACHE_HOME", f"{os.getenv('HOME')}/.cache") if not home: raise ValueError("HOME environment variable not set") - return Path(home) / DEVICEID_LOCATION + return Path(home) / ORT_SUPPORT_DIR def _format_exception_message(ex: BaseException, tb: Optional[TracebackType] = None) -> str: diff --git a/test/conftest.py b/test/conftest.py index c57411ed76..db97c685af 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -8,6 +8,7 @@ import pytest from packaging import version +from olive.telemetry.telemetry import Telemetry from test.utils import create_onnx_model_file, delete_onnx_model_files @@ -39,3 +40,8 @@ def maybe_patch_inc(): yield else: yield + + +@pytest.fixture(scope="session", autouse=True) +def disable_telemetry(): + Telemetry().disable_telemetry() From 8c31e66c6db12c423bb09921a2e41c5f3571823b Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 5 Feb 2026 17:55:16 -0600 Subject: [PATCH 26/31] Update wording --- docs/Privacy.md | 12 ++++++------ olive/telemetry/telemetry.py | 26 ++++++++++++-------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/docs/Privacy.md b/docs/Privacy.md index 5b8e48a36d..d20aa0de07 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -1,16 +1,16 @@ # Privacy ## Data Collection -The software may collect information about you and your use of the software and send it to Microsoft. Microsoft may use this information to provide services and improve our products and services. You may turn off telemetry as described below. Our privacy statement is located at https://go.microsoft.com/fwlink/?LinkID=824704. You can learn more about data collection and use in the help documentation and our privacy statement. Your use of the software operates as your consent to these practices. +The software may collect information about you and your use of the software and send it to Microsoft. Microsoft may use this information to provide services and improve our products and services. You may turn off the telemetry as described in the repository. There are also some features in the software that may enable Microsoft to collect data from users of your applications. If you use these features, you must comply with applicable law, including providing appropriate notices to users of your applications together with a copy of Microsoft's privacy statement. Our privacy statement can be found [here](https://go.microsoft.com/fwlink/?LinkID=824704). You can learn more about data collection and use in the help documentation and our privacy statement. Your use of the software operates as your consent to these practices. *** -#### Technical Details -Olive uses the [OpenTelemetry](https://opentelemetry.io/) API for its implementation. Based on user consent, this data may be periodically sent to Microsoft servers following GDPR and privacy regulations for anonymity and data access controls. Application, device, and version information is collected automatically. +## Technical Details +Olive uses the [OpenTelemetry](https://opentelemetry.io/) API for its implementation. By default, this data may be periodically sent to Microsoft servers following GDPR and privacy regulations for anonymity and data access controls. Application, device, and version information is collected automatically. -In addition, Olive may collect optional telemetry data such as: -- User interactions +In addition, Olive may collect additional telemetry data such as: +- Invoked commands - Performance data - Exception information -Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to your Olive CLI commands. +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 08167045de..4d3f73f6d3 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -7,6 +7,7 @@ import base64 import errno import json +import os import platform import threading import time @@ -448,11 +449,14 @@ def __init__(self): return self._logger = self._create_logger() + event_source.disable() + self._cache_handler = TelemetryCacheHandler(self) self._initialized = True self._setup_payload_callbacks() self._log_heartbeat() - event_source.disable() + if os.environ.get("OLIVE_DISABLE_TELEMETRY") == "1": + self.disable_telemetry() def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: try: @@ -461,8 +465,6 @@ def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: return None def _setup_payload_callbacks(self) -> None: - if not self._logger: - return # Register callback for payload transmission events # No need to store unregister function - logger shutdown will clean up callbacks self._logger.register_payload_transmitted_callback( @@ -482,8 +484,7 @@ def add_global_metadata(self, metadata: dict[str, Any]) -> None: >>> telemetry.add_global_metadata({"user_id": "12345", "environment": "production"}) """ - if self._logger: - self._logger.add_global_metadata(metadata) + self._logger.add_global_metadata(metadata) def log( self, @@ -503,11 +504,10 @@ def log( >>> telemetry.log("ModelOptimized", {"model_type": "bert", "duration_ms": 1500}) """ - if self._logger: - attrs = _merge_metadata(attributes, metadata) - self._logger.log(event_name, attrs) - if self._cache_handler: - self._cache_handler.record_event_logged() + attrs = _merge_metadata(attributes, metadata) + self._logger.log(event_name, attrs) + if self._cache_handler: + self._cache_handler.record_event_logged() def _log_heartbeat( self, @@ -538,8 +538,7 @@ def disable_telemetry(self) -> None: After calling this method, no telemetry events will be sent until telemetry is explicitly re-enabled. """ - if self._logger: - self._logger.disable_telemetry() + self._logger.disable_telemetry() def shutdown(self, timeout_millis: float = 10_000, callback_timeout_millis: float = 2_000) -> None: """Shutdown telemetry and flush pending events. @@ -563,8 +562,7 @@ def shutdown(self, timeout_millis: float = 10_000, callback_timeout_millis: floa self._cache_handler.shutdown() # Step 3: Shutdown logger (callbacks cleaned up automatically) - if self._logger: - self._logger.shutdown() + self._logger.shutdown() def __del__(self): """Cleanup telemetry resources on garbage collection. From 3c766e65d657427870cd0f228d33d2a2ba665d46 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 5 Feb 2026 18:01:51 -0600 Subject: [PATCH 27/31] Lint --- docs/Privacy.md | 2 +- olive/telemetry/telemetry.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/Privacy.md b/docs/Privacy.md index d20aa0de07..8335cf1b88 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -6,7 +6,7 @@ The software may collect information about you and your use of the software and *** ## Technical Details -Olive uses the [OpenTelemetry](https://opentelemetry.io/) API for its implementation. By default, this data may be periodically sent to Microsoft servers following GDPR and privacy regulations for anonymity and data access controls. Application, device, and version information is collected automatically. +Olive uses the [OpenTelemetry](https://opentelemetry.io/) API for its implementation. Telemetry is turned ON by default. Based on user consent, this data may be periodically sent to Microsoft servers following GDPR and privacy regulations for anonymity and data access controls. Application, device, and version information is collected automatically. In addition, Olive may collect additional telemetry data such as: - Invoked commands diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 4d3f73f6d3..3bf2e7048a 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -14,16 +14,15 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Optional -if TYPE_CHECKING: - from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs - from olive.telemetry.constants import CONNECTION_STRING from olive.telemetry.deviceid import get_encrypted_device_id_and_status from olive.telemetry.library.event_source import event_source -from olive.telemetry.library.telemetry_logger import TelemetryLogger as _LibraryTelemetryLogger -from olive.telemetry.library.telemetry_logger import get_telemetry_logger +from olive.telemetry.library.telemetry_logger import TelemetryLogger, get_telemetry_logger from olive.telemetry.utils import get_telemetry_base_dir +if TYPE_CHECKING: + from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs + # Default event names used by the high-level telemetry helpers. HEARTBEAT_EVENT_NAME = "OliveHeartbeat" ACTION_EVENT_NAME = "OliveAction" @@ -114,6 +113,7 @@ def __del__(self): try: self.shutdown() except Exception: + # Silently ignore errors during cleanup pass def on_payload_transmitted(self, args: "PayloadTransmittedCallbackArgs") -> None: @@ -384,6 +384,7 @@ def _flush_cache_file(self, cache_path: Path) -> None: cache_path.parent.mkdir(parents=True, exist_ok=True) flush_path.replace(cache_path) except Exception: + # Silently ignore errors during cleanup pass return @@ -458,7 +459,7 @@ def __init__(self): if os.environ.get("OLIVE_DISABLE_TELEMETRY") == "1": self.disable_telemetry() - def _create_logger(self) -> Optional[_LibraryTelemetryLogger]: + def _create_logger(self) -> Optional[TelemetryLogger]: try: return get_telemetry_logger(base64.b64decode(CONNECTION_STRING).decode()) except Exception: From 492e5072c923853b1c56e8d069f0da90c5e0cade Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 8 Feb 2026 00:58:43 -0600 Subject: [PATCH 28/31] Improve file locking --- olive/telemetry/telemetry.py | 21 ++++++++------ olive/telemetry/utils.py | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 3bf2e7048a..153d2aaa59 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -18,7 +18,7 @@ from olive.telemetry.deviceid import get_encrypted_device_id_and_status from olive.telemetry.library.event_source import event_source from olive.telemetry.library.telemetry_logger import TelemetryLogger, get_telemetry_logger -from olive.telemetry.utils import get_telemetry_base_dir +from olive.telemetry.utils import get_telemetry_base_dir, _exclusive_file_lock if TYPE_CHECKING: from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs @@ -29,7 +29,7 @@ ERROR_EVENT_NAME = "OliveError" ALLOWED_KEYS = { - HEARTBEAT_EVENT_NAME: [ + HEARTBEAT_EVENT_NAME: { "device_id", "id_status", "os.name", @@ -39,8 +39,8 @@ "app_version", "app_instance_id", "initTs", - ], - ACTION_EVENT_NAME: [ + }, + ACTION_EVENT_NAME: { "invoked_from", "action_name", "duration_ms", @@ -48,14 +48,14 @@ "app_version", "app_instance_id", "initTs", - ], - ERROR_EVENT_NAME: [ + }, + ERROR_EVENT_NAME: { "exception_type", "exception_message", "app_version", "app_instance_id", "initTs", - ], + }, } CRITICAL_EVENTS = {HEARTBEAT_EVENT_NAME} @@ -240,6 +240,7 @@ def _write_payload_to_cache(self, payload: bytes) -> None: Design decisions: - Parse payload to extract individual events (allows filtering) - Filter to only critical events near size limit (preserves important data) + - Use file locking for multi-process safety (prevents corruption) - Use exponential backoff for file contention (avoids spinning) - Fail silently on errors (telemetry should never crash app) @@ -277,7 +278,8 @@ def _write_payload_to_cache(self, payload: bytes) -> None: return # Append newline-delimited JSON (human-readable, partial corruption recovery) - with cache_path.open("a", encoding="utf-8") as cache_file: + # Use exclusive file lock for multi-process safety + with _exclusive_file_lock(cache_path, mode="a") as cache_file: for entry in entries: # Write compact JSON on single line json.dump(entry, cache_file, ensure_ascii=False, separators=(",", ":")) @@ -686,6 +688,7 @@ def _read_cache_entries(cache_path: Path) -> list[dict[str, Any]]: """Read all entries from a cache file. Design decisions: + - Use file locking for multi-process safety - Continue reading past malformed entries (partial data recovery) - Return empty list on complete read failure (fail gracefully) @@ -696,7 +699,7 @@ def _read_cache_entries(cache_path: Path) -> list[dict[str, Any]]: """ entries = [] try: - with cache_path.open("r", encoding="utf-8") as cache_file: + with _exclusive_file_lock(cache_path, mode="r") as cache_file: for raw_line in cache_file: line = raw_line.strip() if not line: diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 955d4e03cb..59ff463707 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -50,3 +50,56 @@ def _format_exception_message(ex: BaseException, tb: Optional[TracebackType] = N line_trunc = line_trunc[idx + len(file_line) :] lines.append(line_trunc) return "\n".join(lines) + + +class _ExclusiveFileLock: + """Cross-platform exclusive file lock context manager. + + Uses fcntl on Unix/Linux/macOS, msvcrt on Windows. + Prevents cache corruption when multiple processes access the same file. + + Design decisions: + - Lock is held for the entire duration of file access (prevents partial reads/writes) + - Lock is released automatically on close (even on exceptions) + - Platform-specific implementation (fcntl for POSIX, msvcrt for Windows) + + Assumptions: + - File locking is supported on the platform + - Lock is advisory on some systems (cooperative locking) + """ + + def __init__(self, file_path: Path, mode: str): + self.file_path = file_path + self.mode = mode + self.file = None + + def __enter__(self): + self.file = open(self.file_path, self.mode, encoding="utf-8") + + # Platform-specific locking + if os.name == "posix": + import fcntl + + fcntl.flock(self.file.fileno(), fcntl.LOCK_EX) + elif os.name == "nt": + import msvcrt + + # Lock 1 byte at position 0 + msvcrt.locking(self.file.fileno(), msvcrt.LK_LOCK, 1) + + return self.file + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.file: + # Unlock happens automatically on close + self.file.close() + + +def _exclusive_file_lock(file_path: Path, mode: str): + """Create an exclusive file lock context manager. + + :param file_path: Path to the file to lock. + :param mode: File open mode ('r', 'a', 'w', etc.). + :return: Context manager that returns an open file handle. + """ + return _ExclusiveFileLock(file_path, mode) From 6a9066373722b846a78b258abbfed73e4c40a830 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Sun, 8 Feb 2026 12:33:17 -0600 Subject: [PATCH 29/31] Sort --- olive/telemetry/telemetry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 153d2aaa59..41b8165571 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -18,7 +18,7 @@ from olive.telemetry.deviceid import get_encrypted_device_id_and_status from olive.telemetry.library.event_source import event_source from olive.telemetry.library.telemetry_logger import TelemetryLogger, get_telemetry_logger -from olive.telemetry.utils import get_telemetry_base_dir, _exclusive_file_lock +from olive.telemetry.utils import _exclusive_file_lock, get_telemetry_base_dir if TYPE_CHECKING: from olive.telemetry.library.callback_manager import PayloadTransmittedCallbackArgs From 21d48b173182201834b957dc24b8e15fc536007a Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Thu, 19 Feb 2026 13:25:38 -0600 Subject: [PATCH 30/31] Address comments --- olive/telemetry/deviceid/_store.py | 11 +++++--- olive/telemetry/deviceid/deviceid.py | 2 +- olive/telemetry/library/event_source.py | 29 ++++++++++++++-------- olive/telemetry/library/exporter.py | 4 +-- olive/telemetry/library/payload_builder.py | 2 ++ olive/telemetry/library/transport.py | 2 +- olive/telemetry/telemetry.py | 5 ++-- olive/telemetry/utils.py | 3 +++ 8 files changed, 37 insertions(+), 21 deletions(-) diff --git a/olive/telemetry/deviceid/_store.py b/olive/telemetry/deviceid/_store.py index c7491d4331..7a22736b3c 100644 --- a/olive/telemetry/deviceid/_store.py +++ b/olive/telemetry/deviceid/_store.py @@ -8,11 +8,13 @@ class Store: def __init__(self) -> None: - self._file_path: Path = self._build_path() + self._file_path: Path = self._build_path + @property def _build_path(self) -> Path: return get_telemetry_base_dir() / "deviceid" + @property def retrieve_id(self) -> str: """Retrieve the device id from the store location. @@ -23,7 +25,7 @@ def retrieve_id(self) -> str: if not self._file_path.is_file(): raise FileExistsError(f"File {self._file_path.stem} does not exist") - return self._file_path.read_text(encoding="utf-8") + return self._file_path.read_text(encoding="utf-8").strip() def store_id(self, device_id: str) -> None: """Store the device id in the store location. @@ -44,6 +46,7 @@ def store_id(self, device_id: str) -> None: class WindowsStore: + @property def retrieve_id(self) -> str: """Retrieve the device id from the Windows registry.""" import winreg @@ -54,12 +57,12 @@ def retrieve_id(self) -> str: winreg.HKEY_CURRENT_USER, REGISTRY_PATH, reserved=0, access=winreg.KEY_READ | winreg.KEY_WOW64_64KEY ) as key_handle: device_id = winreg.QueryValueEx(key_handle, REGISTRY_KEY) - return device_id[0] + return device_id[0].strip() def store_id(self, device_id: str) -> None: """Store the device id in the windows registry. - :param str device_id: The device id to sstore. + :param str device_id: The device id to store. """ import winreg diff --git a/olive/telemetry/deviceid/deviceid.py b/olive/telemetry/deviceid/deviceid.py index 70847fa5e3..09087f33c3 100644 --- a/olive/telemetry/deviceid/deviceid.py +++ b/olive/telemetry/deviceid/deviceid.py @@ -44,7 +44,7 @@ def get_device_id() -> str: _device_id_state["device_id"] = device_id return device_id - device_id = store.retrieve_id().strip() + device_id = store.retrieve_id if len(device_id) > 256: _device_id_state["status"] = DeviceIdStatus.CORRUPTED _device_id_state["device_id"] = "" diff --git a/olive/telemetry/library/event_source.py b/olive/telemetry/library/event_source.py index 361387f2b5..e65d9d546a 100644 --- a/olive/telemetry/library/event_source.py +++ b/olive/telemetry/library/event_source.py @@ -39,10 +39,17 @@ def __init__(self): self.logger.addHandler(handler) self.logger.setLevel(logging.INFO) + @property def is_informational_logging_enabled(self) -> bool: """Check if informational level logging is enabled.""" return self.logger.isEnabledFor(logging.INFO) + @property + def is_warning_logging_enabled(self) -> bool: + """Check if warning level logging is enabled.""" + return self.logger.isEnabledFor(logging.WARNING) + + @property def is_error_logging_enabled(self) -> bool: """Check if error level logging is enabled.""" return self.logger.isEnabledFor(logging.ERROR) @@ -55,7 +62,7 @@ def export_exception_thrown(self, item_type: str, exception: Exception) -> None: exception: The exception that was thrown """ - if self.logger.isEnabledFor(logging.ERROR): + if self.is_error_logging_enabled: self.logger.error( "Exception thrown exporting '%s' batch: %s", item_type, @@ -73,7 +80,7 @@ def transport_data_sent(self, item_type: str, num_records: int, transport_descri transport_description: Description of transport used """ - if self.is_informational_logging_enabled(): + if self.is_informational_logging_enabled: self.logger.info( "Sent '%s' batch of %s item(s) to '%s' transport", item_type, @@ -91,7 +98,7 @@ def sink_data_written(self, item_type: str, num_records: int, sink_description: sink_description: Description of sink used """ - if self.is_informational_logging_enabled(): + if self.is_informational_logging_enabled: self.logger.info( "Wrote '%s' batch of %s item(s) to '%s' sink", item_type, @@ -112,7 +119,7 @@ def data_dropped( during_transmission: Number dropped during transmission """ - if self.logger.isEnabledFor(logging.WARNING): + if self.is_warning_logging_enabled: self.logger.warning( "Dropped %s '%s' item(s). %s item(s) dropped during serialization. %s item(s) dropped due to " "transmission failure", @@ -131,7 +138,7 @@ def transport_exception_thrown(self, transport_type: str, exception: Exception) exception: The exception that was thrown """ - if self.logger.isEnabledFor(logging.ERROR): + if self.is_error_logging_enabled: self.logger.error( "Exception thrown by '%s' transport: %s", transport_type, @@ -152,7 +159,7 @@ def http_transport_error_response( error_details: Additional error details """ - if self.logger.isEnabledFor(logging.ERROR): + if self.is_error_logging_enabled: self.logger.error( "Error response received by '%s' transport. StatusCode: %s, ErrorMessage: '%s', ErrorDetails: '%s'", transport_type, @@ -170,7 +177,7 @@ def event_full_name_discarded(self, event_namespace: str, event_name: str) -> No event_name: Event name """ - if self.logger.isEnabledFor(logging.WARNING): + if self.is_warning_logging_enabled: self.logger.warning( "Event full name discarded. EventNamespace: '%s', EventName: '%s'", event_namespace, @@ -185,7 +192,7 @@ def event_namespace_invalid(self, event_namespace: str) -> None: event_namespace: The invalid namespace """ - if self.logger.isEnabledFor(logging.WARNING): + if self.is_warning_logging_enabled: self.logger.warning( "Event namespace invalid. EventNamespace: '%s'", event_namespace, @@ -199,7 +206,7 @@ def event_name_invalid(self, event_name: str) -> None: event_name: The invalid event name """ - if self.logger.isEnabledFor(logging.WARNING): + if self.is_warning_logging_enabled: self.logger.warning( "Event name invalid. EventName: '%s'", event_name, @@ -214,7 +221,7 @@ def exception_thrown_from_user_code(self, user_code_type: str, exception: Except exception: The exception that was thrown """ - if self.logger.isEnabledFor(logging.ERROR): + if self.is_error_logging_enabled: self.logger.error( "Exception thrown by '%s' user code: %s", user_code_type, @@ -232,7 +239,7 @@ def attribute_dropped(self, item_type: str, attribute_name: str, reason: str) -> reason: Reason for dropping """ - if self.logger.isEnabledFor(logging.WARNING): + if self.is_warning_logging_enabled: self.logger.warning( "Dropped %s attribute '%s': %s", item_type, diff --git a/olive/telemetry/library/exporter.py b/olive/telemetry/library/exporter.py index 69f6cf87dc..68c57dccfc 100644 --- a/olive/telemetry/library/exporter.py +++ b/olive/telemetry/library/exporter.py @@ -281,7 +281,7 @@ def _build_payloads(self, serialized_items: list[bytes]) -> list[bytes]: self._payload_builder.reset() for item_bytes in serialized_items: - if not self._payload_builder.can_add(item_bytes) and not self._payload_builder.is_empty(): + if not self._payload_builder.can_add(item_bytes) and not self._payload_builder.is_empty: # Current payload is full, build it and start a new one payloads.append(self._payload_builder.build()) self._payload_builder.reset() @@ -289,7 +289,7 @@ def _build_payloads(self, serialized_items: list[bytes]) -> list[bytes]: self._payload_builder.add(item_bytes) # Build final payload - if not self._payload_builder.is_empty(): + if not self._payload_builder.is_empty: payloads.append(self._payload_builder.build()) return payloads diff --git a/olive/telemetry/library/payload_builder.py b/olive/telemetry/library/payload_builder.py index c4678a6a97..aea6015389 100644 --- a/olive/telemetry/library/payload_builder.py +++ b/olive/telemetry/library/payload_builder.py @@ -82,10 +82,12 @@ def build(self) -> bytes: return self.NEWLINE_SEPARATOR.join(self.items) + @property def item_count(self) -> int: """Get the number of items in the current payload.""" return len(self.items) + @property def is_empty(self) -> bool: """Check if the payload is empty.""" return len(self.items) == 0 diff --git a/olive/telemetry/library/transport.py b/olive/telemetry/library/transport.py index 0e5d09566d..6d7e371a77 100644 --- a/olive/telemetry/library/transport.py +++ b/olive/telemetry/library/transport.py @@ -174,7 +174,7 @@ def send(self, payload: bytes, timeout_sec: float, item_count: int = 1) -> tuple return True, status_code else: # Log error response - if event_source.is_error_logging_enabled(): + if event_source.is_error_logging_enabled: collector_error = response.headers.get("Collector-Error", "") error_details = response.text[:100] if response.text else "" event_source.http_transport_error_response( diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 41b8165571..3af6ebe03c 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -176,7 +176,7 @@ def wait_for_callbacks(self, timeout_sec: float) -> bool: with self._callback_condition: callbacks_item_count = self._callbacks_item_count expected_items = self._events_logged - if not self.is_flushing() and callbacks_item_count >= expected_items: + if not self.is_flushing and callbacks_item_count >= expected_items: return True remaining = deadline - time.time() if remaining <= 0: @@ -416,6 +416,7 @@ def _flush_cache_file(self, cache_path: Path) -> None: pass return + @property def is_flushing(self) -> bool: with self._lock: return self._is_flushing @@ -554,7 +555,7 @@ def shutdown(self, timeout_millis: float = 10_000, callback_timeout_millis: floa # Step 1: Wait for pending flush to complete (matches C# 1-second timeout) start_time = time.time() while time.time() - start_time < 1.0: - if not self._cache_handler or not self._cache_handler.is_flushing(): + if not self._cache_handler or not self._cache_handler.is_flushing: break time.sleep(0.05) diff --git a/olive/telemetry/utils.py b/olive/telemetry/utils.py index 59ff463707..b3ef57e666 100644 --- a/olive/telemetry/utils.py +++ b/olive/telemetry/utils.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import functools import os import platform import traceback @@ -12,6 +13,8 @@ ORT_SUPPORT_DIR = r"Microsoft/DeveloperTools/.onnxruntime" +@property +@functools.lru_cache(maxsize=1) def get_telemetry_base_dir() -> Path: os_name = platform.system() if os_name == "Windows": From 6418c8d35ae58de7dcb926f5f499a23d6c74982c Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Fri, 20 Feb 2026 02:56:36 -0600 Subject: [PATCH 31/31] Use cache directory from env var, if provided --- docs/Privacy.md | 2 +- olive/telemetry/telemetry.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/Privacy.md b/docs/Privacy.md index 8335cf1b88..a74cf5b1c9 100644 --- a/docs/Privacy.md +++ b/docs/Privacy.md @@ -13,4 +13,4 @@ In addition, Olive may collect additional telemetry data such as: - Performance data - Exception information -Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. +Collection of this additional telemetry can be disabled by adding the `--disable_telemetry` flag to any Olive CLI command, or by setting the `OLIVE_DISABLE_TELEMETRY` environment variable to `1` before running. If telemetry is enabled, but cannot be sent to Microsoft, it will be stored locally and sent when a connection is available. You can override the default cache location by setting the `OLIVE_TELEMETRY_CACHE_PATH` environment variable to a valid file path. diff --git a/olive/telemetry/telemetry.py b/olive/telemetry/telemetry.py index 3af6ebe03c..372ee6df89 100644 --- a/olive/telemetry/telemetry.py +++ b/olive/telemetry/telemetry.py @@ -229,10 +229,12 @@ def cache_path(self) -> Optional[Path]: Optional[Path]: Path to cache file, or None if base directory unavailable. """ - support_dir = get_telemetry_base_dir() - if not support_dir: - return None - return support_dir / "cache" / self._cache_file_name + telemetry_cache_dir = None + if "OLIVE_TELEMETRY_CACHE_DIR" in os.environ: + telemetry_cache_dir = os.environ["OLIVE_TELEMETRY_CACHE_DIR"] + if not telemetry_cache_dir: + telemetry_cache_dir = get_telemetry_base_dir() / "cache" + return telemetry_cache_dir / self._cache_file_name def _write_payload_to_cache(self, payload: bytes) -> None: """Write failed telemetry payload to cache for later retry.