From 4be7f9b97f97c0cff3272db81222aa7d754c4f88 Mon Sep 17 00:00:00 2001 From: Victor Dibia Date: Mon, 16 Oct 2023 16:54:40 -0700 Subject: [PATCH 1/3] v1 for support for embeddings in llmx --- llmx/embeddings/__init__.py | 1 + llmx/embeddings/oai_embedding.py | 51 +++++++++++++ llmx/embeddings/text_embedding.py | 42 ++++++++++ llmx/utils.py | 122 ++++++++++++++++++++++++++---- 4 files changed, 200 insertions(+), 16 deletions(-) create mode 100644 llmx/embeddings/__init__.py create mode 100644 llmx/embeddings/oai_embedding.py create mode 100644 llmx/embeddings/text_embedding.py diff --git a/llmx/embeddings/__init__.py b/llmx/embeddings/__init__.py new file mode 100644 index 0000000..8bf3b16 --- /dev/null +++ b/llmx/embeddings/__init__.py @@ -0,0 +1 @@ +from .oai_embedding import OpenAITextEmbedding diff --git a/llmx/embeddings/oai_embedding.py b/llmx/embeddings/oai_embedding.py new file mode 100644 index 0000000..bbcd768 --- /dev/null +++ b/llmx/embeddings/oai_embedding.py @@ -0,0 +1,51 @@ +import os +import numpy as np +import openai +from typing import Union, List +from .text_embedding import TextEmbedding +from ..utils import cache_request + + +class OpenAITextEmbedding(TextEmbedding): + """Text embedding using OpenAI models.""" + + def __init__( + self, + model: str = "text-embedding-ada-002", + api_base: str = None, + api_version: str = None, + api_type: str = None, + api_key: str = os.environ.get("OPENAI_API_KEY", None), + ): + super().__init__(model) + + if api_key is None: + raise ValueError( + "OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable." + ) + self.api_key = api_key + openai.api_key = self.api_key + openai.api_base = api_base or openai.api_base + openai.api_version = api_version or openai.api_version + openai.api_type = api_type or openai.api_type + self.model = model + self._size = 1536 + + def embed( + self, text: Union[str, List[str]], use_cache: bool = True, **kwargs + ) -> np.ndarray: + """Compute embedding for text.""" + # if text is numpy.ndarray, convert to list + if isinstance(text, np.ndarray): + text = text.tolist() + + cache_params = dict(text=text, model=self.model, **kwargs) + if use_cache: + response = cache_request(cache=self.cache, params=cache_params) + if response: + return np.array(response).astype(np.float32) + response = openai.Embedding.create(input=text, model=self.model) + embeddings = [x["embedding"] for x in response["data"]] + + cache_request(cache=self.cache, params=cache_params, values=embeddings) + return np.array(embeddings).astype(np.float32) diff --git a/llmx/embeddings/text_embedding.py b/llmx/embeddings/text_embedding.py new file mode 100644 index 0000000..dcbb7e4 --- /dev/null +++ b/llmx/embeddings/text_embedding.py @@ -0,0 +1,42 @@ +import os +import numpy as np +from typing import Union, List +from diskcache import Cache +from abc import ABC, abstractmethod +from ..utils import get_user_cache_dir +from ..version import APP_NAME + + +class TextEmbedding(ABC): + """Interface for computing Text Embeddings.""" + + def __init__(self, model_name: str, **kwargs): + """ + Initialize the text embedding model. + + :param model_name: str, the name of the model + :param cache_dir: str, optional, the path to the directory where cache files are stored + """ + self.model_name = model_name + self._size = None + + app_name = APP_NAME + cache_dir_default = get_user_cache_dir(app_name) + cache_dir_based_on_model = os.path.join(cache_dir_default, model_name) + self.cache_dir = kwargs.get("cache_dir", cache_dir_based_on_model) + self.cache = Cache(self.cache_dir) + + @abstractmethod + def embed(self, text: Union[str, List[str]]) -> np.ndarray: + """ + Compute embedding for text. + + :param text: Union[str, List[str]], the input text or a list of texts + :return: np.ndarray, the computed embedding(s) + """ + pass + + @property + def size(self): + """Return the embedding size.""" + return self._size diff --git a/llmx/utils.py b/llmx/utils.py index c935ea7..365eed1 100644 --- a/llmx/utils.py +++ b/llmx/utils.py @@ -1,7 +1,7 @@ from dataclasses import asdict import logging import json -from typing import Any, Union, Dict +from typing import Any, List, Tuple, Union, Dict import tiktoken from diskcache import Cache import hashlib @@ -12,6 +12,10 @@ from google.oauth2 import service_account import requests import yaml +import numpy as np +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE +from sklearn.cluster import KMeans logger = logging.getLogger("llmx") @@ -72,8 +76,10 @@ def get_user_cache_dir(app_name: str) -> str: return cache_path -def get_gcp_credentials(service_account_key_file: str = None, scopes: list[str] = [ - 'https://www.googleapis.com/auth/cloud-platform']): +def get_gcp_credentials( + service_account_key_file: str = None, + scopes: list[str] = ["https://www.googleapis.com/auth/cloud-platform"], +): try: # Attempt to use Application Default Credentials credentials, project_id = google.auth.default(scopes=scopes) @@ -87,7 +93,8 @@ def get_gcp_credentials(service_account_key_file: str = None, scopes: list[str] "Service account key file is not set. Please set the PALM_SERVICE_ACCOUNT_KEY_FILE environment variable." ) credentials = service_account.Credentials.from_service_account_file( - service_account_key_file, scopes=scopes) + service_account_key_file, scopes=scopes + ) auth_req = google.auth.transport.requests.Request() credentials.refresh(auth_req) return credentials @@ -102,7 +109,6 @@ def gcp_request( request_timeout: int = 60, **kwargs, ): - headers = headers or {} if "key" not in url: @@ -115,7 +121,12 @@ def gcp_request( headers["Content-Type"] = "application/json" response = requests.request( - method=method, url=url, json=body, headers=headers, timeout=request_timeout, **kwargs + method=method, + url=url, + json=body, + headers=headers, + timeout=request_timeout, + **kwargs, ) if response.status_code not in range(200, 300): @@ -135,35 +146,40 @@ def load_config(): config_path = os.environ.get("LLMX_CONFIG_PATH", None) if config_path is None or os.path.exists(config_path) is False: config_path = os.path.join( - os.path.dirname(__file__), - "configs/config.default.yml") + os.path.dirname(__file__), "configs/config.default.yml" + ) logger.info( "Info: LLMX_CONFIG_PATH environment variable is not set to a valid config file. Using default config file at '%s'.", - config_path) + config_path, + ) if config_path is not None: try: with open(config_path, "r", encoding="utf-8") as f: config = yaml.safe_load(f) - logger.info( - "Loaded config from '%s'.", - config_path) + logger.info("Loaded config from '%s'.", config_path) return config except FileNotFoundError as file_not_found: logger.info( "Error: Config file not found at '%s'. Please check the LLMX_CONFIG_PATH environment variable. %s", config_path, - str(file_not_found)) + str(file_not_found), + ) except IOError as io_error: logger.info( "Error: Could not read the config file at '%s'. %s", - config_path, str(io_error)) + config_path, + str(io_error), + ) except yaml.YAMLError as yaml_error: logger.info( "Error: Malformed YAML in config file at '%s'. %s", - config_path, str(yaml_error)) + config_path, + str(yaml_error), + ) else: logger.info( - "Info:LLMX_CONFIG_PATH environment variable is not set. Please set it to the path of your config file to setup your default model.") + "Info:LLMX_CONFIG_PATH environment variable is not set. Please set it to the path of your config file to setup your default model." + ) except Exception as error: logger.info("Error: An unexpected error occurred: %s", str(error)) @@ -177,3 +193,77 @@ def get_models_maxtoken_dict(models_list): details = model["model"]["parameters"] models_dict[details["model"]] = model["max_tokens"] return models_dict + + +def reduce_dimensions( + embeddings: np.ndarray, method: str = "pca", n_components: int = 2, **kwargs +) -> np.ndarray: + """ + Reduce the dimensionality of the embeddings using the specified method. + + :param embeddings: The input embeddings as a NumPy array of shape (n_samples, n_features). + :param method: The dimensionality reduction method to use ('pca' or 'tsne'). + :param n_components: The number of dimensions to reduce the embeddings to. + :param kwargs: Additional keyword arguments specific to the chosen method. + :return: The reduced embeddings as a NumPy array of shape (n_samples, n_components). + """ + if method.lower() == "pca": + reducer = PCA(n_components=n_components, **kwargs) + elif method.lower() == "tsne": + reducer = TSNE(n_components=n_components, **kwargs) + else: + raise ValueError(f"Unsupported dimensionality reduction method: {method}") + + reduced_embeddings = reducer.fit_transform(embeddings) + return reduced_embeddings + + +def cluster_embeddings( + reduced_embeddings: np.ndarray, + method: str = "kmeans", + n_clusters: int = 3, + **kwargs, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Cluster the reduced embeddings using the specified method and return the centroids. + + :param reduced_embeddings: The reduced embeddings as a NumPy array of shape (n_samples, n_components). + :param method: The clustering method to use ('kmeans'). + :param n_clusters: The number of clusters for KMeans clustering. + :param kwargs: Additional keyword arguments specific to the chosen method. + :return: A tuple containing the cluster labels (shape (n_samples,)) and the centroids (shape (n_clusters, n_components)). + """ + if method.lower() == "kmeans": + clusterer = KMeans(n_clusters=n_clusters, **kwargs) + clusters = clusterer.fit_predict(reduced_embeddings) + centroids = clusterer.cluster_centers_ + else: + raise ValueError(f"Unsupported clustering method: {method}") + + return clusters, centroids + + +def closest_samples_to_centroids( + embeddings: np.ndarray, clusters: np.ndarray, centroids: np.ndarray, n: int = 1 +) -> List[List[int]]: + """ + Find the indices of n samples closest to the centroid of each cluster. + + :param embeddings: The embeddings (reduced or not) as a NumPy array of shape (n_samples, n_components). + :param clusters: The cluster labels as a NumPy array of shape (n_samples,). + :param centroids: The centroids of the clusters as a NumPy array of shape (n_clusters, n_components). + :param n: The number of samples to find for each cluster. + :return: A list of lists containing the indices of the n closest samples for each cluster. + """ + cluster_samples = [[] for _ in range(centroids.shape[0])] + + for i, cluster in enumerate(clusters): + cluster_samples[cluster].append(i) + + closest_samples = [] + for i, centroid in enumerate(centroids): + distances = np.linalg.norm(embeddings[cluster_samples[i]] - centroid, axis=1) + closest_indices = np.argsort(distances)[:n] + closest_samples.append([cluster_samples[i][index] for index in closest_indices]) + + return closest_samples From 18ae627080af31b9cfdb99880f96817a0de88a60 Mon Sep 17 00:00:00 2001 From: Victor Dibia Date: Thu, 2 Nov 2023 22:50:45 -0700 Subject: [PATCH 2/3] add initial cohere v1 support for embeddings --- llmx/embeddings/cohere_embedding.py | 85 +++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 llmx/embeddings/cohere_embedding.py diff --git a/llmx/embeddings/cohere_embedding.py b/llmx/embeddings/cohere_embedding.py new file mode 100644 index 0000000..2658f25 --- /dev/null +++ b/llmx/embeddings/cohere_embedding.py @@ -0,0 +1,85 @@ +import cohere +from typing import Union, List +import numpy as np +import os +from .text_embedding import TextEmbedding +from ..utils import cache_request + + +class CohereTextEmbedding(TextEmbedding): + """ + Text embedding using Cohere models. + + Attributes: + model (str): Name of the Cohere embedding model to use. + api_key (str): API key for accessing the Cohere service. Will use the + COHERE_API_KEY environment variable if not provided. + """ + + def __init__( + self, + model: str = "embed-english-v2.0", + api_key: str = os.environ.get("COHERE_API_KEY", None), + ): + super().__init__(model) + + if api_key is None: + raise ValueError( + "Cohere API key is not set. Please set the COHERE_API_KEY environment variable." + ) + + self.client = cohere.Client(api_key) + self.model = model + self._size = self.get_embedding_size_from_model() + + def get_embedding_size_from_model(self) -> int: + """ + Get the embedding size for the current model. + + Returns: + int: Size of the embeddings returned by the model. + """ + size_map = { + "embed-english-v3.0": 1024, + "embed-multilingual-v3.0": 1024, + "embed-english-light-v3.0": 384, + "embed-multilingual-light-v3.0": 384, + "embed-english-v2.0": 4096, + "embed-english-light-v2.0": 1024, + "embed-multilingual-v2.0": 768, + } + size = size_map.get(self.model) + if not size: + raise ValueError("Invalid model name. Please provide a valid Cohere embed model.") + return size + + def embed( + self, text: Union[str, List[str]], use_cache: bool = True, **kwargs + ) -> np.ndarray: + """ + Compute embedding for the given text using the Cohere model. + + Args: + text (Union[str, List[str]]): The input text(s) to compute embeddings for. + use_cache (bool, optional): Whether to use cached embeddings if available. + Defaults to True. + **kwargs: Additional keyword arguments. + + Returns: + np.ndarray: The computed embeddings for the input text. + """ + # if text is numpy.ndarray, convert to list + if isinstance(text, np.ndarray): + text = text.tolist() + + cache_params = dict(text=text, model=self.model, **kwargs) + if use_cache: + response = cache_request(cache=self.cache, params=cache_params) + if response: + return np.array(response).astype(np.float32) + + co_response = self.client.embed(model=self.model, texts=text) + embeddings = co_response.embeddings + + cache_request(cache=self.cache, params=cache_params, values=embeddings) + return np.array(embeddings).astype(np.float32) From 220267f2485e106f4320cebdbdfd6526bb801899 Mon Sep 17 00:00:00 2001 From: Victor Dibia Date: Sat, 4 Nov 2023 21:35:00 -0700 Subject: [PATCH 3/3] update dependencies, add sklearn --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index ab8adf7..a68c6c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "google.auth", "typer", "pyyaml", + "scikit-learn" ] optional-dependencies = {web = ["fastapi", "uvicorn"], transformers = ["transformers[torch]>=4.26"]}