diff --git a/analysis/chat_history/.env.example b/analysis/chat_history/.env.example new file mode 100644 index 0000000000..c0a5d99123 --- /dev/null +++ b/analysis/chat_history/.env.example @@ -0,0 +1,14 @@ +COSMOSDB_ENDPOINT= +COSMOSDB_KEY= +COSMOSDB_DATABASE_NAME= +COSMOSDB_CONTAINER_NAME= + +AZURE_OPENAI_TEMPERATURE= +AZURE_OPENAI_TOP_P= +AZURE_OPENAI_MAX_TOKENS= +AZURE_OPENAI_STOP_SEQUENCE= +AZURE_OPENAI_RESOURCE= +AZURE_OPENAI_MODEL= +AZURE_OPENAI_MODEL_NAME= +AZURE_OPENAI_ENDPOINT= +AZURE_OPENAI_KEY= \ No newline at end of file diff --git a/analysis/chat_history/eval_prompt.py b/analysis/chat_history/eval_prompt.py new file mode 100644 index 0000000000..e44d2b2c21 --- /dev/null +++ b/analysis/chat_history/eval_prompt.py @@ -0,0 +1,61 @@ +from enum import Enum + +class EvaluationCateogry(Enum): + ACCURACY = "Accuracy" + RELEVANCE = "Relevance" + COHERENCE = "Coherence" + FLUENCY = "Fluency" + DEPTH = "Depth" + INSIGHTFULNESS = "Insightfulness" + OBJECTIVITY = "Objectivity" + CONTEXTUAL_APPROPRIATENESS = "ContextualAppropriateness" + SENTIMENT = "Sentiment" + +evaluations = { + EvaluationCateogry.ACCURACY: "Verify that the information provided about the topic is correct.", + EvaluationCateogry.RELEVANCE: "Check that the response focuses on the topic and its implications in the given context.", + EvaluationCateogry.COHERENCE: "Assess if the response is logically structured and easy to follow.", + EvaluationCateogry.FLUENCY: "Evaluate the grammatical and syntactical quality of the text.", + EvaluationCateogry.DEPTH: "Ensure the response covers the key aspects of the topic, providing a balanced depth of information.", + EvaluationCateogry.INSIGHTFULNESS: "Look for unique insights or perspectives in the response.", + EvaluationCateogry.OBJECTIVITY: "Check for a neutral and unbiased tone in the response.", + EvaluationCateogry.CONTEXTUAL_APPROPRIATENESS: "Ensure the response is appropriate for the given context, including awareness of any recent developments or specific nuances.", + EvaluationCateogry.SENTIMENT: "Assess the overall sentiment of the user's question. Possible values include POSITIVE, NEGATIVE, or NEUTRAL." +} + +system_template = """ +Evaluate the quality of the AI response to the user question based on the following categories: + +{categories} + +Please provide an overall summary of the quality of the response in 2 to 3 sentences. Use the provided context to inform your analysis. Also provide a score between 0 and 3 for each category, where 0 is the lowest score and 3 is the highest score. Category definitions may provide a different rating scheme which if provided should be honored. If you are unsure about a category, you can leave it blank. + +Format your response in JSON format that can be parsed using Python's `json` library. Respond only with the JSON object, without any additional text or comments or Markdown code block delimiters. + +Example response format: + +{{ + "evaluation": "", + "scores": {{ + "": "", + "": "", + ...etc. + }} +}} +""" + +def generate_evaluation_system_prompt(categories: list[EvaluationCateogry]) -> str: + """ + Generate a prompt to evaluate the quality of an AI-generated response based on the specified categories. + + Parameters: + - categories (list[EvaluationCateogry]): The categories to evaluate the response on. + """ + + if not categories or len(categories) == 0: + raise ValueError("At least one EvaluationCategory must be specified.") + + formatted = [f"**{category.value}**:\n{evaluations[category]}" for category in categories] + prompt = system_template.format(categories="\n\n".join(formatted)) + + return prompt \ No newline at end of file diff --git a/analysis/chat_history/history.py b/analysis/chat_history/history.py new file mode 100644 index 0000000000..ef693f3912 --- /dev/null +++ b/analysis/chat_history/history.py @@ -0,0 +1,71 @@ +import os +from dotenv import load_dotenv +from azure.cosmos import CosmosClient +import pandas as pd + +def get_container_client(): + """Get the Cosmos DB container client.""" + + # Read the Cosmos DB settings from environment variables + endpoint = os.environ.get("COSMOSDB_ENDPOINT") + key = os.environ.get("COSMOSDB_KEY") + database_name = os.environ.get("COSMOSDB_DATABASE_NAME") + container_name = os.environ.get("COSMOSDB_CONTAINER_NAME") + + # Initialize the Cosmos DB client + client = CosmosClient(endpoint, key) + database = client.get_database_client(database_name) + container = database.get_container_client(container_name) + + return container + +def get_conversations(start_date = None, end_date = None): + """Get the chat history from Cosmos DB.""" + + container = get_container_client() + + query_template = """ + SELECT c.id, c.timestamp, c.response_timestamp, c.user_input as user_query, c.conversation_id, c.tool as context, c.answer as chat_response + FROM c + {where_clause} + ORDER BY c.timestamp DESC + """ + + if start_date: + start_date = start_date.strftime("%Y-%m-%d %H:%M:%S") + if end_date: + end_date = end_date.strftime("%Y-%m-%d %H:%M:%S") + + where_clause = "" + if start_date and end_date: + where_clause = f"WHERE c.timestamp BETWEEN '{start_date}' AND '{end_date}'" + elif start_date: + where_clause = f"WHERE c.timestamp >= '{start_date}'" + elif end_date: + where_clause = f"WHERE c.timestamp <= '{end_date}'" + + query = query_template.format(where_clause=where_clause) + + items = container.query_items(query, enable_cross_partition_query=True) + return items + +def extend_dataframe(df): + # "Promote" the content form the user_query and chat_response columns to the top level of the dataframe + df['user_input'] = df['user_query'].apply(lambda x: x['content'] if pd.notnull(x) and 'content' in x else None) + df['answer'] = df['chat_response'].apply( + lambda x: x['choices'][0]['messages'][0]['content'] + if pd.notnull(x) + and 'choices' in x + and len(x['choices']) > 0 + and 'messages' in x['choices'][0] + and len(x['choices'][0]['messages']) > 0 + and 'content' in x['choices'][0]['messages'][0] + else None) + + # Calculate the response time + df['response_timestamp'] = pd.to_datetime(df['response_timestamp'], errors='coerce') + df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce') + df['duration'] = (df['response_timestamp'] - df['timestamp']).dt.total_seconds() + + # Calculate number of turns for each 'conversation_id' + df['turn_count'] = df['conversation_id'].apply(lambda x: len(df[df['conversation_id'] == x])) diff --git a/analysis/chat_history/history_analysis.ipynb b/analysis/chat_history/history_analysis.ipynb new file mode 100644 index 0000000000..66185735b5 --- /dev/null +++ b/analysis/chat_history/history_analysis.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup and init" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment and dependencies\n", + "\n", + "1. Create and activate a vitual env, e.g., `python -m venv .venv`\n", + "1. Install dependencies with `pip install -r requirements-dev.txt`\n", + "1. Copy `.env.example` and rename to `.env`.\n", + " 1. Provide the required values using the target Cosmos DB that has the conversations data you're interested in.\n", + " 1. Also provide the required Azure OpenAI values if you plan to analyze conversations data in the AI Analysis section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get conversation data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "from dotenv import load_dotenv\n", + "import pandas as pd\n", + "from history import get_conversations, extend_dataframe\n", + "from inference import evaluate_chat_response\n", + "from eval_prompt import EvaluationCateogry\n", + "\n", + "load_dotenv() " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Date filtering\n", + "\n", + "Results can be filtered by start and/or end dates. Providing a value for both will return conversations between the two dates, inclusive. Passing `None` for one and a value for the other will produce a before or after filter accordingly. Passing `None` for both will return all results. \n", + "\n", + "Note that dates can be simple, like `2024-01-30`, but can also be more targeted by providing time details, like `2024-01-30T10:15:00Z`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Get conversations for a date range (also supports only one date being provided)\n", + "start_date = pd.to_datetime(\"2024-01-30\")\n", + "end_date = pd.to_datetime(\"2024-01-31\")\n", + "\n", + "# Uncomment to use None for start_date and end_date to get all conversations\n", + "# start_date = None\n", + "# end_date = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query Cosmos DB" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "159 conversation turns from date range 2024-01-30 00:00:00 to 2024-01-31 00:00:00 found\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestampresponse_timestampuser_queryconversation_idcontextchat_response
08fdd6606-2cfb-4feb-999f-9b7907488ac42024-01-30T23:33:21.0206662024-01-30T23:33:40.133918{'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',...5e815061-db1b-6c40-e461-0edc8b8ea8ac{'citations': [{'content': 'Title: Accurate Vi...{'id': '', 'model': 'gpt-4', 'created': 170665...
190fe45a4-2c3e-4556-af91-ee0e2fa95b1c2024-01-30T23:32:29.742239NaN{'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',...7232ea15-8bc1-0299-dd24-b63772f1f678NaNNaN
29921095d-f409-4db5-a2fd-a1e52edfc59d2024-01-30T23:11:26.7115462024-01-30T23:11:46.184943{'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',...61a0fe48-9478-e112-ebd4-c5f3c35cb5e5{'citations': [{'content': 'Title: MSR CORE’s ...{'id': '', 'model': 'gpt-4', 'created': 170665...
\n", + "
" + ], + "text/plain": [ + " id timestamp \\\n", + "0 8fdd6606-2cfb-4feb-999f-9b7907488ac4 2024-01-30T23:33:21.020666 \n", + "1 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c 2024-01-30T23:32:29.742239 \n", + "2 9921095d-f409-4db5-a2fd-a1e52edfc59d 2024-01-30T23:11:26.711546 \n", + "\n", + " response_timestamp \\\n", + "0 2024-01-30T23:33:40.133918 \n", + "1 NaN \n", + "2 2024-01-30T23:11:46.184943 \n", + "\n", + " user_query \\\n", + "0 {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',... \n", + "1 {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',... \n", + "2 {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',... \n", + "\n", + " conversation_id \\\n", + "0 5e815061-db1b-6c40-e461-0edc8b8ea8ac \n", + "1 7232ea15-8bc1-0299-dd24-b63772f1f678 \n", + "2 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", + "\n", + " context \\\n", + "0 {'citations': [{'content': 'Title: Accurate Vi... \n", + "1 NaN \n", + "2 {'citations': [{'content': 'Title: MSR CORE’s ... \n", + "\n", + " chat_response \n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "1 NaN \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170665... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = get_conversations(start_date=start_date, end_date=end_date)\n", + "df = pd.DataFrame(dataset)\n", + "\n", + "row_count = df.shape[0]\n", + "print(f\"{row_count} conversation turns from date range {start_date} to {end_date} found\")\n", + "\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extend with calculated colums" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestampresponse_timestampuser_queryconversation_idcontextchat_responseuser_inputanswerdurationturn_count
08fdd6606-2cfb-4feb-999f-9b7907488ac42024-01-30 23:33:21.0206662024-01-30 23:33:40.133918{'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',...5e815061-db1b-6c40-e461-0edc8b8ea8ac{'citations': [{'content': 'Title: Accurate Vi...{'id': '', 'model': 'gpt-4', 'created': 170665...Summarize the main three points of Peter's key...Peter Lee's keynote address at the Microsoft R...19.1132521
190fe45a4-2c3e-4556-af91-ee0e2fa95b1c2024-01-30 23:32:29.742239NaT{'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',...7232ea15-8bc1-0299-dd24-b63772f1f678NaNNaNCan you summarize the key challenges tackled b...NoneNaN1
29921095d-f409-4db5-a2fd-a1e52edfc59d2024-01-30 23:11:26.7115462024-01-30 23:11:46.184943{'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',...61a0fe48-9478-e112-ebd4-c5f3c35cb5e5{'citations': [{'content': 'Title: MSR CORE’s ...{'id': '', 'model': 'gpt-4', 'created': 170665...tell me about msr chatMicrosoft Research (MSR) has been exploring th...19.4733972
\n", + "
" + ], + "text/plain": [ + " id timestamp \\\n", + "0 8fdd6606-2cfb-4feb-999f-9b7907488ac4 2024-01-30 23:33:21.020666 \n", + "1 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c 2024-01-30 23:32:29.742239 \n", + "2 9921095d-f409-4db5-a2fd-a1e52edfc59d 2024-01-30 23:11:26.711546 \n", + "\n", + " response_timestamp \\\n", + "0 2024-01-30 23:33:40.133918 \n", + "1 NaT \n", + "2 2024-01-30 23:11:46.184943 \n", + "\n", + " user_query \\\n", + "0 {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',... \n", + "1 {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',... \n", + "2 {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',... \n", + "\n", + " conversation_id \\\n", + "0 5e815061-db1b-6c40-e461-0edc8b8ea8ac \n", + "1 7232ea15-8bc1-0299-dd24-b63772f1f678 \n", + "2 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", + "\n", + " context \\\n", + "0 {'citations': [{'content': 'Title: Accurate Vi... \n", + "1 NaN \n", + "2 {'citations': [{'content': 'Title: MSR CORE’s ... \n", + "\n", + " chat_response \\\n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "1 NaN \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "\n", + " user_input \\\n", + "0 Summarize the main three points of Peter's key... \n", + "1 Can you summarize the key challenges tackled b... \n", + "2 tell me about msr chat \n", + "\n", + " answer duration turn_count \n", + "0 Peter Lee's keynote address at the Microsoft R... 19.113252 1 \n", + "1 None NaN 1 \n", + "2 Microsoft Research (MSR) has been exploring th... 19.473397 2 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Adds some calculated columns to the dataframe like 'duration', 'turn_count' etc.\n", + "extend_dataframe(df)\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyzing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save to Excel\n", + "\n", + "Due the Excel's **limit of 32,767 characters per cell**, saving the DataFrame as an .xlsx file may yield unexpected and innacurate results when analyzed. Depending on the nature of your data it may be better to execute the cells in the `AI Analysis` section to analyze conversations using GPT-4." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'chat_history.xlsx'\n", + "\n", + "# Create a new DataFrame from the results\n", + "out_dataset = pd.DataFrame(df, columns=[\n", + " 'id', \n", + " 'conversation_id', \n", + " 'turn_count', \n", + " 'timestamp', \n", + " 'response_timestamp', \n", + " 'duration', \n", + " 'user_input', \n", + " 'answer', \n", + " 'context'\n", + "])\n", + "\n", + "# Write the new DataFrame to a new Excel file\n", + "output_file_path = filename\n", + "out_dataset.to_excel(output_file_path, index=False)\n", + "\n", + "out_dataset.head(4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AI Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating chat response for 8fdd6606-2cfb-4feb-999f-9b7907488ac4 ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c ...\n", + "\u001b[91mERROR processing id 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c: The user_input, context, and answer fields must be provided.\u001b[0m\n", + "Evaluating chat response for 9921095d-f409-4db5-a2fd-a1e52edfc59d ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for f1242f7c-41f1-4608-8141-b46de131879a ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for 71375972-e0e5-470a-a301-100a4f401481 ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for eefd1c18-f176-46ca-bea6-0218f04519b6 ...\n", + "\u001b[91mERROR processing id eefd1c18-f176-46ca-bea6-0218f04519b6: The user_input, context, and answer fields must be provided.\u001b[0m\n", + "Evaluating chat response for aade9ab3-38e2-401e-ba7e-66bbf7756017 ...\n", + "Processing evaulation response ...\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestampresponse_timestampuser_queryconversation_idcontextchat_responseuser_inputanswerdurationturn_countevaluationscores
08fdd6606-2cfb-4feb-999f-9b7907488ac42024-01-30 23:33:21.0206662024-01-30 23:33:40.133918{'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',...5e815061-db1b-6c40-e461-0edc8b8ea8ac{'citations': [{'content': 'Title: Accurate Vi...{'id': '', 'model': 'gpt-4', 'created': 170665...Summarize the main three points of Peter's key...Peter Lee's keynote address at the Microsoft R...19.1132521The response provided does not contain any ver...{\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"...
190fe45a4-2c3e-4556-af91-ee0e2fa95b1c2024-01-30 23:32:29.742239NaT{'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',...7232ea15-8bc1-0299-dd24-b63772f1f678NaNNaNCan you summarize the key challenges tackled b...NoneNaN1nannan
29921095d-f409-4db5-a2fd-a1e52edfc59d2024-01-30 23:11:26.7115462024-01-30 23:11:46.184943{'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',...61a0fe48-9478-e112-ebd4-c5f3c35cb5e5{'citations': [{'content': 'Title: MSR CORE’s ...{'id': '', 'model': 'gpt-4', 'created': 170665...tell me about msr chatMicrosoft Research (MSR) has been exploring th...19.4733972The response lacks accuracy and relevance as i...{\"accuracy\": 0, \"relevance\": 0, \"sentiment\": 3}
3f1242f7c-41f1-4608-8141-b46de131879a2024-01-30 23:11:09.0442422024-01-30 23:11:17.530884{'id': '61a0fe48-9478-e112-ebd4-c5f3c35cb5e5',...61a0fe48-9478-e112-ebd4-c5f3c35cb5e5{'citations': [{'content': 'Title: MSR CORE’s ...{'id': '', 'model': 'gpt-4', 'created': 170665...msrIt seems like your question is incomplete. Cou...8.4866422The AI response is not applicable as it was pr...{\"accuracy\": 3, \"relevance\": 3, \"sentiment\": \"...
471375972-e0e5-470a-a301-100a4f4014812024-01-30 23:06:03.0881732024-01-30 23:06:33.381400{'id': '0804017d-1858-2bbd-82d7-00821fa74f1a',...0804017d-1858-2bbd-82d7-00821fa74f1a{'citations': [{'content': 'Transcript - Light...{'id': '', 'model': 'gpt-4', 'created': 170665...Can you summarize the key challenges tackled b...Microsoft Research (MSR) addressed several key...30.2932271The response contains fabricated references (e...{\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"...
\n", + "
" + ], + "text/plain": [ + " id timestamp \\\n", + "0 8fdd6606-2cfb-4feb-999f-9b7907488ac4 2024-01-30 23:33:21.020666 \n", + "1 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c 2024-01-30 23:32:29.742239 \n", + "2 9921095d-f409-4db5-a2fd-a1e52edfc59d 2024-01-30 23:11:26.711546 \n", + "3 f1242f7c-41f1-4608-8141-b46de131879a 2024-01-30 23:11:09.044242 \n", + "4 71375972-e0e5-470a-a301-100a4f401481 2024-01-30 23:06:03.088173 \n", + "\n", + " response_timestamp \\\n", + "0 2024-01-30 23:33:40.133918 \n", + "1 NaT \n", + "2 2024-01-30 23:11:46.184943 \n", + "3 2024-01-30 23:11:17.530884 \n", + "4 2024-01-30 23:06:33.381400 \n", + "\n", + " user_query \\\n", + "0 {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',... \n", + "1 {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',... \n", + "2 {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',... \n", + "3 {'id': '61a0fe48-9478-e112-ebd4-c5f3c35cb5e5',... \n", + "4 {'id': '0804017d-1858-2bbd-82d7-00821fa74f1a',... \n", + "\n", + " conversation_id \\\n", + "0 5e815061-db1b-6c40-e461-0edc8b8ea8ac \n", + "1 7232ea15-8bc1-0299-dd24-b63772f1f678 \n", + "2 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", + "3 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", + "4 0804017d-1858-2bbd-82d7-00821fa74f1a \n", + "\n", + " context \\\n", + "0 {'citations': [{'content': 'Title: Accurate Vi... \n", + "1 NaN \n", + "2 {'citations': [{'content': 'Title: MSR CORE’s ... \n", + "3 {'citations': [{'content': 'Title: MSR CORE’s ... \n", + "4 {'citations': [{'content': 'Transcript - Light... \n", + "\n", + " chat_response \\\n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "1 NaN \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "3 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "4 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "\n", + " user_input \\\n", + "0 Summarize the main three points of Peter's key... \n", + "1 Can you summarize the key challenges tackled b... \n", + "2 tell me about msr chat \n", + "3 msr \n", + "4 Can you summarize the key challenges tackled b... \n", + "\n", + " answer duration turn_count \\\n", + "0 Peter Lee's keynote address at the Microsoft R... 19.113252 1 \n", + "1 None NaN 1 \n", + "2 Microsoft Research (MSR) has been exploring th... 19.473397 2 \n", + "3 It seems like your question is incomplete. Cou... 8.486642 2 \n", + "4 Microsoft Research (MSR) addressed several key... 30.293227 1 \n", + "\n", + " evaluation \\\n", + "0 The response provided does not contain any ver... \n", + "1 nan \n", + "2 The response lacks accuracy and relevance as i... \n", + "3 The AI response is not applicable as it was pr... \n", + "4 The response contains fabricated references (e... \n", + "\n", + " scores \n", + "0 {\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"... \n", + "1 nan \n", + "2 {\"accuracy\": 0, \"relevance\": 0, \"sentiment\": 3} \n", + "3 {\"accuracy\": 3, \"relevance\": 3, \"sentiment\": \"... \n", + "4 {\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Categories to prompt the AI to evaluate on. \n", + "# Options: ACCURACY, RELEVANCE, COHERENCE, FLUENCY, DEPTH, INSIGHTFULNESS, OBJECTIVITY, CONTEXTUAL_APPROPRIATENESS, SENTIMENT\n", + "categories = [\n", + " EvaluationCateogry.ACCURACY,\n", + " EvaluationCateogry.RELEVANCE,\n", + " EvaluationCateogry.SENTIMENT\n", + "]\n", + "\n", + "num_to_process = 5\n", + "count = 0\n", + "\n", + "for index, row in df.iterrows():\n", + " try:\n", + " evaluation, scores = evaluate_chat_response(row, categories)\n", + "\n", + " # Extend the current row with the evaluation and scores\n", + " df.loc[index, \"evaluation\"] = evaluation\n", + " df.loc[index, \"scores\"] = json.dumps(scores)\n", + "\n", + " # Comment these lines to process all rows\n", + " count += 1\n", + " if count >= num_to_process:\n", + " break\n", + " except Exception as e:\n", + " print(f\"\\033[91mERROR processing id {row['id']}: {e}\\033[0m\")\n", + " continue\n", + "\n", + "df.head(5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analysis/chat_history/inference.py b/analysis/chat_history/inference.py new file mode 100644 index 0000000000..a4d1259a25 --- /dev/null +++ b/analysis/chat_history/inference.py @@ -0,0 +1,61 @@ +import os +import json +from typing import Any +import openai +from pandas import Series, isna +from eval_prompt import EvaluationCateogry, generate_evaluation_system_prompt + + +AZURE_OPENAI_TEMPERATURE = os.getenv("AZURE_OPENAI_TEMPERATURE") +AZURE_OPENAI_TOP_P = os.getenv("AZURE_OPENAI_TOP_P") +AZURE_OPENAI_MAX_TOKENS = os.getenv("AZURE_OPENAI_MAX_TOKENS") +AZURE_OPENAI_STOP_SEQUENCE = os.getenv("AZURE_OPENAI_STOP_SEQUENCE") +AZURE_OPENAI_RESOURCE = os.getenv("AZURE_OPENAI_RESOURCE") +AZURE_OPENAI_MODEL = os.getenv("AZURE_OPENAI_MODEL") +AZURE_OPENAI_MODEL_NAME = os.getenv("AZURE_OPENAI_MODEL_NAME") +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") +AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") + +openai.api_type = "azure" +openai.api_base = AZURE_OPENAI_ENDPOINT if AZURE_OPENAI_ENDPOINT else f"https://{AZURE_OPENAI_RESOURCE}.openai.azure.com/" +openai.api_version = "2023-08-01-preview" +openai.api_key = AZURE_OPENAI_KEY + +def evaluate_chat_response(row: Series, categories: list[EvaluationCateogry]) -> tuple[str, dict[str, Any]]: + print(f"Evaluating chat response for {row['id']} ...") + + user_input = row['user_input'] + context = row['context'] + answer = row['answer'] + + if not user_input or not context or not answer or isna(user_input) or isna(context) or isna(answer): + raise ValueError("The user_input, context, and answer fields must be provided.") + + system_message = generate_evaluation_system_prompt(categories) + messages = [ + { "role": "system", "content": system_message }, + { "role": "user", "content": user_input }, + { "role": "assistant", "content": answer }, + { "role": "user", "content": "# Context\n\nThe previous AI response was based on this context:\n\n{context}\n\nEvaluation:\n" } + ] + + response = openai.ChatCompletion.create( + engine=AZURE_OPENAI_MODEL, + messages=messages, + temperature=float(AZURE_OPENAI_TEMPERATURE), + max_tokens=int(AZURE_OPENAI_MAX_TOKENS), + top_p=float(AZURE_OPENAI_TOP_P), + stop=None, + stream=False + ) + + return process_response(response) + +def process_response(response): + print(f"Processing evaulation response ...") + + result = json.loads(response["choices"][0]["message"]["content"]) + evaluation = result['evaluation'] + scores = result['scores'] + + return evaluation, scores \ No newline at end of file