From 552aa246e5eabde5ac24b54aa03283b2c2bb29ba Mon Sep 17 00:00:00 2001 From: Bryan Scheurman Date: Tue, 30 Jan 2024 16:14:20 -0800 Subject: [PATCH 1/4] Added notebook and helper functions to get conversations data and create an Excel file for analysis --- analysis/chat_history/.env.example | 4 + analysis/chat_history/history.py | 63 +++ analysis/chat_history/history_analysis.ipynb | 397 +++++++++++++++++++ 3 files changed, 464 insertions(+) create mode 100644 analysis/chat_history/.env.example create mode 100644 analysis/chat_history/history.py create mode 100644 analysis/chat_history/history_analysis.ipynb diff --git a/analysis/chat_history/.env.example b/analysis/chat_history/.env.example new file mode 100644 index 0000000000..39382da0d0 --- /dev/null +++ b/analysis/chat_history/.env.example @@ -0,0 +1,4 @@ +COSMOSDB_ENDPOINT= +COSMOSDB_KEY= +COSMOSDB_DATABASE_NAME= +COSMOSDB_CONTAINER_NAME= \ No newline at end of file diff --git a/analysis/chat_history/history.py b/analysis/chat_history/history.py new file mode 100644 index 0000000000..d5a8131413 --- /dev/null +++ b/analysis/chat_history/history.py @@ -0,0 +1,63 @@ +import os +from dotenv import load_dotenv +from azure.cosmos import CosmosClient +import pandas as pd + +def get_container_client(): + """Get the Cosmos DB container client.""" + + # Read the Cosmos DB settings from environment variables + endpoint = os.environ.get("COSMOSDB_ENDPOINT") + key = os.environ.get("COSMOSDB_KEY") + database_name = os.environ.get("COSMOSDB_DATABASE_NAME") + container_name = os.environ.get("COSMOSDB_CONTAINER_NAME") + + # Initialize the Cosmos DB client + client = CosmosClient(endpoint, key) + database = client.get_database_client(database_name) + container = database.get_container_client(container_name) + + return container + +def got_conversations(start_date = None, end_date = None): + """Get the chat history from Cosmos DB.""" + + container = get_container_client() + + query_template = """ + SELECT c.id, c.timestamp, c.response_timestamp, c.user_input as user_query, c.conversation_id, c.tool as context, c.answer as chat_response + FROM c + {where_clause} + ORDER BY c.timestamp DESC + """ + + if start_date: + start_date = start_date.strftime("%Y-%m-%d %H:%M:%S") + if end_date: + end_date = end_date.strftime("%Y-%m-%d %H:%M:%S") + + where_clause = "" + if start_date and end_date: + where_clause = f"WHERE c.timestamp BETWEEN '{start_date}' AND '{end_date}'" + elif start_date: + where_clause = f"WHERE c.timestamp >= '{start_date}'" + elif end_date: + where_clause = f"WHERE c.timestamp <= '{end_date}'" + + query = query_template.format(where_clause=where_clause) + + items = container.query_items(query, enable_cross_partition_query=True) + return items + +def extend_dataframe(df): + # "Promote" the content form the user_query and chat_response fields to the top level of the dataframe + df['user_input'] = df['user_query'].apply(lambda x: x['content'] if pd.notnull(x) and 'content' in x else None) + df['answer'] = df['chat_response'].apply(lambda x: x['choices'][0]['messages'][0]['content'] if pd.notnull(x) and 'choices' in x and len(x['choices']) > 0 and 'messages' in x['choices'][0] and len(x['choices'][0]['messages']) > 0 and 'content' in x['choices'][0]['messages'][0] else None) + + # Calculate the response time + df['response_timestamp'] = pd.to_datetime(df['response_timestamp'], errors='coerce') + df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce') + df['duration'] = (df['response_timestamp'] - df['timestamp']).dt.total_seconds() + + # Calculate number of turns for each 'conversation_id' + df['turn_count'] = df['conversation_id'].apply(lambda x: len(df[df['conversation_id'] == x])) diff --git a/analysis/chat_history/history_analysis.ipynb b/analysis/chat_history/history_analysis.ipynb new file mode 100644 index 0000000000..4953da72c3 --- /dev/null +++ b/analysis/chat_history/history_analysis.ipynb @@ -0,0 +1,397 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and init" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Environment and dependencies\n", + "\n", + "1. Create and activate a vitual env, e.g., `python -m venv .venv`\n", + "1. Install dependencies with `pip install -r requirements-dev.txt`\n", + "1. Copy `.env.example` and rename to `.env`.\n", + " 1. Provide the required values using the target Cosmos DB that has the conversations data you're interested in." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get conversation data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from history import *\n", + "from dotenv import load_dotenv\n", + "import pandas as pd\n", + "import json\n", + "\n", + "load_dotenv() " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Date filtering\n", + "\n", + "Results can be filtered by start and/or end dates. Providing a value for both will return conversations between the two dates, inclusive. Passing `None` for one and a value for the other will produce a before or after filter accordingly. Passing `None` for both will return all results. \n", + "\n", + "Note that dates can be simple, like `2024-01-30`, but can also be more targeted by providing time details, like `2024-01-30T10:15:00Z`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Get conversations for a date range (also supports only one date being provided)\n", + "start_date = pd.to_datetime(\"2024-01-29\")\n", + "end_date = pd.to_datetime(\"2024-01-30\")\n", + "\n", + "# Uncomment to use None for start_date and end_date to get all conversations\n", + "# start_date = None\n", + "# end_date = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestampresponse_timestampuser_queryconversation_idcontextchat_response
0c515b325-31d8-4902-8eb0-bfdf8aa915512024-01-29T23:59:31.2060752024-01-29T23:59:54.315737{'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',...ecccf511-35f3-a6e8-2881-02af2addf17a{'citations': [{'content': 'Microsoft Research...{'id': '', 'model': 'gpt-4', 'created': 170657...
1030777f3-f05a-4177-bd8a-57ff4f4182622024-01-29T23:57:41.4057272024-01-29T23:58:13.572655{'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',...fcdd1c9d-dff4-73c4-a137-9dba99990983{'citations': [{'content': '. \n", + "GPT -4 and the ...{'id': '', 'model': 'gpt-4', 'created': 170657...
26e8e6bd7-56e8-4abd-845e-a5e6649d8a8a2024-01-29T23:56:22.6749962024-01-29T23:57:08.228601{'id': '508917c2-bc62-ecda-bb98-d6f940379334',...c3cb145e-63be-cc87-7cba-4ca2f15d0f78{'citations': [{'content': 'Title: Research Fo...{'id': '', 'model': 'gpt-4', 'created': 170657...
\n", + "
" + ], + "text/plain": [ + " id timestamp \\\n", + "0 c515b325-31d8-4902-8eb0-bfdf8aa91551 2024-01-29T23:59:31.206075 \n", + "1 030777f3-f05a-4177-bd8a-57ff4f418262 2024-01-29T23:57:41.405727 \n", + "2 6e8e6bd7-56e8-4abd-845e-a5e6649d8a8a 2024-01-29T23:56:22.674996 \n", + "\n", + " response_timestamp \\\n", + "0 2024-01-29T23:59:54.315737 \n", + "1 2024-01-29T23:58:13.572655 \n", + "2 2024-01-29T23:57:08.228601 \n", + "\n", + " user_query \\\n", + "0 {'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',... \n", + "1 {'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',... \n", + "2 {'id': '508917c2-bc62-ecda-bb98-d6f940379334',... \n", + "\n", + " conversation_id \\\n", + "0 ecccf511-35f3-a6e8-2881-02af2addf17a \n", + "1 fcdd1c9d-dff4-73c4-a137-9dba99990983 \n", + "2 c3cb145e-63be-cc87-7cba-4ca2f15d0f78 \n", + "\n", + " context \\\n", + "0 {'citations': [{'content': 'Microsoft Research... \n", + "1 {'citations': [{'content': '. \n", + "GPT -4 and the ... \n", + "2 {'citations': [{'content': 'Title: Research Fo... \n", + "\n", + " chat_response \n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", + "1 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170657... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = got_conversations(start_date=start_date, end_date=end_date)\n", + "df = pd.DataFrame(dataset)\n", + "\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extend with calculated colums" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestampresponse_timestampuser_queryconversation_idcontextchat_responseuser_inputanswerdurationturn_count
0c515b325-31d8-4902-8eb0-bfdf8aa915512024-01-29 23:59:31.2060752024-01-29 23:59:54.315737{'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',...ecccf511-35f3-a6e8-2881-02af2addf17a{'citations': [{'content': 'Microsoft Research...{'id': '', 'model': 'gpt-4', 'created': 170657...What kind of problems is MSR's AI research try...Microsoft Research's AI research is focused on...23.1096621
1030777f3-f05a-4177-bd8a-57ff4f4182622024-01-29 23:57:41.4057272024-01-29 23:58:13.572655{'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',...fcdd1c9d-dff4-73c4-a137-9dba99990983{'citations': [{'content': '. \n", + "GPT -4 and the ...{'id': '', 'model': 'gpt-4', 'created': 170657...Can you summarize the key challenges tackled b...The Microsoft Research Forum for this year add...32.1669281
26e8e6bd7-56e8-4abd-845e-a5e6649d8a8a2024-01-29 23:56:22.6749962024-01-29 23:57:08.228601{'id': '508917c2-bc62-ecda-bb98-d6f940379334',...c3cb145e-63be-cc87-7cba-4ca2f15d0f78{'citations': [{'content': 'Title: Research Fo...{'id': '', 'model': 'gpt-4', 'created': 170657...what datasets were used in novel ways by Micro...In the NeurIPS 2023 submissions, Microsoft res...45.5536054
\n", + "
" + ], + "text/plain": [ + " id timestamp \\\n", + "0 c515b325-31d8-4902-8eb0-bfdf8aa91551 2024-01-29 23:59:31.206075 \n", + "1 030777f3-f05a-4177-bd8a-57ff4f418262 2024-01-29 23:57:41.405727 \n", + "2 6e8e6bd7-56e8-4abd-845e-a5e6649d8a8a 2024-01-29 23:56:22.674996 \n", + "\n", + " response_timestamp \\\n", + "0 2024-01-29 23:59:54.315737 \n", + "1 2024-01-29 23:58:13.572655 \n", + "2 2024-01-29 23:57:08.228601 \n", + "\n", + " user_query \\\n", + "0 {'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',... \n", + "1 {'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',... \n", + "2 {'id': '508917c2-bc62-ecda-bb98-d6f940379334',... \n", + "\n", + " conversation_id \\\n", + "0 ecccf511-35f3-a6e8-2881-02af2addf17a \n", + "1 fcdd1c9d-dff4-73c4-a137-9dba99990983 \n", + "2 c3cb145e-63be-cc87-7cba-4ca2f15d0f78 \n", + "\n", + " context \\\n", + "0 {'citations': [{'content': 'Microsoft Research... \n", + "1 {'citations': [{'content': '. \n", + "GPT -4 and the ... \n", + "2 {'citations': [{'content': 'Title: Research Fo... \n", + "\n", + " chat_response \\\n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", + "1 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", + "\n", + " user_input \\\n", + "0 What kind of problems is MSR's AI research try... \n", + "1 Can you summarize the key challenges tackled b... \n", + "2 what datasets were used in novel ways by Micro... \n", + "\n", + " answer duration turn_count \n", + "0 Microsoft Research's AI research is focused on... 23.109662 1 \n", + "1 The Microsoft Research Forum for this year add... 32.166928 1 \n", + "2 In the NeurIPS 2023 submissions, Microsoft res... 45.553605 4 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Adds some calculated columns to the dataframe like 'duration', 'turn_count' etc.\n", + "extend_dataframe(df)\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save to Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'chat_history.xlsx'\n", + "\n", + "# Create a new DataFrame from the results\n", + "out_dataset = pd.DataFrame(df, columns=[\n", + " 'id', \n", + " 'conversation_id', \n", + " 'turn_count', \n", + " 'timestamp', \n", + " 'response_timestamp', \n", + " 'duration', \n", + " 'user_input', \n", + " 'answer', \n", + " 'context'\n", + "])\n", + "\n", + "# Write the new DataFrame to a new Excel file\n", + "output_file_path = filename\n", + "out_dataset.to_excel(output_file_path, index=False)\n", + "\n", + "out_dataset.head(4)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d0628b521fcd2840a712d775891815ccee45ad74 Mon Sep 17 00:00:00 2001 From: Bryan Scheurman Date: Tue, 30 Jan 2024 16:17:00 -0800 Subject: [PATCH 2/4] Clarify --- analysis/chat_history/history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/chat_history/history.py b/analysis/chat_history/history.py index d5a8131413..d9ed35a6ca 100644 --- a/analysis/chat_history/history.py +++ b/analysis/chat_history/history.py @@ -50,7 +50,7 @@ def got_conversations(start_date = None, end_date = None): return items def extend_dataframe(df): - # "Promote" the content form the user_query and chat_response fields to the top level of the dataframe + # "Promote" the content form the user_query and chat_response columns to the top level of the dataframe df['user_input'] = df['user_query'].apply(lambda x: x['content'] if pd.notnull(x) and 'content' in x else None) df['answer'] = df['chat_response'].apply(lambda x: x['choices'][0]['messages'][0]['content'] if pd.notnull(x) and 'choices' in x and len(x['choices']) > 0 and 'messages' in x['choices'][0] and len(x['choices'][0]['messages']) > 0 and 'content' in x['choices'][0]['messages'][0] else None) From 14383b3c73da47cf515ef2d3b740c7d6d0dfbe4a Mon Sep 17 00:00:00 2001 From: Bryan Scheurman Date: Tue, 30 Jan 2024 18:22:27 -0800 Subject: [PATCH 3/4] Fixed typo and improved lamba formatting for readability --- analysis/chat_history/history.py | 12 ++++++++++-- analysis/chat_history/history_analysis.ipynb | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/analysis/chat_history/history.py b/analysis/chat_history/history.py index d9ed35a6ca..ef693f3912 100644 --- a/analysis/chat_history/history.py +++ b/analysis/chat_history/history.py @@ -19,7 +19,7 @@ def get_container_client(): return container -def got_conversations(start_date = None, end_date = None): +def get_conversations(start_date = None, end_date = None): """Get the chat history from Cosmos DB.""" container = get_container_client() @@ -52,7 +52,15 @@ def got_conversations(start_date = None, end_date = None): def extend_dataframe(df): # "Promote" the content form the user_query and chat_response columns to the top level of the dataframe df['user_input'] = df['user_query'].apply(lambda x: x['content'] if pd.notnull(x) and 'content' in x else None) - df['answer'] = df['chat_response'].apply(lambda x: x['choices'][0]['messages'][0]['content'] if pd.notnull(x) and 'choices' in x and len(x['choices']) > 0 and 'messages' in x['choices'][0] and len(x['choices'][0]['messages']) > 0 and 'content' in x['choices'][0]['messages'][0] else None) + df['answer'] = df['chat_response'].apply( + lambda x: x['choices'][0]['messages'][0]['content'] + if pd.notnull(x) + and 'choices' in x + and len(x['choices']) > 0 + and 'messages' in x['choices'][0] + and len(x['choices'][0]['messages']) > 0 + and 'content' in x['choices'][0]['messages'][0] + else None) # Calculate the response time df['response_timestamp'] = pd.to_datetime(df['response_timestamp'], errors='coerce') diff --git a/analysis/chat_history/history_analysis.ipynb b/analysis/chat_history/history_analysis.ipynb index 4953da72c3..02903fe8ff 100644 --- a/analysis/chat_history/history_analysis.ipynb +++ b/analysis/chat_history/history_analysis.ipynb @@ -183,7 +183,7 @@ } ], "source": [ - "dataset = got_conversations(start_date=start_date, end_date=end_date)\n", + "dataset = get_conversations(start_date=start_date, end_date=end_date)\n", "df = pd.DataFrame(dataset)\n", "\n", "df.head(3)" From 5069ca9b84d253b81efa7f6319dc89577b784078 Mon Sep 17 00:00:00 2001 From: Bryan Scheurman Date: Wed, 31 Jan 2024 17:25:34 -0800 Subject: [PATCH 4/4] Enhanced the history analysis with scripts to prompt an AOAI endpoint with conversation details and context --- analysis/chat_history/.env.example | 12 +- analysis/chat_history/eval_prompt.py | 61 +++ analysis/chat_history/history_analysis.ipynb | 516 +++++++++++++++---- analysis/chat_history/inference.py | 61 +++ 4 files changed, 536 insertions(+), 114 deletions(-) create mode 100644 analysis/chat_history/eval_prompt.py create mode 100644 analysis/chat_history/inference.py diff --git a/analysis/chat_history/.env.example b/analysis/chat_history/.env.example index 39382da0d0..c0a5d99123 100644 --- a/analysis/chat_history/.env.example +++ b/analysis/chat_history/.env.example @@ -1,4 +1,14 @@ COSMOSDB_ENDPOINT= COSMOSDB_KEY= COSMOSDB_DATABASE_NAME= -COSMOSDB_CONTAINER_NAME= \ No newline at end of file +COSMOSDB_CONTAINER_NAME= + +AZURE_OPENAI_TEMPERATURE= +AZURE_OPENAI_TOP_P= +AZURE_OPENAI_MAX_TOKENS= +AZURE_OPENAI_STOP_SEQUENCE= +AZURE_OPENAI_RESOURCE= +AZURE_OPENAI_MODEL= +AZURE_OPENAI_MODEL_NAME= +AZURE_OPENAI_ENDPOINT= +AZURE_OPENAI_KEY= \ No newline at end of file diff --git a/analysis/chat_history/eval_prompt.py b/analysis/chat_history/eval_prompt.py new file mode 100644 index 0000000000..e44d2b2c21 --- /dev/null +++ b/analysis/chat_history/eval_prompt.py @@ -0,0 +1,61 @@ +from enum import Enum + +class EvaluationCateogry(Enum): + ACCURACY = "Accuracy" + RELEVANCE = "Relevance" + COHERENCE = "Coherence" + FLUENCY = "Fluency" + DEPTH = "Depth" + INSIGHTFULNESS = "Insightfulness" + OBJECTIVITY = "Objectivity" + CONTEXTUAL_APPROPRIATENESS = "ContextualAppropriateness" + SENTIMENT = "Sentiment" + +evaluations = { + EvaluationCateogry.ACCURACY: "Verify that the information provided about the topic is correct.", + EvaluationCateogry.RELEVANCE: "Check that the response focuses on the topic and its implications in the given context.", + EvaluationCateogry.COHERENCE: "Assess if the response is logically structured and easy to follow.", + EvaluationCateogry.FLUENCY: "Evaluate the grammatical and syntactical quality of the text.", + EvaluationCateogry.DEPTH: "Ensure the response covers the key aspects of the topic, providing a balanced depth of information.", + EvaluationCateogry.INSIGHTFULNESS: "Look for unique insights or perspectives in the response.", + EvaluationCateogry.OBJECTIVITY: "Check for a neutral and unbiased tone in the response.", + EvaluationCateogry.CONTEXTUAL_APPROPRIATENESS: "Ensure the response is appropriate for the given context, including awareness of any recent developments or specific nuances.", + EvaluationCateogry.SENTIMENT: "Assess the overall sentiment of the user's question. Possible values include POSITIVE, NEGATIVE, or NEUTRAL." +} + +system_template = """ +Evaluate the quality of the AI response to the user question based on the following categories: + +{categories} + +Please provide an overall summary of the quality of the response in 2 to 3 sentences. Use the provided context to inform your analysis. Also provide a score between 0 and 3 for each category, where 0 is the lowest score and 3 is the highest score. Category definitions may provide a different rating scheme which if provided should be honored. If you are unsure about a category, you can leave it blank. + +Format your response in JSON format that can be parsed using Python's `json` library. Respond only with the JSON object, without any additional text or comments or Markdown code block delimiters. + +Example response format: + +{{ + "evaluation": "", + "scores": {{ + "": "", + "": "", + ...etc. + }} +}} +""" + +def generate_evaluation_system_prompt(categories: list[EvaluationCateogry]) -> str: + """ + Generate a prompt to evaluate the quality of an AI-generated response based on the specified categories. + + Parameters: + - categories (list[EvaluationCateogry]): The categories to evaluate the response on. + """ + + if not categories or len(categories) == 0: + raise ValueError("At least one EvaluationCategory must be specified.") + + formatted = [f"**{category.value}**:\n{evaluations[category]}" for category in categories] + prompt = system_template.format(categories="\n\n".join(formatted)) + + return prompt \ No newline at end of file diff --git a/analysis/chat_history/history_analysis.ipynb b/analysis/chat_history/history_analysis.ipynb index 02903fe8ff..66185735b5 100644 --- a/analysis/chat_history/history_analysis.ipynb +++ b/analysis/chat_history/history_analysis.ipynb @@ -4,38 +4,52 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup and init" + "# Setup and init" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Environment and dependencies\n", + "## Environment and dependencies\n", "\n", "1. Create and activate a vitual env, e.g., `python -m venv .venv`\n", "1. Install dependencies with `pip install -r requirements-dev.txt`\n", "1. Copy `.env.example` and rename to `.env`.\n", - " 1. Provide the required values using the target Cosmos DB that has the conversations data you're interested in." + " 1. Provide the required values using the target Cosmos DB that has the conversations data you're interested in.\n", + " 1. Also provide the required Azure OpenAI values if you plan to analyze conversations data in the AI Analysis section." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Get conversation data" + "# Get conversation data" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from history import *\n", + "import json\n", "from dotenv import load_dotenv\n", "import pandas as pd\n", - "import json\n", + "from history import get_conversations, extend_dataframe\n", + "from inference import evaluate_chat_response\n", + "from eval_prompt import EvaluationCateogry\n", "\n", "load_dotenv() " ] @@ -44,7 +58,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Date filtering\n", + "## Date filtering\n", "\n", "Results can be filtered by start and/or end dates. Providing a value for both will return conversations between the two dates, inclusive. Passing `None` for one and a value for the other will produce a before or after filter accordingly. Passing `None` for both will return all results. \n", "\n", @@ -58,8 +72,8 @@ "outputs": [], "source": [ "# Get conversations for a date range (also supports only one date being provided)\n", - "start_date = pd.to_datetime(\"2024-01-29\")\n", - "end_date = pd.to_datetime(\"2024-01-30\")\n", + "start_date = pd.to_datetime(\"2024-01-30\")\n", + "end_date = pd.to_datetime(\"2024-01-31\")\n", "\n", "# Uncomment to use None for start_date and end_date to get all conversations\n", "# start_date = None\n", @@ -70,14 +84,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Query" + "## Query Cosmos DB" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "159 conversation turns from date range 2024-01-30 00:00:00 to 2024-01-31 00:00:00 found\n" + ] + }, { "data": { "text/html": [ @@ -111,34 +132,33 @@ " \n", " \n", " 0\n", - " c515b325-31d8-4902-8eb0-bfdf8aa91551\n", - " 2024-01-29T23:59:31.206075\n", - " 2024-01-29T23:59:54.315737\n", - " {'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',...\n", - " ecccf511-35f3-a6e8-2881-02af2addf17a\n", - " {'citations': [{'content': 'Microsoft Research...\n", - " {'id': '', 'model': 'gpt-4', 'created': 170657...\n", + " 8fdd6606-2cfb-4feb-999f-9b7907488ac4\n", + " 2024-01-30T23:33:21.020666\n", + " 2024-01-30T23:33:40.133918\n", + " {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',...\n", + " 5e815061-db1b-6c40-e461-0edc8b8ea8ac\n", + " {'citations': [{'content': 'Title: Accurate Vi...\n", + " {'id': '', 'model': 'gpt-4', 'created': 170665...\n", " \n", " \n", " 1\n", - " 030777f3-f05a-4177-bd8a-57ff4f418262\n", - " 2024-01-29T23:57:41.405727\n", - " 2024-01-29T23:58:13.572655\n", - " {'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',...\n", - " fcdd1c9d-dff4-73c4-a137-9dba99990983\n", - " {'citations': [{'content': '. \n", - "GPT -4 and the ...\n", - " {'id': '', 'model': 'gpt-4', 'created': 170657...\n", + " 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c\n", + " 2024-01-30T23:32:29.742239\n", + " NaN\n", + " {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',...\n", + " 7232ea15-8bc1-0299-dd24-b63772f1f678\n", + " NaN\n", + " NaN\n", " \n", " \n", " 2\n", - " 6e8e6bd7-56e8-4abd-845e-a5e6649d8a8a\n", - " 2024-01-29T23:56:22.674996\n", - " 2024-01-29T23:57:08.228601\n", - " {'id': '508917c2-bc62-ecda-bb98-d6f940379334',...\n", - " c3cb145e-63be-cc87-7cba-4ca2f15d0f78\n", - " {'citations': [{'content': 'Title: Research Fo...\n", - " {'id': '', 'model': 'gpt-4', 'created': 170657...\n", + " 9921095d-f409-4db5-a2fd-a1e52edfc59d\n", + " 2024-01-30T23:11:26.711546\n", + " 2024-01-30T23:11:46.184943\n", + " {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',...\n", + " 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5\n", + " {'citations': [{'content': 'Title: MSR CORE’s ...\n", + " {'id': '', 'model': 'gpt-4', 'created': 170665...\n", " \n", " \n", "\n", @@ -146,38 +166,37 @@ ], "text/plain": [ " id timestamp \\\n", - "0 c515b325-31d8-4902-8eb0-bfdf8aa91551 2024-01-29T23:59:31.206075 \n", - "1 030777f3-f05a-4177-bd8a-57ff4f418262 2024-01-29T23:57:41.405727 \n", - "2 6e8e6bd7-56e8-4abd-845e-a5e6649d8a8a 2024-01-29T23:56:22.674996 \n", + "0 8fdd6606-2cfb-4feb-999f-9b7907488ac4 2024-01-30T23:33:21.020666 \n", + "1 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c 2024-01-30T23:32:29.742239 \n", + "2 9921095d-f409-4db5-a2fd-a1e52edfc59d 2024-01-30T23:11:26.711546 \n", "\n", " response_timestamp \\\n", - "0 2024-01-29T23:59:54.315737 \n", - "1 2024-01-29T23:58:13.572655 \n", - "2 2024-01-29T23:57:08.228601 \n", + "0 2024-01-30T23:33:40.133918 \n", + "1 NaN \n", + "2 2024-01-30T23:11:46.184943 \n", "\n", " user_query \\\n", - "0 {'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',... \n", - "1 {'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',... \n", - "2 {'id': '508917c2-bc62-ecda-bb98-d6f940379334',... \n", + "0 {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',... \n", + "1 {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',... \n", + "2 {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',... \n", "\n", " conversation_id \\\n", - "0 ecccf511-35f3-a6e8-2881-02af2addf17a \n", - "1 fcdd1c9d-dff4-73c4-a137-9dba99990983 \n", - "2 c3cb145e-63be-cc87-7cba-4ca2f15d0f78 \n", + "0 5e815061-db1b-6c40-e461-0edc8b8ea8ac \n", + "1 7232ea15-8bc1-0299-dd24-b63772f1f678 \n", + "2 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", "\n", " context \\\n", - "0 {'citations': [{'content': 'Microsoft Research... \n", - "1 {'citations': [{'content': '. \n", - "GPT -4 and the ... \n", - "2 {'citations': [{'content': 'Title: Research Fo... \n", + "0 {'citations': [{'content': 'Title: Accurate Vi... \n", + "1 NaN \n", + "2 {'citations': [{'content': 'Title: MSR CORE’s ... \n", "\n", " chat_response \n", - "0 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", - "1 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", - "2 {'id': '', 'model': 'gpt-4', 'created': 170657... " + "0 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "1 NaN \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170665... " ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -186,6 +205,9 @@ "dataset = get_conversations(start_date=start_date, end_date=end_date)\n", "df = pd.DataFrame(dataset)\n", "\n", + "row_count = df.shape[0]\n", + "print(f\"{row_count} conversation turns from date range {start_date} to {end_date} found\")\n", + "\n", "df.head(3)" ] }, @@ -198,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -238,46 +260,45 @@ " \n", " \n", " 0\n", - " c515b325-31d8-4902-8eb0-bfdf8aa91551\n", - " 2024-01-29 23:59:31.206075\n", - " 2024-01-29 23:59:54.315737\n", - " {'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',...\n", - " ecccf511-35f3-a6e8-2881-02af2addf17a\n", - " {'citations': [{'content': 'Microsoft Research...\n", - " {'id': '', 'model': 'gpt-4', 'created': 170657...\n", - " What kind of problems is MSR's AI research try...\n", - " Microsoft Research's AI research is focused on...\n", - " 23.109662\n", + " 8fdd6606-2cfb-4feb-999f-9b7907488ac4\n", + " 2024-01-30 23:33:21.020666\n", + " 2024-01-30 23:33:40.133918\n", + " {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',...\n", + " 5e815061-db1b-6c40-e461-0edc8b8ea8ac\n", + " {'citations': [{'content': 'Title: Accurate Vi...\n", + " {'id': '', 'model': 'gpt-4', 'created': 170665...\n", + " Summarize the main three points of Peter's key...\n", + " Peter Lee's keynote address at the Microsoft R...\n", + " 19.113252\n", " 1\n", " \n", " \n", " 1\n", - " 030777f3-f05a-4177-bd8a-57ff4f418262\n", - " 2024-01-29 23:57:41.405727\n", - " 2024-01-29 23:58:13.572655\n", - " {'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',...\n", - " fcdd1c9d-dff4-73c4-a137-9dba99990983\n", - " {'citations': [{'content': '. \n", - "GPT -4 and the ...\n", - " {'id': '', 'model': 'gpt-4', 'created': 170657...\n", + " 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c\n", + " 2024-01-30 23:32:29.742239\n", + " NaT\n", + " {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',...\n", + " 7232ea15-8bc1-0299-dd24-b63772f1f678\n", + " NaN\n", + " NaN\n", " Can you summarize the key challenges tackled b...\n", - " The Microsoft Research Forum for this year add...\n", - " 32.166928\n", + " None\n", + " NaN\n", " 1\n", " \n", " \n", " 2\n", - " 6e8e6bd7-56e8-4abd-845e-a5e6649d8a8a\n", - " 2024-01-29 23:56:22.674996\n", - " 2024-01-29 23:57:08.228601\n", - " {'id': '508917c2-bc62-ecda-bb98-d6f940379334',...\n", - " c3cb145e-63be-cc87-7cba-4ca2f15d0f78\n", - " {'citations': [{'content': 'Title: Research Fo...\n", - " {'id': '', 'model': 'gpt-4', 'created': 170657...\n", - " what datasets were used in novel ways by Micro...\n", - " In the NeurIPS 2023 submissions, Microsoft res...\n", - " 45.553605\n", - " 4\n", + " 9921095d-f409-4db5-a2fd-a1e52edfc59d\n", + " 2024-01-30 23:11:26.711546\n", + " 2024-01-30 23:11:46.184943\n", + " {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',...\n", + " 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5\n", + " {'citations': [{'content': 'Title: MSR CORE’s ...\n", + " {'id': '', 'model': 'gpt-4', 'created': 170665...\n", + " tell me about msr chat\n", + " Microsoft Research (MSR) has been exploring th...\n", + " 19.473397\n", + " 2\n", " \n", " \n", "\n", @@ -285,48 +306,47 @@ ], "text/plain": [ " id timestamp \\\n", - "0 c515b325-31d8-4902-8eb0-bfdf8aa91551 2024-01-29 23:59:31.206075 \n", - "1 030777f3-f05a-4177-bd8a-57ff4f418262 2024-01-29 23:57:41.405727 \n", - "2 6e8e6bd7-56e8-4abd-845e-a5e6649d8a8a 2024-01-29 23:56:22.674996 \n", + "0 8fdd6606-2cfb-4feb-999f-9b7907488ac4 2024-01-30 23:33:21.020666 \n", + "1 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c 2024-01-30 23:32:29.742239 \n", + "2 9921095d-f409-4db5-a2fd-a1e52edfc59d 2024-01-30 23:11:26.711546 \n", "\n", " response_timestamp \\\n", - "0 2024-01-29 23:59:54.315737 \n", - "1 2024-01-29 23:58:13.572655 \n", - "2 2024-01-29 23:57:08.228601 \n", + "0 2024-01-30 23:33:40.133918 \n", + "1 NaT \n", + "2 2024-01-30 23:11:46.184943 \n", "\n", " user_query \\\n", - "0 {'id': 'ecccf511-35f3-a6e8-2881-02af2addf17a',... \n", - "1 {'id': 'fcdd1c9d-dff4-73c4-a137-9dba99990983',... \n", - "2 {'id': '508917c2-bc62-ecda-bb98-d6f940379334',... \n", + "0 {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',... \n", + "1 {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',... \n", + "2 {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',... \n", "\n", " conversation_id \\\n", - "0 ecccf511-35f3-a6e8-2881-02af2addf17a \n", - "1 fcdd1c9d-dff4-73c4-a137-9dba99990983 \n", - "2 c3cb145e-63be-cc87-7cba-4ca2f15d0f78 \n", + "0 5e815061-db1b-6c40-e461-0edc8b8ea8ac \n", + "1 7232ea15-8bc1-0299-dd24-b63772f1f678 \n", + "2 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", "\n", " context \\\n", - "0 {'citations': [{'content': 'Microsoft Research... \n", - "1 {'citations': [{'content': '. \n", - "GPT -4 and the ... \n", - "2 {'citations': [{'content': 'Title: Research Fo... \n", + "0 {'citations': [{'content': 'Title: Accurate Vi... \n", + "1 NaN \n", + "2 {'citations': [{'content': 'Title: MSR CORE’s ... \n", "\n", " chat_response \\\n", - "0 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", - "1 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", - "2 {'id': '', 'model': 'gpt-4', 'created': 170657... \n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "1 NaN \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", "\n", " user_input \\\n", - "0 What kind of problems is MSR's AI research try... \n", + "0 Summarize the main three points of Peter's key... \n", "1 Can you summarize the key challenges tackled b... \n", - "2 what datasets were used in novel ways by Micro... \n", + "2 tell me about msr chat \n", "\n", " answer duration turn_count \n", - "0 Microsoft Research's AI research is focused on... 23.109662 1 \n", - "1 The Microsoft Research Forum for this year add... 32.166928 1 \n", - "2 In the NeurIPS 2023 submissions, Microsoft res... 45.553605 4 " + "0 Peter Lee's keynote address at the Microsoft R... 19.113252 1 \n", + "1 None NaN 1 \n", + "2 Microsoft Research (MSR) has been exploring th... 19.473397 2 " ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -341,7 +361,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Save to Excel" + "# Analyzing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save to Excel\n", + "\n", + "Due the Excel's **limit of 32,767 characters per cell**, saving the DataFrame as an .xlsx file may yield unexpected and innacurate results when analyzed. Depending on the nature of your data it may be better to execute the cells in the `AI Analysis` section to analyze conversations using GPT-4." ] }, { @@ -371,6 +400,267 @@ "\n", "out_dataset.head(4)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AI Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating chat response for 8fdd6606-2cfb-4feb-999f-9b7907488ac4 ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c ...\n", + "\u001b[91mERROR processing id 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c: The user_input, context, and answer fields must be provided.\u001b[0m\n", + "Evaluating chat response for 9921095d-f409-4db5-a2fd-a1e52edfc59d ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for f1242f7c-41f1-4608-8141-b46de131879a ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for 71375972-e0e5-470a-a301-100a4f401481 ...\n", + "Processing evaulation response ...\n", + "Evaluating chat response for eefd1c18-f176-46ca-bea6-0218f04519b6 ...\n", + "\u001b[91mERROR processing id eefd1c18-f176-46ca-bea6-0218f04519b6: The user_input, context, and answer fields must be provided.\u001b[0m\n", + "Evaluating chat response for aade9ab3-38e2-401e-ba7e-66bbf7756017 ...\n", + "Processing evaulation response ...\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestampresponse_timestampuser_queryconversation_idcontextchat_responseuser_inputanswerdurationturn_countevaluationscores
08fdd6606-2cfb-4feb-999f-9b7907488ac42024-01-30 23:33:21.0206662024-01-30 23:33:40.133918{'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',...5e815061-db1b-6c40-e461-0edc8b8ea8ac{'citations': [{'content': 'Title: Accurate Vi...{'id': '', 'model': 'gpt-4', 'created': 170665...Summarize the main three points of Peter's key...Peter Lee's keynote address at the Microsoft R...19.1132521The response provided does not contain any ver...{\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"...
190fe45a4-2c3e-4556-af91-ee0e2fa95b1c2024-01-30 23:32:29.742239NaT{'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',...7232ea15-8bc1-0299-dd24-b63772f1f678NaNNaNCan you summarize the key challenges tackled b...NoneNaN1nannan
29921095d-f409-4db5-a2fd-a1e52edfc59d2024-01-30 23:11:26.7115462024-01-30 23:11:46.184943{'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',...61a0fe48-9478-e112-ebd4-c5f3c35cb5e5{'citations': [{'content': 'Title: MSR CORE’s ...{'id': '', 'model': 'gpt-4', 'created': 170665...tell me about msr chatMicrosoft Research (MSR) has been exploring th...19.4733972The response lacks accuracy and relevance as i...{\"accuracy\": 0, \"relevance\": 0, \"sentiment\": 3}
3f1242f7c-41f1-4608-8141-b46de131879a2024-01-30 23:11:09.0442422024-01-30 23:11:17.530884{'id': '61a0fe48-9478-e112-ebd4-c5f3c35cb5e5',...61a0fe48-9478-e112-ebd4-c5f3c35cb5e5{'citations': [{'content': 'Title: MSR CORE’s ...{'id': '', 'model': 'gpt-4', 'created': 170665...msrIt seems like your question is incomplete. Cou...8.4866422The AI response is not applicable as it was pr...{\"accuracy\": 3, \"relevance\": 3, \"sentiment\": \"...
471375972-e0e5-470a-a301-100a4f4014812024-01-30 23:06:03.0881732024-01-30 23:06:33.381400{'id': '0804017d-1858-2bbd-82d7-00821fa74f1a',...0804017d-1858-2bbd-82d7-00821fa74f1a{'citations': [{'content': 'Transcript - Light...{'id': '', 'model': 'gpt-4', 'created': 170665...Can you summarize the key challenges tackled b...Microsoft Research (MSR) addressed several key...30.2932271The response contains fabricated references (e...{\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"...
\n", + "
" + ], + "text/plain": [ + " id timestamp \\\n", + "0 8fdd6606-2cfb-4feb-999f-9b7907488ac4 2024-01-30 23:33:21.020666 \n", + "1 90fe45a4-2c3e-4556-af91-ee0e2fa95b1c 2024-01-30 23:32:29.742239 \n", + "2 9921095d-f409-4db5-a2fd-a1e52edfc59d 2024-01-30 23:11:26.711546 \n", + "3 f1242f7c-41f1-4608-8141-b46de131879a 2024-01-30 23:11:09.044242 \n", + "4 71375972-e0e5-470a-a301-100a4f401481 2024-01-30 23:06:03.088173 \n", + "\n", + " response_timestamp \\\n", + "0 2024-01-30 23:33:40.133918 \n", + "1 NaT \n", + "2 2024-01-30 23:11:46.184943 \n", + "3 2024-01-30 23:11:17.530884 \n", + "4 2024-01-30 23:06:33.381400 \n", + "\n", + " user_query \\\n", + "0 {'id': '5e815061-db1b-6c40-e461-0edc8b8ea8ac',... \n", + "1 {'id': '7232ea15-8bc1-0299-dd24-b63772f1f678',... \n", + "2 {'id': '8a66c894-b425-2d81-0251-15dcb7d6bd08',... \n", + "3 {'id': '61a0fe48-9478-e112-ebd4-c5f3c35cb5e5',... \n", + "4 {'id': '0804017d-1858-2bbd-82d7-00821fa74f1a',... \n", + "\n", + " conversation_id \\\n", + "0 5e815061-db1b-6c40-e461-0edc8b8ea8ac \n", + "1 7232ea15-8bc1-0299-dd24-b63772f1f678 \n", + "2 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", + "3 61a0fe48-9478-e112-ebd4-c5f3c35cb5e5 \n", + "4 0804017d-1858-2bbd-82d7-00821fa74f1a \n", + "\n", + " context \\\n", + "0 {'citations': [{'content': 'Title: Accurate Vi... \n", + "1 NaN \n", + "2 {'citations': [{'content': 'Title: MSR CORE’s ... \n", + "3 {'citations': [{'content': 'Title: MSR CORE’s ... \n", + "4 {'citations': [{'content': 'Transcript - Light... \n", + "\n", + " chat_response \\\n", + "0 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "1 NaN \n", + "2 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "3 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "4 {'id': '', 'model': 'gpt-4', 'created': 170665... \n", + "\n", + " user_input \\\n", + "0 Summarize the main three points of Peter's key... \n", + "1 Can you summarize the key challenges tackled b... \n", + "2 tell me about msr chat \n", + "3 msr \n", + "4 Can you summarize the key challenges tackled b... \n", + "\n", + " answer duration turn_count \\\n", + "0 Peter Lee's keynote address at the Microsoft R... 19.113252 1 \n", + "1 None NaN 1 \n", + "2 Microsoft Research (MSR) has been exploring th... 19.473397 2 \n", + "3 It seems like your question is incomplete. Cou... 8.486642 2 \n", + "4 Microsoft Research (MSR) addressed several key... 30.293227 1 \n", + "\n", + " evaluation \\\n", + "0 The response provided does not contain any ver... \n", + "1 nan \n", + "2 The response lacks accuracy and relevance as i... \n", + "3 The AI response is not applicable as it was pr... \n", + "4 The response contains fabricated references (e... \n", + "\n", + " scores \n", + "0 {\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"... \n", + "1 nan \n", + "2 {\"accuracy\": 0, \"relevance\": 0, \"sentiment\": 3} \n", + "3 {\"accuracy\": 3, \"relevance\": 3, \"sentiment\": \"... \n", + "4 {\"accuracy\": 0, \"relevance\": 0, \"sentiment\": \"... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Categories to prompt the AI to evaluate on. \n", + "# Options: ACCURACY, RELEVANCE, COHERENCE, FLUENCY, DEPTH, INSIGHTFULNESS, OBJECTIVITY, CONTEXTUAL_APPROPRIATENESS, SENTIMENT\n", + "categories = [\n", + " EvaluationCateogry.ACCURACY,\n", + " EvaluationCateogry.RELEVANCE,\n", + " EvaluationCateogry.SENTIMENT\n", + "]\n", + "\n", + "num_to_process = 5\n", + "count = 0\n", + "\n", + "for index, row in df.iterrows():\n", + " try:\n", + " evaluation, scores = evaluate_chat_response(row, categories)\n", + "\n", + " # Extend the current row with the evaluation and scores\n", + " df.loc[index, \"evaluation\"] = evaluation\n", + " df.loc[index, \"scores\"] = json.dumps(scores)\n", + "\n", + " # Comment these lines to process all rows\n", + " count += 1\n", + " if count >= num_to_process:\n", + " break\n", + " except Exception as e:\n", + " print(f\"\\033[91mERROR processing id {row['id']}: {e}\\033[0m\")\n", + " continue\n", + "\n", + "df.head(5)" + ] } ], "metadata": { diff --git a/analysis/chat_history/inference.py b/analysis/chat_history/inference.py new file mode 100644 index 0000000000..a4d1259a25 --- /dev/null +++ b/analysis/chat_history/inference.py @@ -0,0 +1,61 @@ +import os +import json +from typing import Any +import openai +from pandas import Series, isna +from eval_prompt import EvaluationCateogry, generate_evaluation_system_prompt + + +AZURE_OPENAI_TEMPERATURE = os.getenv("AZURE_OPENAI_TEMPERATURE") +AZURE_OPENAI_TOP_P = os.getenv("AZURE_OPENAI_TOP_P") +AZURE_OPENAI_MAX_TOKENS = os.getenv("AZURE_OPENAI_MAX_TOKENS") +AZURE_OPENAI_STOP_SEQUENCE = os.getenv("AZURE_OPENAI_STOP_SEQUENCE") +AZURE_OPENAI_RESOURCE = os.getenv("AZURE_OPENAI_RESOURCE") +AZURE_OPENAI_MODEL = os.getenv("AZURE_OPENAI_MODEL") +AZURE_OPENAI_MODEL_NAME = os.getenv("AZURE_OPENAI_MODEL_NAME") +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") +AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") + +openai.api_type = "azure" +openai.api_base = AZURE_OPENAI_ENDPOINT if AZURE_OPENAI_ENDPOINT else f"https://{AZURE_OPENAI_RESOURCE}.openai.azure.com/" +openai.api_version = "2023-08-01-preview" +openai.api_key = AZURE_OPENAI_KEY + +def evaluate_chat_response(row: Series, categories: list[EvaluationCateogry]) -> tuple[str, dict[str, Any]]: + print(f"Evaluating chat response for {row['id']} ...") + + user_input = row['user_input'] + context = row['context'] + answer = row['answer'] + + if not user_input or not context or not answer or isna(user_input) or isna(context) or isna(answer): + raise ValueError("The user_input, context, and answer fields must be provided.") + + system_message = generate_evaluation_system_prompt(categories) + messages = [ + { "role": "system", "content": system_message }, + { "role": "user", "content": user_input }, + { "role": "assistant", "content": answer }, + { "role": "user", "content": "# Context\n\nThe previous AI response was based on this context:\n\n{context}\n\nEvaluation:\n" } + ] + + response = openai.ChatCompletion.create( + engine=AZURE_OPENAI_MODEL, + messages=messages, + temperature=float(AZURE_OPENAI_TEMPERATURE), + max_tokens=int(AZURE_OPENAI_MAX_TOKENS), + top_p=float(AZURE_OPENAI_TOP_P), + stop=None, + stream=False + ) + + return process_response(response) + +def process_response(response): + print(f"Processing evaulation response ...") + + result = json.loads(response["choices"][0]["message"]["content"]) + evaluation = result['evaluation'] + scores = result['scores'] + + return evaluation, scores \ No newline at end of file