diff --git a/openshift/kustomize/services/auto-clipper/base/deploy.yaml b/openshift/kustomize/services/auto-clipper/base/deploy.yaml
index 10f68f2c7..d21848891 100644
--- a/openshift/kustomize/services/auto-clipper/base/deploy.yaml
+++ b/openshift/kustomize/services/auto-clipper/base/deploy.yaml
@@ -160,6 +160,26 @@ spec:
name: azure-openai
key: AZURE_OPENAI_KEY
+ # Azure Video Indexer Configuration (optional - for speaker identification)
+ - name: Service__AzureVideoIndexerAccountId
+ valueFrom:
+ secretKeyRef:
+ name: azure-video-indexer
+ key: AZURE_VIDEO_INDEXER_ACCOUNT_ID
+ optional: true
+ - name: Service__AzureVideoIndexerApiKey
+ valueFrom:
+ secretKeyRef:
+ name: azure-video-indexer
+ key: AZURE_VIDEO_INDEXER_API_KEY
+ optional: true
+ - name: Service__AzureVideoIndexerLocation
+ valueFrom:
+ secretKeyRef:
+ name: azure-video-indexer
+ key: AZURE_VIDEO_INDEXER_LOCATION
+ optional: true
+
# Service Configuration
- name: Service__MaxFailLimit
valueFrom:
diff --git a/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml
index 4ca71785f..59ff13d76 100644
--- a/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml
+++ b/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml
@@ -16,6 +16,9 @@ secretGenerator:
- name: azure-openai
type: stringData
env: openai.env
+ - name: azure-video-indexer
+ type: stringData
+ env: video-indexer.env
patches:
- target:
diff --git a/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml
index d783baf42..682fa26d8 100644
--- a/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml
+++ b/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml
@@ -16,6 +16,9 @@ secretGenerator:
- name: azure-openai
type: stringData
env: openai.env
+ - name: azure-video-indexer
+ type: stringData
+ env: video-indexer.env
patches:
- target:
diff --git a/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml
index 036910c0c..d8b5bcce9 100644
--- a/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml
+++ b/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml
@@ -16,6 +16,9 @@ secretGenerator:
- name: azure-openai
type: stringData
env: openai.env
+ - name: azure-video-indexer
+ type: stringData
+ env: video-indexer.env
patches:
- target:
diff --git a/services/net/auto-clipper/AutoClipperManager.cs b/services/net/auto-clipper/AutoClipperManager.cs
index b05a11d29..07de0f0e5 100644
--- a/services/net/auto-clipper/AutoClipperManager.cs
+++ b/services/net/auto-clipper/AutoClipperManager.cs
@@ -635,23 +635,32 @@ private static void CleanupTemporaryFiles(bool isSyncedToS3, params string[] fil
///
/// Format the transcript to include newlines.
+ /// When speaker information is available (from Video Indexer), includes speaker prefix.
+ /// Azure Speech transcripts have no speaker info and will output plain text.
///
- ///
- ///
+ /// Transcript segments with optional speaker information.
+ /// Formatted transcript string.
private static string BuildTranscriptDocument(IReadOnlyList segments)
{
if (segments == null || segments.Count == 0) return string.Empty;
var sb = new StringBuilder();
- var index = 1;
foreach (var segment in segments)
{
if (string.IsNullOrWhiteSpace(segment.Text)) continue;
- // sb.AppendLine(index.ToString(CultureInfo.InvariantCulture));
- // sb.AppendLine($"{FormatTimestamp(segment.Start)} --> {FormatTimestamp(segment.End)}");
+
+ // Add speaker prefix if available (Video Indexer provides this)
+ if (!string.IsNullOrWhiteSpace(segment.SpeakerName))
+ {
+ sb.Append($"{segment.SpeakerName}: ");
+ }
+ else if (segment.SpeakerId.HasValue)
+ {
+ sb.Append($"speaker{segment.SpeakerId}: ");
+ }
+
sb.AppendLine(segment.Text.Trim());
sb.AppendLine();
- index++;
}
return sb.ToString().Trim();
diff --git a/services/net/auto-clipper/AutoClipperService.cs b/services/net/auto-clipper/AutoClipperService.cs
index 843a40853..ed18540e5 100644
--- a/services/net/auto-clipper/AutoClipperService.cs
+++ b/services/net/auto-clipper/AutoClipperService.cs
@@ -57,6 +57,14 @@ protected override IServiceCollection ConfigureServices(IServiceCollection servi
services.AddSingleton();
services.AddHttpClient();
+ // Register Video Indexer client if configured
+ var videoIndexerAccountId = this.Configuration.GetSection("Service")["AzureVideoIndexerAccountId"];
+ var videoIndexerApiKey = this.Configuration.GetSection("Service")["AzureVideoIndexerApiKey"];
+ if (!string.IsNullOrWhiteSpace(videoIndexerAccountId) && !string.IsNullOrWhiteSpace(videoIndexerApiKey))
+ {
+ services.AddHttpClient();
+ }
+
// TODO: Figure out how to validate without resulting in aggregating the config values.
// services.AddOptions()
// .Bind(this.Configuration.GetSection("Service"))
diff --git a/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs b/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs
new file mode 100644
index 000000000..354d3921d
--- /dev/null
+++ b/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs
@@ -0,0 +1,363 @@
+using System.Net.Http.Headers;
+using System.Text.Json;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using TNO.Services.AutoClipper.Config;
+
+namespace TNO.Services.AutoClipper.Azure;
+
+///
+/// Client for Azure Video Indexer API.
+/// Handles video upload, processing, and transcript extraction with speaker identification.
+///
+public class AzureVideoIndexerClient : IAzureVideoIndexerClient
+{
+ private const string ApiBaseUrl = "https://api.videoindexer.ai";
+
+ private readonly HttpClient _httpClient;
+ private readonly AutoClipperOptions _options;
+ private readonly ILogger _logger;
+
+ private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web)
+ {
+ PropertyNameCaseInsensitive = true
+ };
+
+ public AzureVideoIndexerClient(
+ HttpClient httpClient,
+ IOptions options,
+ ILogger logger)
+ {
+ _httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
+ _options = options?.Value ?? throw new ArgumentNullException(nameof(options));
+ _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+
+ if (string.IsNullOrWhiteSpace(_options.AzureVideoIndexerAccountId))
+ throw new ArgumentException("AzureVideoIndexerAccountId is required");
+ if (string.IsNullOrWhiteSpace(_options.AzureVideoIndexerApiKey))
+ throw new ArgumentException("AzureVideoIndexerApiKey is required");
+ }
+
+ ///
+ public async Task> TranscribeAsync(
+ string filePath,
+ VideoIndexerRequest request,
+ CancellationToken cancellationToken = default)
+ {
+ if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath))
+ throw new FileNotFoundException("Media file not found", filePath);
+
+ _logger.LogInformation("Starting Video Indexer transcription for {File}", filePath);
+
+ // Step 1: Get access token
+ var accessToken = await GetAccessTokenAsync(cancellationToken);
+ _logger.LogDebug("Obtained access token");
+
+ // Step 2: Upload video
+ var videoId = await UploadVideoAsync(filePath, accessToken, request, cancellationToken);
+ _logger.LogInformation("Video uploaded with ID: {VideoId}", videoId);
+
+ try
+ {
+ // Step 3: Wait for processing to complete
+ var indexJson = await WaitForProcessingAsync(videoId, accessToken, cancellationToken);
+ _logger.LogInformation("Video processing completed");
+
+ // Step 4: Parse transcript
+ var segments = ParseTranscript(indexJson, request.IncludeSpeakerLabels);
+ _logger.LogInformation("Parsed {Count} transcript segments", segments.Count);
+
+ return segments;
+ }
+ finally
+ {
+ // Clean up: delete the video from Video Indexer
+ await TryDeleteVideoAsync(videoId, accessToken);
+ }
+ }
+
+ private async Task GetAccessTokenAsync(CancellationToken cancellationToken)
+ {
+ var url = $"{ApiBaseUrl}/Auth/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/AccessToken?allowEdit=true";
+
+ using var request = new HttpRequestMessage(HttpMethod.Get, url);
+ request.Headers.Add("Ocp-Apim-Subscription-Key", _options.AzureVideoIndexerApiKey);
+
+ using var response = await _httpClient.SendAsync(request, cancellationToken);
+ var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+ if (!response.IsSuccessStatusCode)
+ throw new InvalidOperationException($"Failed to get access token: {response.StatusCode} - {body}");
+
+ // Token is returned as a quoted string
+ return body.Trim('"');
+ }
+
+ private async Task UploadVideoAsync(
+ string filePath,
+ string accessToken,
+ VideoIndexerRequest request,
+ CancellationToken cancellationToken)
+ {
+ var fileName = Path.GetFileName(filePath);
+ var videoName = $"AutoClipper-{Guid.NewGuid():N}-{fileName}";
+
+ // Build upload URL with query parameters
+ var queryParams = new List
+ {
+ $"accessToken={Uri.EscapeDataString(accessToken)}",
+ $"name={Uri.EscapeDataString(videoName)}",
+ $"language={Uri.EscapeDataString(request.Language)}",
+ "privacy=Private",
+ "indexingPreset=AudioOnly" // We only need audio analysis for transcription
+ };
+
+ if (!string.IsNullOrWhiteSpace(request.PersonModelId))
+ {
+ queryParams.Add($"personModelId={Uri.EscapeDataString(request.PersonModelId)}");
+ }
+
+ var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos?{string.Join("&", queryParams)}";
+
+ _logger.LogDebug("Uploading video to: {Url}", url.Split('?')[0]);
+
+ using var content = new MultipartFormDataContent();
+ await using var fileStream = File.OpenRead(filePath);
+ var fileContent = new StreamContent(fileStream);
+ fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream");
+ content.Add(fileContent, "file", fileName);
+
+ using var response = await _httpClient.PostAsync(url, content, cancellationToken);
+ var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+ if (!response.IsSuccessStatusCode)
+ throw new InvalidOperationException($"Failed to upload video: {response.StatusCode} - {body}");
+
+ using var doc = JsonDocument.Parse(body);
+ var videoId = doc.RootElement.GetProperty("id").GetString();
+
+ if (string.IsNullOrWhiteSpace(videoId))
+ throw new InvalidOperationException("Video Indexer did not return a video ID");
+
+ return videoId;
+ }
+
+ private async Task WaitForProcessingAsync(
+ string videoId,
+ string accessToken,
+ CancellationToken cancellationToken)
+ {
+ var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos/{videoId}/Index?accessToken={Uri.EscapeDataString(accessToken)}";
+ var timeout = TimeSpan.FromMinutes(_options.AzureVideoIndexerTimeoutMinutes);
+ var pollInterval = TimeSpan.FromSeconds(_options.AzureVideoIndexerPollingIntervalSeconds);
+ var startTime = DateTime.UtcNow;
+
+ while (true)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ if (DateTime.UtcNow - startTime > timeout)
+ throw new TimeoutException($"Video Indexer processing did not complete within {_options.AzureVideoIndexerTimeoutMinutes} minutes");
+
+ using var response = await _httpClient.GetAsync(url, cancellationToken);
+ var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+ if (!response.IsSuccessStatusCode)
+ throw new InvalidOperationException($"Failed to get video index: {response.StatusCode} - {body}");
+
+ using var doc = JsonDocument.Parse(body);
+ var state = doc.RootElement.GetProperty("state").GetString();
+
+ var elapsed = (int)(DateTime.UtcNow - startTime).TotalSeconds;
+ _logger.LogInformation("Video Indexer status: {State} ({Elapsed}s elapsed)", state, elapsed);
+
+ if (string.Equals(state, "Processed", StringComparison.OrdinalIgnoreCase))
+ {
+ return body;
+ }
+
+ if (string.Equals(state, "Failed", StringComparison.OrdinalIgnoreCase))
+ {
+ var errorMessage = "Unknown error";
+ if (doc.RootElement.TryGetProperty("failureMessage", out var failureMsg))
+ errorMessage = failureMsg.GetString() ?? errorMessage;
+ throw new InvalidOperationException($"Video Indexer processing failed: {errorMessage}");
+ }
+
+ await Task.Delay(pollInterval, cancellationToken);
+ }
+ }
+
+ private IReadOnlyList ParseTranscript(string indexJson, bool includeSpeakerLabels)
+ {
+ var segments = new List();
+
+ using var doc = JsonDocument.Parse(indexJson);
+
+ // Navigate to: videos[0].insights.transcript
+ if (!doc.RootElement.TryGetProperty("videos", out var videos) ||
+ videos.GetArrayLength() == 0)
+ {
+ _logger.LogWarning("No videos found in index response");
+ return segments;
+ }
+
+ var video = videos[0];
+ if (!video.TryGetProperty("insights", out var insights))
+ {
+ _logger.LogWarning("No insights found in video");
+ return segments;
+ }
+
+ // Build speaker name map from faces/speakers if available
+ var speakerNames = BuildSpeakerNameMap(insights);
+
+ // Parse transcript
+ if (!insights.TryGetProperty("transcript", out var transcript) ||
+ transcript.ValueKind != JsonValueKind.Array)
+ {
+ _logger.LogWarning("No transcript found in insights");
+ return segments;
+ }
+
+ foreach (var item in transcript.EnumerateArray())
+ {
+ var text = item.TryGetProperty("text", out var textProp) ? textProp.GetString() : null;
+ if (string.IsNullOrWhiteSpace(text))
+ continue;
+
+ // Parse timestamps from instances array
+ // Video Indexer format: { "instances": [{ "start": "0:00:00.4", "end": "0:00:08.36" }] }
+ var start = TimeSpan.Zero;
+ var end = TimeSpan.Zero;
+
+ if (item.TryGetProperty("instances", out var instances) &&
+ instances.ValueKind == JsonValueKind.Array &&
+ instances.GetArrayLength() > 0)
+ {
+ var firstInstance = instances[0];
+ start = ParseTimestamp(firstInstance, "start");
+ end = ParseTimestamp(firstInstance, "end");
+ }
+
+ if (end <= start)
+ end = start + TimeSpan.FromMilliseconds(100);
+
+ // Parse speaker information
+ int? speakerId = null;
+ string? speakerName = null;
+
+ if (includeSpeakerLabels && item.TryGetProperty("speakerId", out var speakerIdProp))
+ {
+ speakerId = speakerIdProp.ValueKind == JsonValueKind.Number
+ ? speakerIdProp.GetInt32()
+ : int.TryParse(speakerIdProp.GetString(), out var id) ? id : null;
+
+ if (speakerId.HasValue && speakerNames.TryGetValue(speakerId.Value, out var name))
+ {
+ speakerName = name;
+ }
+ }
+
+ segments.Add(new TimestampedTranscript(start, end, text.Trim(), speakerId, speakerName));
+ }
+
+ return segments.OrderBy(s => s.Start).ToList();
+ }
+
+ private Dictionary BuildSpeakerNameMap(JsonElement insights)
+ {
+ var map = new Dictionary();
+
+ // Try to get speaker names from faces (Person Model identification)
+ if (insights.TryGetProperty("faces", out var faces) && faces.ValueKind == JsonValueKind.Array)
+ {
+ foreach (var face in faces.EnumerateArray())
+ {
+ if (!face.TryGetProperty("id", out var idProp))
+ continue;
+
+ var faceId = idProp.ValueKind == JsonValueKind.Number
+ ? idProp.GetInt32()
+ : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+ if (!faceId.HasValue)
+ continue;
+
+ // Get name - could be from Person Model or Unknown
+ var name = face.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+
+ // Skip unknown faces - we'll use speaker ID instead
+ if (!string.IsNullOrWhiteSpace(name) &&
+ !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase))
+ {
+ map[faceId.Value] = name;
+ }
+ }
+ }
+
+ // Also check speakers section
+ if (insights.TryGetProperty("speakers", out var speakers) && speakers.ValueKind == JsonValueKind.Array)
+ {
+ foreach (var speaker in speakers.EnumerateArray())
+ {
+ if (!speaker.TryGetProperty("id", out var idProp))
+ continue;
+
+ var speakerId = idProp.ValueKind == JsonValueKind.Number
+ ? idProp.GetInt32()
+ : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+ if (!speakerId.HasValue || map.ContainsKey(speakerId.Value))
+ continue;
+
+ var name = speaker.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+ if (!string.IsNullOrWhiteSpace(name) &&
+ !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase) &&
+ !name.StartsWith("Speaker", StringComparison.OrdinalIgnoreCase))
+ {
+ map[speakerId.Value] = name;
+ }
+ }
+ }
+
+ return map;
+ }
+
+ private static TimeSpan ParseTimestamp(JsonElement element, string property)
+ {
+ if (!element.TryGetProperty(property, out var prop))
+ return TimeSpan.Zero;
+
+ var value = prop.GetString();
+ if (string.IsNullOrWhiteSpace(value))
+ return TimeSpan.Zero;
+
+ // Video Indexer uses format like "0:00:05.12" or "00:00:05.12"
+ if (TimeSpan.TryParse(value, out var ts))
+ return ts;
+
+ // Try parsing as seconds
+ if (double.TryParse(value, out var seconds))
+ return TimeSpan.FromSeconds(seconds);
+
+ return TimeSpan.Zero;
+ }
+
+ private async Task TryDeleteVideoAsync(string videoId, string accessToken)
+ {
+ try
+ {
+ var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos/{videoId}?accessToken={Uri.EscapeDataString(accessToken)}";
+ using var response = await _httpClient.DeleteAsync(url);
+ if (response.IsSuccessStatusCode)
+ {
+ _logger.LogDebug("Deleted video {VideoId} from Video Indexer", videoId);
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Failed to delete video {VideoId} from Video Indexer", videoId);
+ }
+ }
+}
diff --git a/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs b/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs
index b8742b3d2..99db7736c 100644
--- a/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs
+++ b/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs
@@ -1,11 +1,19 @@
-using System.Collections.Generic;
-using System.IO;
-using System.Threading;
-using System.Threading.Tasks;
-
namespace TNO.Services.AutoClipper.Azure;
+///
+/// Client interface for Azure Video Indexer transcription service.
+///
public interface IAzureVideoIndexerClient
{
- Task> GenerateTranscriptAsync(Stream stream, string fileName, string language, CancellationToken cancellationToken = default);
+ ///
+ /// Transcribes a media file using Azure Video Indexer.
+ ///
+ /// Path to the media file (video or audio).
+ /// Transcription request options.
+ /// Cancellation token.
+ /// List of transcript segments with optional speaker information.
+ Task> TranscribeAsync(
+ string filePath,
+ VideoIndexerRequest request,
+ CancellationToken cancellationToken = default);
}
diff --git a/services/net/auto-clipper/Azure/TimestampedTranscript.cs b/services/net/auto-clipper/Azure/TimestampedTranscript.cs
index 02795aef9..fb11c4958 100644
--- a/services/net/auto-clipper/Azure/TimestampedTranscript.cs
+++ b/services/net/auto-clipper/Azure/TimestampedTranscript.cs
@@ -2,4 +2,13 @@
namespace TNO.Services.AutoClipper.Azure;
-public record TimestampedTranscript(TimeSpan Start, TimeSpan End, string Text);
+///
+/// Represents a transcript segment with optional speaker identification.
+///
+public record TimestampedTranscript(
+ TimeSpan Start,
+ TimeSpan End,
+ string Text,
+ int? SpeakerId = null,
+ string? SpeakerName = null
+);
diff --git a/services/net/auto-clipper/Azure/VideoIndexerRequest.cs b/services/net/auto-clipper/Azure/VideoIndexerRequest.cs
new file mode 100644
index 000000000..98311499a
--- /dev/null
+++ b/services/net/auto-clipper/Azure/VideoIndexerRequest.cs
@@ -0,0 +1,23 @@
+namespace TNO.Services.AutoClipper.Azure;
+
+///
+/// Request options for Azure Video Indexer transcription.
+///
+public class VideoIndexerRequest
+{
+ ///
+ /// Language code for transcription (e.g., "en-US", "zh-CN").
+ ///
+ public string Language { get; init; } = "en-US";
+
+ ///
+ /// Optional Person Model ID for speaker identification.
+ /// When provided, Video Indexer will attempt to identify known faces.
+ ///
+ public string? PersonModelId { get; init; }
+
+ ///
+ /// Whether to include speaker labels in the output.
+ ///
+ public bool IncludeSpeakerLabels { get; init; } = true;
+}
diff --git a/services/net/auto-clipper/Config/AutoClipperOptions.cs b/services/net/auto-clipper/Config/AutoClipperOptions.cs
index cee1d3a4e..4e5fa587a 100644
--- a/services/net/auto-clipper/Config/AutoClipperOptions.cs
+++ b/services/net/auto-clipper/Config/AutoClipperOptions.cs
@@ -94,6 +94,33 @@ public class AutoClipperOptions : ServiceOptions
public int AzureSpeechStorageSasExpiryMinutes { get; set; } = 180;
#endregion
+ #region Azure Video Indexer configuration
+ ///
+ /// get/set - Azure Video Indexer account ID.
+ ///
+ public string AzureVideoIndexerAccountId { get; set; } = string.Empty;
+
+ ///
+ /// get/set - Azure Video Indexer location (e.g., "trial", "eastus").
+ ///
+ public string AzureVideoIndexerLocation { get; set; } = "trial";
+
+ ///
+ /// get/set - Azure Video Indexer API key (Ocp-Apim-Subscription-Key).
+ ///
+ public string AzureVideoIndexerApiKey { get; set; } = string.Empty;
+
+ ///
+ /// get/set - Timeout in minutes for Video Indexer processing.
+ ///
+ public int AzureVideoIndexerTimeoutMinutes { get; set; } = 60;
+
+ ///
+ /// get/set - Polling interval in seconds for Video Indexer status checks.
+ ///
+ public int AzureVideoIndexerPollingIntervalSeconds { get; set; } = 30;
+ #endregion
+
#region Azure AI configuration
///
/// get/set - The URL to the LLM
diff --git a/services/net/auto-clipper/Config/StationProfile.cs b/services/net/auto-clipper/Config/StationProfile.cs
index 408475cba..a6e9de94e 100644
--- a/services/net/auto-clipper/Config/StationProfile.cs
+++ b/services/net/auto-clipper/Config/StationProfile.cs
@@ -11,12 +11,35 @@ public class StationProfile
public class StationTranscriptionProfile
{
+ ///
+ /// Transcription provider: "azure_speech" or "azure_video_indexer"
+ ///
public string Provider { get; set; } = "azure_speech";
+
+ public string Language { get; set; } = "en-US";
+ public int SampleRate { get; set; } = 16000;
+
+ // Azure Speech specific settings
public bool Diarization { get; set; }
public int? MaxSpeakers { get; set; }
public string? DiarizationMode { get; set; } = "online";
- public string Language { get; set; } = "en-US";
- public int SampleRate { get; set; } = 16000;
+
+ // Azure Video Indexer specific settings
+ ///
+ /// Dictionary of Person Model names to IDs for speaker identification.
+ /// Example: { "news": "model-id-1", "sports": "model-id-2" }
+ ///
+ public Dictionary PersonModels { get; set; } = new();
+
+ ///
+ /// Key to select which Person Model to use from PersonModels dictionary.
+ ///
+ public string? PersonModelKey { get; set; }
+
+ ///
+ /// Whether to include speaker labels (speaker1:, speaker2:, or named) in transcript.
+ ///
+ public bool IncludeSpeakerLabels { get; set; }
}
public class StationTextProfile
diff --git a/services/net/auto-clipper/Config/Stations/CHAN.yml b/services/net/auto-clipper/Config/Stations/CHAN.yml
new file mode 100644
index 000000000..761ebe6cc
--- /dev/null
+++ b/services/net/auto-clipper/Config/Stations/CHAN.yml
@@ -0,0 +1,95 @@
+# CHAN station configuration using Azure Video Indexer
+name: CHAN
+sample_rate: 16000
+
+transcription:
+ provider: azure_video_indexer
+ language: en-CA
+ include_speaker_labels: true
+ # Person Model (optional) - add later when you have one:
+ # person_models:
+ # news: "your-model-id-here"
+ # person_model_key: news
+
+text:
+ chunk_size_s: 3.0
+ chunk_overlap_ratio: 0.5
+ heuristic_boundary_weight: 0.35
+ keyword_categories:
+ "(?i)traffic": Traffic
+ "(?i)weather": Weather
+ "(?i)sponsor": Ad
+ "(?i)commercial": Ad
+ "(?i)up next": Promo
+ "(?i)coming up": Promo
+ llm_segmentation: true
+ llm_model: gpt-5-chat
+ llm_temperature: 0.0
+ system_prompt: |
+ You are a Broadcast Structure Parser. Your ONLY job is to detect segment transitions.
+ Output MUST be a single, raw JSON object.
+ CRITICAL: Start your response with '{' and end with '}'.
+ DO NOT use markdown, backticks, or "```json" blocks. No introductory or closing text.
+ max_stories: 15
+ llm_prompt: |
+ Identify every point in the transcript where the topic or segment type changes.
+ Note: Speaker labels (e.g., "speaker1:", "Tom:") indicate who is speaking - use these to help identify segment transitions.
+
+ # SUMMARY RULES
+ 1. **Prioritize Anchor Leads**: For News, derive the summary from the anchor's introduction or the first three sentences of the report.
+ 2. **Active Voice**: Use active journalistic voice (e.g., "Surrey Council rejects housing proposal" NOT "A report about a meeting").
+ 3. **Category Formulas**:
+ - News: [Subject] [Action] (e.g., "Abbotsford shooting victim's son calls for urgent investigation").
+ - Traffic: [Location] [Incident/Status] (e.g., "Highway 99 northbound blocked at Hwy 17A due to crash").
+ - Weather: [Condition] + [High Temp] (e.g., "Mix of sun and cloud with a high of 9 degrees").
+ - Ad: [Business Name] + [Offer/Service] (e.g., "McDonald's features Egg McMuffin with Hollandaise sauce").
+ 4. **One Sentence Only**: Summaries MUST be a single, concise sentence.
+
+ # STRUCTURAL RULES (To Prevent Bundling)
+ 1. **The Sign-off Rule**: Phrases like "Global News," "CBC News," or "Reporting live" followed by a name mark the END of a segment. The very next sentence MUST be a new boundary.
+ 2. **The Handoff Rule**: When an anchor introduces a reporter (e.g., "As Joshua reports..."), the boundary starts at the ANCHOR'S introduction line.
+ 3. **Mandatory Category Split**: News, Traffic, Weather, and Ads MUST be isolated. Never bundle a Traffic report with a News story.
+ 4. **Zero Bloating**: Treat every unique headline as a separate clip. If the topic shifts from a shooting to a stabbing, create two distinct boundaries.
+ 5. **Speaker Change Awareness**: When the speaker label changes (e.g., from "speaker1:" to "speaker2:"), consider if this indicates a segment transition.
+
+ # OUTPUT FORMAT (Raw JSON ONLY)
+ {
+ "boundaries": [
+ {
+ "index": [Sentence Number],
+ "category": "News | Traffic | Weather | Ad | Promo",
+ "title": "[Short Slug]",
+ "summary": "[Journalistic Summary Sentence]",
+ "score": 0.95
+ }
+ ]
+ }
+
+ Transcript:
+ {{transcript}}
+
+heuristics:
+ pattern_entries:
+ # --- Common Transition Patterns ---
+ - pattern: "(?i)coming up"
+ weight: 0.65
+ category: Promo
+ note: Host tease for the next story
+ - pattern: "(?i)after the break"
+ weight: 0.65
+ category: Promo
+ note: Signals a hard break/transition
+ # --- Service Cues ---
+ - pattern: "(?i)traffic update"
+ weight: 0.6
+ category: Traffic
+ note: Recurring traffic block
+ - pattern: "(?i)weather update"
+ weight: 0.55
+ category: Weather
+ note: Weather hits are their own segments
+ # --- Add CHAN-specific anchor patterns here when known ---
+ # - pattern: "(?i)Anchor Name"
+ # weight: 0.85
+ # category: News/Traffic/Weather
+ # note: Description
diff --git a/services/net/auto-clipper/LLM/ClipSegmentationService.cs b/services/net/auto-clipper/LLM/ClipSegmentationService.cs
index 806a6d0c5..7e913d8b0 100644
--- a/services/net/auto-clipper/LLM/ClipSegmentationService.cs
+++ b/services/net/auto-clipper/LLM/ClipSegmentationService.cs
@@ -295,7 +295,10 @@ private IReadOnlyList ParseResponse(string? body, IReadOnlyList<
{
if (content == null) continue;
- var boundaries = JsonSerializer.Deserialize(content!);
+ // LLM sometimes returns JSON wrapped in markdown code fences (```json ... ```),
+ // even when instructed to return raw JSON. Strip the fences before parsing.
+ var strippedContent = StripCodeFence(content!);
+ var boundaries = JsonSerializer.Deserialize(strippedContent);
if (boundaries == null || boundaries.Boundaries == null) continue;
foreach (var boundary in boundaries.Boundaries)
{
diff --git a/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs b/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs
index 0d098b51c..74c67d6e7 100644
--- a/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs
+++ b/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs
@@ -11,6 +11,7 @@ public class ClipProcessingPipeline
{
private readonly IAudioNormalizer _audioNormalizer;
private readonly IAzureSpeechTranscriptionService _speechTranscriber;
+ private readonly IAzureVideoIndexerClient? _videoIndexerClient;
private readonly IClipSegmentationService _clipSegmentation;
private readonly AutoClipperOptions _options;
private readonly ILogger _logger;
@@ -20,10 +21,12 @@ public ClipProcessingPipeline(
IAzureSpeechTranscriptionService speechTranscriber,
IClipSegmentationService clipSegmentation,
IOptions options,
- ILogger logger)
+ ILogger logger,
+ IAzureVideoIndexerClient? videoIndexerClient = null)
{
_audioNormalizer = audioNormalizer;
_speechTranscriber = speechTranscriber;
+ _videoIndexerClient = videoIndexerClient;
_clipSegmentation = clipSegmentation;
_options = options.Value;
_logger = logger;
@@ -31,25 +34,71 @@ public ClipProcessingPipeline(
public async Task ExecuteAsync(ClipProcessingContext context, CancellationToken cancellationToken)
{
- var normalizedPath = await _audioNormalizer.NormalizeAsync(context.SourcePath, context.TargetSampleRate, cancellationToken);
var language = !string.IsNullOrWhiteSpace(context.Request.Language)
? context.Request.Language!
: !string.IsNullOrWhiteSpace(context.StationProfile.Transcription.Language)
? context.StationProfile.Transcription.Language
: _options.DefaultTranscriptLanguage;
- var transcriptionRequest = new SpeechTranscriptionRequest
+
+ var provider = context.StationProfile.Transcription.Provider ?? "azure_speech";
+ IReadOnlyList segments;
+ string workingPath;
+
+ if (string.Equals(provider, "azure_video_indexer", StringComparison.OrdinalIgnoreCase))
{
- Language = language,
- EnableSpeakerDiarization = context.StationProfile.Transcription.Diarization,
- SpeakerCount = context.StationProfile.Transcription.MaxSpeakers,
- DiarizationMode = context.StationProfile.Transcription.DiarizationMode
- };
+ // Use Azure Video Indexer - upload original file directly (no normalization needed)
+ if (_videoIndexerClient == null)
+ throw new InvalidOperationException("Video Indexer client is not configured but provider is set to azure_video_indexer");
+
+ workingPath = context.SourcePath;
+ var personModelId = ResolvePersonModelId(context.StationProfile.Transcription);
+
+ _logger.LogInformation("Using Video Indexer provider (PersonModel: {PersonModel})", personModelId ?? "none");
+
+ var viRequest = new VideoIndexerRequest
+ {
+ Language = language,
+ PersonModelId = personModelId,
+ IncludeSpeakerLabels = context.StationProfile.Transcription.IncludeSpeakerLabels
+ };
+
+ segments = await _videoIndexerClient.TranscribeAsync(context.SourcePath, viRequest, cancellationToken);
+ }
+ else
+ {
+ // Use Azure Speech (default) - requires audio normalization
+ _logger.LogInformation("Using Azure Speech provider");
+
+ workingPath = await _audioNormalizer.NormalizeAsync(context.SourcePath, context.TargetSampleRate, cancellationToken);
+ var transcriptionRequest = new SpeechTranscriptionRequest
+ {
+ Language = language,
+ EnableSpeakerDiarization = context.StationProfile.Transcription.Diarization,
+ SpeakerCount = context.StationProfile.Transcription.MaxSpeakers,
+ DiarizationMode = context.StationProfile.Transcription.DiarizationMode
+ };
+
+ segments = await _speechTranscriber.TranscribeAsync(workingPath, transcriptionRequest, cancellationToken);
+ }
- var segments = await _speechTranscriber.TranscribeAsync(normalizedPath, transcriptionRequest, cancellationToken);
var segmentationSettings = BuildSegmentationSettings(context.StationProfile);
var clipDefinitions = await _clipSegmentation.GenerateClipsAsync(segments, segmentationSettings, cancellationToken);
- return new ClipProcessingResult(normalizedPath, language, segments, clipDefinitions, segmentationSettings);
+ return new ClipProcessingResult(workingPath, language, segments, clipDefinitions, segmentationSettings);
+ }
+
+ ///
+ /// Resolves the Person Model ID from station profile configuration.
+ ///
+ private static string? ResolvePersonModelId(StationTranscriptionProfile transcription)
+ {
+ if (string.IsNullOrWhiteSpace(transcription.PersonModelKey))
+ return null;
+
+ if (transcription.PersonModels.TryGetValue(transcription.PersonModelKey, out var modelId))
+ return modelId;
+
+ return null;
}
private static ClipSegmentationSettings BuildSegmentationSettings(StationProfile profile)
diff --git a/services/net/auto-clipper/README.md b/services/net/auto-clipper/README.md
index 171d2ad5f..8b8654c67 100644
--- a/services/net/auto-clipper/README.md
+++ b/services/net/auto-clipper/README.md
@@ -1,27 +1,41 @@
# AutoClipper Service
-The AutoClipper service consumes clip requests from Kafka, normalizes audio, transcribes it with Azure Speech, and
-segments the transcript into clips using a boundary-aware LLM workflow boosted by station heuristics. Key concepts:
+The AutoClipper service consumes clip requests from Kafka, transcribes media using Azure Speech or Azure Video Indexer,
+and segments the transcript into clips using a boundary-aware LLM workflow boosted by station heuristics. Key concepts:
-- **Station profiles** (Config/Stations/\*.yml) define language, sample rate, heuristic keywords, custom prompts, and
- category mappings for weather/traffic/ads.
-- **Pipeline** (ClipProcessingPipeline) normalizes audio, transcribes via AzureSpeechTranscriptionService, and feeds
- transcripts plus station config into ClipSegmentationService.
+- **Station profiles** (Config/Stations/\*.yml) define transcription provider, language, sample rate, heuristic keywords,
+ custom prompts, and category mappings for weather/traffic/ads.
+- **Transcription providers**:
+ - `azure_speech` (default) - Fast batch transcription, outputs plain text.
+ - `azure_video_indexer` - Supports speaker identification (speaker1:, speaker2:, or named via Person Model).
+- **Pipeline** (ClipProcessingPipeline) selects the transcription provider based on station config, transcribes the media,
+ and feeds transcripts plus station config into ClipSegmentationService.
- **Segmentation** uses Azure OpenAI to score story boundaries, merges in regex-based heuristics, snaps clips to transcript
sentences, and tags each clip with a category before AutoClipperManager creates content and uploads the media.
## Development
-1. Update station YAMLs under Config/Stations (copy CKNW.yml as a starting point).
-2. Run dotnet build services/net/auto-clipper/TNO.Services.AutoClipper.csproj to verify changes.
-3. Use the harness (see tools/auto-clipper-harness/README.md) to manually validate segmentation on sample audio.
+1. Update station YAMLs under Config/Stations (copy CKNW.yml for Azure Speech, CHAN.yml for Video Indexer).
+2. Run `dotnet build services/net/auto-clipper/TNO.Services.AutoClipper.csproj` to verify changes.
+3. Use the harness (see tools/auto-clipper-harness/README.md) to manually validate segmentation on sample media.
## Configuration
-Important Service\_\_ env vars:
+### Azure Speech (default provider)
-- Service**AzureSpeechKey / Service**AzureSpeechRegion
-- Service**AzureSpeechStorageConnectionString / Service**AzureSpeechStorageContainer (batch upload destination for Azure Speech).
-- Service**AzureSpeechBatchEndpoint, Service**AzureSpeechBatchApiVersion, Service**AzureSpeechBatchPollingIntervalSeconds, Service**AzureSpeechBatchTimeoutMinutes, Service\_\_AzureSpeechStorageSasExpiryMinutes (optional batch tuning).
-- Service**LlmApiUrl, Service**LlmApiKey, Service**LlmDeployment, Service**LlmApiVersion
-- Service\_\_StationConfigPath (optional override for station YAML directory)
+- Service\_\_AzureSpeechKey / Service\_\_AzureSpeechRegion
+- Service\_\_AzureSpeechStorageConnectionString / Service\_\_AzureSpeechStorageContainer (batch upload destination)
+- Service\_\_AzureSpeechBatchEndpoint, Service\_\_AzureSpeechBatchApiVersion, Service\_\_AzureSpeechBatchPollingIntervalSeconds, Service\_\_AzureSpeechBatchTimeoutMinutes, Service\_\_AzureSpeechStorageSasExpiryMinutes (optional batch tuning)
+
+### Azure Video Indexer (optional, for speaker identification)
+
+- Service\_\_AzureVideoIndexerAccountId - Your Video Indexer account ID
+- Service\_\_AzureVideoIndexerLocation - Account location (e.g., `trial`, `eastus`)
+- Service\_\_AzureVideoIndexerApiKey - API subscription key
+- Service\_\_AzureVideoIndexerTimeoutMinutes (default: 60) - Max wait time for processing
+- Service\_\_AzureVideoIndexerPollingIntervalSeconds (default: 30) - Status check interval
+
+### LLM & General
+
+- Service\_\_LlmApiUrl, Service\_\_LlmApiKey, Service\_\_LlmDeployment, Service\_\_LlmApiVersion
+- Service\_\_StationConfigPath (optional override for station YAML directory)
\ No newline at end of file
diff --git a/services/net/auto-clipper/appsettings.json b/services/net/auto-clipper/appsettings.json
index 4bf3fbc1e..1f0a93ff7 100644
--- a/services/net/auto-clipper/appsettings.json
+++ b/services/net/auto-clipper/appsettings.json
@@ -53,6 +53,12 @@
"AzureSpeechStorageSasExpiryMinutes": 180,
"DefaultTranscriptLanguage": "en-US",
+ "AzureVideoIndexerAccountId": "",
+ "AzureVideoIndexerLocation": "trial",
+ "AzureVideoIndexerApiKey": "",
+ "AzureVideoIndexerTimeoutMinutes": 60,
+ "AzureVideoIndexerPollingIntervalSeconds": 30,
+
"LlmApiUrl": "https://mmiopenai.cognitiveservices.azure.com/",
"LlmApiKey": "",
"LlmDefaultModel": "",
diff --git a/tools/auto-clipper-harness/.env.sample b/tools/auto-clipper-harness/.env.sample
index d90a25757..7bc0c53c0 100644
--- a/tools/auto-clipper-harness/.env.sample
+++ b/tools/auto-clipper-harness/.env.sample
@@ -1,4 +1,14 @@
# TEMP HARNESS env file. Delete along with this harness when done.
+
+# === Transcription Provider Selection ===
+# Options: azure_speech | azure_video_indexer
+AUTOCLIP_HARNESS_PROVIDER=azure_speech
+
+# === Output Options ===
+# Include speaker labels in transcript output (true/false)
+AUTOCLIP_HARNESS_INCLUDE_SPEAKER_LABELS=false
+
+# === Azure Speech Configuration ===
AUTOCLIP_HARNESS_SPEECH_KEY=
AUTOCLIP_HARNESS_SPEECH_REGION=canadacentral
AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING=
@@ -8,12 +18,33 @@ AUTOCLIP_HARNESS_BATCH_ENDPOINT=
AUTOCLIP_HARNESS_BATCH_VERSION=v3.2
AUTOCLIP_HARNESS_BATCH_POLL_SECONDS=10
AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES=45
+
+# === Azure Video Indexer Configuration ===
+# Required when AUTOCLIP_HARNESS_PROVIDER=azure_video_indexer
+AUTOCLIP_HARNESS_VI_ACCOUNT_ID=
+AUTOCLIP_HARNESS_VI_LOCATION=trial
+AUTOCLIP_HARNESS_VI_API_KEY=
+AUTOCLIP_HARNESS_VI_TIMEOUT_MINUTES=60
+AUTOCLIP_HARNESS_VI_POLL_SECONDS=30
+
+# Multiple Person Models (JSON dictionary format)
+# Keys are usage identifiers, values are Azure Person Model IDs
+# Example: {"news":"abc123-news-model","sports":"def456-sports-model","default":"ghi789-general"}
+AUTOCLIP_HARNESS_VI_PERSON_MODELS={}
+
+# Current Person Model key to use (corresponds to a key in the dictionary above)
+# Leave empty to skip Person Model identification (will output speaker1, speaker2, etc.)
+AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY=
+
+# === LLM Configuration ===
# Provide either the full chat-completions endpoint or the base resource URL.
AUTOCLIP_HARNESS_LLM_URL=https://your-resource.openai.azure.com
AUTOCLIP_HARNESS_LLM_KEY=
AUTOCLIP_HARNESS_LLM_DEPLOYMENT=gpt-4o-mini
AUTOCLIP_HARNESS_LLM_MODEL=gpt-4o-mini
AUTOCLIP_HARNESS_LLM_VERSION=2024-07-18
+
+# === General Settings ===
AUTOCLIP_HARNESS_LANGUAGE=en-US
AUTOCLIP_HARNESS_MAX_STORIES=5
diff --git a/tools/auto-clipper-harness/.gitignore b/tools/auto-clipper-harness/.gitignore
index 91b68b99a..9a8771077 100644
--- a/tools/auto-clipper-harness/.gitignore
+++ b/tools/auto-clipper-harness/.gitignore
@@ -1,2 +1,26 @@
**/output/
-**/input/
\ No newline at end of file
+**/input/
+**/auto-clipper-harness-output/
+**/*.mp4
+**/*.avi
+**/*.mkv
+**/*.mov
+**/*.flv
+**/*.wmv
+**/*.webm
+**/*.m4v
+**/*.m4a
+**/*.m4b
+**/*.m4p
+**/*.m4v
+**/*.mp3
+**/*.ogg
+**/*.wav
+**/*.m4v
+**/*.m4a
+**/*.m4b
+**/*.m4p
+**/*.m4v
+**/*.mp3
+**/*.ogg
+**/*.wav
diff --git a/tools/auto-clipper-harness/Program.cs b/tools/auto-clipper-harness/Program.cs
index 1892d3ed3..539ecc664 100644
--- a/tools/auto-clipper-harness/Program.cs
+++ b/tools/auto-clipper-harness/Program.cs
@@ -1,8 +1,10 @@
using System.Text;
using System.Linq;
+using System.Text.Json;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
+using AutoClipperHarness;
using TNO.Services.AutoClipper.Audio;
using TNO.Services.AutoClipper.Azure;
using TNO.Services.AutoClipper.Config;
@@ -30,7 +32,13 @@
}
var outputDir = args.Length > 2 ? args[2] : Path.Combine(Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".", "auto-clipper-harness-output");
+Console.WriteLine($"[HARNESS] Output directory: {outputDir}");
Directory.CreateDirectory(outputDir);
+if (!Directory.Exists(outputDir))
+{
+ Console.WriteLine($"[HARNESS] ERROR: Failed to create output directory: {outputDir}");
+ return;
+}
using var loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole(o => o.TimestampFormat = "HH:mm:ss "));
var stationCode = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STATION") ?? "CKNW";
@@ -46,8 +54,6 @@
var sampleRate = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_SAMPLE_RATE"), out var sr)
? sr
: (stationProfile.Transcription.SampleRate > 0 ? stationProfile.Transcription.SampleRate : 16000);
-var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger());
-var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate);
var llmBaseUrl = RequireEnv("AUTOCLIP_HARNESS_LLM_URL").Trim();
var llmDeployment = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_DEPLOYMENT");
var llmVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_VERSION") ?? "2024-07-18";
@@ -57,45 +63,107 @@
? (!string.IsNullOrWhiteSpace(llmDeployment) ? llmDeployment : "gpt-4o-mini")
: llmModel;
-
-
-var options = Options.Create(new AutoClipperOptions
+// Create LLM options (shared by both providers)
+var llmOptions = Options.Create(new AutoClipperOptions
{
- AzureSpeechKey = RequireEnv("AUTOCLIP_HARNESS_SPEECH_KEY"),
- AzureSpeechRegion = RequireEnv("AUTOCLIP_HARNESS_SPEECH_REGION"),
- AzureSpeechBatchEndpoint = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_ENDPOINT") ?? string.Empty,
- AzureSpeechBatchApiVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_VERSION") ?? "v3.2",
- AzureSpeechBatchPollingIntervalSeconds = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_POLL_SECONDS"), out var batchPollSeconds) ? batchPollSeconds : 10,
- AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45,
- AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"),
- AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"),
- AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180, LlmApiUrl = llmEndpoint,
-
+ LlmApiUrl = llmEndpoint,
LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"),
-
LlmDefaultModel = defaultModel,
LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT")
?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt),
- MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5,
+ MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStoriesLlm) ? maxStoriesLlm : 5,
VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".",
DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US"
});
-var speechLogger = loggerFactory.CreateLogger();
var llmLogger = loggerFactory.CreateLogger();
-var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger);
-var llmService = new ClipSegmentationService(new HttpClient(), options, llmLogger);
-var transcriptionRequest = new SpeechTranscriptionRequest
+var llmService = new ClipSegmentationService(new HttpClient(), llmOptions, llmLogger);
+
+// Determine provider and output options (early detection to avoid unnecessary initialization)
+var provider = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROVIDER") ?? "azure_speech";
+var includeSpeakerLabels = bool.TryParse(
+ Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_INCLUDE_SPEAKER_LABELS"),
+ out var isl) && isl;
+
+// Transcribe using selected provider
+IReadOnlyList transcriptSegments;
+IReadOnlyList segments;
+
+if (string.Equals(provider, "azure_video_indexer", StringComparison.OrdinalIgnoreCase))
+{
+ // Use Azure Video Indexer - no audio normalization needed, upload original file directly
+ var personModelId = ResolvePersonModelId();
+ Console.WriteLine($"[HARNESS] Using Video Indexer (PersonModel: {personModelId ?? "none"})");
+ Console.WriteLine($"[HARNESS] Uploading original file: {input}");
+
+ var viClient = new VideoIndexerClient(
+ new HttpClient(),
+ RequireEnv("AUTOCLIP_HARNESS_VI_ACCOUNT_ID"),
+ Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_LOCATION") ?? "trial",
+ RequireEnv("AUTOCLIP_HARNESS_VI_API_KEY"),
+ int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_TIMEOUT_MINUTES"), out var viTimeout) ? viTimeout : 60,
+ int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_POLL_SECONDS"), out var viPoll) ? viPoll : 30,
+ loggerFactory.CreateLogger());
+
+ var viRequest = new VideoIndexerRequest
+ {
+ Language = language,
+ PersonModelId = personModelId,
+ IncludeSpeakerLabels = includeSpeakerLabels
+ };
+
+ var rawJsonPath = Path.Combine(outputDir, "video_indexer_raw_response.json");
+ transcriptSegments = await viClient.TranscribeAsync(input, viRequest, CancellationToken.None, rawJsonPath);
+ Console.WriteLine($"[HARNESS] Raw Video Indexer response -> {rawJsonPath}");
+
+ // Convert to TimestampedTranscript for downstream compatibility
+ segments = transcriptSegments.Select(s => new TimestampedTranscript(s.Start, s.End, s.Text)).ToList();
+}
+else
{
- Language = language,
- EnableSpeakerDiarization = stationProfile.Transcription.Diarization,
- SpeakerCount = stationProfile.Transcription.MaxSpeakers,
- DiarizationMode = stationProfile.Transcription.DiarizationMode
-};
-
-Console.WriteLine($"[HARNESS] Transcribing {workingFile} ...");
-var segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None);
+ // Use Azure Speech (default) - requires audio normalization
+ var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger());
+ var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate);
+
+ var options = Options.Create(new AutoClipperOptions
+ {
+ AzureSpeechKey = RequireEnv("AUTOCLIP_HARNESS_SPEECH_KEY"),
+ AzureSpeechRegion = RequireEnv("AUTOCLIP_HARNESS_SPEECH_REGION"),
+ AzureSpeechBatchEndpoint = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_ENDPOINT") ?? string.Empty,
+ AzureSpeechBatchApiVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_VERSION") ?? "v3.2",
+ AzureSpeechBatchPollingIntervalSeconds = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_POLL_SECONDS"), out var batchPollSeconds) ? batchPollSeconds : 10,
+ AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45,
+ AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"),
+ AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"),
+ AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180,
+ LlmApiUrl = llmEndpoint,
+ LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"),
+ LlmDefaultModel = defaultModel,
+ LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT")
+ ?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt),
+ MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5,
+ VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".",
+ DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US"
+ });
+
+ var speechLogger = loggerFactory.CreateLogger();
+ var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger);
+ var transcriptionRequest = new SpeechTranscriptionRequest
+ {
+ Language = language,
+ EnableSpeakerDiarization = stationProfile.Transcription.Diarization,
+ SpeakerCount = stationProfile.Transcription.MaxSpeakers,
+ DiarizationMode = stationProfile.Transcription.DiarizationMode
+ };
+
+ Console.WriteLine($"[HARNESS] Transcribing with Azure Speech: {workingFile} ...");
+ segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None);
+
+ // Convert to TranscriptSegment for speaker label support
+ transcriptSegments = segments.Select(s => new TranscriptSegment(s.Start, s.End, s.Text)).ToList();
+}
+
Console.WriteLine($"[HARNESS] Received {segments.Count} transcript segments");
-var fullTranscriptBody = BuildTranscriptDocument(segments);
+var fullTranscriptBody = BuildTranscriptDocumentWithSpeakers(transcriptSegments, includeSpeakerLabels);
var fullTranscriptPath = Path.Combine(outputDir, "transcript_full.txt");
await File.WriteAllTextAsync(fullTranscriptPath, fullTranscriptBody ?? string.Empty);
Console.WriteLine($"[HARNESS] Full transcript -> {fullTranscriptPath}");
@@ -118,8 +186,8 @@
continue;
}
- var transcriptSlice = ExtractTranscriptRange(segments, normalized.Start, normalized.End);
- var transcriptBody = BuildTranscriptDocument(transcriptSlice);
+ var transcriptSlice = ExtractTranscriptSegmentRange(transcriptSegments, normalized.Start, normalized.End);
+ var transcriptBody = BuildTranscriptDocumentWithSpeakers(transcriptSlice, includeSpeakerLabels);
if (string.IsNullOrWhiteSpace(transcriptBody))
{
Console.WriteLine($"[HARNESS] Empty transcript for clip {definition.Title}");
@@ -378,6 +446,60 @@ static void LoadEnvFile(string path)
static IReadOnlyList ExtractTranscriptRange(IReadOnlyList segments, TimeSpan start, TimeSpan end)
=> segments.Where(s => s.End > start && s.Start < end).ToArray();
+static IReadOnlyList ExtractTranscriptSegmentRange(IReadOnlyList segments, TimeSpan start, TimeSpan end)
+ => segments.Where(s => s.End > start && s.Start < end).ToArray();
+
+static string? ResolvePersonModelId()
+{
+ var personModelsJson = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_PERSON_MODELS");
+ var personModelKey = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY");
+
+ if (string.IsNullOrWhiteSpace(personModelsJson) || string.IsNullOrWhiteSpace(personModelKey))
+ return null;
+
+ try
+ {
+ var personModels = JsonSerializer.Deserialize>(personModelsJson);
+ if (personModels != null && personModels.TryGetValue(personModelKey, out var modelId))
+ {
+ return modelId;
+ }
+ }
+ catch (JsonException)
+ {
+ Console.WriteLine($"[HARNESS] Warning: Failed to parse AUTOCLIP_HARNESS_VI_PERSON_MODELS as JSON");
+ }
+
+ return null;
+}
+
+static string BuildTranscriptDocumentWithSpeakers(IReadOnlyList segments, bool includeSpeakerLabels)
+{
+ if (segments == null || segments.Count == 0) return string.Empty;
+ var sb = new StringBuilder();
+ var idx = 1;
+ foreach (var segment in segments)
+ {
+ if (string.IsNullOrWhiteSpace(segment.Text)) continue;
+
+ sb.AppendLine(idx.ToString());
+ sb.AppendLine($"{FormatTimestamp(segment.Start)} --> {FormatTimestamp(segment.End)}");
+
+ if (includeSpeakerLabels && (segment.SpeakerName != null || segment.SpeakerId != null))
+ {
+ var label = segment.SpeakerName ?? $"speaker{segment.SpeakerId}";
+ sb.AppendLine($"{label}: {segment.Text.Trim()}");
+ }
+ else
+ {
+ sb.AppendLine(segment.Text.Trim());
+ }
+ sb.AppendLine();
+ idx++;
+ }
+ return sb.ToString().Trim();
+}
+
diff --git a/tools/auto-clipper-harness/README.md b/tools/auto-clipper-harness/README.md
index 933223301..45667630a 100644
--- a/tools/auto-clipper-harness/README.md
+++ b/tools/auto-clipper-harness/README.md
@@ -1,23 +1,48 @@
# AutoClipper Harness
-The harness is a standalone console app that mirrors the AutoClipper pipeline for manual validation. It
-normalizes a local media file, runs Azure Speech transcription, feeds the transcript and station heuristics to the
-segmenter, and writes clips/transcripts/prompt debug files for inspection.
+Standalone console app for local testing of the AutoClipper pipeline.
## Usage
-`dotnet run --project tools/auto-clipper-harness -- [language] [outputDir]`
+```bash
+dotnet run --project tools/auto-clipper-harness -- [language] [outputDir]
+```
-- Configure Azure keys and LLM settings via .env (see .env.sample).
-- Provide AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING / AUTOCLIP_HARNESS_STORAGE_CONTAINER so the harness can upload audio for Azure batch transcription.
-- Optional overrides: AUTOCLIP_HARNESS_BATCH_ENDPOINT, \_BATCH_VERSION, \_BATCH_POLL_SECONDS, \_BATCH_TIMEOUT_MINUTES, and \_STORAGE_SAS_MINUTES.
-- Station profiles are loaded from services/net/auto-clipper/Config/Stations by default; override with
- AUTOCLIP_HARNESS_STATION_PATH / AUTOCLIP_HARNESS_STATION.
-- Outputs: clip_XX.\* media slices, clip_XX.txt transcripts, ranscript_full.txt, and
- llm_prompt_debug.txt (shows numbered transcript, heuristics, and the final prompt).
+## Configuration
+
+Copy `.env.sample` to `.env` and fill in your keys.
+
+### Provider Selection
+
+```bash
+AUTOCLIP_HARNESS_PROVIDER=azure_speech # default
+AUTOCLIP_HARNESS_PROVIDER=azure_video_indexer # with speaker identification
+```
+
+### Azure Speech (default)
+
+- `AUTOCLIP_HARNESS_SPEECH_KEY` / `AUTOCLIP_HARNESS_SPEECH_REGION`
+- `AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING` / `AUTOCLIP_HARNESS_STORAGE_CONTAINER`
+
+### Azure Video Indexer
+
+- `AUTOCLIP_HARNESS_VI_ACCOUNT_ID` / `AUTOCLIP_HARNESS_VI_API_KEY`
+- `AUTOCLIP_HARNESS_VI_LOCATION` (default: trial)
+- `AUTOCLIP_HARNESS_VI_PERSON_MODELS` - JSON map for multiple Person Models
+- `AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY` - which model to use
+
+### LLM
+
+- `AUTOCLIP_HARNESS_LLM_ENDPOINT` / `AUTOCLIP_HARNESS_LLM_KEY`
+
+## Output
+
+- `clip_XX.mp4` / `clip_XX.txt` - segmented clips with transcripts
+- `transcript_full.txt` - full transcript (with speaker labels if enabled)
+- `video_indexer_raw_response.json` - raw API response (Video Indexer only)
+- `llm_prompt_debug.txt` - LLM prompt for debugging
## Notes
-- The harness shares the segmentation logic with the service, so any changes in ClipSegmentationService
- should be validated here first.
-- Ensure ffmpeg is available on PATH; the harness shells out to ffmpeg to produce media clips.
+- Video Indexer uploads original media directly; Azure Speech requires WAV conversion
+- Requires `ffmpeg` on PATH for clip extraction
diff --git a/tools/auto-clipper-harness/TranscriptSegment.cs b/tools/auto-clipper-harness/TranscriptSegment.cs
new file mode 100644
index 000000000..66d18adc1
--- /dev/null
+++ b/tools/auto-clipper-harness/TranscriptSegment.cs
@@ -0,0 +1,13 @@
+namespace AutoClipperHarness;
+
+///
+/// Represents a transcript segment with optional speaker identification.
+/// This is the harness-local version; will be merged into TimestampedTranscript in phase 2.
+///
+public record TranscriptSegment(
+ TimeSpan Start,
+ TimeSpan End,
+ string Text,
+ int? SpeakerId = null,
+ string? SpeakerName = null
+);
diff --git a/tools/auto-clipper-harness/VideoIndexerClient.cs b/tools/auto-clipper-harness/VideoIndexerClient.cs
new file mode 100644
index 000000000..ddf09d61e
--- /dev/null
+++ b/tools/auto-clipper-harness/VideoIndexerClient.cs
@@ -0,0 +1,398 @@
+using System.Net.Http.Headers;
+using System.Text.Json;
+using Microsoft.Extensions.Logging;
+
+namespace AutoClipperHarness;
+
+///
+/// Client for Azure Video Indexer API.
+/// Handles video upload, processing, and transcript extraction with speaker identification.
+///
+public class VideoIndexerClient
+{
+ private const string ApiBaseUrl = "https://api.videoindexer.ai";
+
+ private readonly HttpClient _httpClient;
+ private readonly string _accountId;
+ private readonly string _location;
+ private readonly string _apiKey;
+ private readonly int _timeoutMinutes;
+ private readonly int _pollIntervalSeconds;
+ private readonly ILogger? _logger;
+
+ private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web)
+ {
+ PropertyNameCaseInsensitive = true
+ };
+
+ public VideoIndexerClient(
+ HttpClient httpClient,
+ string accountId,
+ string location,
+ string apiKey,
+ int timeoutMinutes = 60,
+ int pollIntervalSeconds = 30,
+ ILogger? logger = null)
+ {
+ _httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
+ _accountId = accountId ?? throw new ArgumentNullException(nameof(accountId));
+ _location = location ?? throw new ArgumentNullException(nameof(location));
+ _apiKey = apiKey ?? throw new ArgumentNullException(nameof(apiKey));
+ _timeoutMinutes = timeoutMinutes > 0 ? timeoutMinutes : 60;
+ _pollIntervalSeconds = pollIntervalSeconds > 0 ? pollIntervalSeconds : 30;
+ _logger = logger;
+ }
+
+ ///
+ /// Transcribes a media file using Azure Video Indexer.
+ ///
+ /// Path to the media file (video or audio).
+ /// Transcription request options.
+ /// Cancellation token.
+ /// Optional path to save the raw Video Indexer JSON response for debugging.
+ /// List of transcript segments with optional speaker information.
+ public async Task> TranscribeAsync(
+ string filePath,
+ VideoIndexerRequest request,
+ CancellationToken cancellationToken = default,
+ string? rawJsonOutputPath = null)
+ {
+ if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath))
+ throw new FileNotFoundException("Media file not found", filePath);
+
+ _logger?.LogInformation("Starting Video Indexer transcription for {File}", filePath);
+
+ // Step 1: Get access token
+ var accessToken = await GetAccessTokenAsync(cancellationToken);
+ _logger?.LogDebug("Obtained access token");
+
+ // Step 2: Upload video
+ var videoId = await UploadVideoAsync(filePath, accessToken, request, cancellationToken);
+ _logger?.LogInformation("Video uploaded with ID: {VideoId}", videoId);
+
+ try
+ {
+ // Step 3: Wait for processing to complete
+ var indexJson = await WaitForProcessingAsync(videoId, accessToken, cancellationToken);
+ _logger?.LogInformation("Video processing completed");
+
+ // Save raw JSON for debugging if path is provided
+ if (!string.IsNullOrWhiteSpace(rawJsonOutputPath))
+ {
+ try
+ {
+ // Ensure directory exists
+ var dir = Path.GetDirectoryName(rawJsonOutputPath);
+ if (!string.IsNullOrWhiteSpace(dir))
+ Directory.CreateDirectory(dir);
+
+ // Pretty print the JSON
+ using var doc = JsonDocument.Parse(indexJson);
+ var prettyJson = JsonSerializer.Serialize(doc, new JsonSerializerOptions { WriteIndented = true });
+ await File.WriteAllTextAsync(rawJsonOutputPath, prettyJson, cancellationToken);
+ _logger?.LogInformation("Saved raw Video Indexer response to {Path}", rawJsonOutputPath);
+ }
+ catch (Exception ex)
+ {
+ _logger?.LogWarning(ex, "Failed to save raw JSON to {Path}", rawJsonOutputPath);
+ }
+ }
+
+ // Step 4: Parse transcript
+ var segments = ParseTranscript(indexJson, request.IncludeSpeakerLabels);
+ _logger?.LogInformation("Parsed {Count} transcript segments", segments.Count);
+
+ return segments;
+ }
+ finally
+ {
+ // Clean up: delete the video from Video Indexer
+ await TryDeleteVideoAsync(videoId, accessToken);
+ }
+ }
+
+ private async Task GetAccessTokenAsync(CancellationToken cancellationToken)
+ {
+ var url = $"{ApiBaseUrl}/Auth/{_location}/Accounts/{_accountId}/AccessToken?allowEdit=true";
+
+ using var request = new HttpRequestMessage(HttpMethod.Get, url);
+ request.Headers.Add("Ocp-Apim-Subscription-Key", _apiKey);
+
+ using var response = await _httpClient.SendAsync(request, cancellationToken);
+ var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+ if (!response.IsSuccessStatusCode)
+ throw new InvalidOperationException($"Failed to get access token: {response.StatusCode} - {body}");
+
+ // Token is returned as a quoted string
+ return body.Trim('"');
+ }
+
+ private async Task UploadVideoAsync(
+ string filePath,
+ string accessToken,
+ VideoIndexerRequest request,
+ CancellationToken cancellationToken)
+ {
+ var fileName = Path.GetFileName(filePath);
+ var videoName = $"AutoClipper-{Guid.NewGuid():N}-{fileName}";
+
+ // Build upload URL with query parameters
+ var queryParams = new List
+ {
+ $"accessToken={Uri.EscapeDataString(accessToken)}",
+ $"name={Uri.EscapeDataString(videoName)}",
+ $"language={Uri.EscapeDataString(request.Language)}",
+ "privacy=Private",
+ "indexingPreset=AudioOnly" // We only need audio analysis for transcription
+ };
+
+ if (!string.IsNullOrWhiteSpace(request.PersonModelId))
+ {
+ queryParams.Add($"personModelId={Uri.EscapeDataString(request.PersonModelId)}");
+ }
+
+ var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos?{string.Join("&", queryParams)}";
+
+ _logger?.LogDebug("Uploading video to: {Url}", url.Split('?')[0]);
+
+ using var content = new MultipartFormDataContent();
+ await using var fileStream = File.OpenRead(filePath);
+ var fileContent = new StreamContent(fileStream);
+ fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream");
+ content.Add(fileContent, "file", fileName);
+
+ using var response = await _httpClient.PostAsync(url, content, cancellationToken);
+ var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+ if (!response.IsSuccessStatusCode)
+ throw new InvalidOperationException($"Failed to upload video: {response.StatusCode} - {body}");
+
+ using var doc = JsonDocument.Parse(body);
+ var videoId = doc.RootElement.GetProperty("id").GetString();
+
+ if (string.IsNullOrWhiteSpace(videoId))
+ throw new InvalidOperationException("Video Indexer did not return a video ID");
+
+ return videoId;
+ }
+
+ private async Task WaitForProcessingAsync(
+ string videoId,
+ string accessToken,
+ CancellationToken cancellationToken)
+ {
+ var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos/{videoId}/Index?accessToken={Uri.EscapeDataString(accessToken)}";
+ var timeout = TimeSpan.FromMinutes(_timeoutMinutes);
+ var pollInterval = TimeSpan.FromSeconds(_pollIntervalSeconds);
+ var startTime = DateTime.UtcNow;
+
+ while (true)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ if (DateTime.UtcNow - startTime > timeout)
+ throw new TimeoutException($"Video Indexer processing did not complete within {_timeoutMinutes} minutes");
+
+ using var response = await _httpClient.GetAsync(url, cancellationToken);
+ var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+ if (!response.IsSuccessStatusCode)
+ throw new InvalidOperationException($"Failed to get video index: {response.StatusCode} - {body}");
+
+ using var doc = JsonDocument.Parse(body);
+ var state = doc.RootElement.GetProperty("state").GetString();
+
+ var elapsed = (int)(DateTime.UtcNow - startTime).TotalSeconds;
+ _logger?.LogInformation("Video Indexer status: {State} ({Elapsed}s elapsed)", state, elapsed);
+
+ if (string.Equals(state, "Processed", StringComparison.OrdinalIgnoreCase))
+ {
+ return body;
+ }
+
+ if (string.Equals(state, "Failed", StringComparison.OrdinalIgnoreCase))
+ {
+ var errorMessage = "Unknown error";
+ if (doc.RootElement.TryGetProperty("failureMessage", out var failureMsg))
+ errorMessage = failureMsg.GetString() ?? errorMessage;
+ throw new InvalidOperationException($"Video Indexer processing failed: {errorMessage}");
+ }
+
+ await Task.Delay(pollInterval, cancellationToken);
+ }
+ }
+
+ private IReadOnlyList ParseTranscript(string indexJson, bool includeSpeakerLabels)
+ {
+ var segments = new List();
+
+ using var doc = JsonDocument.Parse(indexJson);
+
+ // Navigate to: videos[0].insights.transcript
+ if (!doc.RootElement.TryGetProperty("videos", out var videos) ||
+ videos.GetArrayLength() == 0)
+ {
+ _logger?.LogWarning("No videos found in index response");
+ return segments;
+ }
+
+ var video = videos[0];
+ if (!video.TryGetProperty("insights", out var insights))
+ {
+ _logger?.LogWarning("No insights found in video");
+ return segments;
+ }
+
+ // Build speaker name map from faces/speakers if available
+ var speakerNames = BuildSpeakerNameMap(insights);
+
+ // Parse transcript
+ if (!insights.TryGetProperty("transcript", out var transcript) ||
+ transcript.ValueKind != JsonValueKind.Array)
+ {
+ _logger?.LogWarning("No transcript found in insights");
+ return segments;
+ }
+
+ foreach (var item in transcript.EnumerateArray())
+ {
+ var text = item.TryGetProperty("text", out var textProp) ? textProp.GetString() : null;
+ if (string.IsNullOrWhiteSpace(text))
+ continue;
+
+ // Parse timestamps from instances array
+ // Video Indexer format: { "instances": [{ "start": "0:00:00.4", "end": "0:00:08.36" }] }
+ var start = TimeSpan.Zero;
+ var end = TimeSpan.Zero;
+
+ if (item.TryGetProperty("instances", out var instances) &&
+ instances.ValueKind == JsonValueKind.Array &&
+ instances.GetArrayLength() > 0)
+ {
+ var firstInstance = instances[0];
+ start = ParseTimestamp(firstInstance, "start");
+ end = ParseTimestamp(firstInstance, "end");
+ }
+
+ if (end <= start)
+ end = start + TimeSpan.FromMilliseconds(100);
+
+ // Parse speaker information
+ int? speakerId = null;
+ string? speakerName = null;
+
+ if (includeSpeakerLabels && item.TryGetProperty("speakerId", out var speakerIdProp))
+ {
+ speakerId = speakerIdProp.ValueKind == JsonValueKind.Number
+ ? speakerIdProp.GetInt32()
+ : int.TryParse(speakerIdProp.GetString(), out var id) ? id : null;
+
+ if (speakerId.HasValue && speakerNames.TryGetValue(speakerId.Value, out var name))
+ {
+ speakerName = name;
+ }
+ }
+
+ segments.Add(new TranscriptSegment(start, end, text.Trim(), speakerId, speakerName));
+ }
+
+ return segments.OrderBy(s => s.Start).ToList();
+ }
+
+ private Dictionary BuildSpeakerNameMap(JsonElement insights)
+ {
+ var map = new Dictionary();
+
+ // Try to get speaker names from faces (Person Model identification)
+ if (insights.TryGetProperty("faces", out var faces) && faces.ValueKind == JsonValueKind.Array)
+ {
+ foreach (var face in faces.EnumerateArray())
+ {
+ if (!face.TryGetProperty("id", out var idProp))
+ continue;
+
+ var faceId = idProp.ValueKind == JsonValueKind.Number
+ ? idProp.GetInt32()
+ : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+ if (!faceId.HasValue)
+ continue;
+
+ // Get name - could be from Person Model or Unknown
+ var name = face.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+
+ // Skip unknown faces - we'll use speaker ID instead
+ if (!string.IsNullOrWhiteSpace(name) &&
+ !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase))
+ {
+ map[faceId.Value] = name;
+ }
+ }
+ }
+
+ // Also check speakers section
+ if (insights.TryGetProperty("speakers", out var speakers) && speakers.ValueKind == JsonValueKind.Array)
+ {
+ foreach (var speaker in speakers.EnumerateArray())
+ {
+ if (!speaker.TryGetProperty("id", out var idProp))
+ continue;
+
+ var speakerId = idProp.ValueKind == JsonValueKind.Number
+ ? idProp.GetInt32()
+ : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+ if (!speakerId.HasValue || map.ContainsKey(speakerId.Value))
+ continue;
+
+ var name = speaker.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+ if (!string.IsNullOrWhiteSpace(name) &&
+ !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase) &&
+ !name.StartsWith("Speaker", StringComparison.OrdinalIgnoreCase))
+ {
+ map[speakerId.Value] = name;
+ }
+ }
+ }
+
+ return map;
+ }
+
+ private static TimeSpan ParseTimestamp(JsonElement element, string property)
+ {
+ if (!element.TryGetProperty(property, out var prop))
+ return TimeSpan.Zero;
+
+ var value = prop.GetString();
+ if (string.IsNullOrWhiteSpace(value))
+ return TimeSpan.Zero;
+
+ // Video Indexer uses format like "0:00:05.12" or "00:00:05.12"
+ if (TimeSpan.TryParse(value, out var ts))
+ return ts;
+
+ // Try parsing as seconds
+ if (double.TryParse(value, out var seconds))
+ return TimeSpan.FromSeconds(seconds);
+
+ return TimeSpan.Zero;
+ }
+
+ private async Task TryDeleteVideoAsync(string videoId, string accessToken)
+ {
+ try
+ {
+ var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos/{videoId}?accessToken={Uri.EscapeDataString(accessToken)}";
+ using var response = await _httpClient.DeleteAsync(url);
+ if (response.IsSuccessStatusCode)
+ {
+ _logger?.LogDebug("Deleted video {VideoId} from Video Indexer", videoId);
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger?.LogWarning(ex, "Failed to delete video {VideoId} from Video Indexer", videoId);
+ }
+ }
+}
diff --git a/tools/auto-clipper-harness/VideoIndexerRequest.cs b/tools/auto-clipper-harness/VideoIndexerRequest.cs
new file mode 100644
index 000000000..a24d54518
--- /dev/null
+++ b/tools/auto-clipper-harness/VideoIndexerRequest.cs
@@ -0,0 +1,23 @@
+namespace AutoClipperHarness;
+
+///
+/// Request options for Azure Video Indexer transcription.
+///
+public class VideoIndexerRequest
+{
+ ///
+ /// Language code for transcription (e.g., "en-US", "zh-CN").
+ ///
+ public string Language { get; init; } = "en-US";
+
+ ///
+ /// Optional Person Model ID for speaker identification.
+ /// When provided, Video Indexer will attempt to identify known faces.
+ ///
+ public string? PersonModelId { get; init; }
+
+ ///
+ /// Whether to include speaker labels in the output.
+ ///
+ public bool IncludeSpeakerLabels { get; init; } = true;
+}