diff --git a/openshift/kustomize/services/auto-clipper/base/deploy.yaml b/openshift/kustomize/services/auto-clipper/base/deploy.yaml
index 10f68f2c7..d21848891 100644
--- a/openshift/kustomize/services/auto-clipper/base/deploy.yaml
+++ b/openshift/kustomize/services/auto-clipper/base/deploy.yaml
@@ -160,6 +160,26 @@ spec:
                   name: azure-openai
                   key: AZURE_OPENAI_KEY
 
+            # Azure Video Indexer Configuration (optional - for speaker identification)
+            - name: Service__AzureVideoIndexerAccountId
+              valueFrom:
+                secretKeyRef:
+                  name: azure-video-indexer
+                  key: AZURE_VIDEO_INDEXER_ACCOUNT_ID
+                  optional: true
+            - name: Service__AzureVideoIndexerApiKey
+              valueFrom:
+                secretKeyRef:
+                  name: azure-video-indexer
+                  key: AZURE_VIDEO_INDEXER_API_KEY
+                  optional: true
+            - name: Service__AzureVideoIndexerLocation
+              valueFrom:
+                secretKeyRef:
+                  name: azure-video-indexer
+                  key: AZURE_VIDEO_INDEXER_LOCATION
+                  optional: true
+
             # Service Configuration
             - name: Service__MaxFailLimit
               valueFrom:
diff --git a/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml
index 4ca71785f..59ff13d76 100644
--- a/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml
+++ b/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml
@@ -16,6 +16,9 @@ secretGenerator:
   - name: azure-openai
     type: stringData
     env: openai.env
+  - name: azure-video-indexer
+    type: stringData
+    env: video-indexer.env
 
 patches:
   - target:
diff --git a/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml
index d783baf42..682fa26d8 100644
--- a/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml
+++ b/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml
@@ -16,6 +16,9 @@ secretGenerator:
   - name: azure-openai
     type: stringData
     env: openai.env
+  - name: azure-video-indexer
+    type: stringData
+    env: video-indexer.env
 
 patches:
   - target:
diff --git a/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml
index 036910c0c..d8b5bcce9 100644
--- a/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml
+++ b/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml
@@ -16,6 +16,9 @@ secretGenerator:
   - name: azure-openai
     type: stringData
     env: openai.env
+  - name: azure-video-indexer
+    type: stringData
+    env: video-indexer.env
 
 patches:
   - target:
diff --git a/services/net/auto-clipper/AutoClipperManager.cs b/services/net/auto-clipper/AutoClipperManager.cs
index b05a11d29..07de0f0e5 100644
--- a/services/net/auto-clipper/AutoClipperManager.cs
+++ b/services/net/auto-clipper/AutoClipperManager.cs
@@ -635,23 +635,32 @@ private static void CleanupTemporaryFiles(bool isSyncedToS3, params string[] fil
 
     /// <summary>
     /// Format the transcript to include newlines.
+    /// When speaker information is available (from Video Indexer), includes speaker prefix.
+    /// Azure Speech transcripts have no speaker info and will output plain text.
     /// </summary>
-    /// <param name="transcript"></param>
-    /// <returns></returns>
+    /// <param name="segments">Transcript segments with optional speaker information.</param>
+    /// <returns>Formatted transcript string.</returns>
     private static string BuildTranscriptDocument(IReadOnlyList<TimestampedTranscript> segments)
     {
         if (segments == null || segments.Count == 0) return string.Empty;
 
         var sb = new StringBuilder();
-        var index = 1;
         foreach (var segment in segments)
         {
             if (string.IsNullOrWhiteSpace(segment.Text)) continue;
-            // sb.AppendLine(index.ToString(CultureInfo.InvariantCulture));
-            // sb.AppendLine($"{FormatTimestamp(segment.Start)} --> {FormatTimestamp(segment.End)}");
+
+            // Add speaker prefix if available (Video Indexer provides this)
+            if (!string.IsNullOrWhiteSpace(segment.SpeakerName))
+            {
+                sb.Append($"{segment.SpeakerName}: ");
+            }
+            else if (segment.SpeakerId.HasValue)
+            {
+                sb.Append($"speaker{segment.SpeakerId}: ");
+            }
+
             sb.AppendLine(segment.Text.Trim());
             sb.AppendLine();
-            index++;
         }
 
         return sb.ToString().Trim();
diff --git a/services/net/auto-clipper/AutoClipperService.cs b/services/net/auto-clipper/AutoClipperService.cs
index 843a40853..ed18540e5 100644
--- a/services/net/auto-clipper/AutoClipperService.cs
+++ b/services/net/auto-clipper/AutoClipperService.cs
@@ -57,6 +57,14 @@ protected override IServiceCollection ConfigureServices(IServiceCollection servi
         services.AddSingleton<IAzureSpeechTranscriptionService, AzureSpeechTranscriptionService>();
         services.AddHttpClient<IClipSegmentationService, ClipSegmentationService>();
 
+        // Register Video Indexer client if configured
+        var videoIndexerAccountId = this.Configuration.GetSection("Service")["AzureVideoIndexerAccountId"];
+        var videoIndexerApiKey = this.Configuration.GetSection("Service")["AzureVideoIndexerApiKey"];
+        if (!string.IsNullOrWhiteSpace(videoIndexerAccountId) && !string.IsNullOrWhiteSpace(videoIndexerApiKey))
+        {
+            services.AddHttpClient<IAzureVideoIndexerClient, AzureVideoIndexerClient>();
+        }
+
         // TODO: Figure out how to validate without resulting in aggregating the config values.
         // services.AddOptions<AutoClipperOptions>()
         //     .Bind(this.Configuration.GetSection("Service"))
diff --git a/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs b/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs
new file mode 100644
index 000000000..354d3921d
--- /dev/null
+++ b/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs
@@ -0,0 +1,363 @@
+using System.Net.Http.Headers;
+using System.Text.Json;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using TNO.Services.AutoClipper.Config;
+
+namespace TNO.Services.AutoClipper.Azure;
+
+/// <summary>
+/// Client for Azure Video Indexer API.
+/// Handles video upload, processing, and transcript extraction with speaker identification.
+/// </summary>
+public class AzureVideoIndexerClient : IAzureVideoIndexerClient
+{
+    private const string ApiBaseUrl = "https://api.videoindexer.ai";
+
+    private readonly HttpClient _httpClient;
+    private readonly AutoClipperOptions _options;
+    private readonly ILogger<AzureVideoIndexerClient> _logger;
+
+    private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web)
+    {
+        PropertyNameCaseInsensitive = true
+    };
+
+    public AzureVideoIndexerClient(
+        HttpClient httpClient,
+        IOptions<AutoClipperOptions> options,
+        ILogger<AzureVideoIndexerClient> logger)
+    {
+        _httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
+        _options = options?.Value ?? throw new ArgumentNullException(nameof(options));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+
+        if (string.IsNullOrWhiteSpace(_options.AzureVideoIndexerAccountId))
+            throw new ArgumentException("AzureVideoIndexerAccountId is required");
+        if (string.IsNullOrWhiteSpace(_options.AzureVideoIndexerApiKey))
+            throw new ArgumentException("AzureVideoIndexerApiKey is required");
+    }
+
+    /// <inheritdoc />
+    public async Task<IReadOnlyList<TimestampedTranscript>> TranscribeAsync(
+        string filePath,
+        VideoIndexerRequest request,
+        CancellationToken cancellationToken = default)
+    {
+        if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath))
+            throw new FileNotFoundException("Media file not found", filePath);
+
+        _logger.LogInformation("Starting Video Indexer transcription for {File}", filePath);
+
+        // Step 1: Get access token
+        var accessToken = await GetAccessTokenAsync(cancellationToken);
+        _logger.LogDebug("Obtained access token");
+
+        // Step 2: Upload video
+        var videoId = await UploadVideoAsync(filePath, accessToken, request, cancellationToken);
+        _logger.LogInformation("Video uploaded with ID: {VideoId}", videoId);
+
+        try
+        {
+            // Step 3: Wait for processing to complete
+            var indexJson = await WaitForProcessingAsync(videoId, accessToken, cancellationToken);
+            _logger.LogInformation("Video processing completed");
+
+            // Step 4: Parse transcript
+            var segments = ParseTranscript(indexJson, request.IncludeSpeakerLabels);
+            _logger.LogInformation("Parsed {Count} transcript segments", segments.Count);
+
+            return segments;
+        }
+        finally
+        {
+            // Clean up: delete the video from Video Indexer
+            await TryDeleteVideoAsync(videoId, accessToken);
+        }
+    }
+
+    private async Task<string> GetAccessTokenAsync(CancellationToken cancellationToken)
+    {
+        var url = $"{ApiBaseUrl}/Auth/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/AccessToken?allowEdit=true";
+
+        using var request = new HttpRequestMessage(HttpMethod.Get, url);
+        request.Headers.Add("Ocp-Apim-Subscription-Key", _options.AzureVideoIndexerApiKey);
+
+        using var response = await _httpClient.SendAsync(request, cancellationToken);
+        var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+        if (!response.IsSuccessStatusCode)
+            throw new InvalidOperationException($"Failed to get access token: {response.StatusCode} - {body}");
+
+        // Token is returned as a quoted string
+        return body.Trim('"');
+    }
+
+    private async Task<string> UploadVideoAsync(
+        string filePath,
+        string accessToken,
+        VideoIndexerRequest request,
+        CancellationToken cancellationToken)
+    {
+        var fileName = Path.GetFileName(filePath);
+        var videoName = $"AutoClipper-{Guid.NewGuid():N}-{fileName}";
+
+        // Build upload URL with query parameters
+        var queryParams = new List<string>
+        {
+            $"accessToken={Uri.EscapeDataString(accessToken)}",
+            $"name={Uri.EscapeDataString(videoName)}",
+            $"language={Uri.EscapeDataString(request.Language)}",
+            "privacy=Private",
+            "indexingPreset=AudioOnly"  // We only need audio analysis for transcription
+        };
+
+        if (!string.IsNullOrWhiteSpace(request.PersonModelId))
+        {
+            queryParams.Add($"personModelId={Uri.EscapeDataString(request.PersonModelId)}");
+        }
+
+        var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos?{string.Join("&", queryParams)}";
+
+        _logger.LogDebug("Uploading video to: {Url}", url.Split('?')[0]);
+
+        using var content = new MultipartFormDataContent();
+        await using var fileStream = File.OpenRead(filePath);
+        var fileContent = new StreamContent(fileStream);
+        fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream");
+        content.Add(fileContent, "file", fileName);
+
+        using var response = await _httpClient.PostAsync(url, content, cancellationToken);
+        var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+        if (!response.IsSuccessStatusCode)
+            throw new InvalidOperationException($"Failed to upload video: {response.StatusCode} - {body}");
+
+        using var doc = JsonDocument.Parse(body);
+        var videoId = doc.RootElement.GetProperty("id").GetString();
+
+        if (string.IsNullOrWhiteSpace(videoId))
+            throw new InvalidOperationException("Video Indexer did not return a video ID");
+
+        return videoId;
+    }
+
+    private async Task<string> WaitForProcessingAsync(
+        string videoId,
+        string accessToken,
+        CancellationToken cancellationToken)
+    {
+        var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos/{videoId}/Index?accessToken={Uri.EscapeDataString(accessToken)}";
+        var timeout = TimeSpan.FromMinutes(_options.AzureVideoIndexerTimeoutMinutes);
+        var pollInterval = TimeSpan.FromSeconds(_options.AzureVideoIndexerPollingIntervalSeconds);
+        var startTime = DateTime.UtcNow;
+
+        while (true)
+        {
+            cancellationToken.ThrowIfCancellationRequested();
+
+            if (DateTime.UtcNow - startTime > timeout)
+                throw new TimeoutException($"Video Indexer processing did not complete within {_options.AzureVideoIndexerTimeoutMinutes} minutes");
+
+            using var response = await _httpClient.GetAsync(url, cancellationToken);
+            var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+            if (!response.IsSuccessStatusCode)
+                throw new InvalidOperationException($"Failed to get video index: {response.StatusCode} - {body}");
+
+            using var doc = JsonDocument.Parse(body);
+            var state = doc.RootElement.GetProperty("state").GetString();
+
+            var elapsed = (int)(DateTime.UtcNow - startTime).TotalSeconds;
+            _logger.LogInformation("Video Indexer status: {State} ({Elapsed}s elapsed)", state, elapsed);
+
+            if (string.Equals(state, "Processed", StringComparison.OrdinalIgnoreCase))
+            {
+                return body;
+            }
+
+            if (string.Equals(state, "Failed", StringComparison.OrdinalIgnoreCase))
+            {
+                var errorMessage = "Unknown error";
+                if (doc.RootElement.TryGetProperty("failureMessage", out var failureMsg))
+                    errorMessage = failureMsg.GetString() ?? errorMessage;
+                throw new InvalidOperationException($"Video Indexer processing failed: {errorMessage}");
+            }
+
+            await Task.Delay(pollInterval, cancellationToken);
+        }
+    }
+
+    private IReadOnlyList<TimestampedTranscript> ParseTranscript(string indexJson, bool includeSpeakerLabels)
+    {
+        var segments = new List<TimestampedTranscript>();
+
+        using var doc = JsonDocument.Parse(indexJson);
+
+        // Navigate to: videos[0].insights.transcript
+        if (!doc.RootElement.TryGetProperty("videos", out var videos) ||
+            videos.GetArrayLength() == 0)
+        {
+            _logger.LogWarning("No videos found in index response");
+            return segments;
+        }
+
+        var video = videos[0];
+        if (!video.TryGetProperty("insights", out var insights))
+        {
+            _logger.LogWarning("No insights found in video");
+            return segments;
+        }
+
+        // Build speaker name map from faces/speakers if available
+        var speakerNames = BuildSpeakerNameMap(insights);
+
+        // Parse transcript
+        if (!insights.TryGetProperty("transcript", out var transcript) ||
+            transcript.ValueKind != JsonValueKind.Array)
+        {
+            _logger.LogWarning("No transcript found in insights");
+            return segments;
+        }
+
+        foreach (var item in transcript.EnumerateArray())
+        {
+            var text = item.TryGetProperty("text", out var textProp) ? textProp.GetString() : null;
+            if (string.IsNullOrWhiteSpace(text))
+                continue;
+
+            // Parse timestamps from instances array
+            // Video Indexer format: { "instances": [{ "start": "0:00:00.4", "end": "0:00:08.36" }] }
+            var start = TimeSpan.Zero;
+            var end = TimeSpan.Zero;
+
+            if (item.TryGetProperty("instances", out var instances) &&
+                instances.ValueKind == JsonValueKind.Array &&
+                instances.GetArrayLength() > 0)
+            {
+                var firstInstance = instances[0];
+                start = ParseTimestamp(firstInstance, "start");
+                end = ParseTimestamp(firstInstance, "end");
+            }
+
+            if (end <= start)
+                end = start + TimeSpan.FromMilliseconds(100);
+
+            // Parse speaker information
+            int? speakerId = null;
+            string? speakerName = null;
+
+            if (includeSpeakerLabels && item.TryGetProperty("speakerId", out var speakerIdProp))
+            {
+                speakerId = speakerIdProp.ValueKind == JsonValueKind.Number
+                    ? speakerIdProp.GetInt32()
+                    : int.TryParse(speakerIdProp.GetString(), out var id) ? id : null;
+
+                if (speakerId.HasValue && speakerNames.TryGetValue(speakerId.Value, out var name))
+                {
+                    speakerName = name;
+                }
+            }
+
+            segments.Add(new TimestampedTranscript(start, end, text.Trim(), speakerId, speakerName));
+        }
+
+        return segments.OrderBy(s => s.Start).ToList();
+    }
+
+    private Dictionary<int, string> BuildSpeakerNameMap(JsonElement insights)
+    {
+        var map = new Dictionary<int, string>();
+
+        // Try to get speaker names from faces (Person Model identification)
+        if (insights.TryGetProperty("faces", out var faces) && faces.ValueKind == JsonValueKind.Array)
+        {
+            foreach (var face in faces.EnumerateArray())
+            {
+                if (!face.TryGetProperty("id", out var idProp))
+                    continue;
+
+                var faceId = idProp.ValueKind == JsonValueKind.Number
+                    ? idProp.GetInt32()
+                    : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+                if (!faceId.HasValue)
+                    continue;
+
+                // Get name - could be from Person Model or Unknown
+                var name = face.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+
+                // Skip unknown faces - we'll use speaker ID instead
+                if (!string.IsNullOrWhiteSpace(name) &&
+                    !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase))
+                {
+                    map[faceId.Value] = name;
+                }
+            }
+        }
+
+        // Also check speakers section
+        if (insights.TryGetProperty("speakers", out var speakers) && speakers.ValueKind == JsonValueKind.Array)
+        {
+            foreach (var speaker in speakers.EnumerateArray())
+            {
+                if (!speaker.TryGetProperty("id", out var idProp))
+                    continue;
+
+                var speakerId = idProp.ValueKind == JsonValueKind.Number
+                    ? idProp.GetInt32()
+                    : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+                if (!speakerId.HasValue || map.ContainsKey(speakerId.Value))
+                    continue;
+
+                var name = speaker.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+                if (!string.IsNullOrWhiteSpace(name) &&
+                    !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase) &&
+                    !name.StartsWith("Speaker", StringComparison.OrdinalIgnoreCase))
+                {
+                    map[speakerId.Value] = name;
+                }
+            }
+        }
+
+        return map;
+    }
+
+    private static TimeSpan ParseTimestamp(JsonElement element, string property)
+    {
+        if (!element.TryGetProperty(property, out var prop))
+            return TimeSpan.Zero;
+
+        var value = prop.GetString();
+        if (string.IsNullOrWhiteSpace(value))
+            return TimeSpan.Zero;
+
+        // Video Indexer uses format like "0:00:05.12" or "00:00:05.12"
+        if (TimeSpan.TryParse(value, out var ts))
+            return ts;
+
+        // Try parsing as seconds
+        if (double.TryParse(value, out var seconds))
+            return TimeSpan.FromSeconds(seconds);
+
+        return TimeSpan.Zero;
+    }
+
+    private async Task TryDeleteVideoAsync(string videoId, string accessToken)
+    {
+        try
+        {
+            var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos/{videoId}?accessToken={Uri.EscapeDataString(accessToken)}";
+            using var response = await _httpClient.DeleteAsync(url);
+            if (response.IsSuccessStatusCode)
+            {
+                _logger.LogDebug("Deleted video {VideoId} from Video Indexer", videoId);
+            }
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex, "Failed to delete video {VideoId} from Video Indexer", videoId);
+        }
+    }
+}
diff --git a/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs b/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs
index b8742b3d2..99db7736c 100644
--- a/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs
+++ b/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs
@@ -1,11 +1,19 @@
-using System.Collections.Generic;
-using System.IO;
-using System.Threading;
-using System.Threading.Tasks;
-
 namespace TNO.Services.AutoClipper.Azure;
 
+/// <summary>
+/// Client interface for Azure Video Indexer transcription service.
+/// </summary>
 public interface IAzureVideoIndexerClient
 {
-    Task<IReadOnlyList<TimestampedTranscript>> GenerateTranscriptAsync(Stream stream, string fileName, string language, CancellationToken cancellationToken = default);
+    /// <summary>
+    /// Transcribes a media file using Azure Video Indexer.
+    /// </summary>
+    /// <param name="filePath">Path to the media file (video or audio).</param>
+    /// <param name="request">Transcription request options.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <returns>List of transcript segments with optional speaker information.</returns>
+    Task<IReadOnlyList<TimestampedTranscript>> TranscribeAsync(
+        string filePath,
+        VideoIndexerRequest request,
+        CancellationToken cancellationToken = default);
 }
diff --git a/services/net/auto-clipper/Azure/TimestampedTranscript.cs b/services/net/auto-clipper/Azure/TimestampedTranscript.cs
index 02795aef9..fb11c4958 100644
--- a/services/net/auto-clipper/Azure/TimestampedTranscript.cs
+++ b/services/net/auto-clipper/Azure/TimestampedTranscript.cs
@@ -2,4 +2,13 @@
 
 namespace TNO.Services.AutoClipper.Azure;
 
-public record TimestampedTranscript(TimeSpan Start, TimeSpan End, string Text);
+/// <summary>
+/// Represents a transcript segment with optional speaker identification.
+/// </summary>
+public record TimestampedTranscript(
+    TimeSpan Start,
+    TimeSpan End,
+    string Text,
+    int? SpeakerId = null,
+    string? SpeakerName = null
+);
diff --git a/services/net/auto-clipper/Azure/VideoIndexerRequest.cs b/services/net/auto-clipper/Azure/VideoIndexerRequest.cs
new file mode 100644
index 000000000..98311499a
--- /dev/null
+++ b/services/net/auto-clipper/Azure/VideoIndexerRequest.cs
@@ -0,0 +1,23 @@
+namespace TNO.Services.AutoClipper.Azure;
+
+/// <summary>
+/// Request options for Azure Video Indexer transcription.
+/// </summary>
+public class VideoIndexerRequest
+{
+    /// <summary>
+    /// Language code for transcription (e.g., "en-US", "zh-CN").
+    /// </summary>
+    public string Language { get; init; } = "en-US";
+
+    /// <summary>
+    /// Optional Person Model ID for speaker identification.
+    /// When provided, Video Indexer will attempt to identify known faces.
+    /// </summary>
+    public string? PersonModelId { get; init; }
+
+    /// <summary>
+    /// Whether to include speaker labels in the output.
+    /// </summary>
+    public bool IncludeSpeakerLabels { get; init; } = true;
+}
diff --git a/services/net/auto-clipper/Config/AutoClipperOptions.cs b/services/net/auto-clipper/Config/AutoClipperOptions.cs
index cee1d3a4e..4e5fa587a 100644
--- a/services/net/auto-clipper/Config/AutoClipperOptions.cs
+++ b/services/net/auto-clipper/Config/AutoClipperOptions.cs
@@ -94,6 +94,33 @@ public class AutoClipperOptions : ServiceOptions
     public int AzureSpeechStorageSasExpiryMinutes { get; set; } = 180;
     #endregion
 
+    #region Azure Video Indexer configuration
+    /// <summary>
+    /// get/set - Azure Video Indexer account ID.
+    /// </summary>
+    public string AzureVideoIndexerAccountId { get; set; } = string.Empty;
+
+    /// <summary>
+    /// get/set - Azure Video Indexer location (e.g., "trial", "eastus").
+    /// </summary>
+    public string AzureVideoIndexerLocation { get; set; } = "trial";
+
+    /// <summary>
+    /// get/set - Azure Video Indexer API key (Ocp-Apim-Subscription-Key).
+    /// </summary>
+    public string AzureVideoIndexerApiKey { get; set; } = string.Empty;
+
+    /// <summary>
+    /// get/set - Timeout in minutes for Video Indexer processing.
+    /// </summary>
+    public int AzureVideoIndexerTimeoutMinutes { get; set; } = 60;
+
+    /// <summary>
+    /// get/set - Polling interval in seconds for Video Indexer status checks.
+    /// </summary>
+    public int AzureVideoIndexerPollingIntervalSeconds { get; set; } = 30;
+    #endregion
+
     #region Azure AI configuration
     /// <summary>
     /// get/set - The URL to the LLM
diff --git a/services/net/auto-clipper/Config/StationProfile.cs b/services/net/auto-clipper/Config/StationProfile.cs
index 408475cba..a6e9de94e 100644
--- a/services/net/auto-clipper/Config/StationProfile.cs
+++ b/services/net/auto-clipper/Config/StationProfile.cs
@@ -11,12 +11,35 @@ public class StationProfile
 
 public class StationTranscriptionProfile
 {
+    /// <summary>
+    /// Transcription provider: "azure_speech" or "azure_video_indexer"
+    /// </summary>
     public string Provider { get; set; } = "azure_speech";
+
+    public string Language { get; set; } = "en-US";
+    public int SampleRate { get; set; } = 16000;
+
+    // Azure Speech specific settings
     public bool Diarization { get; set; }
     public int? MaxSpeakers { get; set; }
     public string? DiarizationMode { get; set; } = "online";
-    public string Language { get; set; } = "en-US";
-    public int SampleRate { get; set; } = 16000;
+
+    // Azure Video Indexer specific settings
+    /// <summary>
+    /// Dictionary of Person Model names to IDs for speaker identification.
+    /// Example: { "news": "model-id-1", "sports": "model-id-2" }
+    /// </summary>
+    public Dictionary<string, string> PersonModels { get; set; } = new();
+
+    /// <summary>
+    /// Key to select which Person Model to use from PersonModels dictionary.
+    /// </summary>
+    public string? PersonModelKey { get; set; }
+
+    /// <summary>
+    /// Whether to include speaker labels (speaker1:, speaker2:, or named) in transcript.
+    /// </summary>
+    public bool IncludeSpeakerLabels { get; set; }
 }
 
 public class StationTextProfile
diff --git a/services/net/auto-clipper/Config/Stations/CHAN.yml b/services/net/auto-clipper/Config/Stations/CHAN.yml
new file mode 100644
index 000000000..761ebe6cc
--- /dev/null
+++ b/services/net/auto-clipper/Config/Stations/CHAN.yml
@@ -0,0 +1,95 @@
+# CHAN station configuration using Azure Video Indexer
+name: CHAN
+sample_rate: 16000
+
+transcription:
+  provider: azure_video_indexer
+  language: en-CA
+  include_speaker_labels: true
+  # Person Model (optional) - add later when you have one:
+  # person_models:
+  #   news: "your-model-id-here"
+  # person_model_key: news
+
+text:
+  chunk_size_s: 3.0
+  chunk_overlap_ratio: 0.5
+  heuristic_boundary_weight: 0.35
+  keyword_categories:
+    "(?i)traffic": Traffic
+    "(?i)weather": Weather
+    "(?i)sponsor": Ad
+    "(?i)commercial": Ad
+    "(?i)up next": Promo
+    "(?i)coming up": Promo
+  llm_segmentation: true
+  llm_model: gpt-5-chat
+  llm_temperature: 0.0
+  system_prompt: |
+    You are a Broadcast Structure Parser. Your ONLY job is to detect segment transitions.
+    Output MUST be a single, raw JSON object. 
+    CRITICAL: Start your response with '{' and end with '}'. 
+    DO NOT use markdown, backticks, or "```json" blocks. No introductory or closing text.
+  max_stories: 15
+  llm_prompt: |
+    Identify every point in the transcript where the topic or segment type changes.
+    Note: Speaker labels (e.g., "speaker1:", "Tom:") indicate who is speaking - use these to help identify segment transitions.
+    
+    # SUMMARY RULES
+    1. **Prioritize Anchor Leads**: For News, derive the summary from the anchor's introduction or the first three sentences of the report.
+    2. **Active Voice**: Use active journalistic voice (e.g., "Surrey Council rejects housing proposal" NOT "A report about a meeting").
+    3. **Category Formulas**:
+       - News: [Subject] [Action] (e.g., "Abbotsford shooting victim's son calls for urgent investigation").
+       - Traffic: [Location] [Incident/Status] (e.g., "Highway 99 northbound blocked at Hwy 17A due to crash").
+       - Weather: [Condition] + [High Temp] (e.g., "Mix of sun and cloud with a high of 9 degrees").
+       - Ad: [Business Name] + [Offer/Service] (e.g., "McDonald's features Egg McMuffin with Hollandaise sauce").
+    4. **One Sentence Only**: Summaries MUST be a single, concise sentence.
+    
+    # STRUCTURAL RULES (To Prevent Bundling)
+    1. **The Sign-off Rule**: Phrases like "Global News," "CBC News," or "Reporting live" followed by a name mark the END of a segment. The very next sentence MUST be a new boundary.
+    2. **The Handoff Rule**: When an anchor introduces a reporter (e.g., "As Joshua reports..."), the boundary starts at the ANCHOR'S introduction line.
+    3. **Mandatory Category Split**: News, Traffic, Weather, and Ads MUST be isolated. Never bundle a Traffic report with a News story.
+    4. **Zero Bloating**: Treat every unique headline as a separate clip. If the topic shifts from a shooting to a stabbing, create two distinct boundaries.
+    5. **Speaker Change Awareness**: When the speaker label changes (e.g., from "speaker1:" to "speaker2:"), consider if this indicates a segment transition.
+
+    # OUTPUT FORMAT (Raw JSON ONLY)
+    {
+      "boundaries": [
+        {
+          "index": [Sentence Number],
+          "category": "News | Traffic | Weather | Ad | Promo",
+          "title": "[Short Slug]",
+          "summary": "[Journalistic Summary Sentence]",
+          "score": 0.95
+        }
+      ]
+    }
+
+    Transcript:
+    {{transcript}}
+
+heuristics:
+  pattern_entries:
+    # --- Common Transition Patterns ---
+    - pattern: "(?i)coming up"
+      weight: 0.65
+      category: Promo
+      note: Host tease for the next story
+    - pattern: "(?i)after the break"
+      weight: 0.65
+      category: Promo
+      note: Signals a hard break/transition
+    # --- Service Cues ---
+    - pattern: "(?i)traffic update"
+      weight: 0.6
+      category: Traffic
+      note: Recurring traffic block
+    - pattern: "(?i)weather update"
+      weight: 0.55
+      category: Weather
+      note: Weather hits are their own segments
+    # --- Add CHAN-specific anchor patterns here when known ---
+    # - pattern: "(?i)Anchor Name"
+    #   weight: 0.85
+    #   category: News/Traffic/Weather
+    #   note: Description
diff --git a/services/net/auto-clipper/LLM/ClipSegmentationService.cs b/services/net/auto-clipper/LLM/ClipSegmentationService.cs
index 806a6d0c5..7e913d8b0 100644
--- a/services/net/auto-clipper/LLM/ClipSegmentationService.cs
+++ b/services/net/auto-clipper/LLM/ClipSegmentationService.cs
@@ -295,7 +295,10 @@ private IReadOnlyList<ClipDefinition> ParseResponse(string? body, IReadOnlyList<
             {
                 if (content == null) continue;
 
-                var boundaries = JsonSerializer.Deserialize<TranscriptBoundaries>(content!);
+                // LLM sometimes returns JSON wrapped in markdown code fences (```json ... ```),
+                // even when instructed to return raw JSON. Strip the fences before parsing.
+                var strippedContent = StripCodeFence(content!);
+                var boundaries = JsonSerializer.Deserialize<TranscriptBoundaries>(strippedContent);
                 if (boundaries == null || boundaries.Boundaries == null) continue;
                 foreach (var boundary in boundaries.Boundaries)
                 {
diff --git a/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs b/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs
index 0d098b51c..74c67d6e7 100644
--- a/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs
+++ b/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs
@@ -11,6 +11,7 @@ public class ClipProcessingPipeline
 {
     private readonly IAudioNormalizer _audioNormalizer;
     private readonly IAzureSpeechTranscriptionService _speechTranscriber;
+    private readonly IAzureVideoIndexerClient? _videoIndexerClient;
     private readonly IClipSegmentationService _clipSegmentation;
     private readonly AutoClipperOptions _options;
     private readonly ILogger<ClipProcessingPipeline> _logger;
@@ -20,10 +21,12 @@ public ClipProcessingPipeline(
         IAzureSpeechTranscriptionService speechTranscriber,
         IClipSegmentationService clipSegmentation,
         IOptions<AutoClipperOptions> options,
-        ILogger<ClipProcessingPipeline> logger)
+        ILogger<ClipProcessingPipeline> logger,
+        IAzureVideoIndexerClient? videoIndexerClient = null)
     {
         _audioNormalizer = audioNormalizer;
         _speechTranscriber = speechTranscriber;
+        _videoIndexerClient = videoIndexerClient;
         _clipSegmentation = clipSegmentation;
         _options = options.Value;
         _logger = logger;
@@ -31,25 +34,71 @@ public ClipProcessingPipeline(
 
     public async Task<ClipProcessingResult> ExecuteAsync(ClipProcessingContext context, CancellationToken cancellationToken)
     {
-        var normalizedPath = await _audioNormalizer.NormalizeAsync(context.SourcePath, context.TargetSampleRate, cancellationToken);
         var language = !string.IsNullOrWhiteSpace(context.Request.Language)
             ? context.Request.Language!
             : !string.IsNullOrWhiteSpace(context.StationProfile.Transcription.Language)
                 ? context.StationProfile.Transcription.Language
                 : _options.DefaultTranscriptLanguage;
-        var transcriptionRequest = new SpeechTranscriptionRequest
+
+        var provider = context.StationProfile.Transcription.Provider ?? "azure_speech";
+        IReadOnlyList<TimestampedTranscript> segments;
+        string workingPath;
+
+        if (string.Equals(provider, "azure_video_indexer", StringComparison.OrdinalIgnoreCase))
         {
-            Language = language,
-            EnableSpeakerDiarization = context.StationProfile.Transcription.Diarization,
-            SpeakerCount = context.StationProfile.Transcription.MaxSpeakers,
-            DiarizationMode = context.StationProfile.Transcription.DiarizationMode
-        };
+            // Use Azure Video Indexer - upload original file directly (no normalization needed)
+            if (_videoIndexerClient == null)
+                throw new InvalidOperationException("Video Indexer client is not configured but provider is set to azure_video_indexer");
+
+            workingPath = context.SourcePath;
+            var personModelId = ResolvePersonModelId(context.StationProfile.Transcription);
+
+            _logger.LogInformation("Using Video Indexer provider (PersonModel: {PersonModel})", personModelId ?? "none");
+
+            var viRequest = new VideoIndexerRequest
+            {
+                Language = language,
+                PersonModelId = personModelId,
+                IncludeSpeakerLabels = context.StationProfile.Transcription.IncludeSpeakerLabels
+            };
+
+            segments = await _videoIndexerClient.TranscribeAsync(context.SourcePath, viRequest, cancellationToken);
+        }
+        else
+        {
+            // Use Azure Speech (default) - requires audio normalization
+            _logger.LogInformation("Using Azure Speech provider");
+
+            workingPath = await _audioNormalizer.NormalizeAsync(context.SourcePath, context.TargetSampleRate, cancellationToken);
+            var transcriptionRequest = new SpeechTranscriptionRequest
+            {
+                Language = language,
+                EnableSpeakerDiarization = context.StationProfile.Transcription.Diarization,
+                SpeakerCount = context.StationProfile.Transcription.MaxSpeakers,
+                DiarizationMode = context.StationProfile.Transcription.DiarizationMode
+            };
+
+            segments = await _speechTranscriber.TranscribeAsync(workingPath, transcriptionRequest, cancellationToken);
+        }
 
-        var segments = await _speechTranscriber.TranscribeAsync(normalizedPath, transcriptionRequest, cancellationToken);
         var segmentationSettings = BuildSegmentationSettings(context.StationProfile);
         var clipDefinitions = await _clipSegmentation.GenerateClipsAsync(segments, segmentationSettings, cancellationToken);
 
-        return new ClipProcessingResult(normalizedPath, language, segments, clipDefinitions, segmentationSettings);
+        return new ClipProcessingResult(workingPath, language, segments, clipDefinitions, segmentationSettings);
+    }
+
+    /// <summary>
+    /// Resolves the Person Model ID from station profile configuration.
+    /// </summary>
+    private static string? ResolvePersonModelId(StationTranscriptionProfile transcription)
+    {
+        if (string.IsNullOrWhiteSpace(transcription.PersonModelKey))
+            return null;
+
+        if (transcription.PersonModels.TryGetValue(transcription.PersonModelKey, out var modelId))
+            return modelId;
+
+        return null;
     }
 
     private static ClipSegmentationSettings BuildSegmentationSettings(StationProfile profile)
diff --git a/services/net/auto-clipper/README.md b/services/net/auto-clipper/README.md
index 171d2ad5f..8b8654c67 100644
--- a/services/net/auto-clipper/README.md
+++ b/services/net/auto-clipper/README.md
@@ -1,27 +1,41 @@
 # AutoClipper Service
 
-The AutoClipper service consumes clip requests from Kafka, normalizes audio, transcribes it with Azure Speech, and
-segments the transcript into clips using a boundary-aware LLM workflow boosted by station heuristics. Key concepts:
+The AutoClipper service consumes clip requests from Kafka, transcribes media using Azure Speech or Azure Video Indexer,
+and segments the transcript into clips using a boundary-aware LLM workflow boosted by station heuristics. Key concepts:
 
-- **Station profiles** (Config/Stations/\*.yml) define language, sample rate, heuristic keywords, custom prompts, and
-  category mappings for weather/traffic/ads.
-- **Pipeline** (ClipProcessingPipeline) normalizes audio, transcribes via AzureSpeechTranscriptionService, and feeds
-  transcripts plus station config into ClipSegmentationService.
+- **Station profiles** (Config/Stations/\*.yml) define transcription provider, language, sample rate, heuristic keywords,
+  custom prompts, and category mappings for weather/traffic/ads.
+- **Transcription providers**:
+  - `azure_speech` (default) - Fast batch transcription, outputs plain text.
+  - `azure_video_indexer` - Supports speaker identification (speaker1:, speaker2:, or named via Person Model).
+- **Pipeline** (ClipProcessingPipeline) selects the transcription provider based on station config, transcribes the media,
+  and feeds transcripts plus station config into ClipSegmentationService.
 - **Segmentation** uses Azure OpenAI to score story boundaries, merges in regex-based heuristics, snaps clips to transcript
   sentences, and tags each clip with a category before AutoClipperManager creates content and uploads the media.
 
 ## Development
 
-1. Update station YAMLs under Config/Stations (copy CKNW.yml as a starting point).
-2. Run dotnet build services/net/auto-clipper/TNO.Services.AutoClipper.csproj to verify changes.
-3. Use the harness (see tools/auto-clipper-harness/README.md) to manually validate segmentation on sample audio.
+1. Update station YAMLs under Config/Stations (copy CKNW.yml for Azure Speech, CHAN.yml for Video Indexer).
+2. Run `dotnet build services/net/auto-clipper/TNO.Services.AutoClipper.csproj` to verify changes.
+3. Use the harness (see tools/auto-clipper-harness/README.md) to manually validate segmentation on sample media.
 
 ## Configuration
 
-Important Service\_\_ env vars:
+### Azure Speech (default provider)
 
-- Service**AzureSpeechKey / Service**AzureSpeechRegion
-- Service**AzureSpeechStorageConnectionString / Service**AzureSpeechStorageContainer (batch upload destination for Azure Speech).
-- Service**AzureSpeechBatchEndpoint, Service**AzureSpeechBatchApiVersion, Service**AzureSpeechBatchPollingIntervalSeconds, Service**AzureSpeechBatchTimeoutMinutes, Service\_\_AzureSpeechStorageSasExpiryMinutes (optional batch tuning).
-- Service**LlmApiUrl, Service**LlmApiKey, Service**LlmDeployment, Service**LlmApiVersion
-- Service\_\_StationConfigPath (optional override for station YAML directory)
+- Service\_\_AzureSpeechKey / Service\_\_AzureSpeechRegion
+- Service\_\_AzureSpeechStorageConnectionString / Service\_\_AzureSpeechStorageContainer (batch upload destination)
+- Service\_\_AzureSpeechBatchEndpoint, Service\_\_AzureSpeechBatchApiVersion, Service\_\_AzureSpeechBatchPollingIntervalSeconds, Service\_\_AzureSpeechBatchTimeoutMinutes, Service\_\_AzureSpeechStorageSasExpiryMinutes (optional batch tuning)
+
+### Azure Video Indexer (optional, for speaker identification)
+
+- Service\_\_AzureVideoIndexerAccountId - Your Video Indexer account ID
+- Service\_\_AzureVideoIndexerLocation - Account location (e.g., `trial`, `eastus`)
+- Service\_\_AzureVideoIndexerApiKey - API subscription key
+- Service\_\_AzureVideoIndexerTimeoutMinutes (default: 60) - Max wait time for processing
+- Service\_\_AzureVideoIndexerPollingIntervalSeconds (default: 30) - Status check interval
+
+### LLM & General
+
+- Service\_\_LlmApiUrl, Service\_\_LlmApiKey, Service\_\_LlmDeployment, Service\_\_LlmApiVersion
+- Service\_\_StationConfigPath (optional override for station YAML directory)
\ No newline at end of file
diff --git a/services/net/auto-clipper/appsettings.json b/services/net/auto-clipper/appsettings.json
index 4bf3fbc1e..1f0a93ff7 100644
--- a/services/net/auto-clipper/appsettings.json
+++ b/services/net/auto-clipper/appsettings.json
@@ -53,6 +53,12 @@
     "AzureSpeechStorageSasExpiryMinutes": 180,
     "DefaultTranscriptLanguage": "en-US",
 
+    "AzureVideoIndexerAccountId": "",
+    "AzureVideoIndexerLocation": "trial",
+    "AzureVideoIndexerApiKey": "",
+    "AzureVideoIndexerTimeoutMinutes": 60,
+    "AzureVideoIndexerPollingIntervalSeconds": 30,
+
     "LlmApiUrl": "https://mmiopenai.cognitiveservices.azure.com/",
     "LlmApiKey": "",
     "LlmDefaultModel": "",
diff --git a/tools/auto-clipper-harness/.env.sample b/tools/auto-clipper-harness/.env.sample
index d90a25757..7bc0c53c0 100644
--- a/tools/auto-clipper-harness/.env.sample
+++ b/tools/auto-clipper-harness/.env.sample
@@ -1,4 +1,14 @@
 # TEMP HARNESS env file. Delete along with this harness when done.
+
+# === Transcription Provider Selection ===
+# Options: azure_speech | azure_video_indexer
+AUTOCLIP_HARNESS_PROVIDER=azure_speech
+
+# === Output Options ===
+# Include speaker labels in transcript output (true/false)
+AUTOCLIP_HARNESS_INCLUDE_SPEAKER_LABELS=false
+
+# === Azure Speech Configuration ===
 AUTOCLIP_HARNESS_SPEECH_KEY=
 AUTOCLIP_HARNESS_SPEECH_REGION=canadacentral
 AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING=
@@ -8,12 +18,33 @@ AUTOCLIP_HARNESS_BATCH_ENDPOINT=
 AUTOCLIP_HARNESS_BATCH_VERSION=v3.2
 AUTOCLIP_HARNESS_BATCH_POLL_SECONDS=10
 AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES=45
+
+# === Azure Video Indexer Configuration ===
+# Required when AUTOCLIP_HARNESS_PROVIDER=azure_video_indexer
+AUTOCLIP_HARNESS_VI_ACCOUNT_ID=
+AUTOCLIP_HARNESS_VI_LOCATION=trial
+AUTOCLIP_HARNESS_VI_API_KEY=
+AUTOCLIP_HARNESS_VI_TIMEOUT_MINUTES=60
+AUTOCLIP_HARNESS_VI_POLL_SECONDS=30
+
+# Multiple Person Models (JSON dictionary format)
+# Keys are usage identifiers, values are Azure Person Model IDs
+# Example: {"news":"abc123-news-model","sports":"def456-sports-model","default":"ghi789-general"}
+AUTOCLIP_HARNESS_VI_PERSON_MODELS={}
+
+# Current Person Model key to use (corresponds to a key in the dictionary above)
+# Leave empty to skip Person Model identification (will output speaker1, speaker2, etc.)
+AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY=
+
+# === LLM Configuration ===
 # Provide either the full chat-completions endpoint or the base resource URL.
 AUTOCLIP_HARNESS_LLM_URL=https://your-resource.openai.azure.com
 AUTOCLIP_HARNESS_LLM_KEY=
 AUTOCLIP_HARNESS_LLM_DEPLOYMENT=gpt-4o-mini
 AUTOCLIP_HARNESS_LLM_MODEL=gpt-4o-mini
 AUTOCLIP_HARNESS_LLM_VERSION=2024-07-18
+
+# === General Settings ===
 AUTOCLIP_HARNESS_LANGUAGE=en-US
 AUTOCLIP_HARNESS_MAX_STORIES=5
 
diff --git a/tools/auto-clipper-harness/.gitignore b/tools/auto-clipper-harness/.gitignore
index 91b68b99a..9a8771077 100644
--- a/tools/auto-clipper-harness/.gitignore
+++ b/tools/auto-clipper-harness/.gitignore
@@ -1,2 +1,26 @@
 **/output/
-**/input/
\ No newline at end of file
+**/input/
+**/auto-clipper-harness-output/
+**/*.mp4
+**/*.avi
+**/*.mkv
+**/*.mov
+**/*.flv
+**/*.wmv
+**/*.webm
+**/*.m4v
+**/*.m4a
+**/*.m4b
+**/*.m4p
+**/*.m4v
+**/*.mp3
+**/*.ogg
+**/*.wav
+**/*.m4v
+**/*.m4a
+**/*.m4b
+**/*.m4p
+**/*.m4v
+**/*.mp3
+**/*.ogg
+**/*.wav
diff --git a/tools/auto-clipper-harness/Program.cs b/tools/auto-clipper-harness/Program.cs
index 1892d3ed3..539ecc664 100644
--- a/tools/auto-clipper-harness/Program.cs
+++ b/tools/auto-clipper-harness/Program.cs
@@ -1,8 +1,10 @@
 using System.Text;
 using System.Linq;
+using System.Text.Json;
 using System.Text.RegularExpressions;
 using Microsoft.Extensions.Logging;
 using Microsoft.Extensions.Options;
+using AutoClipperHarness;
 using TNO.Services.AutoClipper.Audio;
 using TNO.Services.AutoClipper.Azure;
 using TNO.Services.AutoClipper.Config;
@@ -30,7 +32,13 @@
 }
 
 var outputDir = args.Length > 2 ? args[2] : Path.Combine(Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".", "auto-clipper-harness-output");
+Console.WriteLine($"[HARNESS] Output directory: {outputDir}");
 Directory.CreateDirectory(outputDir);
+if (!Directory.Exists(outputDir))
+{
+    Console.WriteLine($"[HARNESS] ERROR: Failed to create output directory: {outputDir}");
+    return;
+}
 
 using var loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole(o => o.TimestampFormat = "HH:mm:ss "));
 var stationCode = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STATION") ?? "CKNW";
@@ -46,8 +54,6 @@
 var sampleRate = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_SAMPLE_RATE"), out var sr)
     ? sr
     : (stationProfile.Transcription.SampleRate > 0 ? stationProfile.Transcription.SampleRate : 16000);
-var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger<AudioNormalizer>());
-var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate);
 var llmBaseUrl = RequireEnv("AUTOCLIP_HARNESS_LLM_URL").Trim();
 var llmDeployment = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_DEPLOYMENT");
 var llmVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_VERSION") ?? "2024-07-18";
@@ -57,45 +63,107 @@
     ? (!string.IsNullOrWhiteSpace(llmDeployment) ? llmDeployment : "gpt-4o-mini")
     : llmModel;
 
-
-
-var options = Options.Create(new AutoClipperOptions
+// Create LLM options (shared by both providers)
+var llmOptions = Options.Create(new AutoClipperOptions
 {
-    AzureSpeechKey = RequireEnv("AUTOCLIP_HARNESS_SPEECH_KEY"),
-    AzureSpeechRegion = RequireEnv("AUTOCLIP_HARNESS_SPEECH_REGION"),
-    AzureSpeechBatchEndpoint = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_ENDPOINT") ?? string.Empty,
-    AzureSpeechBatchApiVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_VERSION") ?? "v3.2",
-    AzureSpeechBatchPollingIntervalSeconds = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_POLL_SECONDS"), out var batchPollSeconds) ? batchPollSeconds : 10,
-    AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45,
-    AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"),
-    AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"),
-    AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180,    LlmApiUrl = llmEndpoint,
-
+    LlmApiUrl = llmEndpoint,
     LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"),
-
     LlmDefaultModel = defaultModel,
     LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT")
         ?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt),
-    MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5,
+    MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStoriesLlm) ? maxStoriesLlm : 5,
     VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".",
     DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US"
 });
-var speechLogger = loggerFactory.CreateLogger<AzureSpeechTranscriptionService>();
 var llmLogger = loggerFactory.CreateLogger<ClipSegmentationService>();
-var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger);
-var llmService = new ClipSegmentationService(new HttpClient(), options, llmLogger);
-var transcriptionRequest = new SpeechTranscriptionRequest
+var llmService = new ClipSegmentationService(new HttpClient(), llmOptions, llmLogger);
+
+// Determine provider and output options (early detection to avoid unnecessary initialization)
+var provider = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROVIDER") ?? "azure_speech";
+var includeSpeakerLabels = bool.TryParse(
+    Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_INCLUDE_SPEAKER_LABELS"),
+    out var isl) && isl;
+
+// Transcribe using selected provider
+IReadOnlyList<TranscriptSegment> transcriptSegments;
+IReadOnlyList<TimestampedTranscript> segments;
+
+if (string.Equals(provider, "azure_video_indexer", StringComparison.OrdinalIgnoreCase))
+{
+    // Use Azure Video Indexer - no audio normalization needed, upload original file directly
+    var personModelId = ResolvePersonModelId();
+    Console.WriteLine($"[HARNESS] Using Video Indexer (PersonModel: {personModelId ?? "none"})");
+    Console.WriteLine($"[HARNESS] Uploading original file: {input}");
+
+    var viClient = new VideoIndexerClient(
+        new HttpClient(),
+        RequireEnv("AUTOCLIP_HARNESS_VI_ACCOUNT_ID"),
+        Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_LOCATION") ?? "trial",
+        RequireEnv("AUTOCLIP_HARNESS_VI_API_KEY"),
+        int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_TIMEOUT_MINUTES"), out var viTimeout) ? viTimeout : 60,
+        int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_POLL_SECONDS"), out var viPoll) ? viPoll : 30,
+        loggerFactory.CreateLogger<VideoIndexerClient>());
+
+    var viRequest = new VideoIndexerRequest
+    {
+        Language = language,
+        PersonModelId = personModelId,
+        IncludeSpeakerLabels = includeSpeakerLabels
+    };
+
+    var rawJsonPath = Path.Combine(outputDir, "video_indexer_raw_response.json");
+    transcriptSegments = await viClient.TranscribeAsync(input, viRequest, CancellationToken.None, rawJsonPath);
+    Console.WriteLine($"[HARNESS] Raw Video Indexer response -> {rawJsonPath}");
+
+    // Convert to TimestampedTranscript for downstream compatibility
+    segments = transcriptSegments.Select(s => new TimestampedTranscript(s.Start, s.End, s.Text)).ToList();
+}
+else
 {
-    Language = language,
-    EnableSpeakerDiarization = stationProfile.Transcription.Diarization,
-    SpeakerCount = stationProfile.Transcription.MaxSpeakers,
-    DiarizationMode = stationProfile.Transcription.DiarizationMode
-};
-
-Console.WriteLine($"[HARNESS] Transcribing {workingFile} ...");
-var segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None);
+    // Use Azure Speech (default) - requires audio normalization
+    var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger<AudioNormalizer>());
+    var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate);
+
+    var options = Options.Create(new AutoClipperOptions
+    {
+        AzureSpeechKey = RequireEnv("AUTOCLIP_HARNESS_SPEECH_KEY"),
+        AzureSpeechRegion = RequireEnv("AUTOCLIP_HARNESS_SPEECH_REGION"),
+        AzureSpeechBatchEndpoint = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_ENDPOINT") ?? string.Empty,
+        AzureSpeechBatchApiVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_VERSION") ?? "v3.2",
+        AzureSpeechBatchPollingIntervalSeconds = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_POLL_SECONDS"), out var batchPollSeconds) ? batchPollSeconds : 10,
+        AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45,
+        AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"),
+        AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"),
+        AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180,
+        LlmApiUrl = llmEndpoint,
+        LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"),
+        LlmDefaultModel = defaultModel,
+        LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT")
+            ?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt),
+        MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5,
+        VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".",
+        DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US"
+    });
+
+    var speechLogger = loggerFactory.CreateLogger<AzureSpeechTranscriptionService>();
+    var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger);
+    var transcriptionRequest = new SpeechTranscriptionRequest
+    {
+        Language = language,
+        EnableSpeakerDiarization = stationProfile.Transcription.Diarization,
+        SpeakerCount = stationProfile.Transcription.MaxSpeakers,
+        DiarizationMode = stationProfile.Transcription.DiarizationMode
+    };
+
+    Console.WriteLine($"[HARNESS] Transcribing with Azure Speech: {workingFile} ...");
+    segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None);
+
+    // Convert to TranscriptSegment for speaker label support
+    transcriptSegments = segments.Select(s => new TranscriptSegment(s.Start, s.End, s.Text)).ToList();
+}
+
 Console.WriteLine($"[HARNESS] Received {segments.Count} transcript segments");
-var fullTranscriptBody = BuildTranscriptDocument(segments);
+var fullTranscriptBody = BuildTranscriptDocumentWithSpeakers(transcriptSegments, includeSpeakerLabels);
 var fullTranscriptPath = Path.Combine(outputDir, "transcript_full.txt");
 await File.WriteAllTextAsync(fullTranscriptPath, fullTranscriptBody ?? string.Empty);
 Console.WriteLine($"[HARNESS] Full transcript -> {fullTranscriptPath}");
@@ -118,8 +186,8 @@
         continue;
     }
 
-    var transcriptSlice = ExtractTranscriptRange(segments, normalized.Start, normalized.End);
-    var transcriptBody = BuildTranscriptDocument(transcriptSlice);
+    var transcriptSlice = ExtractTranscriptSegmentRange(transcriptSegments, normalized.Start, normalized.End);
+    var transcriptBody = BuildTranscriptDocumentWithSpeakers(transcriptSlice, includeSpeakerLabels);
     if (string.IsNullOrWhiteSpace(transcriptBody))
     {
         Console.WriteLine($"[HARNESS] Empty transcript for clip {definition.Title}");
@@ -378,6 +446,60 @@ static void LoadEnvFile(string path)
 static IReadOnlyList<TimestampedTranscript> ExtractTranscriptRange(IReadOnlyList<TimestampedTranscript> segments, TimeSpan start, TimeSpan end)
     => segments.Where(s => s.End > start && s.Start < end).ToArray();
 
+static IReadOnlyList<TranscriptSegment> ExtractTranscriptSegmentRange(IReadOnlyList<TranscriptSegment> segments, TimeSpan start, TimeSpan end)
+    => segments.Where(s => s.End > start && s.Start < end).ToArray();
+
+static string? ResolvePersonModelId()
+{
+    var personModelsJson = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_PERSON_MODELS");
+    var personModelKey = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY");
+
+    if (string.IsNullOrWhiteSpace(personModelsJson) || string.IsNullOrWhiteSpace(personModelKey))
+        return null;
+
+    try
+    {
+        var personModels = JsonSerializer.Deserialize<Dictionary<string, string>>(personModelsJson);
+        if (personModels != null && personModels.TryGetValue(personModelKey, out var modelId))
+        {
+            return modelId;
+        }
+    }
+    catch (JsonException)
+    {
+        Console.WriteLine($"[HARNESS] Warning: Failed to parse AUTOCLIP_HARNESS_VI_PERSON_MODELS as JSON");
+    }
+
+    return null;
+}
+
+static string BuildTranscriptDocumentWithSpeakers(IReadOnlyList<TranscriptSegment> segments, bool includeSpeakerLabels)
+{
+    if (segments == null || segments.Count == 0) return string.Empty;
+    var sb = new StringBuilder();
+    var idx = 1;
+    foreach (var segment in segments)
+    {
+        if (string.IsNullOrWhiteSpace(segment.Text)) continue;
+
+        sb.AppendLine(idx.ToString());
+        sb.AppendLine($"{FormatTimestamp(segment.Start)} --> {FormatTimestamp(segment.End)}");
+
+        if (includeSpeakerLabels && (segment.SpeakerName != null || segment.SpeakerId != null))
+        {
+            var label = segment.SpeakerName ?? $"speaker{segment.SpeakerId}";
+            sb.AppendLine($"{label}: {segment.Text.Trim()}");
+        }
+        else
+        {
+            sb.AppendLine(segment.Text.Trim());
+        }
+        sb.AppendLine();
+        idx++;
+    }
+    return sb.ToString().Trim();
+}
+
 
 
 
diff --git a/tools/auto-clipper-harness/README.md b/tools/auto-clipper-harness/README.md
index 933223301..45667630a 100644
--- a/tools/auto-clipper-harness/README.md
+++ b/tools/auto-clipper-harness/README.md
@@ -1,23 +1,48 @@
 # AutoClipper Harness
 
-The harness is a standalone console app that mirrors the AutoClipper pipeline for manual validation. It
-normalizes a local media file, runs Azure Speech transcription, feeds the transcript and station heuristics to the
-segmenter, and writes clips/transcripts/prompt debug files for inspection.
+Standalone console app for local testing of the AutoClipper pipeline.
 
 ## Usage
 
-`dotnet run --project tools/auto-clipper-harness -- <path-to-media> [language] [outputDir]`
+```bash
+dotnet run --project tools/auto-clipper-harness -- <media-file> [language] [outputDir]
+```
 
-- Configure Azure keys and LLM settings via .env (see .env.sample).
-- Provide AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING / AUTOCLIP_HARNESS_STORAGE_CONTAINER so the harness can upload audio for Azure batch transcription.
-- Optional overrides: AUTOCLIP_HARNESS_BATCH_ENDPOINT, \_BATCH_VERSION, \_BATCH_POLL_SECONDS, \_BATCH_TIMEOUT_MINUTES, and \_STORAGE_SAS_MINUTES.
-- Station profiles are loaded from services/net/auto-clipper/Config/Stations by default; override with
-  AUTOCLIP_HARNESS_STATION_PATH / AUTOCLIP_HARNESS_STATION.
-- Outputs: clip_XX.\* media slices, clip_XX.txt transcripts, ranscript_full.txt, and
-  llm_prompt_debug.txt (shows numbered transcript, heuristics, and the final prompt).
+## Configuration
+
+Copy `.env.sample` to `.env` and fill in your keys.
+
+### Provider Selection
+
+```bash
+AUTOCLIP_HARNESS_PROVIDER=azure_speech         # default
+AUTOCLIP_HARNESS_PROVIDER=azure_video_indexer  # with speaker identification
+```
+
+### Azure Speech (default)
+
+- `AUTOCLIP_HARNESS_SPEECH_KEY` / `AUTOCLIP_HARNESS_SPEECH_REGION`
+- `AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING` / `AUTOCLIP_HARNESS_STORAGE_CONTAINER`
+
+### Azure Video Indexer
+
+- `AUTOCLIP_HARNESS_VI_ACCOUNT_ID` / `AUTOCLIP_HARNESS_VI_API_KEY`
+- `AUTOCLIP_HARNESS_VI_LOCATION` (default: trial)
+- `AUTOCLIP_HARNESS_VI_PERSON_MODELS` - JSON map for multiple Person Models
+- `AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY` - which model to use
+
+### LLM
+
+- `AUTOCLIP_HARNESS_LLM_ENDPOINT` / `AUTOCLIP_HARNESS_LLM_KEY`
+
+## Output
+
+- `clip_XX.mp4` / `clip_XX.txt` - segmented clips with transcripts
+- `transcript_full.txt` - full transcript (with speaker labels if enabled)
+- `video_indexer_raw_response.json` - raw API response (Video Indexer only)
+- `llm_prompt_debug.txt` - LLM prompt for debugging
 
 ## Notes
 
-- The harness shares the segmentation logic with the service, so any changes in ClipSegmentationService
-  should be validated here first.
-- Ensure ffmpeg is available on PATH; the harness shells out to ffmpeg to produce media clips.
+- Video Indexer uploads original media directly; Azure Speech requires WAV conversion
+- Requires `ffmpeg` on PATH for clip extraction
diff --git a/tools/auto-clipper-harness/TranscriptSegment.cs b/tools/auto-clipper-harness/TranscriptSegment.cs
new file mode 100644
index 000000000..66d18adc1
--- /dev/null
+++ b/tools/auto-clipper-harness/TranscriptSegment.cs
@@ -0,0 +1,13 @@
+namespace AutoClipperHarness;
+
+/// <summary>
+/// Represents a transcript segment with optional speaker identification.
+/// This is the harness-local version; will be merged into TimestampedTranscript in phase 2.
+/// </summary>
+public record TranscriptSegment(
+    TimeSpan Start,
+    TimeSpan End,
+    string Text,
+    int? SpeakerId = null,
+    string? SpeakerName = null
+);
diff --git a/tools/auto-clipper-harness/VideoIndexerClient.cs b/tools/auto-clipper-harness/VideoIndexerClient.cs
new file mode 100644
index 000000000..ddf09d61e
--- /dev/null
+++ b/tools/auto-clipper-harness/VideoIndexerClient.cs
@@ -0,0 +1,398 @@
+using System.Net.Http.Headers;
+using System.Text.Json;
+using Microsoft.Extensions.Logging;
+
+namespace AutoClipperHarness;
+
+/// <summary>
+/// Client for Azure Video Indexer API.
+/// Handles video upload, processing, and transcript extraction with speaker identification.
+/// </summary>
+public class VideoIndexerClient
+{
+    private const string ApiBaseUrl = "https://api.videoindexer.ai";
+
+    private readonly HttpClient _httpClient;
+    private readonly string _accountId;
+    private readonly string _location;
+    private readonly string _apiKey;
+    private readonly int _timeoutMinutes;
+    private readonly int _pollIntervalSeconds;
+    private readonly ILogger? _logger;
+
+    private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web)
+    {
+        PropertyNameCaseInsensitive = true
+    };
+
+    public VideoIndexerClient(
+        HttpClient httpClient,
+        string accountId,
+        string location,
+        string apiKey,
+        int timeoutMinutes = 60,
+        int pollIntervalSeconds = 30,
+        ILogger? logger = null)
+    {
+        _httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
+        _accountId = accountId ?? throw new ArgumentNullException(nameof(accountId));
+        _location = location ?? throw new ArgumentNullException(nameof(location));
+        _apiKey = apiKey ?? throw new ArgumentNullException(nameof(apiKey));
+        _timeoutMinutes = timeoutMinutes > 0 ? timeoutMinutes : 60;
+        _pollIntervalSeconds = pollIntervalSeconds > 0 ? pollIntervalSeconds : 30;
+        _logger = logger;
+    }
+
+    /// <summary>
+    /// Transcribes a media file using Azure Video Indexer.
+    /// </summary>
+    /// <param name="filePath">Path to the media file (video or audio).</param>
+    /// <param name="request">Transcription request options.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <param name="rawJsonOutputPath">Optional path to save the raw Video Indexer JSON response for debugging.</param>
+    /// <returns>List of transcript segments with optional speaker information.</returns>
+    public async Task<IReadOnlyList<TranscriptSegment>> TranscribeAsync(
+        string filePath,
+        VideoIndexerRequest request,
+        CancellationToken cancellationToken = default,
+        string? rawJsonOutputPath = null)
+    {
+        if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath))
+            throw new FileNotFoundException("Media file not found", filePath);
+
+        _logger?.LogInformation("Starting Video Indexer transcription for {File}", filePath);
+
+        // Step 1: Get access token
+        var accessToken = await GetAccessTokenAsync(cancellationToken);
+        _logger?.LogDebug("Obtained access token");
+
+        // Step 2: Upload video
+        var videoId = await UploadVideoAsync(filePath, accessToken, request, cancellationToken);
+        _logger?.LogInformation("Video uploaded with ID: {VideoId}", videoId);
+
+        try
+        {
+            // Step 3: Wait for processing to complete
+            var indexJson = await WaitForProcessingAsync(videoId, accessToken, cancellationToken);
+            _logger?.LogInformation("Video processing completed");
+
+            // Save raw JSON for debugging if path is provided
+            if (!string.IsNullOrWhiteSpace(rawJsonOutputPath))
+            {
+                try
+                {
+                    // Ensure directory exists
+                    var dir = Path.GetDirectoryName(rawJsonOutputPath);
+                    if (!string.IsNullOrWhiteSpace(dir))
+                        Directory.CreateDirectory(dir);
+
+                    // Pretty print the JSON
+                    using var doc = JsonDocument.Parse(indexJson);
+                    var prettyJson = JsonSerializer.Serialize(doc, new JsonSerializerOptions { WriteIndented = true });
+                    await File.WriteAllTextAsync(rawJsonOutputPath, prettyJson, cancellationToken);
+                    _logger?.LogInformation("Saved raw Video Indexer response to {Path}", rawJsonOutputPath);
+                }
+                catch (Exception ex)
+                {
+                    _logger?.LogWarning(ex, "Failed to save raw JSON to {Path}", rawJsonOutputPath);
+                }
+            }
+
+            // Step 4: Parse transcript
+            var segments = ParseTranscript(indexJson, request.IncludeSpeakerLabels);
+            _logger?.LogInformation("Parsed {Count} transcript segments", segments.Count);
+
+            return segments;
+        }
+        finally
+        {
+            // Clean up: delete the video from Video Indexer
+            await TryDeleteVideoAsync(videoId, accessToken);
+        }
+    }
+
+    private async Task<string> GetAccessTokenAsync(CancellationToken cancellationToken)
+    {
+        var url = $"{ApiBaseUrl}/Auth/{_location}/Accounts/{_accountId}/AccessToken?allowEdit=true";
+
+        using var request = new HttpRequestMessage(HttpMethod.Get, url);
+        request.Headers.Add("Ocp-Apim-Subscription-Key", _apiKey);
+
+        using var response = await _httpClient.SendAsync(request, cancellationToken);
+        var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+        if (!response.IsSuccessStatusCode)
+            throw new InvalidOperationException($"Failed to get access token: {response.StatusCode} - {body}");
+
+        // Token is returned as a quoted string
+        return body.Trim('"');
+    }
+
+    private async Task<string> UploadVideoAsync(
+        string filePath,
+        string accessToken,
+        VideoIndexerRequest request,
+        CancellationToken cancellationToken)
+    {
+        var fileName = Path.GetFileName(filePath);
+        var videoName = $"AutoClipper-{Guid.NewGuid():N}-{fileName}";
+
+        // Build upload URL with query parameters
+        var queryParams = new List<string>
+        {
+            $"accessToken={Uri.EscapeDataString(accessToken)}",
+            $"name={Uri.EscapeDataString(videoName)}",
+            $"language={Uri.EscapeDataString(request.Language)}",
+            "privacy=Private",
+            "indexingPreset=AudioOnly"  // We only need audio analysis for transcription
+        };
+
+        if (!string.IsNullOrWhiteSpace(request.PersonModelId))
+        {
+            queryParams.Add($"personModelId={Uri.EscapeDataString(request.PersonModelId)}");
+        }
+
+        var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos?{string.Join("&", queryParams)}";
+
+        _logger?.LogDebug("Uploading video to: {Url}", url.Split('?')[0]);
+
+        using var content = new MultipartFormDataContent();
+        await using var fileStream = File.OpenRead(filePath);
+        var fileContent = new StreamContent(fileStream);
+        fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream");
+        content.Add(fileContent, "file", fileName);
+
+        using var response = await _httpClient.PostAsync(url, content, cancellationToken);
+        var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+        if (!response.IsSuccessStatusCode)
+            throw new InvalidOperationException($"Failed to upload video: {response.StatusCode} - {body}");
+
+        using var doc = JsonDocument.Parse(body);
+        var videoId = doc.RootElement.GetProperty("id").GetString();
+
+        if (string.IsNullOrWhiteSpace(videoId))
+            throw new InvalidOperationException("Video Indexer did not return a video ID");
+
+        return videoId;
+    }
+
+    private async Task<string> WaitForProcessingAsync(
+        string videoId,
+        string accessToken,
+        CancellationToken cancellationToken)
+    {
+        var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos/{videoId}/Index?accessToken={Uri.EscapeDataString(accessToken)}";
+        var timeout = TimeSpan.FromMinutes(_timeoutMinutes);
+        var pollInterval = TimeSpan.FromSeconds(_pollIntervalSeconds);
+        var startTime = DateTime.UtcNow;
+
+        while (true)
+        {
+            cancellationToken.ThrowIfCancellationRequested();
+
+            if (DateTime.UtcNow - startTime > timeout)
+                throw new TimeoutException($"Video Indexer processing did not complete within {_timeoutMinutes} minutes");
+
+            using var response = await _httpClient.GetAsync(url, cancellationToken);
+            var body = await response.Content.ReadAsStringAsync(cancellationToken);
+
+            if (!response.IsSuccessStatusCode)
+                throw new InvalidOperationException($"Failed to get video index: {response.StatusCode} - {body}");
+
+            using var doc = JsonDocument.Parse(body);
+            var state = doc.RootElement.GetProperty("state").GetString();
+
+            var elapsed = (int)(DateTime.UtcNow - startTime).TotalSeconds;
+            _logger?.LogInformation("Video Indexer status: {State} ({Elapsed}s elapsed)", state, elapsed);
+
+            if (string.Equals(state, "Processed", StringComparison.OrdinalIgnoreCase))
+            {
+                return body;
+            }
+
+            if (string.Equals(state, "Failed", StringComparison.OrdinalIgnoreCase))
+            {
+                var errorMessage = "Unknown error";
+                if (doc.RootElement.TryGetProperty("failureMessage", out var failureMsg))
+                    errorMessage = failureMsg.GetString() ?? errorMessage;
+                throw new InvalidOperationException($"Video Indexer processing failed: {errorMessage}");
+            }
+
+            await Task.Delay(pollInterval, cancellationToken);
+        }
+    }
+
+    private IReadOnlyList<TranscriptSegment> ParseTranscript(string indexJson, bool includeSpeakerLabels)
+    {
+        var segments = new List<TranscriptSegment>();
+
+        using var doc = JsonDocument.Parse(indexJson);
+
+        // Navigate to: videos[0].insights.transcript
+        if (!doc.RootElement.TryGetProperty("videos", out var videos) ||
+            videos.GetArrayLength() == 0)
+        {
+            _logger?.LogWarning("No videos found in index response");
+            return segments;
+        }
+
+        var video = videos[0];
+        if (!video.TryGetProperty("insights", out var insights))
+        {
+            _logger?.LogWarning("No insights found in video");
+            return segments;
+        }
+
+        // Build speaker name map from faces/speakers if available
+        var speakerNames = BuildSpeakerNameMap(insights);
+
+        // Parse transcript
+        if (!insights.TryGetProperty("transcript", out var transcript) ||
+            transcript.ValueKind != JsonValueKind.Array)
+        {
+            _logger?.LogWarning("No transcript found in insights");
+            return segments;
+        }
+
+        foreach (var item in transcript.EnumerateArray())
+        {
+            var text = item.TryGetProperty("text", out var textProp) ? textProp.GetString() : null;
+            if (string.IsNullOrWhiteSpace(text))
+                continue;
+
+            // Parse timestamps from instances array
+            // Video Indexer format: { "instances": [{ "start": "0:00:00.4", "end": "0:00:08.36" }] }
+            var start = TimeSpan.Zero;
+            var end = TimeSpan.Zero;
+
+            if (item.TryGetProperty("instances", out var instances) &&
+                instances.ValueKind == JsonValueKind.Array &&
+                instances.GetArrayLength() > 0)
+            {
+                var firstInstance = instances[0];
+                start = ParseTimestamp(firstInstance, "start");
+                end = ParseTimestamp(firstInstance, "end");
+            }
+
+            if (end <= start)
+                end = start + TimeSpan.FromMilliseconds(100);
+
+            // Parse speaker information
+            int? speakerId = null;
+            string? speakerName = null;
+
+            if (includeSpeakerLabels && item.TryGetProperty("speakerId", out var speakerIdProp))
+            {
+                speakerId = speakerIdProp.ValueKind == JsonValueKind.Number
+                    ? speakerIdProp.GetInt32()
+                    : int.TryParse(speakerIdProp.GetString(), out var id) ? id : null;
+
+                if (speakerId.HasValue && speakerNames.TryGetValue(speakerId.Value, out var name))
+                {
+                    speakerName = name;
+                }
+            }
+
+            segments.Add(new TranscriptSegment(start, end, text.Trim(), speakerId, speakerName));
+        }
+
+        return segments.OrderBy(s => s.Start).ToList();
+    }
+
+    private Dictionary<int, string> BuildSpeakerNameMap(JsonElement insights)
+    {
+        var map = new Dictionary<int, string>();
+
+        // Try to get speaker names from faces (Person Model identification)
+        if (insights.TryGetProperty("faces", out var faces) && faces.ValueKind == JsonValueKind.Array)
+        {
+            foreach (var face in faces.EnumerateArray())
+            {
+                if (!face.TryGetProperty("id", out var idProp))
+                    continue;
+
+                var faceId = idProp.ValueKind == JsonValueKind.Number
+                    ? idProp.GetInt32()
+                    : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+                if (!faceId.HasValue)
+                    continue;
+
+                // Get name - could be from Person Model or Unknown
+                var name = face.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+
+                // Skip unknown faces - we'll use speaker ID instead
+                if (!string.IsNullOrWhiteSpace(name) &&
+                    !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase))
+                {
+                    map[faceId.Value] = name;
+                }
+            }
+        }
+
+        // Also check speakers section
+        if (insights.TryGetProperty("speakers", out var speakers) && speakers.ValueKind == JsonValueKind.Array)
+        {
+            foreach (var speaker in speakers.EnumerateArray())
+            {
+                if (!speaker.TryGetProperty("id", out var idProp))
+                    continue;
+
+                var speakerId = idProp.ValueKind == JsonValueKind.Number
+                    ? idProp.GetInt32()
+                    : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null;
+
+                if (!speakerId.HasValue || map.ContainsKey(speakerId.Value))
+                    continue;
+
+                var name = speaker.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null;
+                if (!string.IsNullOrWhiteSpace(name) &&
+                    !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase) &&
+                    !name.StartsWith("Speaker", StringComparison.OrdinalIgnoreCase))
+                {
+                    map[speakerId.Value] = name;
+                }
+            }
+        }
+
+        return map;
+    }
+
+    private static TimeSpan ParseTimestamp(JsonElement element, string property)
+    {
+        if (!element.TryGetProperty(property, out var prop))
+            return TimeSpan.Zero;
+
+        var value = prop.GetString();
+        if (string.IsNullOrWhiteSpace(value))
+            return TimeSpan.Zero;
+
+        // Video Indexer uses format like "0:00:05.12" or "00:00:05.12"
+        if (TimeSpan.TryParse(value, out var ts))
+            return ts;
+
+        // Try parsing as seconds
+        if (double.TryParse(value, out var seconds))
+            return TimeSpan.FromSeconds(seconds);
+
+        return TimeSpan.Zero;
+    }
+
+    private async Task TryDeleteVideoAsync(string videoId, string accessToken)
+    {
+        try
+        {
+            var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos/{videoId}?accessToken={Uri.EscapeDataString(accessToken)}";
+            using var response = await _httpClient.DeleteAsync(url);
+            if (response.IsSuccessStatusCode)
+            {
+                _logger?.LogDebug("Deleted video {VideoId} from Video Indexer", videoId);
+            }
+        }
+        catch (Exception ex)
+        {
+            _logger?.LogWarning(ex, "Failed to delete video {VideoId} from Video Indexer", videoId);
+        }
+    }
+}
diff --git a/tools/auto-clipper-harness/VideoIndexerRequest.cs b/tools/auto-clipper-harness/VideoIndexerRequest.cs
new file mode 100644
index 000000000..a24d54518
--- /dev/null
+++ b/tools/auto-clipper-harness/VideoIndexerRequest.cs
@@ -0,0 +1,23 @@
+namespace AutoClipperHarness;
+
+/// <summary>
+/// Request options for Azure Video Indexer transcription.
+/// </summary>
+public class VideoIndexerRequest
+{
+    /// <summary>
+    /// Language code for transcription (e.g., "en-US", "zh-CN").
+    /// </summary>
+    public string Language { get; init; } = "en-US";
+
+    /// <summary>
+    /// Optional Person Model ID for speaker identification.
+    /// When provided, Video Indexer will attempt to identify known faces.
+    /// </summary>
+    public string? PersonModelId { get; init; }
+
+    /// <summary>
+    /// Whether to include speaker labels in the output.
+    /// </summary>
+    public bool IncludeSpeakerLabels { get; init; } = true;
+}