diff --git a/openshift/kustomize/services/auto-clipper/base/deploy.yaml b/openshift/kustomize/services/auto-clipper/base/deploy.yaml index 10f68f2c7..d21848891 100644 --- a/openshift/kustomize/services/auto-clipper/base/deploy.yaml +++ b/openshift/kustomize/services/auto-clipper/base/deploy.yaml @@ -160,6 +160,26 @@ spec: name: azure-openai key: AZURE_OPENAI_KEY + # Azure Video Indexer Configuration (optional - for speaker identification) + - name: Service__AzureVideoIndexerAccountId + valueFrom: + secretKeyRef: + name: azure-video-indexer + key: AZURE_VIDEO_INDEXER_ACCOUNT_ID + optional: true + - name: Service__AzureVideoIndexerApiKey + valueFrom: + secretKeyRef: + name: azure-video-indexer + key: AZURE_VIDEO_INDEXER_API_KEY + optional: true + - name: Service__AzureVideoIndexerLocation + valueFrom: + secretKeyRef: + name: azure-video-indexer + key: AZURE_VIDEO_INDEXER_LOCATION + optional: true + # Service Configuration - name: Service__MaxFailLimit valueFrom: diff --git a/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml index 4ca71785f..59ff13d76 100644 --- a/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml +++ b/openshift/kustomize/services/auto-clipper/overlays/dev/kustomization.yaml @@ -16,6 +16,9 @@ secretGenerator: - name: azure-openai type: stringData env: openai.env + - name: azure-video-indexer + type: stringData + env: video-indexer.env patches: - target: diff --git a/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml index d783baf42..682fa26d8 100644 --- a/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml +++ b/openshift/kustomize/services/auto-clipper/overlays/prod/kustomization.yaml @@ -16,6 +16,9 @@ secretGenerator: - name: azure-openai type: stringData env: openai.env + - name: azure-video-indexer + type: stringData + env: video-indexer.env patches: - target: diff --git a/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml b/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml index 036910c0c..d8b5bcce9 100644 --- a/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml +++ b/openshift/kustomize/services/auto-clipper/overlays/test/kustomization.yaml @@ -16,6 +16,9 @@ secretGenerator: - name: azure-openai type: stringData env: openai.env + - name: azure-video-indexer + type: stringData + env: video-indexer.env patches: - target: diff --git a/services/net/auto-clipper/AutoClipperManager.cs b/services/net/auto-clipper/AutoClipperManager.cs index b05a11d29..07de0f0e5 100644 --- a/services/net/auto-clipper/AutoClipperManager.cs +++ b/services/net/auto-clipper/AutoClipperManager.cs @@ -635,23 +635,32 @@ private static void CleanupTemporaryFiles(bool isSyncedToS3, params string[] fil /// /// Format the transcript to include newlines. + /// When speaker information is available (from Video Indexer), includes speaker prefix. + /// Azure Speech transcripts have no speaker info and will output plain text. /// - /// - /// + /// Transcript segments with optional speaker information. + /// Formatted transcript string. private static string BuildTranscriptDocument(IReadOnlyList segments) { if (segments == null || segments.Count == 0) return string.Empty; var sb = new StringBuilder(); - var index = 1; foreach (var segment in segments) { if (string.IsNullOrWhiteSpace(segment.Text)) continue; - // sb.AppendLine(index.ToString(CultureInfo.InvariantCulture)); - // sb.AppendLine($"{FormatTimestamp(segment.Start)} --> {FormatTimestamp(segment.End)}"); + + // Add speaker prefix if available (Video Indexer provides this) + if (!string.IsNullOrWhiteSpace(segment.SpeakerName)) + { + sb.Append($"{segment.SpeakerName}: "); + } + else if (segment.SpeakerId.HasValue) + { + sb.Append($"speaker{segment.SpeakerId}: "); + } + sb.AppendLine(segment.Text.Trim()); sb.AppendLine(); - index++; } return sb.ToString().Trim(); diff --git a/services/net/auto-clipper/AutoClipperService.cs b/services/net/auto-clipper/AutoClipperService.cs index 843a40853..ed18540e5 100644 --- a/services/net/auto-clipper/AutoClipperService.cs +++ b/services/net/auto-clipper/AutoClipperService.cs @@ -57,6 +57,14 @@ protected override IServiceCollection ConfigureServices(IServiceCollection servi services.AddSingleton(); services.AddHttpClient(); + // Register Video Indexer client if configured + var videoIndexerAccountId = this.Configuration.GetSection("Service")["AzureVideoIndexerAccountId"]; + var videoIndexerApiKey = this.Configuration.GetSection("Service")["AzureVideoIndexerApiKey"]; + if (!string.IsNullOrWhiteSpace(videoIndexerAccountId) && !string.IsNullOrWhiteSpace(videoIndexerApiKey)) + { + services.AddHttpClient(); + } + // TODO: Figure out how to validate without resulting in aggregating the config values. // services.AddOptions() // .Bind(this.Configuration.GetSection("Service")) diff --git a/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs b/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs new file mode 100644 index 000000000..354d3921d --- /dev/null +++ b/services/net/auto-clipper/Azure/AzureVideoIndexerClient.cs @@ -0,0 +1,363 @@ +using System.Net.Http.Headers; +using System.Text.Json; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using TNO.Services.AutoClipper.Config; + +namespace TNO.Services.AutoClipper.Azure; + +/// +/// Client for Azure Video Indexer API. +/// Handles video upload, processing, and transcript extraction with speaker identification. +/// +public class AzureVideoIndexerClient : IAzureVideoIndexerClient +{ + private const string ApiBaseUrl = "https://api.videoindexer.ai"; + + private readonly HttpClient _httpClient; + private readonly AutoClipperOptions _options; + private readonly ILogger _logger; + + private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web) + { + PropertyNameCaseInsensitive = true + }; + + public AzureVideoIndexerClient( + HttpClient httpClient, + IOptions options, + ILogger logger) + { + _httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient)); + _options = options?.Value ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + + if (string.IsNullOrWhiteSpace(_options.AzureVideoIndexerAccountId)) + throw new ArgumentException("AzureVideoIndexerAccountId is required"); + if (string.IsNullOrWhiteSpace(_options.AzureVideoIndexerApiKey)) + throw new ArgumentException("AzureVideoIndexerApiKey is required"); + } + + /// + public async Task> TranscribeAsync( + string filePath, + VideoIndexerRequest request, + CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath)) + throw new FileNotFoundException("Media file not found", filePath); + + _logger.LogInformation("Starting Video Indexer transcription for {File}", filePath); + + // Step 1: Get access token + var accessToken = await GetAccessTokenAsync(cancellationToken); + _logger.LogDebug("Obtained access token"); + + // Step 2: Upload video + var videoId = await UploadVideoAsync(filePath, accessToken, request, cancellationToken); + _logger.LogInformation("Video uploaded with ID: {VideoId}", videoId); + + try + { + // Step 3: Wait for processing to complete + var indexJson = await WaitForProcessingAsync(videoId, accessToken, cancellationToken); + _logger.LogInformation("Video processing completed"); + + // Step 4: Parse transcript + var segments = ParseTranscript(indexJson, request.IncludeSpeakerLabels); + _logger.LogInformation("Parsed {Count} transcript segments", segments.Count); + + return segments; + } + finally + { + // Clean up: delete the video from Video Indexer + await TryDeleteVideoAsync(videoId, accessToken); + } + } + + private async Task GetAccessTokenAsync(CancellationToken cancellationToken) + { + var url = $"{ApiBaseUrl}/Auth/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/AccessToken?allowEdit=true"; + + using var request = new HttpRequestMessage(HttpMethod.Get, url); + request.Headers.Add("Ocp-Apim-Subscription-Key", _options.AzureVideoIndexerApiKey); + + using var response = await _httpClient.SendAsync(request, cancellationToken); + var body = await response.Content.ReadAsStringAsync(cancellationToken); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException($"Failed to get access token: {response.StatusCode} - {body}"); + + // Token is returned as a quoted string + return body.Trim('"'); + } + + private async Task UploadVideoAsync( + string filePath, + string accessToken, + VideoIndexerRequest request, + CancellationToken cancellationToken) + { + var fileName = Path.GetFileName(filePath); + var videoName = $"AutoClipper-{Guid.NewGuid():N}-{fileName}"; + + // Build upload URL with query parameters + var queryParams = new List + { + $"accessToken={Uri.EscapeDataString(accessToken)}", + $"name={Uri.EscapeDataString(videoName)}", + $"language={Uri.EscapeDataString(request.Language)}", + "privacy=Private", + "indexingPreset=AudioOnly" // We only need audio analysis for transcription + }; + + if (!string.IsNullOrWhiteSpace(request.PersonModelId)) + { + queryParams.Add($"personModelId={Uri.EscapeDataString(request.PersonModelId)}"); + } + + var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos?{string.Join("&", queryParams)}"; + + _logger.LogDebug("Uploading video to: {Url}", url.Split('?')[0]); + + using var content = new MultipartFormDataContent(); + await using var fileStream = File.OpenRead(filePath); + var fileContent = new StreamContent(fileStream); + fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream"); + content.Add(fileContent, "file", fileName); + + using var response = await _httpClient.PostAsync(url, content, cancellationToken); + var body = await response.Content.ReadAsStringAsync(cancellationToken); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException($"Failed to upload video: {response.StatusCode} - {body}"); + + using var doc = JsonDocument.Parse(body); + var videoId = doc.RootElement.GetProperty("id").GetString(); + + if (string.IsNullOrWhiteSpace(videoId)) + throw new InvalidOperationException("Video Indexer did not return a video ID"); + + return videoId; + } + + private async Task WaitForProcessingAsync( + string videoId, + string accessToken, + CancellationToken cancellationToken) + { + var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos/{videoId}/Index?accessToken={Uri.EscapeDataString(accessToken)}"; + var timeout = TimeSpan.FromMinutes(_options.AzureVideoIndexerTimeoutMinutes); + var pollInterval = TimeSpan.FromSeconds(_options.AzureVideoIndexerPollingIntervalSeconds); + var startTime = DateTime.UtcNow; + + while (true) + { + cancellationToken.ThrowIfCancellationRequested(); + + if (DateTime.UtcNow - startTime > timeout) + throw new TimeoutException($"Video Indexer processing did not complete within {_options.AzureVideoIndexerTimeoutMinutes} minutes"); + + using var response = await _httpClient.GetAsync(url, cancellationToken); + var body = await response.Content.ReadAsStringAsync(cancellationToken); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException($"Failed to get video index: {response.StatusCode} - {body}"); + + using var doc = JsonDocument.Parse(body); + var state = doc.RootElement.GetProperty("state").GetString(); + + var elapsed = (int)(DateTime.UtcNow - startTime).TotalSeconds; + _logger.LogInformation("Video Indexer status: {State} ({Elapsed}s elapsed)", state, elapsed); + + if (string.Equals(state, "Processed", StringComparison.OrdinalIgnoreCase)) + { + return body; + } + + if (string.Equals(state, "Failed", StringComparison.OrdinalIgnoreCase)) + { + var errorMessage = "Unknown error"; + if (doc.RootElement.TryGetProperty("failureMessage", out var failureMsg)) + errorMessage = failureMsg.GetString() ?? errorMessage; + throw new InvalidOperationException($"Video Indexer processing failed: {errorMessage}"); + } + + await Task.Delay(pollInterval, cancellationToken); + } + } + + private IReadOnlyList ParseTranscript(string indexJson, bool includeSpeakerLabels) + { + var segments = new List(); + + using var doc = JsonDocument.Parse(indexJson); + + // Navigate to: videos[0].insights.transcript + if (!doc.RootElement.TryGetProperty("videos", out var videos) || + videos.GetArrayLength() == 0) + { + _logger.LogWarning("No videos found in index response"); + return segments; + } + + var video = videos[0]; + if (!video.TryGetProperty("insights", out var insights)) + { + _logger.LogWarning("No insights found in video"); + return segments; + } + + // Build speaker name map from faces/speakers if available + var speakerNames = BuildSpeakerNameMap(insights); + + // Parse transcript + if (!insights.TryGetProperty("transcript", out var transcript) || + transcript.ValueKind != JsonValueKind.Array) + { + _logger.LogWarning("No transcript found in insights"); + return segments; + } + + foreach (var item in transcript.EnumerateArray()) + { + var text = item.TryGetProperty("text", out var textProp) ? textProp.GetString() : null; + if (string.IsNullOrWhiteSpace(text)) + continue; + + // Parse timestamps from instances array + // Video Indexer format: { "instances": [{ "start": "0:00:00.4", "end": "0:00:08.36" }] } + var start = TimeSpan.Zero; + var end = TimeSpan.Zero; + + if (item.TryGetProperty("instances", out var instances) && + instances.ValueKind == JsonValueKind.Array && + instances.GetArrayLength() > 0) + { + var firstInstance = instances[0]; + start = ParseTimestamp(firstInstance, "start"); + end = ParseTimestamp(firstInstance, "end"); + } + + if (end <= start) + end = start + TimeSpan.FromMilliseconds(100); + + // Parse speaker information + int? speakerId = null; + string? speakerName = null; + + if (includeSpeakerLabels && item.TryGetProperty("speakerId", out var speakerIdProp)) + { + speakerId = speakerIdProp.ValueKind == JsonValueKind.Number + ? speakerIdProp.GetInt32() + : int.TryParse(speakerIdProp.GetString(), out var id) ? id : null; + + if (speakerId.HasValue && speakerNames.TryGetValue(speakerId.Value, out var name)) + { + speakerName = name; + } + } + + segments.Add(new TimestampedTranscript(start, end, text.Trim(), speakerId, speakerName)); + } + + return segments.OrderBy(s => s.Start).ToList(); + } + + private Dictionary BuildSpeakerNameMap(JsonElement insights) + { + var map = new Dictionary(); + + // Try to get speaker names from faces (Person Model identification) + if (insights.TryGetProperty("faces", out var faces) && faces.ValueKind == JsonValueKind.Array) + { + foreach (var face in faces.EnumerateArray()) + { + if (!face.TryGetProperty("id", out var idProp)) + continue; + + var faceId = idProp.ValueKind == JsonValueKind.Number + ? idProp.GetInt32() + : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null; + + if (!faceId.HasValue) + continue; + + // Get name - could be from Person Model or Unknown + var name = face.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null; + + // Skip unknown faces - we'll use speaker ID instead + if (!string.IsNullOrWhiteSpace(name) && + !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase)) + { + map[faceId.Value] = name; + } + } + } + + // Also check speakers section + if (insights.TryGetProperty("speakers", out var speakers) && speakers.ValueKind == JsonValueKind.Array) + { + foreach (var speaker in speakers.EnumerateArray()) + { + if (!speaker.TryGetProperty("id", out var idProp)) + continue; + + var speakerId = idProp.ValueKind == JsonValueKind.Number + ? idProp.GetInt32() + : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null; + + if (!speakerId.HasValue || map.ContainsKey(speakerId.Value)) + continue; + + var name = speaker.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null; + if (!string.IsNullOrWhiteSpace(name) && + !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase) && + !name.StartsWith("Speaker", StringComparison.OrdinalIgnoreCase)) + { + map[speakerId.Value] = name; + } + } + } + + return map; + } + + private static TimeSpan ParseTimestamp(JsonElement element, string property) + { + if (!element.TryGetProperty(property, out var prop)) + return TimeSpan.Zero; + + var value = prop.GetString(); + if (string.IsNullOrWhiteSpace(value)) + return TimeSpan.Zero; + + // Video Indexer uses format like "0:00:05.12" or "00:00:05.12" + if (TimeSpan.TryParse(value, out var ts)) + return ts; + + // Try parsing as seconds + if (double.TryParse(value, out var seconds)) + return TimeSpan.FromSeconds(seconds); + + return TimeSpan.Zero; + } + + private async Task TryDeleteVideoAsync(string videoId, string accessToken) + { + try + { + var url = $"{ApiBaseUrl}/{_options.AzureVideoIndexerLocation}/Accounts/{_options.AzureVideoIndexerAccountId}/Videos/{videoId}?accessToken={Uri.EscapeDataString(accessToken)}"; + using var response = await _httpClient.DeleteAsync(url); + if (response.IsSuccessStatusCode) + { + _logger.LogDebug("Deleted video {VideoId} from Video Indexer", videoId); + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to delete video {VideoId} from Video Indexer", videoId); + } + } +} diff --git a/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs b/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs index b8742b3d2..99db7736c 100644 --- a/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs +++ b/services/net/auto-clipper/Azure/IAzureVideoIndexerClient.cs @@ -1,11 +1,19 @@ -using System.Collections.Generic; -using System.IO; -using System.Threading; -using System.Threading.Tasks; - namespace TNO.Services.AutoClipper.Azure; +/// +/// Client interface for Azure Video Indexer transcription service. +/// public interface IAzureVideoIndexerClient { - Task> GenerateTranscriptAsync(Stream stream, string fileName, string language, CancellationToken cancellationToken = default); + /// + /// Transcribes a media file using Azure Video Indexer. + /// + /// Path to the media file (video or audio). + /// Transcription request options. + /// Cancellation token. + /// List of transcript segments with optional speaker information. + Task> TranscribeAsync( + string filePath, + VideoIndexerRequest request, + CancellationToken cancellationToken = default); } diff --git a/services/net/auto-clipper/Azure/TimestampedTranscript.cs b/services/net/auto-clipper/Azure/TimestampedTranscript.cs index 02795aef9..fb11c4958 100644 --- a/services/net/auto-clipper/Azure/TimestampedTranscript.cs +++ b/services/net/auto-clipper/Azure/TimestampedTranscript.cs @@ -2,4 +2,13 @@ namespace TNO.Services.AutoClipper.Azure; -public record TimestampedTranscript(TimeSpan Start, TimeSpan End, string Text); +/// +/// Represents a transcript segment with optional speaker identification. +/// +public record TimestampedTranscript( + TimeSpan Start, + TimeSpan End, + string Text, + int? SpeakerId = null, + string? SpeakerName = null +); diff --git a/services/net/auto-clipper/Azure/VideoIndexerRequest.cs b/services/net/auto-clipper/Azure/VideoIndexerRequest.cs new file mode 100644 index 000000000..98311499a --- /dev/null +++ b/services/net/auto-clipper/Azure/VideoIndexerRequest.cs @@ -0,0 +1,23 @@ +namespace TNO.Services.AutoClipper.Azure; + +/// +/// Request options for Azure Video Indexer transcription. +/// +public class VideoIndexerRequest +{ + /// + /// Language code for transcription (e.g., "en-US", "zh-CN"). + /// + public string Language { get; init; } = "en-US"; + + /// + /// Optional Person Model ID for speaker identification. + /// When provided, Video Indexer will attempt to identify known faces. + /// + public string? PersonModelId { get; init; } + + /// + /// Whether to include speaker labels in the output. + /// + public bool IncludeSpeakerLabels { get; init; } = true; +} diff --git a/services/net/auto-clipper/Config/AutoClipperOptions.cs b/services/net/auto-clipper/Config/AutoClipperOptions.cs index cee1d3a4e..4e5fa587a 100644 --- a/services/net/auto-clipper/Config/AutoClipperOptions.cs +++ b/services/net/auto-clipper/Config/AutoClipperOptions.cs @@ -94,6 +94,33 @@ public class AutoClipperOptions : ServiceOptions public int AzureSpeechStorageSasExpiryMinutes { get; set; } = 180; #endregion + #region Azure Video Indexer configuration + /// + /// get/set - Azure Video Indexer account ID. + /// + public string AzureVideoIndexerAccountId { get; set; } = string.Empty; + + /// + /// get/set - Azure Video Indexer location (e.g., "trial", "eastus"). + /// + public string AzureVideoIndexerLocation { get; set; } = "trial"; + + /// + /// get/set - Azure Video Indexer API key (Ocp-Apim-Subscription-Key). + /// + public string AzureVideoIndexerApiKey { get; set; } = string.Empty; + + /// + /// get/set - Timeout in minutes for Video Indexer processing. + /// + public int AzureVideoIndexerTimeoutMinutes { get; set; } = 60; + + /// + /// get/set - Polling interval in seconds for Video Indexer status checks. + /// + public int AzureVideoIndexerPollingIntervalSeconds { get; set; } = 30; + #endregion + #region Azure AI configuration /// /// get/set - The URL to the LLM diff --git a/services/net/auto-clipper/Config/StationProfile.cs b/services/net/auto-clipper/Config/StationProfile.cs index 408475cba..a6e9de94e 100644 --- a/services/net/auto-clipper/Config/StationProfile.cs +++ b/services/net/auto-clipper/Config/StationProfile.cs @@ -11,12 +11,35 @@ public class StationProfile public class StationTranscriptionProfile { + /// + /// Transcription provider: "azure_speech" or "azure_video_indexer" + /// public string Provider { get; set; } = "azure_speech"; + + public string Language { get; set; } = "en-US"; + public int SampleRate { get; set; } = 16000; + + // Azure Speech specific settings public bool Diarization { get; set; } public int? MaxSpeakers { get; set; } public string? DiarizationMode { get; set; } = "online"; - public string Language { get; set; } = "en-US"; - public int SampleRate { get; set; } = 16000; + + // Azure Video Indexer specific settings + /// + /// Dictionary of Person Model names to IDs for speaker identification. + /// Example: { "news": "model-id-1", "sports": "model-id-2" } + /// + public Dictionary PersonModels { get; set; } = new(); + + /// + /// Key to select which Person Model to use from PersonModels dictionary. + /// + public string? PersonModelKey { get; set; } + + /// + /// Whether to include speaker labels (speaker1:, speaker2:, or named) in transcript. + /// + public bool IncludeSpeakerLabels { get; set; } } public class StationTextProfile diff --git a/services/net/auto-clipper/Config/Stations/CHAN.yml b/services/net/auto-clipper/Config/Stations/CHAN.yml new file mode 100644 index 000000000..761ebe6cc --- /dev/null +++ b/services/net/auto-clipper/Config/Stations/CHAN.yml @@ -0,0 +1,95 @@ +# CHAN station configuration using Azure Video Indexer +name: CHAN +sample_rate: 16000 + +transcription: + provider: azure_video_indexer + language: en-CA + include_speaker_labels: true + # Person Model (optional) - add later when you have one: + # person_models: + # news: "your-model-id-here" + # person_model_key: news + +text: + chunk_size_s: 3.0 + chunk_overlap_ratio: 0.5 + heuristic_boundary_weight: 0.35 + keyword_categories: + "(?i)traffic": Traffic + "(?i)weather": Weather + "(?i)sponsor": Ad + "(?i)commercial": Ad + "(?i)up next": Promo + "(?i)coming up": Promo + llm_segmentation: true + llm_model: gpt-5-chat + llm_temperature: 0.0 + system_prompt: | + You are a Broadcast Structure Parser. Your ONLY job is to detect segment transitions. + Output MUST be a single, raw JSON object. + CRITICAL: Start your response with '{' and end with '}'. + DO NOT use markdown, backticks, or "```json" blocks. No introductory or closing text. + max_stories: 15 + llm_prompt: | + Identify every point in the transcript where the topic or segment type changes. + Note: Speaker labels (e.g., "speaker1:", "Tom:") indicate who is speaking - use these to help identify segment transitions. + + # SUMMARY RULES + 1. **Prioritize Anchor Leads**: For News, derive the summary from the anchor's introduction or the first three sentences of the report. + 2. **Active Voice**: Use active journalistic voice (e.g., "Surrey Council rejects housing proposal" NOT "A report about a meeting"). + 3. **Category Formulas**: + - News: [Subject] [Action] (e.g., "Abbotsford shooting victim's son calls for urgent investigation"). + - Traffic: [Location] [Incident/Status] (e.g., "Highway 99 northbound blocked at Hwy 17A due to crash"). + - Weather: [Condition] + [High Temp] (e.g., "Mix of sun and cloud with a high of 9 degrees"). + - Ad: [Business Name] + [Offer/Service] (e.g., "McDonald's features Egg McMuffin with Hollandaise sauce"). + 4. **One Sentence Only**: Summaries MUST be a single, concise sentence. + + # STRUCTURAL RULES (To Prevent Bundling) + 1. **The Sign-off Rule**: Phrases like "Global News," "CBC News," or "Reporting live" followed by a name mark the END of a segment. The very next sentence MUST be a new boundary. + 2. **The Handoff Rule**: When an anchor introduces a reporter (e.g., "As Joshua reports..."), the boundary starts at the ANCHOR'S introduction line. + 3. **Mandatory Category Split**: News, Traffic, Weather, and Ads MUST be isolated. Never bundle a Traffic report with a News story. + 4. **Zero Bloating**: Treat every unique headline as a separate clip. If the topic shifts from a shooting to a stabbing, create two distinct boundaries. + 5. **Speaker Change Awareness**: When the speaker label changes (e.g., from "speaker1:" to "speaker2:"), consider if this indicates a segment transition. + + # OUTPUT FORMAT (Raw JSON ONLY) + { + "boundaries": [ + { + "index": [Sentence Number], + "category": "News | Traffic | Weather | Ad | Promo", + "title": "[Short Slug]", + "summary": "[Journalistic Summary Sentence]", + "score": 0.95 + } + ] + } + + Transcript: + {{transcript}} + +heuristics: + pattern_entries: + # --- Common Transition Patterns --- + - pattern: "(?i)coming up" + weight: 0.65 + category: Promo + note: Host tease for the next story + - pattern: "(?i)after the break" + weight: 0.65 + category: Promo + note: Signals a hard break/transition + # --- Service Cues --- + - pattern: "(?i)traffic update" + weight: 0.6 + category: Traffic + note: Recurring traffic block + - pattern: "(?i)weather update" + weight: 0.55 + category: Weather + note: Weather hits are their own segments + # --- Add CHAN-specific anchor patterns here when known --- + # - pattern: "(?i)Anchor Name" + # weight: 0.85 + # category: News/Traffic/Weather + # note: Description diff --git a/services/net/auto-clipper/LLM/ClipSegmentationService.cs b/services/net/auto-clipper/LLM/ClipSegmentationService.cs index 806a6d0c5..7e913d8b0 100644 --- a/services/net/auto-clipper/LLM/ClipSegmentationService.cs +++ b/services/net/auto-clipper/LLM/ClipSegmentationService.cs @@ -295,7 +295,10 @@ private IReadOnlyList ParseResponse(string? body, IReadOnlyList< { if (content == null) continue; - var boundaries = JsonSerializer.Deserialize(content!); + // LLM sometimes returns JSON wrapped in markdown code fences (```json ... ```), + // even when instructed to return raw JSON. Strip the fences before parsing. + var strippedContent = StripCodeFence(content!); + var boundaries = JsonSerializer.Deserialize(strippedContent); if (boundaries == null || boundaries.Boundaries == null) continue; foreach (var boundary in boundaries.Boundaries) { diff --git a/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs b/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs index 0d098b51c..74c67d6e7 100644 --- a/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs +++ b/services/net/auto-clipper/Pipeline/ClipProcessingPipeline.cs @@ -11,6 +11,7 @@ public class ClipProcessingPipeline { private readonly IAudioNormalizer _audioNormalizer; private readonly IAzureSpeechTranscriptionService _speechTranscriber; + private readonly IAzureVideoIndexerClient? _videoIndexerClient; private readonly IClipSegmentationService _clipSegmentation; private readonly AutoClipperOptions _options; private readonly ILogger _logger; @@ -20,10 +21,12 @@ public ClipProcessingPipeline( IAzureSpeechTranscriptionService speechTranscriber, IClipSegmentationService clipSegmentation, IOptions options, - ILogger logger) + ILogger logger, + IAzureVideoIndexerClient? videoIndexerClient = null) { _audioNormalizer = audioNormalizer; _speechTranscriber = speechTranscriber; + _videoIndexerClient = videoIndexerClient; _clipSegmentation = clipSegmentation; _options = options.Value; _logger = logger; @@ -31,25 +34,71 @@ public ClipProcessingPipeline( public async Task ExecuteAsync(ClipProcessingContext context, CancellationToken cancellationToken) { - var normalizedPath = await _audioNormalizer.NormalizeAsync(context.SourcePath, context.TargetSampleRate, cancellationToken); var language = !string.IsNullOrWhiteSpace(context.Request.Language) ? context.Request.Language! : !string.IsNullOrWhiteSpace(context.StationProfile.Transcription.Language) ? context.StationProfile.Transcription.Language : _options.DefaultTranscriptLanguage; - var transcriptionRequest = new SpeechTranscriptionRequest + + var provider = context.StationProfile.Transcription.Provider ?? "azure_speech"; + IReadOnlyList segments; + string workingPath; + + if (string.Equals(provider, "azure_video_indexer", StringComparison.OrdinalIgnoreCase)) { - Language = language, - EnableSpeakerDiarization = context.StationProfile.Transcription.Diarization, - SpeakerCount = context.StationProfile.Transcription.MaxSpeakers, - DiarizationMode = context.StationProfile.Transcription.DiarizationMode - }; + // Use Azure Video Indexer - upload original file directly (no normalization needed) + if (_videoIndexerClient == null) + throw new InvalidOperationException("Video Indexer client is not configured but provider is set to azure_video_indexer"); + + workingPath = context.SourcePath; + var personModelId = ResolvePersonModelId(context.StationProfile.Transcription); + + _logger.LogInformation("Using Video Indexer provider (PersonModel: {PersonModel})", personModelId ?? "none"); + + var viRequest = new VideoIndexerRequest + { + Language = language, + PersonModelId = personModelId, + IncludeSpeakerLabels = context.StationProfile.Transcription.IncludeSpeakerLabels + }; + + segments = await _videoIndexerClient.TranscribeAsync(context.SourcePath, viRequest, cancellationToken); + } + else + { + // Use Azure Speech (default) - requires audio normalization + _logger.LogInformation("Using Azure Speech provider"); + + workingPath = await _audioNormalizer.NormalizeAsync(context.SourcePath, context.TargetSampleRate, cancellationToken); + var transcriptionRequest = new SpeechTranscriptionRequest + { + Language = language, + EnableSpeakerDiarization = context.StationProfile.Transcription.Diarization, + SpeakerCount = context.StationProfile.Transcription.MaxSpeakers, + DiarizationMode = context.StationProfile.Transcription.DiarizationMode + }; + + segments = await _speechTranscriber.TranscribeAsync(workingPath, transcriptionRequest, cancellationToken); + } - var segments = await _speechTranscriber.TranscribeAsync(normalizedPath, transcriptionRequest, cancellationToken); var segmentationSettings = BuildSegmentationSettings(context.StationProfile); var clipDefinitions = await _clipSegmentation.GenerateClipsAsync(segments, segmentationSettings, cancellationToken); - return new ClipProcessingResult(normalizedPath, language, segments, clipDefinitions, segmentationSettings); + return new ClipProcessingResult(workingPath, language, segments, clipDefinitions, segmentationSettings); + } + + /// + /// Resolves the Person Model ID from station profile configuration. + /// + private static string? ResolvePersonModelId(StationTranscriptionProfile transcription) + { + if (string.IsNullOrWhiteSpace(transcription.PersonModelKey)) + return null; + + if (transcription.PersonModels.TryGetValue(transcription.PersonModelKey, out var modelId)) + return modelId; + + return null; } private static ClipSegmentationSettings BuildSegmentationSettings(StationProfile profile) diff --git a/services/net/auto-clipper/README.md b/services/net/auto-clipper/README.md index 171d2ad5f..8b8654c67 100644 --- a/services/net/auto-clipper/README.md +++ b/services/net/auto-clipper/README.md @@ -1,27 +1,41 @@ # AutoClipper Service -The AutoClipper service consumes clip requests from Kafka, normalizes audio, transcribes it with Azure Speech, and -segments the transcript into clips using a boundary-aware LLM workflow boosted by station heuristics. Key concepts: +The AutoClipper service consumes clip requests from Kafka, transcribes media using Azure Speech or Azure Video Indexer, +and segments the transcript into clips using a boundary-aware LLM workflow boosted by station heuristics. Key concepts: -- **Station profiles** (Config/Stations/\*.yml) define language, sample rate, heuristic keywords, custom prompts, and - category mappings for weather/traffic/ads. -- **Pipeline** (ClipProcessingPipeline) normalizes audio, transcribes via AzureSpeechTranscriptionService, and feeds - transcripts plus station config into ClipSegmentationService. +- **Station profiles** (Config/Stations/\*.yml) define transcription provider, language, sample rate, heuristic keywords, + custom prompts, and category mappings for weather/traffic/ads. +- **Transcription providers**: + - `azure_speech` (default) - Fast batch transcription, outputs plain text. + - `azure_video_indexer` - Supports speaker identification (speaker1:, speaker2:, or named via Person Model). +- **Pipeline** (ClipProcessingPipeline) selects the transcription provider based on station config, transcribes the media, + and feeds transcripts plus station config into ClipSegmentationService. - **Segmentation** uses Azure OpenAI to score story boundaries, merges in regex-based heuristics, snaps clips to transcript sentences, and tags each clip with a category before AutoClipperManager creates content and uploads the media. ## Development -1. Update station YAMLs under Config/Stations (copy CKNW.yml as a starting point). -2. Run dotnet build services/net/auto-clipper/TNO.Services.AutoClipper.csproj to verify changes. -3. Use the harness (see tools/auto-clipper-harness/README.md) to manually validate segmentation on sample audio. +1. Update station YAMLs under Config/Stations (copy CKNW.yml for Azure Speech, CHAN.yml for Video Indexer). +2. Run `dotnet build services/net/auto-clipper/TNO.Services.AutoClipper.csproj` to verify changes. +3. Use the harness (see tools/auto-clipper-harness/README.md) to manually validate segmentation on sample media. ## Configuration -Important Service\_\_ env vars: +### Azure Speech (default provider) -- Service**AzureSpeechKey / Service**AzureSpeechRegion -- Service**AzureSpeechStorageConnectionString / Service**AzureSpeechStorageContainer (batch upload destination for Azure Speech). -- Service**AzureSpeechBatchEndpoint, Service**AzureSpeechBatchApiVersion, Service**AzureSpeechBatchPollingIntervalSeconds, Service**AzureSpeechBatchTimeoutMinutes, Service\_\_AzureSpeechStorageSasExpiryMinutes (optional batch tuning). -- Service**LlmApiUrl, Service**LlmApiKey, Service**LlmDeployment, Service**LlmApiVersion -- Service\_\_StationConfigPath (optional override for station YAML directory) +- Service\_\_AzureSpeechKey / Service\_\_AzureSpeechRegion +- Service\_\_AzureSpeechStorageConnectionString / Service\_\_AzureSpeechStorageContainer (batch upload destination) +- Service\_\_AzureSpeechBatchEndpoint, Service\_\_AzureSpeechBatchApiVersion, Service\_\_AzureSpeechBatchPollingIntervalSeconds, Service\_\_AzureSpeechBatchTimeoutMinutes, Service\_\_AzureSpeechStorageSasExpiryMinutes (optional batch tuning) + +### Azure Video Indexer (optional, for speaker identification) + +- Service\_\_AzureVideoIndexerAccountId - Your Video Indexer account ID +- Service\_\_AzureVideoIndexerLocation - Account location (e.g., `trial`, `eastus`) +- Service\_\_AzureVideoIndexerApiKey - API subscription key +- Service\_\_AzureVideoIndexerTimeoutMinutes (default: 60) - Max wait time for processing +- Service\_\_AzureVideoIndexerPollingIntervalSeconds (default: 30) - Status check interval + +### LLM & General + +- Service\_\_LlmApiUrl, Service\_\_LlmApiKey, Service\_\_LlmDeployment, Service\_\_LlmApiVersion +- Service\_\_StationConfigPath (optional override for station YAML directory) \ No newline at end of file diff --git a/services/net/auto-clipper/appsettings.json b/services/net/auto-clipper/appsettings.json index 4bf3fbc1e..1f0a93ff7 100644 --- a/services/net/auto-clipper/appsettings.json +++ b/services/net/auto-clipper/appsettings.json @@ -53,6 +53,12 @@ "AzureSpeechStorageSasExpiryMinutes": 180, "DefaultTranscriptLanguage": "en-US", + "AzureVideoIndexerAccountId": "", + "AzureVideoIndexerLocation": "trial", + "AzureVideoIndexerApiKey": "", + "AzureVideoIndexerTimeoutMinutes": 60, + "AzureVideoIndexerPollingIntervalSeconds": 30, + "LlmApiUrl": "https://mmiopenai.cognitiveservices.azure.com/", "LlmApiKey": "", "LlmDefaultModel": "", diff --git a/tools/auto-clipper-harness/.env.sample b/tools/auto-clipper-harness/.env.sample index d90a25757..7bc0c53c0 100644 --- a/tools/auto-clipper-harness/.env.sample +++ b/tools/auto-clipper-harness/.env.sample @@ -1,4 +1,14 @@ # TEMP HARNESS env file. Delete along with this harness when done. + +# === Transcription Provider Selection === +# Options: azure_speech | azure_video_indexer +AUTOCLIP_HARNESS_PROVIDER=azure_speech + +# === Output Options === +# Include speaker labels in transcript output (true/false) +AUTOCLIP_HARNESS_INCLUDE_SPEAKER_LABELS=false + +# === Azure Speech Configuration === AUTOCLIP_HARNESS_SPEECH_KEY= AUTOCLIP_HARNESS_SPEECH_REGION=canadacentral AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING= @@ -8,12 +18,33 @@ AUTOCLIP_HARNESS_BATCH_ENDPOINT= AUTOCLIP_HARNESS_BATCH_VERSION=v3.2 AUTOCLIP_HARNESS_BATCH_POLL_SECONDS=10 AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES=45 + +# === Azure Video Indexer Configuration === +# Required when AUTOCLIP_HARNESS_PROVIDER=azure_video_indexer +AUTOCLIP_HARNESS_VI_ACCOUNT_ID= +AUTOCLIP_HARNESS_VI_LOCATION=trial +AUTOCLIP_HARNESS_VI_API_KEY= +AUTOCLIP_HARNESS_VI_TIMEOUT_MINUTES=60 +AUTOCLIP_HARNESS_VI_POLL_SECONDS=30 + +# Multiple Person Models (JSON dictionary format) +# Keys are usage identifiers, values are Azure Person Model IDs +# Example: {"news":"abc123-news-model","sports":"def456-sports-model","default":"ghi789-general"} +AUTOCLIP_HARNESS_VI_PERSON_MODELS={} + +# Current Person Model key to use (corresponds to a key in the dictionary above) +# Leave empty to skip Person Model identification (will output speaker1, speaker2, etc.) +AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY= + +# === LLM Configuration === # Provide either the full chat-completions endpoint or the base resource URL. AUTOCLIP_HARNESS_LLM_URL=https://your-resource.openai.azure.com AUTOCLIP_HARNESS_LLM_KEY= AUTOCLIP_HARNESS_LLM_DEPLOYMENT=gpt-4o-mini AUTOCLIP_HARNESS_LLM_MODEL=gpt-4o-mini AUTOCLIP_HARNESS_LLM_VERSION=2024-07-18 + +# === General Settings === AUTOCLIP_HARNESS_LANGUAGE=en-US AUTOCLIP_HARNESS_MAX_STORIES=5 diff --git a/tools/auto-clipper-harness/.gitignore b/tools/auto-clipper-harness/.gitignore index 91b68b99a..9a8771077 100644 --- a/tools/auto-clipper-harness/.gitignore +++ b/tools/auto-clipper-harness/.gitignore @@ -1,2 +1,26 @@ **/output/ -**/input/ \ No newline at end of file +**/input/ +**/auto-clipper-harness-output/ +**/*.mp4 +**/*.avi +**/*.mkv +**/*.mov +**/*.flv +**/*.wmv +**/*.webm +**/*.m4v +**/*.m4a +**/*.m4b +**/*.m4p +**/*.m4v +**/*.mp3 +**/*.ogg +**/*.wav +**/*.m4v +**/*.m4a +**/*.m4b +**/*.m4p +**/*.m4v +**/*.mp3 +**/*.ogg +**/*.wav diff --git a/tools/auto-clipper-harness/Program.cs b/tools/auto-clipper-harness/Program.cs index 1892d3ed3..539ecc664 100644 --- a/tools/auto-clipper-harness/Program.cs +++ b/tools/auto-clipper-harness/Program.cs @@ -1,8 +1,10 @@ using System.Text; using System.Linq; +using System.Text.Json; using System.Text.RegularExpressions; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +using AutoClipperHarness; using TNO.Services.AutoClipper.Audio; using TNO.Services.AutoClipper.Azure; using TNO.Services.AutoClipper.Config; @@ -30,7 +32,13 @@ } var outputDir = args.Length > 2 ? args[2] : Path.Combine(Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".", "auto-clipper-harness-output"); +Console.WriteLine($"[HARNESS] Output directory: {outputDir}"); Directory.CreateDirectory(outputDir); +if (!Directory.Exists(outputDir)) +{ + Console.WriteLine($"[HARNESS] ERROR: Failed to create output directory: {outputDir}"); + return; +} using var loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole(o => o.TimestampFormat = "HH:mm:ss ")); var stationCode = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STATION") ?? "CKNW"; @@ -46,8 +54,6 @@ var sampleRate = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_SAMPLE_RATE"), out var sr) ? sr : (stationProfile.Transcription.SampleRate > 0 ? stationProfile.Transcription.SampleRate : 16000); -var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger()); -var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate); var llmBaseUrl = RequireEnv("AUTOCLIP_HARNESS_LLM_URL").Trim(); var llmDeployment = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_DEPLOYMENT"); var llmVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_LLM_VERSION") ?? "2024-07-18"; @@ -57,45 +63,107 @@ ? (!string.IsNullOrWhiteSpace(llmDeployment) ? llmDeployment : "gpt-4o-mini") : llmModel; - - -var options = Options.Create(new AutoClipperOptions +// Create LLM options (shared by both providers) +var llmOptions = Options.Create(new AutoClipperOptions { - AzureSpeechKey = RequireEnv("AUTOCLIP_HARNESS_SPEECH_KEY"), - AzureSpeechRegion = RequireEnv("AUTOCLIP_HARNESS_SPEECH_REGION"), - AzureSpeechBatchEndpoint = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_ENDPOINT") ?? string.Empty, - AzureSpeechBatchApiVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_VERSION") ?? "v3.2", - AzureSpeechBatchPollingIntervalSeconds = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_POLL_SECONDS"), out var batchPollSeconds) ? batchPollSeconds : 10, - AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45, - AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"), - AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"), - AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180, LlmApiUrl = llmEndpoint, - + LlmApiUrl = llmEndpoint, LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"), - LlmDefaultModel = defaultModel, LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT") ?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt), - MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5, + MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStoriesLlm) ? maxStoriesLlm : 5, VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".", DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US" }); -var speechLogger = loggerFactory.CreateLogger(); var llmLogger = loggerFactory.CreateLogger(); -var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger); -var llmService = new ClipSegmentationService(new HttpClient(), options, llmLogger); -var transcriptionRequest = new SpeechTranscriptionRequest +var llmService = new ClipSegmentationService(new HttpClient(), llmOptions, llmLogger); + +// Determine provider and output options (early detection to avoid unnecessary initialization) +var provider = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROVIDER") ?? "azure_speech"; +var includeSpeakerLabels = bool.TryParse( + Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_INCLUDE_SPEAKER_LABELS"), + out var isl) && isl; + +// Transcribe using selected provider +IReadOnlyList transcriptSegments; +IReadOnlyList segments; + +if (string.Equals(provider, "azure_video_indexer", StringComparison.OrdinalIgnoreCase)) +{ + // Use Azure Video Indexer - no audio normalization needed, upload original file directly + var personModelId = ResolvePersonModelId(); + Console.WriteLine($"[HARNESS] Using Video Indexer (PersonModel: {personModelId ?? "none"})"); + Console.WriteLine($"[HARNESS] Uploading original file: {input}"); + + var viClient = new VideoIndexerClient( + new HttpClient(), + RequireEnv("AUTOCLIP_HARNESS_VI_ACCOUNT_ID"), + Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_LOCATION") ?? "trial", + RequireEnv("AUTOCLIP_HARNESS_VI_API_KEY"), + int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_TIMEOUT_MINUTES"), out var viTimeout) ? viTimeout : 60, + int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_POLL_SECONDS"), out var viPoll) ? viPoll : 30, + loggerFactory.CreateLogger()); + + var viRequest = new VideoIndexerRequest + { + Language = language, + PersonModelId = personModelId, + IncludeSpeakerLabels = includeSpeakerLabels + }; + + var rawJsonPath = Path.Combine(outputDir, "video_indexer_raw_response.json"); + transcriptSegments = await viClient.TranscribeAsync(input, viRequest, CancellationToken.None, rawJsonPath); + Console.WriteLine($"[HARNESS] Raw Video Indexer response -> {rawJsonPath}"); + + // Convert to TimestampedTranscript for downstream compatibility + segments = transcriptSegments.Select(s => new TimestampedTranscript(s.Start, s.End, s.Text)).ToList(); +} +else { - Language = language, - EnableSpeakerDiarization = stationProfile.Transcription.Diarization, - SpeakerCount = stationProfile.Transcription.MaxSpeakers, - DiarizationMode = stationProfile.Transcription.DiarizationMode -}; - -Console.WriteLine($"[HARNESS] Transcribing {workingFile} ..."); -var segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None); + // Use Azure Speech (default) - requires audio normalization + var audioNormalizer = new AudioNormalizer(loggerFactory.CreateLogger()); + var workingFile = await audioNormalizer.NormalizeAsync(input, sampleRate); + + var options = Options.Create(new AutoClipperOptions + { + AzureSpeechKey = RequireEnv("AUTOCLIP_HARNESS_SPEECH_KEY"), + AzureSpeechRegion = RequireEnv("AUTOCLIP_HARNESS_SPEECH_REGION"), + AzureSpeechBatchEndpoint = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_ENDPOINT") ?? string.Empty, + AzureSpeechBatchApiVersion = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_VERSION") ?? "v3.2", + AzureSpeechBatchPollingIntervalSeconds = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_POLL_SECONDS"), out var batchPollSeconds) ? batchPollSeconds : 10, + AzureSpeechBatchTimeoutMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_BATCH_TIMEOUT_MINUTES"), out var batchTimeoutMinutes) ? batchTimeoutMinutes : 45, + AzureSpeechStorageConnectionString = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING"), + AzureSpeechStorageContainer = RequireEnv("AUTOCLIP_HARNESS_STORAGE_CONTAINER"), + AzureSpeechStorageSasExpiryMinutes = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_STORAGE_SAS_MINUTES"), out var sasMinutes) ? sasMinutes : 180, + LlmApiUrl = llmEndpoint, + LlmApiKey = RequireEnv("AUTOCLIP_HARNESS_LLM_KEY"), + LlmDefaultModel = defaultModel, + LlmPrompt = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_PROMPT") + ?? (string.IsNullOrWhiteSpace(stationProfile.Text.LlmPrompt) ? string.Empty : stationProfile.Text.LlmPrompt), + MaxStoriesFromClip = int.TryParse(Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_MAX_STORIES"), out var maxStories) ? maxStories : 5, + VolumePath = Path.GetDirectoryName(Path.GetFullPath(input)) ?? ".", + DefaultTranscriptLanguage = stationProfile.Transcription.Language ?? "en-US" + }); + + var speechLogger = loggerFactory.CreateLogger(); + var speechService = new AzureSpeechTranscriptionService(new HttpClient(), options, speechLogger); + var transcriptionRequest = new SpeechTranscriptionRequest + { + Language = language, + EnableSpeakerDiarization = stationProfile.Transcription.Diarization, + SpeakerCount = stationProfile.Transcription.MaxSpeakers, + DiarizationMode = stationProfile.Transcription.DiarizationMode + }; + + Console.WriteLine($"[HARNESS] Transcribing with Azure Speech: {workingFile} ..."); + segments = await speechService.TranscribeAsync(workingFile, transcriptionRequest, CancellationToken.None); + + // Convert to TranscriptSegment for speaker label support + transcriptSegments = segments.Select(s => new TranscriptSegment(s.Start, s.End, s.Text)).ToList(); +} + Console.WriteLine($"[HARNESS] Received {segments.Count} transcript segments"); -var fullTranscriptBody = BuildTranscriptDocument(segments); +var fullTranscriptBody = BuildTranscriptDocumentWithSpeakers(transcriptSegments, includeSpeakerLabels); var fullTranscriptPath = Path.Combine(outputDir, "transcript_full.txt"); await File.WriteAllTextAsync(fullTranscriptPath, fullTranscriptBody ?? string.Empty); Console.WriteLine($"[HARNESS] Full transcript -> {fullTranscriptPath}"); @@ -118,8 +186,8 @@ continue; } - var transcriptSlice = ExtractTranscriptRange(segments, normalized.Start, normalized.End); - var transcriptBody = BuildTranscriptDocument(transcriptSlice); + var transcriptSlice = ExtractTranscriptSegmentRange(transcriptSegments, normalized.Start, normalized.End); + var transcriptBody = BuildTranscriptDocumentWithSpeakers(transcriptSlice, includeSpeakerLabels); if (string.IsNullOrWhiteSpace(transcriptBody)) { Console.WriteLine($"[HARNESS] Empty transcript for clip {definition.Title}"); @@ -378,6 +446,60 @@ static void LoadEnvFile(string path) static IReadOnlyList ExtractTranscriptRange(IReadOnlyList segments, TimeSpan start, TimeSpan end) => segments.Where(s => s.End > start && s.Start < end).ToArray(); +static IReadOnlyList ExtractTranscriptSegmentRange(IReadOnlyList segments, TimeSpan start, TimeSpan end) + => segments.Where(s => s.End > start && s.Start < end).ToArray(); + +static string? ResolvePersonModelId() +{ + var personModelsJson = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_PERSON_MODELS"); + var personModelKey = Environment.GetEnvironmentVariable("AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY"); + + if (string.IsNullOrWhiteSpace(personModelsJson) || string.IsNullOrWhiteSpace(personModelKey)) + return null; + + try + { + var personModels = JsonSerializer.Deserialize>(personModelsJson); + if (personModels != null && personModels.TryGetValue(personModelKey, out var modelId)) + { + return modelId; + } + } + catch (JsonException) + { + Console.WriteLine($"[HARNESS] Warning: Failed to parse AUTOCLIP_HARNESS_VI_PERSON_MODELS as JSON"); + } + + return null; +} + +static string BuildTranscriptDocumentWithSpeakers(IReadOnlyList segments, bool includeSpeakerLabels) +{ + if (segments == null || segments.Count == 0) return string.Empty; + var sb = new StringBuilder(); + var idx = 1; + foreach (var segment in segments) + { + if (string.IsNullOrWhiteSpace(segment.Text)) continue; + + sb.AppendLine(idx.ToString()); + sb.AppendLine($"{FormatTimestamp(segment.Start)} --> {FormatTimestamp(segment.End)}"); + + if (includeSpeakerLabels && (segment.SpeakerName != null || segment.SpeakerId != null)) + { + var label = segment.SpeakerName ?? $"speaker{segment.SpeakerId}"; + sb.AppendLine($"{label}: {segment.Text.Trim()}"); + } + else + { + sb.AppendLine(segment.Text.Trim()); + } + sb.AppendLine(); + idx++; + } + return sb.ToString().Trim(); +} + diff --git a/tools/auto-clipper-harness/README.md b/tools/auto-clipper-harness/README.md index 933223301..45667630a 100644 --- a/tools/auto-clipper-harness/README.md +++ b/tools/auto-clipper-harness/README.md @@ -1,23 +1,48 @@ # AutoClipper Harness -The harness is a standalone console app that mirrors the AutoClipper pipeline for manual validation. It -normalizes a local media file, runs Azure Speech transcription, feeds the transcript and station heuristics to the -segmenter, and writes clips/transcripts/prompt debug files for inspection. +Standalone console app for local testing of the AutoClipper pipeline. ## Usage -`dotnet run --project tools/auto-clipper-harness -- [language] [outputDir]` +```bash +dotnet run --project tools/auto-clipper-harness -- [language] [outputDir] +``` -- Configure Azure keys and LLM settings via .env (see .env.sample). -- Provide AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING / AUTOCLIP_HARNESS_STORAGE_CONTAINER so the harness can upload audio for Azure batch transcription. -- Optional overrides: AUTOCLIP_HARNESS_BATCH_ENDPOINT, \_BATCH_VERSION, \_BATCH_POLL_SECONDS, \_BATCH_TIMEOUT_MINUTES, and \_STORAGE_SAS_MINUTES. -- Station profiles are loaded from services/net/auto-clipper/Config/Stations by default; override with - AUTOCLIP_HARNESS_STATION_PATH / AUTOCLIP_HARNESS_STATION. -- Outputs: clip_XX.\* media slices, clip_XX.txt transcripts, ranscript_full.txt, and - llm_prompt_debug.txt (shows numbered transcript, heuristics, and the final prompt). +## Configuration + +Copy `.env.sample` to `.env` and fill in your keys. + +### Provider Selection + +```bash +AUTOCLIP_HARNESS_PROVIDER=azure_speech # default +AUTOCLIP_HARNESS_PROVIDER=azure_video_indexer # with speaker identification +``` + +### Azure Speech (default) + +- `AUTOCLIP_HARNESS_SPEECH_KEY` / `AUTOCLIP_HARNESS_SPEECH_REGION` +- `AUTOCLIP_HARNESS_STORAGE_CONNECTION_STRING` / `AUTOCLIP_HARNESS_STORAGE_CONTAINER` + +### Azure Video Indexer + +- `AUTOCLIP_HARNESS_VI_ACCOUNT_ID` / `AUTOCLIP_HARNESS_VI_API_KEY` +- `AUTOCLIP_HARNESS_VI_LOCATION` (default: trial) +- `AUTOCLIP_HARNESS_VI_PERSON_MODELS` - JSON map for multiple Person Models +- `AUTOCLIP_HARNESS_VI_PERSON_MODEL_KEY` - which model to use + +### LLM + +- `AUTOCLIP_HARNESS_LLM_ENDPOINT` / `AUTOCLIP_HARNESS_LLM_KEY` + +## Output + +- `clip_XX.mp4` / `clip_XX.txt` - segmented clips with transcripts +- `transcript_full.txt` - full transcript (with speaker labels if enabled) +- `video_indexer_raw_response.json` - raw API response (Video Indexer only) +- `llm_prompt_debug.txt` - LLM prompt for debugging ## Notes -- The harness shares the segmentation logic with the service, so any changes in ClipSegmentationService - should be validated here first. -- Ensure ffmpeg is available on PATH; the harness shells out to ffmpeg to produce media clips. +- Video Indexer uploads original media directly; Azure Speech requires WAV conversion +- Requires `ffmpeg` on PATH for clip extraction diff --git a/tools/auto-clipper-harness/TranscriptSegment.cs b/tools/auto-clipper-harness/TranscriptSegment.cs new file mode 100644 index 000000000..66d18adc1 --- /dev/null +++ b/tools/auto-clipper-harness/TranscriptSegment.cs @@ -0,0 +1,13 @@ +namespace AutoClipperHarness; + +/// +/// Represents a transcript segment with optional speaker identification. +/// This is the harness-local version; will be merged into TimestampedTranscript in phase 2. +/// +public record TranscriptSegment( + TimeSpan Start, + TimeSpan End, + string Text, + int? SpeakerId = null, + string? SpeakerName = null +); diff --git a/tools/auto-clipper-harness/VideoIndexerClient.cs b/tools/auto-clipper-harness/VideoIndexerClient.cs new file mode 100644 index 000000000..ddf09d61e --- /dev/null +++ b/tools/auto-clipper-harness/VideoIndexerClient.cs @@ -0,0 +1,398 @@ +using System.Net.Http.Headers; +using System.Text.Json; +using Microsoft.Extensions.Logging; + +namespace AutoClipperHarness; + +/// +/// Client for Azure Video Indexer API. +/// Handles video upload, processing, and transcript extraction with speaker identification. +/// +public class VideoIndexerClient +{ + private const string ApiBaseUrl = "https://api.videoindexer.ai"; + + private readonly HttpClient _httpClient; + private readonly string _accountId; + private readonly string _location; + private readonly string _apiKey; + private readonly int _timeoutMinutes; + private readonly int _pollIntervalSeconds; + private readonly ILogger? _logger; + + private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web) + { + PropertyNameCaseInsensitive = true + }; + + public VideoIndexerClient( + HttpClient httpClient, + string accountId, + string location, + string apiKey, + int timeoutMinutes = 60, + int pollIntervalSeconds = 30, + ILogger? logger = null) + { + _httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient)); + _accountId = accountId ?? throw new ArgumentNullException(nameof(accountId)); + _location = location ?? throw new ArgumentNullException(nameof(location)); + _apiKey = apiKey ?? throw new ArgumentNullException(nameof(apiKey)); + _timeoutMinutes = timeoutMinutes > 0 ? timeoutMinutes : 60; + _pollIntervalSeconds = pollIntervalSeconds > 0 ? pollIntervalSeconds : 30; + _logger = logger; + } + + /// + /// Transcribes a media file using Azure Video Indexer. + /// + /// Path to the media file (video or audio). + /// Transcription request options. + /// Cancellation token. + /// Optional path to save the raw Video Indexer JSON response for debugging. + /// List of transcript segments with optional speaker information. + public async Task> TranscribeAsync( + string filePath, + VideoIndexerRequest request, + CancellationToken cancellationToken = default, + string? rawJsonOutputPath = null) + { + if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath)) + throw new FileNotFoundException("Media file not found", filePath); + + _logger?.LogInformation("Starting Video Indexer transcription for {File}", filePath); + + // Step 1: Get access token + var accessToken = await GetAccessTokenAsync(cancellationToken); + _logger?.LogDebug("Obtained access token"); + + // Step 2: Upload video + var videoId = await UploadVideoAsync(filePath, accessToken, request, cancellationToken); + _logger?.LogInformation("Video uploaded with ID: {VideoId}", videoId); + + try + { + // Step 3: Wait for processing to complete + var indexJson = await WaitForProcessingAsync(videoId, accessToken, cancellationToken); + _logger?.LogInformation("Video processing completed"); + + // Save raw JSON for debugging if path is provided + if (!string.IsNullOrWhiteSpace(rawJsonOutputPath)) + { + try + { + // Ensure directory exists + var dir = Path.GetDirectoryName(rawJsonOutputPath); + if (!string.IsNullOrWhiteSpace(dir)) + Directory.CreateDirectory(dir); + + // Pretty print the JSON + using var doc = JsonDocument.Parse(indexJson); + var prettyJson = JsonSerializer.Serialize(doc, new JsonSerializerOptions { WriteIndented = true }); + await File.WriteAllTextAsync(rawJsonOutputPath, prettyJson, cancellationToken); + _logger?.LogInformation("Saved raw Video Indexer response to {Path}", rawJsonOutputPath); + } + catch (Exception ex) + { + _logger?.LogWarning(ex, "Failed to save raw JSON to {Path}", rawJsonOutputPath); + } + } + + // Step 4: Parse transcript + var segments = ParseTranscript(indexJson, request.IncludeSpeakerLabels); + _logger?.LogInformation("Parsed {Count} transcript segments", segments.Count); + + return segments; + } + finally + { + // Clean up: delete the video from Video Indexer + await TryDeleteVideoAsync(videoId, accessToken); + } + } + + private async Task GetAccessTokenAsync(CancellationToken cancellationToken) + { + var url = $"{ApiBaseUrl}/Auth/{_location}/Accounts/{_accountId}/AccessToken?allowEdit=true"; + + using var request = new HttpRequestMessage(HttpMethod.Get, url); + request.Headers.Add("Ocp-Apim-Subscription-Key", _apiKey); + + using var response = await _httpClient.SendAsync(request, cancellationToken); + var body = await response.Content.ReadAsStringAsync(cancellationToken); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException($"Failed to get access token: {response.StatusCode} - {body}"); + + // Token is returned as a quoted string + return body.Trim('"'); + } + + private async Task UploadVideoAsync( + string filePath, + string accessToken, + VideoIndexerRequest request, + CancellationToken cancellationToken) + { + var fileName = Path.GetFileName(filePath); + var videoName = $"AutoClipper-{Guid.NewGuid():N}-{fileName}"; + + // Build upload URL with query parameters + var queryParams = new List + { + $"accessToken={Uri.EscapeDataString(accessToken)}", + $"name={Uri.EscapeDataString(videoName)}", + $"language={Uri.EscapeDataString(request.Language)}", + "privacy=Private", + "indexingPreset=AudioOnly" // We only need audio analysis for transcription + }; + + if (!string.IsNullOrWhiteSpace(request.PersonModelId)) + { + queryParams.Add($"personModelId={Uri.EscapeDataString(request.PersonModelId)}"); + } + + var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos?{string.Join("&", queryParams)}"; + + _logger?.LogDebug("Uploading video to: {Url}", url.Split('?')[0]); + + using var content = new MultipartFormDataContent(); + await using var fileStream = File.OpenRead(filePath); + var fileContent = new StreamContent(fileStream); + fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream"); + content.Add(fileContent, "file", fileName); + + using var response = await _httpClient.PostAsync(url, content, cancellationToken); + var body = await response.Content.ReadAsStringAsync(cancellationToken); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException($"Failed to upload video: {response.StatusCode} - {body}"); + + using var doc = JsonDocument.Parse(body); + var videoId = doc.RootElement.GetProperty("id").GetString(); + + if (string.IsNullOrWhiteSpace(videoId)) + throw new InvalidOperationException("Video Indexer did not return a video ID"); + + return videoId; + } + + private async Task WaitForProcessingAsync( + string videoId, + string accessToken, + CancellationToken cancellationToken) + { + var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos/{videoId}/Index?accessToken={Uri.EscapeDataString(accessToken)}"; + var timeout = TimeSpan.FromMinutes(_timeoutMinutes); + var pollInterval = TimeSpan.FromSeconds(_pollIntervalSeconds); + var startTime = DateTime.UtcNow; + + while (true) + { + cancellationToken.ThrowIfCancellationRequested(); + + if (DateTime.UtcNow - startTime > timeout) + throw new TimeoutException($"Video Indexer processing did not complete within {_timeoutMinutes} minutes"); + + using var response = await _httpClient.GetAsync(url, cancellationToken); + var body = await response.Content.ReadAsStringAsync(cancellationToken); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException($"Failed to get video index: {response.StatusCode} - {body}"); + + using var doc = JsonDocument.Parse(body); + var state = doc.RootElement.GetProperty("state").GetString(); + + var elapsed = (int)(DateTime.UtcNow - startTime).TotalSeconds; + _logger?.LogInformation("Video Indexer status: {State} ({Elapsed}s elapsed)", state, elapsed); + + if (string.Equals(state, "Processed", StringComparison.OrdinalIgnoreCase)) + { + return body; + } + + if (string.Equals(state, "Failed", StringComparison.OrdinalIgnoreCase)) + { + var errorMessage = "Unknown error"; + if (doc.RootElement.TryGetProperty("failureMessage", out var failureMsg)) + errorMessage = failureMsg.GetString() ?? errorMessage; + throw new InvalidOperationException($"Video Indexer processing failed: {errorMessage}"); + } + + await Task.Delay(pollInterval, cancellationToken); + } + } + + private IReadOnlyList ParseTranscript(string indexJson, bool includeSpeakerLabels) + { + var segments = new List(); + + using var doc = JsonDocument.Parse(indexJson); + + // Navigate to: videos[0].insights.transcript + if (!doc.RootElement.TryGetProperty("videos", out var videos) || + videos.GetArrayLength() == 0) + { + _logger?.LogWarning("No videos found in index response"); + return segments; + } + + var video = videos[0]; + if (!video.TryGetProperty("insights", out var insights)) + { + _logger?.LogWarning("No insights found in video"); + return segments; + } + + // Build speaker name map from faces/speakers if available + var speakerNames = BuildSpeakerNameMap(insights); + + // Parse transcript + if (!insights.TryGetProperty("transcript", out var transcript) || + transcript.ValueKind != JsonValueKind.Array) + { + _logger?.LogWarning("No transcript found in insights"); + return segments; + } + + foreach (var item in transcript.EnumerateArray()) + { + var text = item.TryGetProperty("text", out var textProp) ? textProp.GetString() : null; + if (string.IsNullOrWhiteSpace(text)) + continue; + + // Parse timestamps from instances array + // Video Indexer format: { "instances": [{ "start": "0:00:00.4", "end": "0:00:08.36" }] } + var start = TimeSpan.Zero; + var end = TimeSpan.Zero; + + if (item.TryGetProperty("instances", out var instances) && + instances.ValueKind == JsonValueKind.Array && + instances.GetArrayLength() > 0) + { + var firstInstance = instances[0]; + start = ParseTimestamp(firstInstance, "start"); + end = ParseTimestamp(firstInstance, "end"); + } + + if (end <= start) + end = start + TimeSpan.FromMilliseconds(100); + + // Parse speaker information + int? speakerId = null; + string? speakerName = null; + + if (includeSpeakerLabels && item.TryGetProperty("speakerId", out var speakerIdProp)) + { + speakerId = speakerIdProp.ValueKind == JsonValueKind.Number + ? speakerIdProp.GetInt32() + : int.TryParse(speakerIdProp.GetString(), out var id) ? id : null; + + if (speakerId.HasValue && speakerNames.TryGetValue(speakerId.Value, out var name)) + { + speakerName = name; + } + } + + segments.Add(new TranscriptSegment(start, end, text.Trim(), speakerId, speakerName)); + } + + return segments.OrderBy(s => s.Start).ToList(); + } + + private Dictionary BuildSpeakerNameMap(JsonElement insights) + { + var map = new Dictionary(); + + // Try to get speaker names from faces (Person Model identification) + if (insights.TryGetProperty("faces", out var faces) && faces.ValueKind == JsonValueKind.Array) + { + foreach (var face in faces.EnumerateArray()) + { + if (!face.TryGetProperty("id", out var idProp)) + continue; + + var faceId = idProp.ValueKind == JsonValueKind.Number + ? idProp.GetInt32() + : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null; + + if (!faceId.HasValue) + continue; + + // Get name - could be from Person Model or Unknown + var name = face.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null; + + // Skip unknown faces - we'll use speaker ID instead + if (!string.IsNullOrWhiteSpace(name) && + !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase)) + { + map[faceId.Value] = name; + } + } + } + + // Also check speakers section + if (insights.TryGetProperty("speakers", out var speakers) && speakers.ValueKind == JsonValueKind.Array) + { + foreach (var speaker in speakers.EnumerateArray()) + { + if (!speaker.TryGetProperty("id", out var idProp)) + continue; + + var speakerId = idProp.ValueKind == JsonValueKind.Number + ? idProp.GetInt32() + : int.TryParse(idProp.GetString(), out var id) ? id : (int?)null; + + if (!speakerId.HasValue || map.ContainsKey(speakerId.Value)) + continue; + + var name = speaker.TryGetProperty("name", out var nameProp) ? nameProp.GetString() : null; + if (!string.IsNullOrWhiteSpace(name) && + !name.StartsWith("Unknown", StringComparison.OrdinalIgnoreCase) && + !name.StartsWith("Speaker", StringComparison.OrdinalIgnoreCase)) + { + map[speakerId.Value] = name; + } + } + } + + return map; + } + + private static TimeSpan ParseTimestamp(JsonElement element, string property) + { + if (!element.TryGetProperty(property, out var prop)) + return TimeSpan.Zero; + + var value = prop.GetString(); + if (string.IsNullOrWhiteSpace(value)) + return TimeSpan.Zero; + + // Video Indexer uses format like "0:00:05.12" or "00:00:05.12" + if (TimeSpan.TryParse(value, out var ts)) + return ts; + + // Try parsing as seconds + if (double.TryParse(value, out var seconds)) + return TimeSpan.FromSeconds(seconds); + + return TimeSpan.Zero; + } + + private async Task TryDeleteVideoAsync(string videoId, string accessToken) + { + try + { + var url = $"{ApiBaseUrl}/{_location}/Accounts/{_accountId}/Videos/{videoId}?accessToken={Uri.EscapeDataString(accessToken)}"; + using var response = await _httpClient.DeleteAsync(url); + if (response.IsSuccessStatusCode) + { + _logger?.LogDebug("Deleted video {VideoId} from Video Indexer", videoId); + } + } + catch (Exception ex) + { + _logger?.LogWarning(ex, "Failed to delete video {VideoId} from Video Indexer", videoId); + } + } +} diff --git a/tools/auto-clipper-harness/VideoIndexerRequest.cs b/tools/auto-clipper-harness/VideoIndexerRequest.cs new file mode 100644 index 000000000..a24d54518 --- /dev/null +++ b/tools/auto-clipper-harness/VideoIndexerRequest.cs @@ -0,0 +1,23 @@ +namespace AutoClipperHarness; + +/// +/// Request options for Azure Video Indexer transcription. +/// +public class VideoIndexerRequest +{ + /// + /// Language code for transcription (e.g., "en-US", "zh-CN"). + /// + public string Language { get; init; } = "en-US"; + + /// + /// Optional Person Model ID for speaker identification. + /// When provided, Video Indexer will attempt to identify known faces. + /// + public string? PersonModelId { get; init; } + + /// + /// Whether to include speaker labels in the output. + /// + public bool IncludeSpeakerLabels { get; init; } = true; +}