From ecb1557adb1c74ec481569c83187830f9bfd9491 Mon Sep 17 00:00:00 2001
From: "martinadams.dev" <martinadams.dev@gmail.com>
Date: Sat, 7 Feb 2026 23:55:27 +0800
Subject: [PATCH] feat: subscribe to multiple audio tracks and add RFC 6464
 audio level extension

When multiple devices join the same LiveKit room, they cannot hear each
other because subscribe_tracks() only subscribes to the first audio track
and breaks. This removes the single-track limitation, allowing subscription
to up to MAX_AUDIO_SUBSCRIPTIONS (4) audio tracks with deduplication.

Additionally, injects RFC 6464 ssrc-audio-level RTP header extension into
publisher SDP and outgoing RTP packets to support LiveKit Active Speaker
detection. Requires esp_peer >= 1.3.0 for the RTP transformer API.
---
 components/livekit/core/engine.c |  33 +++--
 components/livekit/core/peer.c   | 245 ++++++++++++++++++++++++++++++-
 2 files changed, 264 insertions(+), 14 deletions(-)

diff --git a/components/livekit/core/engine.c b/components/livekit/core/engine.c
index 7bb739e..1069f39 100644
--- a/components/livekit/core/engine.c
+++ b/components/livekit/core/engine.c
@@ -88,10 +88,13 @@ typedef struct {
     } detail;
 } engine_event_t;
 
+#define MAX_AUDIO_SUBSCRIPTIONS 4
+
 typedef struct {
     bool is_subscriber_primary;
     livekit_pb_sid_t local_participant_sid;
-    livekit_pb_sid_t sub_audio_track_sid;
+    livekit_pb_sid_t sub_audio_track_sids[MAX_AUDIO_SUBSCRIPTIONS];
+    uint8_t sub_audio_track_count;
 } session_state_t;
 
 typedef struct {
@@ -152,19 +155,32 @@ static engine_err_t subscribe_tracks(engine_t *eng, livekit_pb_track_info_t *tra
     if (tracks == NULL || count <= 0) {
         return ENGINE_ERR_INVALID_ARG;
     }
-    if (eng->session.sub_audio_track_sid[0] != '\0') {
-        return ENGINE_ERR_MAX_SUB;
-    }
     for (int i = 0; i < count; i++) {
         livekit_pb_track_info_t *track = &tracks[i];
         if (track->type != LIVEKIT_PB_TRACK_TYPE_AUDIO) {
             continue;
         }
-        // For now, subscribe to the first audio track.
+        // Check if already subscribed to this track
+        bool already_subscribed = false;
+        for (uint8_t j = 0; j < eng->session.sub_audio_track_count; j++) {
+            if (strncmp(eng->session.sub_audio_track_sids[j], track->sid,
+                        sizeof(livekit_pb_sid_t)) == 0) {
+                already_subscribed = true;
+                break;
+            }
+        }
+        if (already_subscribed) {
+            continue;
+        }
+        if (eng->session.sub_audio_track_count >= MAX_AUDIO_SUBSCRIPTIONS) {
+            ESP_LOGW(TAG, "Max audio subscriptions reached (%d)", MAX_AUDIO_SUBSCRIPTIONS);
+            return ENGINE_ERR_MAX_SUB;
+        }
         ESP_LOGI(TAG, "Subscribing to audio track: sid=%s", track->sid);
         signal_send_update_subscription(eng->signal_handle, track->sid, true);
-        strlcpy(eng->session.sub_audio_track_sid, track->sid, sizeof(eng->session.sub_audio_track_sid));
-        break;
+        strlcpy(eng->session.sub_audio_track_sids[eng->session.sub_audio_track_count],
+                track->sid, sizeof(livekit_pb_sid_t));
+        eng->session.sub_audio_track_count++;
     }
     return ENGINE_ERR_NONE;
 }
@@ -692,12 +708,11 @@ static bool handle_join(engine_t *eng, const livekit_pb_join_response_t *join)
 
     // 6. Subscribe to remote tracks that have already been published.
     for (pb_size_t i = 0; i < join->other_participants_count; i++) {
-        engine_err_t ret = subscribe_tracks(
+        subscribe_tracks(
             eng,
             join->other_participants[i].tracks,
             join->other_participants[i].tracks_count
         );
-        if (ret == ENGINE_ERR_MAX_SUB) break;
     }
     return true;
 }
diff --git a/components/livekit/core/peer.c b/components/livekit/core/peer.c
index d442f92..cc637bc 100644
--- a/components/livekit/core/peer.c
+++ b/components/livekit/core/peer.c
@@ -15,6 +15,7 @@
  */
 
 #include <stdlib.h>
+#include <string.h>
 #include "esp_log.h"
 #include "esp_peer.h"
 #include "esp_peer_default.h"
@@ -36,6 +37,121 @@ static const char *PUB_TAG = "livekit_peer.pub";
 #define PC_RESUME_BIT    (1 << 2)
 #define PC_SEND_QUIT_BIT (1 << 3)
 
+// RFC 6464 audio level RTP header extension constants
+#define AUDIO_LEVEL_URI         "urn:ietf:params:rtp-hdrext:ssrc-audio-level"
+#define AUDIO_LEVEL_DEFAULT     30  // Placeholder: -30 dBov (fixed level, not measured from actual audio)
+#define RTP_EXT_BLOCK_SIZE      8   // 4-byte profile+length header + 1-byte element ID + 1-byte audio level + 2-byte padding
+
+// MARK: - Audio level RTP header extension (RFC 6464)
+
+/// Finds the smallest unused extmap ID (1-14) within an SDP media section.
+static uint8_t sdp_find_unused_extmap_id(const char *section_start, const char *section_end)
+{
+    uint16_t used_ids = 0;
+    const char *p = section_start;
+    while ((p = strstr(p, "\na=extmap:")) != NULL && p < section_end) {
+        p += 10; // skip "\na=extmap:"
+        int id = atoi(p);
+        if (id >= 1 && id <= 14) {
+            used_ids |= (1u << id);
+        }
+    }
+    for (uint8_t id = 1; id <= 14; id++) {
+        if (!(used_ids & (1u << id))) {
+            return id;
+        }
+    }
+    return 0;
+}
+
+/// Parses the Opus payload type from an SDP string.
+static uint8_t sdp_parse_opus_payload_type(const char *sdp)
+{
+    const char *p = sdp;
+    while ((p = strstr(p, "a=rtpmap:")) != NULL) {
+        p += 9;
+        int pt = atoi(p);
+        const char *eol = strpbrk(p, "\r\n");
+        if (eol == NULL) eol = p + strlen(p);
+        const char *opus = strstr(p, " opus/");
+        if (opus != NULL && opus < eol && pt >= 0 && pt <= 127) {
+            return (uint8_t)pt;
+        }
+    }
+    return 0;
+}
+
+/// Injects the ssrc-audio-level extmap attribute into the audio section of an SDP string.
+/// Dynamically selects an unused extmap ID (1-14) to avoid conflicts.
+/// Sets *out_extmap_id to the chosen ID on success.
+/// Returns a newly allocated SDP string with the extmap line inserted, or NULL on failure.
+/// The caller must free the returned string.
+static char *sdp_inject_audio_level_extmap(const char *sdp, uint8_t *out_extmap_id)
+{
+    // Find "m=audio" section
+    const char *m_audio = strstr(sdp, "m=audio");
+    if (m_audio == NULL) {
+        return NULL;
+    }
+
+    // Determine audio section boundary (ends at next "m=" line or end of string)
+    const char *audio_section_end = strstr(m_audio + 1, "\nm=");
+    if (audio_section_end == NULL) {
+        audio_section_end = sdp + strlen(sdp);
+    }
+
+    // Find an unused extmap ID
+    uint8_t extmap_id = sdp_find_unused_extmap_id(m_audio, audio_section_end);
+    if (extmap_id == 0) {
+        return NULL;
+    }
+
+    // Find direction attribute as insertion point
+    const char *insert_before = NULL;
+    const char *p;
+    p = strstr(m_audio, "\na=sendrecv");
+    if (p != NULL && p < audio_section_end) {
+        insert_before = p + 1; // skip the leading \n
+    }
+    if (insert_before == NULL) {
+        p = strstr(m_audio, "\na=sendonly");
+        if (p != NULL && p < audio_section_end) {
+            insert_before = p + 1;
+        }
+    }
+    if (insert_before == NULL) {
+        p = strstr(m_audio, "\na=recvonly");
+        if (p != NULL && p < audio_section_end) {
+            insert_before = p + 1;
+        }
+    }
+    if (insert_before == NULL) {
+        return NULL;
+    }
+
+    // Build the extmap line
+    char extmap_line[80];
+    int extmap_len = snprintf(extmap_line, sizeof(extmap_line),
+        "a=extmap:%u %s\r\n", extmap_id, AUDIO_LEVEL_URI);
+
+    size_t orig_len = strlen(sdp);
+    char *new_sdp = (char *)malloc(orig_len + (size_t)extmap_len + 1);
+    if (new_sdp == NULL) {
+        return NULL;
+    }
+
+    // Copy everything before the insertion point
+    size_t prefix_len = (size_t)(insert_before - sdp);
+    memcpy(new_sdp, sdp, prefix_len);
+    // Insert the extmap line
+    memcpy(new_sdp + prefix_len, extmap_line, (size_t)extmap_len);
+    // Copy the rest (including null terminator)
+    memcpy(new_sdp + prefix_len + (size_t)extmap_len, insert_before, orig_len - prefix_len + 1);
+
+    *out_extmap_id = extmap_id;
+    return new_sdp;
+}
+
 typedef struct {
     peer_options_t options;
     esp_peer_role_t ice_role;
@@ -50,11 +166,94 @@ typedef struct {
     uint16_t reliable_stream_id;
     uint16_t lossy_stream_id;
 
+    uint8_t audio_level_extmap_id;  // Negotiated extmap ID for audio level extension
+    uint8_t opus_payload_type;      // Negotiated Opus payload type from SDP
+
 #if CONFIG_LK_BENCHMARK
     uint64_t start_time;
 #endif
 } peer_t;
 
+/// RTP transformer callback: compute encoded size (original + 8 bytes for extension block).
+static int audio_level_get_encoded_size(esp_peer_rtp_frame_t *frame, bool *in_place, void *ctx)
+{
+    peer_t *peer = (peer_t *)ctx;
+    // Skip if Opus PT not yet parsed or doesn't match this packet
+    if (peer == NULL || peer->opus_payload_type == 0 ||
+        frame->payload_type != peer->opus_payload_type) {
+        return ESP_PEER_ERR_NOT_SUPPORT;
+    }
+    // Validate minimum RTP packet size (12 bytes fixed header)
+    if (frame->orig_data == NULL || frame->orig_size < 12) {
+        return ESP_PEER_ERR_NOT_SUPPORT;
+    }
+    // Skip if packet already has header extensions (X bit set)
+    if (frame->orig_data[0] & 0x10) {
+        return ESP_PEER_ERR_NOT_SUPPORT;
+    }
+    // Validate CSRC count doesn't exceed packet size
+    uint8_t cc = frame->orig_data[0] & 0x0F;
+    if (frame->orig_size < (uint32_t)(12 + cc * 4)) {
+        return ESP_PEER_ERR_NOT_SUPPORT;
+    }
+    frame->encoded_size = frame->orig_size + RTP_EXT_BLOCK_SIZE;
+    *in_place = false;
+    return 0;
+}
+
+/// RTP transformer callback: inject audio level header extension into RTP packet.
+///
+/// Inserts an RFC 5285 one-byte header extension with RFC 6464 audio level data
+/// between the RTP fixed header and the payload.
+///
+/// Packet layout after transform:
+///   [RTP Header (X bit set)] [Extension Block (8 bytes)] [Payload]
+///
+static int audio_level_transform(esp_peer_rtp_frame_t *frame, void *ctx)
+{
+    peer_t *peer = (peer_t *)ctx;
+    uint8_t *orig = frame->orig_data;
+    uint8_t *enc = frame->encoded_data;
+    if (orig == NULL || enc == NULL) {
+        return ESP_PEER_ERR_NOT_SUPPORT;
+    }
+
+    uint8_t extmap_id = (peer != NULL) ? peer->audio_level_extmap_id : 1;
+
+    // Calculate RTP header length: 12 bytes fixed + 4 * CC (CSRC count)
+    uint8_t cc = orig[0] & 0x0F;
+    uint32_t header_len = 12u + (uint32_t)cc * 4u;
+
+    // Copy RTP header
+    memcpy(enc, orig, header_len);
+
+    // Set the X (extension) bit in the first byte
+    enc[0] |= 0x10;
+
+    // Build the extension block (8 bytes total, one 32-bit word of extension data)
+    uint8_t *ext = enc + header_len;
+    ext[0] = 0xBE;                             // RFC 5285 one-byte header profile
+    ext[1] = 0xDE;
+    ext[2] = 0x00;                             // Extension length: 1 word (32 bits)
+    ext[3] = 0x01;
+    ext[4] = (extmap_id << 4) | 0;             // ID=negotiated, L=0 (1 byte of data follows)
+    ext[5] = 0x80 | AUDIO_LEVEL_DEFAULT;       // V=1 (voice active), placeholder level
+    ext[6] = 0x00;                             // Padding
+    ext[7] = 0x00;                             // Padding
+
+    // Copy payload after the extension block
+    memcpy(enc + header_len + RTP_EXT_BLOCK_SIZE,
+           orig + header_len,
+           frame->orig_size - header_len);
+
+    return 0;
+}
+
+static esp_peer_rtp_transform_cb_t audio_level_transform_cb = {
+    .get_encoded_size = audio_level_get_encoded_size,
+    .transform = audio_level_transform
+};
+
 static esp_peer_media_dir_t get_media_direction(esp_peer_media_dir_t direction, peer_role_t role) {
     switch (role) {
         case PEER_ROLE_PUBLISHER:  return direction & ESP_PEER_MEDIA_DIR_SEND_ONLY;
@@ -148,12 +347,31 @@ static int on_msg(esp_peer_msg_t *info, void *ctx)
 {
     peer_t *peer = (peer_t *)ctx;
     switch (info->type) {
-        case ESP_PEER_MSG_TYPE_SDP:
-            ESP_LOGI(TAG(peer), "Generated %s:\n%s",
-                peer->options.role == PEER_ROLE_PUBLISHER ? "offer" : "answer",
-                (char *)info->data);
-            peer->options.on_sdp((char *)info->data, peer->options.role, peer->options.ctx);
+        case ESP_PEER_MSG_TYPE_SDP: {
+            const char *sdp = (const char *)info->data;
+
+            // For publisher SDP, inject audio level extmap for Active Speaker detection
+            if (peer->options.role == PEER_ROLE_PUBLISHER) {
+                uint8_t extmap_id = 0;
+                char *patched_sdp = sdp_inject_audio_level_extmap(sdp, &extmap_id);
+                if (patched_sdp != NULL) {
+                    peer->audio_level_extmap_id = extmap_id;
+                    peer->opus_payload_type = sdp_parse_opus_payload_type(patched_sdp);
+                    ESP_LOGD(TAG(peer), "Generated offer (with audio-level extmap id=%u):\n%s",
+                        extmap_id, patched_sdp);
+                    peer->options.on_sdp(patched_sdp, peer->options.role, peer->options.ctx);
+                    free(patched_sdp);
+                } else {
+                    ESP_LOGW(TAG(peer), "Failed to inject extmap, sending original SDP");
+                    ESP_LOGD(TAG(peer), "Generated offer:\n%s", sdp);
+                    peer->options.on_sdp(sdp, peer->options.role, peer->options.ctx);
+                }
+            } else {
+                ESP_LOGD(TAG(peer), "Generated answer:\n%s", sdp);
+                peer->options.on_sdp(sdp, peer->options.role, peer->options.ctx);
+            }
             break;
+        }
         default:
             ESP_LOGD(TAG(peer), "Unhandled msg type: %d", info->type);
             break;
@@ -329,6 +547,23 @@ peer_err_t peer_create(peer_handle_t *handle, peer_options_t *options)
         free(peer);
         return PEER_ERR_RTC;
     }
+
+    // Set RTP transformer for publisher to inject audio level header extension
+    if (options->role == PEER_ROLE_PUBLISHER &&
+        options->media->audio_info.codec != ESP_PEER_AUDIO_CODEC_NONE) {
+        int ret = esp_peer_set_rtp_transformer(
+            peer->connection,
+            ESP_PEER_RTP_TRANSFORM_ROLE_SENDER,
+            &audio_level_transform_cb,
+            peer
+        );
+        if (ret != ESP_PEER_ERR_NONE) {
+            ESP_LOGW(TAG(peer), "Failed to set audio level RTP transformer: %d", ret);
+        } else {
+            ESP_LOGI(TAG(peer), "Audio level RTP transformer enabled");
+        }
+    }
+
     *handle = (peer_handle_t)peer;
     return PEER_ERR_NONE;
 }