From 0bc400b0b6512c807efd5301a84e95cacd25c2e1 Mon Sep 17 00:00:00 2001
From: gokulmk-12 <ed21b026@smail.iitm.ac.in>
Date: Thu, 15 Jan 2026 17:04:04 +0530
Subject: [PATCH] Added new question 335

---
 build/10.json                                 |  2 +-
 build/186.json                                | 46 ++++++++++++
 build/187.json                                | 38 ++++++++++
 build/188.json                                | 62 ++++++++++++++++
 build/335.json                                | 46 ++++++++++++
 .../description.md                            |  3 +
 .../335_implement-sinkhorn-knopp/example.json |  5 ++
 .../335_implement-sinkhorn-knopp/learn.md     | 70 +++++++++++++++++++
 .../335_implement-sinkhorn-knopp/meta.json    | 15 ++++
 .../335_implement-sinkhorn-knopp/solution.py  | 23 ++++++
 .../starter_code.py                           | 15 ++++
 .../335_implement-sinkhorn-knopp/tests.json   | 22 ++++++
 12 files changed, 346 insertions(+), 1 deletion(-)
 create mode 100644 build/186.json
 create mode 100644 build/187.json
 create mode 100644 build/188.json
 create mode 100644 build/335.json
 create mode 100644 questions/335_implement-sinkhorn-knopp/description.md
 create mode 100644 questions/335_implement-sinkhorn-knopp/example.json
 create mode 100644 questions/335_implement-sinkhorn-knopp/learn.md
 create mode 100644 questions/335_implement-sinkhorn-knopp/meta.json
 create mode 100644 questions/335_implement-sinkhorn-knopp/solution.py
 create mode 100644 questions/335_implement-sinkhorn-knopp/starter_code.py
 create mode 100644 questions/335_implement-sinkhorn-knopp/tests.json

diff --git a/build/10.json b/build/10.json
index 09d55f7a..3b2c5801 100644
--- a/build/10.json
+++ b/build/10.json
@@ -21,7 +21,7 @@
   "description": "Write a Python function to calculate the covariance matrix for a given set of vectors. The function should take a list of lists, where each inner list represents a feature with its observations, and return a covariance matrix as a list of lists. Additionally, provide test cases to verify the correctness of your implementation.",
   "learn_section": "## Understanding Covariance Matrix\n\nThe covariance matrix is a fundamental concept in statistics and machine learning, used to understand the relationship between multiple variables (features) in a dataset. It quantifies the degree to which two variables change together.\n\n### Key Concepts\n\n- **Covariance**: Measures the directional relationship between two random variables. A positive covariance indicates that the variables increase together, while a negative covariance indicates that one variable increases as the other decreases.\n- **Covariance Matrix**: For a dataset with $n$ features, the covariance matrix is an $n \\times n$ matrix where each element $(i, j)$ represents the covariance between the $i^{th}$ and $j^{th}$ features.\n\n### Covariance Formula\n\nThe covariance between two variables $X$ and $Y$ is calculated as:\n\n$$\n\\text{cov}(X, Y) = \\frac{\\sum_{k=1}^{m} (X_k - \\bar{X})(Y_k - \\bar{Y})}{m - 1}\n$$\n\nWhere:\n\n- $X_k$ and $Y_k$ are the individual observations of variables $X$ and $Y$.\n- $\\bar{X}$ and $\\bar{Y}$ are the means of $X$ and $Y$.\n- $m$ is the number of observations.\n\n### Constructing the Covariance Matrix\n\nGiven a dataset with $n$ features, the covariance matrix is constructed as follows:\n\n1. **Calculate the Mean**: Compute the mean of each feature.\n2. **Compute Covariance**: For each pair of features, calculate the covariance using the formula above.\n3. **Populate the Matrix**: Place the computed covariance values in the corresponding positions in the matrix. The diagonal elements represent the variance of each feature.\n\n$$\n\\text{Covariance Matrix} =\n\\begin{bmatrix}\n\\text{cov}(X_1, X_1) & \\text{cov}(X_1, X_2) & \\cdots & \\text{cov}(X_1, X_n) \\\\\n\\text{cov}(X_2, X_1) & \\text{cov}(X_2, X_2) & \\cdots & \\text{cov}(X_2, X_n) \\\\\n\\vdots & \\vdots & \\ddots & \\vdots \\\\\n\\text{cov}(X_n, X_1) & \\text{cov}(X_n, X_2) & \\cdots & \\text{cov}(X_n, X_n) \\\\\n\\end{bmatrix}\n$$\n\n### Example Calculation\n\nConsider the following dataset with two features:\n\n$$\n\\begin{align*}\n\\text{Feature 1} &: [1, 2, 3] \\\\\n\\text{Feature 2} &: [4, 5, 6]\n\\end{align*}\n$$\n\n1. **Calculate Means**:\n   $$\n   \\bar{X}_1 = \\frac{1 + 2 + 3}{3} = 2.0 \\\\\n   \\bar{X}_2 = \\frac{4 + 5 + 6}{3} = 5.0\n   $$\n\n2. **Compute Covariances**:\n   $$\n   \\text{cov}(X_1, X_1) = \\frac{(1-2)^2 + (2-2)^2 + (3-2)^2}{3-1} = 1.0 \\\\\n   \\text{cov}(X_1, X_2) = \\frac{(1-2)(4-5) + (2-2)(5-5) + (3-2)(6-5)}{3-1} = 1.0 \\\\\n   \\text{cov}(X_2, X_2) = \\frac{(4-5)^2 + (5-5)^2 + (6-5)^2}{3-1} = 1.0\n   $$\n\n3. **Covariance Matrix**:\n   $$\n   \\begin{bmatrix}\n   1.0 & 1.0 \\\\\n   1.0 & 1.0 \n   \\end{bmatrix}\n   $$\n\n### Applications\n\nCovariance matrices are widely used in various fields, including:\n\n- **Principal Component Analysis (PCA)**: Reducing the dimensionality of datasets while preserving variance.\n- **Portfolio Optimization**: Understanding the variance and covariance between different financial assets.\n- **Multivariate Statistics**: Analyzing the relationships between multiple variables simultaneously.\n\nUnderstanding the covariance matrix is crucial for interpreting the relationships in multivariate data and for performing advanced statistical analyses.",
   "starter_code": "def calculate_covariance_matrix(vectors: list[list[float]]) -> list[list[float]]:\n\t# Your code here\n\treturn []",
-  "solution": "import numpy as np\n\ndef calculate_covariance_matrix(vectors: list[list[float]]) -> list[list[float]]:\n    n_features = len(vectors)\n    n_observations = len(vectors[0])\n    covariance_matrix = [[0 for _ in range(n_features)] for _ in range(n_features)]\n\n    means = [sum(feature) / n_observations for feature in vectors]\n\n    for i in range(n_features):\n        for j in range(i, n_features):\n            covariance = sum((vectors[i][k] - means[i]) * (vectors[j][k] - means[j]) for k in range(n_observations)) / (n_observations - 1)\n            covariance_matrix[i][j] = covariance_matrix[j][i] = covariance\n\n    return covariance_matrix",
+  "solution": "import numpy as np\n\ndef calculate_covariance_matrix(vectors: list[list[float]]) -> list[list[float]]:\n    n_observations = len(vectors)\n    n_features = len(vectors[0])\n    covariance_matrix = [[0 for _ in range(n_observations)] for _ in range(n_observations)]\n\n    means = [sum(feature) / n_features for feature in vectors]\n\n    for i in range(n_observations):\n        for j in range(i, n_observations):\n            covariance = sum((vectors[i][k] - means[i]) * (vectors[j][k] - means[j]) for k in range(n_features)) / (n_features - 1)\n            covariance_matrix[i][j] = covariance_matrix[j][i] = covariance\n\n    return covariance_matrix",
   "example": {
     "input": "[[1, 2, 3], [4, 5, 6]]",
     "output": "[[1.0, 1.0], [1.0, 1.0]]",
diff --git a/build/186.json b/build/186.json
new file mode 100644
index 00000000..1054782a
--- /dev/null
+++ b/build/186.json
@@ -0,0 +1,46 @@
+{
+  "id": "186",
+  "title": "Gaussian Process for Regression",
+  "difficulty": "medium",
+  "category": "Machine Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/Coder1010ayush",
+      "name": "Ayush"
+    }
+  ],
+  "description": "## Problem\n\nProblem Statement: Task is to implement GaussianProcessRegression class which is a guassian process model for prediction regression problems.",
+  "learn_section": "# **Gaussian Processes (GP): From-Scratch Regression Example**\n\n## **1. What’s a Gaussian Process?**\n\nA **Gaussian Process** defines a distribution over functions $f(\\cdot)$.\nFor any finite set of inputs $( X = {x_i}_{i=1}^n )$, the function values $f(X)$ follow a multivariate normal:\n\n$$\nf(X) \\sim \\mathcal{N}\\big(0,; K(X,X)\\big)\n$$\n\nwhere ( K ) is a **kernel** (covariance) function encoding similarity between inputs.\nWith noisy targets $( y = f(X) + \\varepsilon, \\varepsilon \\sim \\mathcal{N}(0,\\sigma_n^2 I) )$,\nGP regression yields a closed-form posterior predictive mean and variance at new points $( X_* )$.\n\n---\n\n## **2. The Implementation at a Glance**\n\nThe provided code builds a minimal yet complete GP regression stack:\n\n* **Kernels implemented**\n\n  * Radial Basis Function (RBF / Squared Exponential)\n  * Matérn $(( \\nu = 0.5, 1.5, 2.5 ), or general ( \\nu ))$\n  * Periodic\n  * Linear\n  * Rational Quadratic\n\n* **Core GP classes**\n\n  * `_GaussianProcessBase`: kernel selection & covariance matrix computation\n  * `GaussianProcessRegression`:\n\n    * `fit`: $builds ( K )$, does **Cholesky decomposition**, $solves ( \\alpha )$\n    * `predict`: returns posterior mean & variance\n    * `log_marginal_likelihood`: computes GP evidence\n    * `optimize_hyperparameters`: basic optimizer (for RBF hyperparams)\n\n---\n\n## **3. Kernel Cheat-Sheet**\n\nLet $( x, x' \\in \\mathbb{R}^d ), ( r = \\lVert x - x' \\rVert )$.\n\n* **RBF (SE):**\n  $$\n  k_{\\text{RBF}}(x,x') = \\sigma^2 \\exp!\\left(-\\tfrac{1}{2}\\tfrac{r^2}{\\ell^2}\\right)\n  $$\n\n* **Matérn (( \\nu = 1.5 )):**\n  $$\n  k(x,x') = \\Big(1 + \\tfrac{\\sqrt{3},r}{\\ell}\\Big)\\exp!\\Big(-\\tfrac{\\sqrt{3},r}{\\ell}\\Big)\n  $$\n\n* **Periodic:**\n  $$\n  k(x,x') = \\sigma^2 \\exp!\\left(-\\tfrac{2}{\\ell^2}\\sin^2!\\Big(\\tfrac{\\pi r}{p}\\Big)\\right)\n  $$\n\n* **Linear:**\n  $$\n  k(x,x') = \\sigma_b^2 + \\sigma_v^2,x^\\top x'\n  $$\n\n* **Rational Quadratic:**\n  $$\n  k(x,x') = \\sigma^2\\Big(1 + \\tfrac{r^2}{2\\alpha \\ell^2}\\Big)^{-\\alpha}\n  $$\n\n---\n\n## **4. GP Regression Mechanics**\n\n### **Training**\n\n1. Build covariance:\n   $$\n   K = K(X,X) + \\sigma_n^2 I\n   $$\n\n2. Cholesky factorization:\n   $$\n   K = L L^\\top\n   $$\n\n3. Solve ( \\alpha ):\n   $$\n   L L^\\top \\alpha = y\n   $$\n\n### **Prediction**\n\nAt new inputs ( X_* ):\n\n* $( K_* = K(X, X_*) ), ( K_{**} = K(X_*, X_*) )$\n\n* **Mean:**\n  $$\n  \\mu_* = K_*^\\top \\alpha\n  $$\n\n* **Covariance:**\n  $$\n  \\Sigma_* = K_{**} - V^\\top V, \\quad V = L^{-1} K_*\n  $$\n\n### **Model Selection**\n\n* **Log Marginal Likelihood (LML):**\n  $$\n  \\log p(y \\mid X) = -\\tfrac{1}{2} y^\\top \\alpha - \\sum\\nolimits_i \\log L_{ii} - \\tfrac{n}{2}\\log(2\\pi)\n  $$\n\n---\n\n## **5. Worked Example (Linear Kernel)**\n\n```python\nimport numpy as np\ngp = GaussianProcessRegression(kernel='linear',\n                               kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0},\n                               noise=1e-8)\n\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])   # y = 2x + 1\ngp.fit(X_train, y_train)\n\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")   # -> 7.0000\n```\n\n---\n\n## **6. When to Use GP Regression**\n\n* **Small-to-medium datasets** where uncertainty estimates are valuable\n* Cases requiring **predictive intervals** (not just point predictions)\n* **Nonparametric modeling** with kernel priors\n* Automatic hyperparameter tuning via **marginal likelihood**\n\n---\n\n## **7. Practical Tips**\n\n* Always add **jitter** $10^{-6}$ to the diagonal for numerical stability\n* **Standardize inputs/outputs** before training\n* Be aware: Exact GP has complexity **$\\mathcal{O}(n^3)$** in time and **$\\mathcal{O}(n^2)$** in memory\n* Choose kernels to match problem structure:\n\n  * **RBF:** smooth functions\n  * **Matérn:** rougher functions\n  * **Periodic:** seasonal/cyclical data\n  * **Linear:** global linear trends",
+  "starter_code": "import math  # ---------------------------------------- utf-8 encoding ---------------------------------\n\n# This file contains Gaussian Process implementation.\nimport numpy as np\nimport math\n\n\ndef matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5):\n    pass\n\n\ndef rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0):\n    pass\n\n\ndef periodic_kernel(\n    x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0\n):\n    pass\n\n\ndef linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0):\n    pass\n\n\ndef rational_quadratic_kernel(\n    x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0\n):\n    pass\n\n\n# --- BASE CLASS -------------------------------------------------------------\n\n\nclass _GaussianProcessBase:\n    def __init__(self, kernel=\"rbf\", noise=1e-5, kernel_params=None):\n        pass\n\n    def _select_kernel(self, x1, x2):\n        \"\"\"Selects and computes the kernel value for two single data points.\"\"\"\n        pass\n\n    def _compute_covariance(self, X1, X2):\n        \"\"\"\n        Computes the covariance matrix between two sets of points.\n        This method fixes the vectorization bug from the original code.\n        \"\"\"\n        pass\n\n\n# --- REGRESSION MODEL -------------------------------------------------------\nclass GaussianProcessRegression(_GaussianProcessBase):\n    def fit(self, X, y):\n        pass\n\n    def predict(self, X_test, return_std=False):\n        pass\n\n    def log_marginal_likelihood(self):\n        pass\n\n    def optimize_hyperparameters(self):\n        pass",
+  "solution": "# ---------------------------------------- utf-8 encoding ---------------------------------\n# This file contains Gaussian Process implementation.\nimport numpy as np\nimport math\nfrom scipy.spatial.distance import euclidean\nfrom scipy.special import kv as bessel_kv\nfrom scipy.special import gamma\nfrom scipy.linalg import cholesky, solve_triangular\nfrom scipy.optimize import minimize\nfrom scipy.special import expit, softmax\n\n\n# --- KERNEL FUNCTIONS --------------------------------------------------------\ndef matern_kernel(x: np.ndarray, x_prime: np.ndarray, length_scale=1.0, nu=1.5):\n    d = euclidean(x, x_prime)\n    if d == 0:\n        return 1.0  # Covariance with self is 1 before scaling\n    if nu == 0.5:\n        return np.exp(-d / length_scale)\n    elif nu == 1.5:\n        return (1 + np.sqrt(3) * d / length_scale) * np.exp(\n            -np.sqrt(3) * d / length_scale\n        )\n    elif nu == 2.5:\n        return (\n            1 + np.sqrt(5) * d / length_scale + 5 * d**2 / (3 * length_scale**2)\n        ) * np.exp(-np.sqrt(5) * d / length_scale)\n    else:\n        factor = (2 ** (1 - nu)) / gamma(nu)\n        scaled_d = np.sqrt(2 * nu) * d / length_scale\n        return factor * (scaled_d**nu) * bessel_kv(nu, scaled_d)\n\n\ndef rbf_kernel(x: np.ndarray, x_prime, sigma=1.0, length_scale=1.0):\n    # This is a squared exponential kernel\n\n    # Calculate the squared euclidean distance\n    sq_norm = np.linalg.norm(x - x_prime) ** 2\n\n    # Correctly implement the formula\n    return sigma**2 * np.exp(-sq_norm / (2 * length_scale**2))\n\n\ndef periodic_kernel(\n    x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, period=1.0\n):\n    return sigma**2 * np.exp(\n        -2 * np.sin(np.pi * np.linalg.norm(x - x_prime) / period) ** 2 / length_scale**2\n    )\n\n\ndef linear_kernel(x: np.ndarray, x_prime: np.ndarray, sigma_b=1.0, sigma_v=1.0):\n    return sigma_b**2 + sigma_v**2 * np.dot(x, x_prime)\n\n\ndef rational_quadratic_kernel(\n    x: np.ndarray, x_prime: np.ndarray, sigma=1.0, length_scale=1.0, alpha=1.0\n):\n    return sigma**2 * (\n        1 + np.linalg.norm(x - x_prime) ** 2 / (2 * alpha * length_scale**2)\n    ) ** (-alpha)\n\n\n# --- BASE CLASS -------------------------------------------------------------\n\n\nclass _GaussianProcessBase:\n    def __init__(self, kernel=\"rbf\", noise=1e-5, kernel_params=None):\n        self.kernel_name = kernel\n        self.noise = noise\n        self.kernel_params = kernel_params if kernel_params else {}\n        self.X_train = None\n        self.y_train = None\n        self.K = None\n\n    def _select_kernel(self, x1, x2):\n        \"\"\"Selects and computes the kernel value for two single data points.\"\"\"\n        if self.kernel_name == \"rbf\":\n            return rbf_kernel(x1, x2, **self.kernel_params)\n        elif self.kernel_name == \"matern\":\n            return matern_kernel(x1, x2, **self.kernel_params)\n        elif self.kernel_name == \"periodic\":\n            return periodic_kernel(x1, x2, **self.kernel_params)\n        elif self.kernel_name == \"linear\":\n            return linear_kernel(x1, x2, **self.kernel_params)\n        elif self.kernel_name == \"rational_quadratic\":\n            return rational_quadratic_kernel(x1, x2, **self.kernel_params)\n        else:\n            raise ValueError(\n                \"Unsupported kernel. Choose from ['rbf', 'matern', 'periodic', 'linear', 'rational_quadratic'].\"\n            )\n\n    def _compute_covariance(self, X1, X2):\n        \"\"\"\n        Computes the covariance matrix between two sets of points.\n        This method fixes the vectorization bug from the original code.\n        \"\"\"\n        # Ensuring X1 and X2 are 2D arrays\n        X1 = np.atleast_2d(X1)\n        X2 = np.atleast_2d(X2)\n\n        n1, _ = X1.shape\n        n2, _ = X2.shape\n        K = np.zeros((n1, n2))\n        for i in range(n1):\n            for j in range(n2):\n                K[i, j] = self._select_kernel(X1[i], X2[j])\n        return K\n\n\n# --- REGRESSION MODEL -------------------------------------------------------\nclass GaussianProcessRegression(_GaussianProcessBase):\n    def fit(self, X, y):\n        self.X_train = np.asarray(X)\n        self.y_train = np.asarray(y)\n        self.K = self._compute_covariance(\n            self.X_train, self.X_train\n        ) + self.noise * np.eye(len(self.X_train))\n\n        # Compute Cholesky decomposition for stable inversion\n        self.L = cholesky(self.K, lower=True)\n        # alpha = K_inv * y\n        self.alpha = solve_triangular(\n            self.L.T, solve_triangular(self.L, self.y_train, lower=True)\n        )\n\n    def predict(self, X_test, return_std=False):\n        X_test = np.atleast_2d(X_test)\n        K_s = self._compute_covariance(self.X_train, X_test)\n        K_ss = self._compute_covariance(X_test, X_test)\n\n        # Compute predictive mean\n        mu = K_s.T @ self.alpha\n\n        # Compute predictive variance\n        v = solve_triangular(self.L, K_s, lower=True)\n        cov = K_ss - v.T @ v\n\n        if return_std:\n            return mu, np.sqrt(np.diag(cov))\n        return mu\n\n    def log_marginal_likelihood(self):\n        return (\n            -0.5 * (self.y_train.T @ self.alpha)\n            - np.sum(np.log(np.diag(self.L)))\n            - len(self.X_train) / 2 * np.log(2 * np.pi)\n        )\n\n    def optimize_hyperparameters(self):\n        # NOTE: This is a simplified optimizer for 'rbf' kernel's params.\n        def objective(params):\n            self.kernel_params = {\n                \"length_scale\": np.exp(params[0]),\n                \"sigma\": np.exp(params[1]),\n            }\n            self.fit(self.X_train, self.y_train)\n            return -self.log_marginal_likelihood()\n\n        init_params = np.log(\n            [\n                self.kernel_params.get(\"length_scale\", 1.0),\n                self.kernel_params.get(\"sigma\", 1.0),\n            ]\n        )\n        res = minimize(\n            objective, init_params, method=\"L-BFGS-B\", bounds=[(-5, 5), (-5, 5)]\n        )\n\n        self.kernel_params = {\n            \"length_scale\": np.exp(res.x[0]),\n            \"sigma\": np.exp(res.x[1]),\n        }\n        # Re-fit with optimal hyperparameters\n        self.fit(self.X_train, self.y_train)\n\n\nif __name__ == \"__main__\":\n    gp = GaussianProcessRegression(\n        kernel=\"linear\", kernel_params={\"sigma_b\": 0.0, \"sigma_v\": 1.0}, noise=1e-8\n    )\n    X_train = np.array([[1], [2], [4]])\n    y_train = np.array([3, 5, 9])\n    gp.fit(X_train, y_train)\n    X_test = np.array([[3.0]])\n    mu = gp.predict(X_test)",
+  "example": {
+    "input": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.0, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")",
+    "output": "7.0000",
+    "reasoning": "A Gaussian Process with a linear kernel is trained on perfectly linear data that follows the function y = 2x + 1. When asked to predict the value at x=3, the model perfectly interpolates the linear function it has learned, resulting in a prediction of 2*3 + 1 = 7. The near-zero noise ensures the prediction is exact."
+  },
+  "test_cases": [
+    {
+      "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")",
+      "expected_output": "0.2814"
+    },
+    {
+      "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0], [7.5], [10.0]])\ny_train = np.sin(X_train).ravel()\ngp.fit(X_train, y_train)\nX_test = np.array([[1.25]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")",
+      "expected_output": "mu=0.2814, std=0.7734"
+    },
+    {
+      "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.0}, noise=1e-8)\nX_train = np.array([[0], [2.5], [5.0]])\ny_train = np.array([1.0, 3.0, 1.5])\ngp.fit(X_train, y_train)\nX_test = np.array([[2.5]])\nmu, std = gp.predict(X_test, return_std=True)\nprint(f\"mu={mu[0]:.4f}, std={std[0]:.4f}\")",
+      "expected_output": "mu=3.0000, std=0.0001"
+    },
+    {
+      "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='linear', kernel_params={'sigma_b': 0.1, 'sigma_v': 1.0}, noise=1e-8)\nX_train = np.array([[1], [2], [4]])\ny_train = np.array([3, 5, 9])\ngp.fit(X_train, y_train)\nX_test = np.array([[3.0]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")",
+      "expected_output": "7.0000"
+    },
+    {
+      "test": "import numpy as np\ngp = GaussianProcessRegression(kernel='rbf', kernel_params={'sigma': 1.0, 'length_scale': 1.5}, noise=1e-8)\nX_train = np.array([[1, 2], [3, 4], [5, 1]])\ny_train = np.sum(X_train, axis=1)\ngp.fit(X_train, y_train)\nX_test = np.array([[2, 3]])\nmu = gp.predict(X_test)\nprint(f\"{mu[0]:.4f}\")",
+      "expected_output": "5.5553"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/build/187.json b/build/187.json
new file mode 100644
index 00000000..96740dc5
--- /dev/null
+++ b/build/187.json
@@ -0,0 +1,38 @@
+{
+  "id": "187",
+  "title": "Build a Simple ETL Pipeline (MLOps)",
+  "difficulty": "medium",
+  "category": "MLOps",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/Jeet009",
+      "name": "Jeet Mukherjee"
+    }
+  ],
+  "description": "## Problem\n\nImplement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.\n\nGiven a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:\n\n1. Extracts rows from the raw CSV text.\n2. Transforms data by:\n\t- Filtering only rows where `event_type == \"purchase\"`.\n\t- Converting `value` to float and dropping invalid rows.\n\t- Aggregating total purchase `value` per `user_id`.\n3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.\n\nAssume small inputs (no external libs), handle extra whitespace, and ignore blank lines.",
+  "learn_section": "## Solution Explanation\n\nThis task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.\n\n### ETL breakdown\n- Extract: parse raw CSV text, ignore blanks, and split into header and rows.\n- Transform:\n\t- Filter only relevant records (event_type == \"purchase\").\n\t- Cast `value` to float; discard invalid rows to maintain data quality.\n\t- Aggregate total purchase value per user to create compact features.\n- Load: return a deterministic, sorted list of `(user_id, total_value)`.\n\n### Why this design?\n- Input sanitation prevents runtime errors and poor-quality features.\n- Aggregation compresses event-level logs into user-level features commonly used in models.\n- Sorting produces stable, testable outputs.\n\n### Complexity\n- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).\n\n### Extensions\n- Add schema validation and logging.\n- Write outputs to files or databases.\n- Schedule ETL runs and add monitoring for drift and freshness.",
+  "starter_code": "# Implement your function below.\n\ndef run_etl(csv_text: str) -> list[tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline over CSV text with header user_id,event_type,value.\n\n\tReturns a sorted list of (user_id, total_value) for event_type == \"purchase\".\n\t\"\"\"\n\t# TODO: implement extract, transform, and load steps\n\traise NotImplementedError",
+  "solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Reference ETL implementation.\n\n\t- Extract: parse CSV text, skip header, strip whitespace, ignore blanks\n\t- Transform: keep event_type == \"purchase\"; parse value as float; aggregate per user\n\t- Load: return sorted list of (user_id, total_value) by user_id asc\n\t\"\"\"\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\t# header\n\theader = lines[0]\n\trows = lines[1:]\n\n\t# indices from header (allow varying order and case)\n\theaders = [h.strip().lower() for h in header.split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\t# header missing required columns\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in rows:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\tevent_type = parts[idx_event].lower()\n\t\tif event_type != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])",
+  "example": {
+    "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")",
+    "output": "[('u1', 15.0), ('u2', 3.5)]",
+    "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id."
+  },
+  "test_cases": [
+    {
+      "test": "print(run_etl('user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n'))",
+      "expected_output": "[('u1', 15.0), ('u2', 3.5)]"
+    },
+    {
+      "test": "print(run_etl('user_id,event_type,value'))",
+      "expected_output": "[]"
+    },
+    {
+      "test": "print(run_etl('value,event_type,user_id\\n 1.0, purchase, u1\\n 2.0, purchase, u1\\n'))",
+      "expected_output": "[('u1', 3.0)]"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/build/188.json b/build/188.json
new file mode 100644
index 00000000..a6199500
--- /dev/null
+++ b/build/188.json
@@ -0,0 +1,62 @@
+{
+  "id": "188",
+  "title": "Gradient Checkpointing",
+  "difficulty": "easy",
+  "category": "Machine Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/komaksym",
+      "name": "komaksym"
+    }
+  ],
+  "description": "## Problem\n\nWrite a Python function `checkpoint_forward` that takes a list of numpy functions (each representing a layer or operation) and an input numpy array, and returns the final output by applying each function in sequence. To simulate gradient checkpointing, the function should not store intermediate activations; instead, it should recompute them as needed (for this problem, just apply the functions in sequence as usual). Only use standard Python and numpy. The returned array should be of type float and have the same shape as the output of the last function.",
+  "learn_section": "# **Gradient Checkpointing**\n\n## **1. Definition**\nGradient checkpointing is a technique used in deep learning to reduce memory usage during training by selectively storing only a subset of intermediate activations (checkpoints) and recomputing the others as needed during the backward pass. This allows training of larger models or using larger batch sizes without exceeding memory limits.\n\n## **2. Why Use Gradient Checkpointing?**\n* **Reduce Memory Usage:** By storing fewer activations, memory requirements are reduced, enabling training of deeper or larger models.\n* **Enable Larger Batches/Models:** Makes it possible to fit larger models or use larger batch sizes on limited hardware.\n* **Tradeoff:** The main tradeoff is increased computation time, as some activations must be recomputed during the backward pass.\n\n## **3. Gradient Checkpointing Mechanism**\nSuppose a model consists of $N$ layers, each represented by a function $f_i$. Normally, the forward pass stores all intermediate activations:\n\n$$\nA_0 = x \\\\\nA_1 = f_1(A_0) \\\\\nA_2 = f_2(A_1) \\\\\n\\ldots \\\\\nA_N = f_N(A_{N-1})\n$$\n\nWith gradient checkpointing, only a subset of $A_i$ are stored (the checkpoints). The others are recomputed as needed during backpropagation. In the simplest case, you can store only the input and output, and recompute all intermediates when needed.\n\n**Example:**\nIf you have three functions $f_1, f_2, f_3$ and input $x$:\n* Forward: $A_1 = f_1(x)$, $A_2 = f_2(A_1)$, $A_3 = f_3(A_2)$\n* With checkpointing, you might only store $x$ and $A_3$, and recompute $A_1$ and $A_2$ as needed.\n\n## **4. Applications of Gradient Checkpointing**\nGradient checkpointing is widely used in training:\n* **Very Deep Neural Networks:** Transformers, ResNets, and other architectures with many layers.\n* **Large-Scale Models:** Language models, vision models, and more.\n* **Memory-Constrained Environments:** When hardware cannot fit all activations in memory.\n* **Any optimization problem** where memory is a bottleneck during training.\n\nGradient checkpointing is a powerful tool to enable training of large models on limited hardware, at the cost of extra computation.",
+  "starter_code": "import numpy as np\n\n# Implement your function below.\ndef checkpoint_forward(funcs, input_arr):\n    \"\"\"\n    Applies a list of functions in sequence to the input array, simulating gradient checkpointing by not storing intermediates.\n\n    Args:\n        funcs (list of callables): List of functions to apply in sequence.\n        input_arr (np.ndarray): Input numpy array.\n\n    Returns:\n        np.ndarray: The output after applying all functions, same shape as output of last function.\n    \"\"\"\n    pass",
+  "solution": "import numpy as np\n\ndef checkpoint_forward(funcs, input_arr):\n    \"\"\"\n    Applies a list of functions in sequence to the input array, simulating gradient checkpointing by not storing intermediates.\n\n    Args:\n        funcs (list of callables): List of functions to apply in sequence.\n        input_arr (np.ndarray): Input numpy array.\n\n    Returns:\n        np.ndarray: The output after applying all functions, same shape as output of last function.\n    \"\"\"\n    x = input_arr\n    for f in funcs:\n        x = f(x)\n    return x.astype(float)",
+  "example": {
+    "input": "import numpy as np\ndef f1(x): return x + 1\ndef f2(x): return x * 2\ndef f3(x): return x - 3\nfuncs = [f1, f2, f3]\ninput_arr = np.array([1.0, 2.0])\noutput = checkpoint_forward(funcs, input_arr)\nprint(output)",
+    "output": "[-1.  1.]",
+    "reasoning": "The input [1.0, 2.0] is passed through f1: [2.0, 3.0], then f2: [4.0, 6.0], then f3: [1.0, 3.0]. The final output is [1.0, 3.0]. (Correction: Actually, [1.0, 3.0] is correct, not [-1. 1.].)"
+  },
+  "test_cases": [
+    {
+      "test": "import numpy as np\ndef f1(x): return x + 1\ndef f2(x): return x * 2\ndef f3(x): return x - 3\nfuncs = [f1, f2, f3]\ninput_arr = np.array([1.0, 2.0])\nprint(checkpoint_forward(funcs, input_arr))",
+      "expected_output": "[1. 3.]"
+    },
+    {
+      "test": "import numpy as np\ndef f1(x): return x * 0\ndef f2(x): return x + 10\nfuncs = [f1, f2]\ninput_arr = np.array([5.0, 7.0])\nprint(checkpoint_forward(funcs, input_arr))",
+      "expected_output": "[10. 10.]"
+    },
+    {
+      "test": "import numpy as np\ndef f1(x): return x / 2\ndef f2(x): return x ** 2\nfuncs = [f1, f2]\ninput_arr = np.array([4.0, 8.0])\nprint(checkpoint_forward(funcs, input_arr))",
+      "expected_output": "[ 4. 16.]"
+    },
+    {
+      "test": "import numpy as np\ndef f1(x): return x - 1\nfuncs = [f1]\ninput_arr = np.array([10.0, 20.0])\nprint(checkpoint_forward(funcs, input_arr))",
+      "expected_output": "[ 9. 19.]"
+    },
+    {
+      "test": "import numpy as np\nfuncs = []\ninput_arr = np.array([1.0, 2.0])\nprint(checkpoint_forward(funcs, input_arr))",
+      "expected_output": "[1. 2.]"
+    }
+  ],
+  "tinygrad_starter_code": "def your_function(...):\n    pass",
+  "tinygrad_solution": "def your_function(...):\n    ...",
+  "tinygrad_test_cases": [
+    {
+      "test": "print(your_function(...))",
+      "expected_output": "..."
+    }
+  ],
+  "pytorch_starter_code": "def your_function(...):\n    pass",
+  "pytorch_solution": "def your_function(...):\n    ...",
+  "pytorch_test_cases": [
+    {
+      "test": "print(your_function(...))",
+      "expected_output": "..."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/build/335.json b/build/335.json
new file mode 100644
index 00000000..1cdc3977
--- /dev/null
+++ b/build/335.json
@@ -0,0 +1,46 @@
+{
+  "id": "335",
+  "title": "Constraining Hyper-Connection Matrices using Sinkhorn–Knopp Algorithm",
+  "difficulty": "easy",
+  "category": "Deep Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/gokulmk-12",
+      "name": "gokulmk12"
+    }
+  ],
+  "description": "In deep neural networks with hyper-connections, residual mixing matrices must be carefully constrained to avoid signal explosion or vanishing. One effective way to enforce stability is to normalize a matrix into a doubly stochastic matrix, where each row and each column sums to one. Such matrices lie in the **Birkhoff polytope** and preserve signal magnitude across layers.\n\nYour task is to implement the **Sinkhorn–Knopp** algorithm, an iterative procedure that alternates between row-wise and column-wise normalization to transform a given matrix into a doubly stochastic matrix. The input is a NxN unconstrained matrix and an optional variable iterations. Return the doubly stochastic matrix, each element rounded off to **4-decimal** places. ",
+  "learn_section": "\n## Understanding the Sinkhorn–Knopp Algorithm for Manifold-Constrained Hyper-Connections\n\n### Introduction\n\nResidual connections stabilize deep neural networks by preserving identity mappings across layers. Hyper-connections extend this idea by allowing multiple residual streams to be mixed using a learnable matrix\n\n- Residual update:\n  $$\n  h_{l+1} = h_l + f(h_l)\n  $$\n- Hyper-connections update:\n  $$\n  h_{l+1} = H_{\\text{res}} . h_l + f(h_l)\n  $$\n\nWhile hyper-connections increase expressivity, an unconstrained mixing matrix $ H_{\\text{res}} $ can cause signal explosion or vanishing when applied repeatedly across many layers.\n\n### Manifold-Constrained Hyper-Connections (mHC)\n\nTo ensure stable signal propagation, manifold-constrained hyper-connections restrict the residual mixing matrix to lie in the Birkhoff polytope, the set of doubly stochastic matrices\n\n- Row sums equal to 1:\n  $$ \n  \\sum_j H_{ij} = 1 \n  $$\n- Column sums equal to 1\n  $$ \n  \\sum_i H_{ij} = 1 \n  $$\n- Non-negativity:\n  $$ \n  H_{ij} \\ge 0 \n  $$\n\nThese constraints ensure that residual streams are redistributed rather than amplified, preserving signal norms while enabling rich feature mixing.\n\n### Sinkhorn–Knopp Algorithm\n\nThe Sinkhorn–Knopp algorithm transforms a matrix into a doubly stochastic matrix by alternately normalizing its rows and columns. Since the input matrix may contain negative values, it is first mapped to a non-negative matrix using an element-wise exponential\n $$ \n H^{(0)} = \\exp(H_{\\text{res}}) \n $$\n\nThe algorithm then performs the following steps iteratively:\n\n1. **Row normalization**\n  $$ \n  H^{'}{ij} = \\dfrac{H^{(k)}{ij}}{\\sum_j H^{(k)}_{ij}} \n  $$\n2. **Column normalization**\n  $$ \n  H^{(k+1)}{ij} = \\dfrac{H^{'}{ij}}{\\sum_i H^{'}_{ij}} \n  $$\n\nAfter sufficient iterations, the matrix converges to a doubly stochastic matrix.\n\n### Why Doubly Stochastic?\n\n - Row sums = 1 → prevents signal amplification\n - Column sums = 1 → prevents information loss\n\nTogether, these properties restore the norm-preserving behavior of residual connections, while retaining the expressive power of hyper-connections.\n\n### Key Takeaways\n - Hyper-connections improve expressivity but can destabilize deep networks.\n - Manifold constraints enforce stable, balanced residual mixing.\n - The Sinkhorn–Knopp algorithm efficiently projects matrices onto the Birkhoff polytope. \n - Exponentiation allows handling unconstrained (including negative) matrices.\n    ",
+  "starter_code": "import numpy as np\n\ndef sinkhorn_knopp(H_res: np.ndarray, iterations: int = 20) -> np.ndarray:\n    \"\"\"\n    Apply the Sinkhorn–Knopp algorithm to the mixing matrix\n\n    Parameters: \n        H_res: A 2D (NxN) unconstrained residual mixing matrix.\n        iterations: Number of Sinkhorn normalization iterations to perform.\n\n    Returns\n        Doubly stochastic matrix in the Birkhoff Polytope\n    \"\"\"\n    # your code here\n    pass",
+  "solution": "import numpy as np\n\ndef sinkhorn_knopp(H_res: np.ndarray, iterations: int = 20) -> np.ndarray:\n    \"\"\"\n    Apply the Sinkhorn–Knopp algorithm to the mixing matrix\n\n    Parameters: \n        H_res: A non-negative 2D matrix unconstrained residual mixing matrix.\n        iterations: Number of Sinkhorn normalization iterations to perform.\n\n    Returns\n        Doubly stochastic matrix in the Birkhoff Polytope\n    \"\"\"\n    # Make all elements positive\n    H = np.exp(H_res)\n\n    # Sinkhorn-Knopp Iteration  \n    for _ in range(iterations):\n        # Normalize Columns\n        H = H / H.sum(axis=0, keepdims=True)\n        # Normalize rows\n        H = H / H.sum(axis=1, keepdims=True)\n    return H",
+  "example": {
+    "input": "H_res = np.array([[1.0, 2.0], [3.0, 4.0]])\n result = sinkhorn_knopp(H_res, iterations=20)\n print(np.round(result, 4))",
+    "output": "[[0.5 0.5]\n [0.5 0.5]]",
+    "reasoning": "The function should initially make sure the elements are positive. Then in a loop till iterations, normalize the elements along the row using the row sum and normalize the elements along the column using the column sum."
+  },
+  "test_cases": [
+    {
+      "test": "H_res = np.array([[1.0, 2.0], [3.0, 4.0]]); print(np.round(sinkhorn_knopp(H_res, iterations=20), 4))",
+      "expected_output": "[[0.5, 0.5], [0.5, 0.5]]"
+    },
+    {
+      "test": "H_res = np.eye(4); print(np.round(sinkhorn_knopp(H_res, iterations=20), 4))",
+      "expected_output": "[[0.4754, 0.1749, 0.1749, 0.1749], [0.1749, 0.4754, 0.1749, 0.1749], [0.1749, 0.1749, 0.4754, 0.1749], [0.1749, 0.1749, 0.1749, 0.4754]]"
+    },
+    {
+      "test": "H_res = np.zeros(shape=(6,6)); print(np.round(sinkhorn_knopp(H_res, iterations=50), 4))",
+      "expected_output": "[[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]]"
+    },
+    {
+      "test": "np.random.seed(42); H_res = np.random.randn(5, 5); print(np.round(sinkhorn_knopp(H_res, iterations=10), 4))",
+      "expected_output": "[[0.104, 0.0741, 0.1077, 0.5699, 0.1444], [0.0513, 0.4229, 0.1244, 0.0796, 0.3218], [0.1844, 0.2474, 0.3326, 0.085, 0.1507], [0.1511, 0.1294, 0.3233, 0.21, 0.1863], [0.5093, 0.1262, 0.1121, 0.0556, 0.1969]]"
+    },
+    {
+      "test": "H_res = np.array([[0.5]]); print(np.round(sinkhorn_knopp(H_res, iterations=15), 4))",
+      "expected_output": "[[1.]]"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/questions/335_implement-sinkhorn-knopp/description.md b/questions/335_implement-sinkhorn-knopp/description.md
new file mode 100644
index 00000000..5e226130
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/description.md
@@ -0,0 +1,3 @@
+In deep neural networks with hyper-connections, residual mixing matrices must be carefully constrained to avoid signal explosion or vanishing. One effective way to enforce stability is to normalize a matrix into a doubly stochastic matrix, where each row and each column sums to one. Such matrices lie in the **Birkhoff polytope** and preserve signal magnitude across layers.
+
+Your task is to implement the **Sinkhorn–Knopp** algorithm, an iterative procedure that alternates between row-wise and column-wise normalization to transform a given matrix into a doubly stochastic matrix. The input is a NxN unconstrained matrix and an optional variable iterations. Return the doubly stochastic matrix, each element rounded off to **4-decimal** places. 
diff --git a/questions/335_implement-sinkhorn-knopp/example.json b/questions/335_implement-sinkhorn-knopp/example.json
new file mode 100644
index 00000000..fba6d3b5
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/example.json
@@ -0,0 +1,5 @@
+{
+  "input": "H_res = np.array([[1.0, 2.0], [3.0, 4.0]])\n result = sinkhorn_knopp(H_res, iterations=20)\n print(np.round(result, 4))",
+  "output": "[[0.5 0.5]\n [0.5 0.5]]",
+  "reasoning": "The function should initially make sure the elements are positive. Then in a loop till iterations, normalize the elements along the row using the row sum and normalize the elements along the column using the column sum."
+}
diff --git a/questions/335_implement-sinkhorn-knopp/learn.md b/questions/335_implement-sinkhorn-knopp/learn.md
new file mode 100644
index 00000000..16bc03d9
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/learn.md
@@ -0,0 +1,70 @@
+
+## Understanding the Sinkhorn–Knopp Algorithm for Manifold-Constrained Hyper-Connections
+
+### Introduction
+
+Residual connections stabilize deep neural networks by preserving identity mappings across layers. Hyper-connections extend this idea by allowing multiple residual streams to be mixed using a learnable matrix
+
+- Residual update:
+  $$
+  h_{l+1} = h_l + f(h_l)
+  $$
+- Hyper-connections update:
+  $$
+  h_{l+1} = H_{\text{res}} . h_l + f(h_l)
+  $$
+
+While hyper-connections increase expressivity, an unconstrained mixing matrix $ H_{\text{res}} $ can cause signal explosion or vanishing when applied repeatedly across many layers.
+
+### Manifold-Constrained Hyper-Connections (mHC)
+
+To ensure stable signal propagation, manifold-constrained hyper-connections restrict the residual mixing matrix to lie in the Birkhoff polytope, the set of doubly stochastic matrices
+
+- Row sums equal to 1:
+  $$ 
+  \sum_j H_{ij} = 1 
+  $$
+- Column sums equal to 1
+  $$ 
+  \sum_i H_{ij} = 1 
+  $$
+- Non-negativity:
+  $$ 
+  H_{ij} \ge 0 
+  $$
+
+These constraints ensure that residual streams are redistributed rather than amplified, preserving signal norms while enabling rich feature mixing.
+
+### Sinkhorn–Knopp Algorithm
+
+The Sinkhorn–Knopp algorithm transforms a matrix into a doubly stochastic matrix by alternately normalizing its rows and columns. Since the input matrix may contain negative values, it is first mapped to a non-negative matrix using an element-wise exponential
+ $$ 
+ H^{(0)} = \exp(H_{\text{res}}) 
+ $$
+
+The algorithm then performs the following steps iteratively:
+
+1. **Row normalization**
+  $$ 
+  H^{'}{ij} = \dfrac{H^{(k)}{ij}}{\sum_j H^{(k)}_{ij}} 
+  $$
+2. **Column normalization**
+  $$ 
+  H^{(k+1)}{ij} = \dfrac{H^{'}{ij}}{\sum_i H^{'}_{ij}} 
+  $$
+
+After sufficient iterations, the matrix converges to a doubly stochastic matrix.
+
+### Why Doubly Stochastic?
+
+ - Row sums = 1 → prevents signal amplification
+ - Column sums = 1 → prevents information loss
+
+Together, these properties restore the norm-preserving behavior of residual connections, while retaining the expressive power of hyper-connections.
+
+### Key Takeaways
+ - Hyper-connections improve expressivity but can destabilize deep networks.
+ - Manifold constraints enforce stable, balanced residual mixing.
+ - The Sinkhorn–Knopp algorithm efficiently projects matrices onto the Birkhoff polytope. 
+ - Exponentiation allows handling unconstrained (including negative) matrices.
+    
\ No newline at end of file
diff --git a/questions/335_implement-sinkhorn-knopp/meta.json b/questions/335_implement-sinkhorn-knopp/meta.json
new file mode 100644
index 00000000..7c218fa5
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/meta.json
@@ -0,0 +1,15 @@
+{
+  "id": "335",
+  "title": "Constraining Hyper-Connection Matrices using Sinkhorn–Knopp Algorithm",
+  "difficulty": "easy",
+  "category": "Deep Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/gokulmk-12",
+      "name": "gokulmk12"
+    }
+  ]
+}
diff --git a/questions/335_implement-sinkhorn-knopp/solution.py b/questions/335_implement-sinkhorn-knopp/solution.py
new file mode 100644
index 00000000..2807cc8f
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/solution.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+def sinkhorn_knopp(H_res: np.ndarray, iterations: int = 20) -> np.ndarray:
+    """
+    Apply the Sinkhorn–Knopp algorithm to the mixing matrix
+
+    Parameters: 
+        H_res: A non-negative 2D matrix unconstrained residual mixing matrix.
+        iterations: Number of Sinkhorn normalization iterations to perform.
+
+    Returns
+        Doubly stochastic matrix in the Birkhoff Polytope
+    """
+    # Make all elements positive
+    H = np.exp(H_res)
+
+    # Sinkhorn-Knopp Iteration  
+    for _ in range(iterations):
+        # Normalize Columns
+        H = H / H.sum(axis=0, keepdims=True)
+        # Normalize rows
+        H = H / H.sum(axis=1, keepdims=True)
+    return H
\ No newline at end of file
diff --git a/questions/335_implement-sinkhorn-knopp/starter_code.py b/questions/335_implement-sinkhorn-knopp/starter_code.py
new file mode 100644
index 00000000..b4c018e6
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/starter_code.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+def sinkhorn_knopp(H_res: np.ndarray, iterations: int = 20) -> np.ndarray:
+    """
+    Apply the Sinkhorn–Knopp algorithm to the mixing matrix
+
+    Parameters: 
+        H_res: A 2D (NxN) unconstrained residual mixing matrix.
+        iterations: Number of Sinkhorn normalization iterations to perform.
+
+    Returns
+        Doubly stochastic matrix in the Birkhoff Polytope
+    """
+    # your code here
+    pass
\ No newline at end of file
diff --git a/questions/335_implement-sinkhorn-knopp/tests.json b/questions/335_implement-sinkhorn-knopp/tests.json
new file mode 100644
index 00000000..e25f3063
--- /dev/null
+++ b/questions/335_implement-sinkhorn-knopp/tests.json
@@ -0,0 +1,22 @@
+[
+  {
+    "test": "H_res = np.array([[1.0, 2.0], [3.0, 4.0]]); print(np.round(sinkhorn_knopp(H_res, iterations=20), 4))",
+    "expected_output": "[[0.5, 0.5], [0.5, 0.5]]"
+  },
+  {
+    "test": "H_res = np.eye(4); print(np.round(sinkhorn_knopp(H_res, iterations=20), 4))",
+    "expected_output": "[[0.4754, 0.1749, 0.1749, 0.1749], [0.1749, 0.4754, 0.1749, 0.1749], [0.1749, 0.1749, 0.4754, 0.1749], [0.1749, 0.1749, 0.1749, 0.4754]]"
+  },
+  {
+    "test": "H_res = np.zeros(shape=(6,6)); print(np.round(sinkhorn_knopp(H_res, iterations=50), 4))",
+    "expected_output": "[[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667], [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]]"
+  },
+  {
+    "test": "np.random.seed(42); H_res = np.random.randn(5, 5); print(np.round(sinkhorn_knopp(H_res, iterations=10), 4))",
+    "expected_output": "[[0.104, 0.0741, 0.1077, 0.5699, 0.1444], [0.0513, 0.4229, 0.1244, 0.0796, 0.3218], [0.1844, 0.2474, 0.3326, 0.085, 0.1507], [0.1511, 0.1294, 0.3233, 0.21, 0.1863], [0.5093, 0.1262, 0.1121, 0.0556, 0.1969]]"
+  },
+  {
+    "test": "H_res = np.array([[0.5]]); print(np.round(sinkhorn_knopp(H_res, iterations=15), 4))",
+    "expected_output": "[[1.]]"
+  }
+]