diff --git a/Python/README_CUSTOM.md b/Python/README_CUSTOM.md
new file mode 100644
index 0000000..6694ae2
--- /dev/null
+++ b/Python/README_CUSTOM.md
@@ -0,0 +1,76 @@
+# Custom FSA Implementation
+
+This directory contains a custom implementation of the FSA (Feature Selection with Annealing) algorithm with enhanced multiclass support.
+
+## Files
+
+- **fsa_custom.py**: Main implementation of FSA_Multiclass with PyTorch
+- **demo_custom.py**: Demo script showing usage for binary and multiclass classification
+- **fsa.py**: Original FSA implementation from the repository
+- **demo.ipynb**: Original demo notebook
+
+## Custom Implementation Features
+
+### FSA_Multiclass Class
+
+The `FSA_Multiclass` class provides an improved implementation with the following features:
+
+- **Multiclass Support**: Handles both binary and multiclass classification problems
+- **GPU Acceleration**: Utilizes CUDA when available for faster computation
+- **Gradient Clipping**: Prevents gradient explosion during training
+- **Annealing Schedule**: Gradually reduces features during optimization
+- **Flexible Interface**: Works with both NumPy arrays and PyTorch tensors
+
+### Parameters
+
+- `k` (int): Target number of features to select
+- `mu` (float, default=100): Annealing parameter controlling feature reduction speed
+- `s` (float, default=0.0001): L2 regularization parameter
+- `Niter` (int, default=300): Number of optimization iterations
+- `lr` (float, default=0.01): Learning rate for SGD optimizer
+
+## Usage Example
+
+```python
+import torch
+from fsa_custom import FSA_Multiclass, select_features_fsa
+
+# Prepare data
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+X_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
+y_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
+
+# Select top k features
+fsa = FSA_Multiclass(k=20, mu=100, Niter=300)
+fsa.fit(X_tensor, y_tensor, device, num_classes=5)
+
+# Get selected feature indices
+selected_features = fsa.idx.cpu().numpy()
+print(f"Selected features: {selected_features}")
+
+# Or use the high-level function
+selected_feature_names = select_features_fsa(X_train, y_train, k=20, num_classes=5)
+```
+
+## Running the Demo
+
+```bash
+cd Python
+python demo_custom.py
+```
+
+The demo will run two examples:
+1. Binary classification with 100 features → 20 selected
+2. Multiclass (5 classes) with 100 features → 25 selected
+
+## Reference
+
+Original FSA algorithm from:
+> A. Barbu, Y. She, L. Ding, G. Gramajo. Feature Selection with Annealing for Computer Vision and Big Data Learning. IEEE PAMI, 39, No. 2, 272–286, 2017
+
+## Requirements
+
+- Python 3.7+
+- PyTorch
+- NumPy
+- scikit-learn (for demo only)
diff --git a/Python/demo_custom.py b/Python/demo_custom.py
new file mode 100644
index 0000000..904e7f6
--- /dev/null
+++ b/Python/demo_custom.py
@@ -0,0 +1,145 @@
+"""
+Demo script for FSA_Multiclass feature selection
+Shows usage examples for both binary and multiclass classification
+"""
+
+import torch
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+import sys
+sys.path.append('.')
+from fsa_custom import FSA_Multiclass, select_features_fsa
+
+
+def demo_binary_classification():
+    """Demo for binary classification"""
+    print("=" * 60)
+    print("BINARY CLASSIFICATION DEMO")
+    print("=" * 60)
+    
+    # Generate synthetic binary dataset
+    X, y = make_classification(
+        n_samples=500, 
+        n_features=100, 
+        n_informative=10, 
+        n_redundant=20,
+        n_classes=2,
+        random_state=42
+    )
+    
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42
+    )
+    
+    print(f"Dataset: {X_train.shape[0]} samples, {X_train.shape[1]} features")
+    
+    # Feature selection with FSA
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    
+    k = 20  # Select 20 features
+    print(f"\nSelecting top {k} features with FSA...")
+    
+    X_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
+    y_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
+    
+    fsa = FSA_Multiclass(k=k, mu=100, Niter=300)
+    fsa.fit(X_tensor, y_tensor, device, num_classes=2)
+    
+    selected_features = fsa.idx.cpu().numpy()
+    print(f"Selected features: {selected_features}")
+    
+    # Train classifier on selected features
+    X_train_selected = X_train[:, selected_features]
+    X_test_selected = X_test[:, selected_features]
+    
+    clf = RandomForestClassifier(n_estimators=100, random_state=42)
+    clf.fit(X_train_selected, y_train)
+    
+    # Evaluate
+    y_pred = clf.predict(X_test_selected)
+    acc = accuracy_score(y_test, y_pred)
+    
+    print(f"\nAccuracy with {k} selected features: {acc:.4f}")
+    
+    # Compare with all features
+    clf_all = RandomForestClassifier(n_estimators=100, random_state=42)
+    clf_all.fit(X_train, y_train)
+    y_pred_all = clf_all.predict(X_test)
+    acc_all = accuracy_score(y_test, y_pred_all)
+    
+    print(f"Accuracy with all {X_train.shape[1]} features: {acc_all:.4f}")
+    print()
+
+
+def demo_multiclass_classification():
+    """Demo for multiclass classification"""
+    print("=" * 60)
+    print("MULTICLASS CLASSIFICATION DEMO")
+    print("=" * 60)
+    
+    # Generate synthetic multiclass dataset
+    X, y = make_classification(
+        n_samples=500, 
+        n_features=100, 
+        n_informative=15, 
+        n_redundant=25,
+        n_classes=5,  # 5 classes
+        n_clusters_per_class=1,
+        random_state=42
+    )
+    
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42
+    )
+    
+    print(f"Dataset: {X_train.shape[0]} samples, {X_train.shape[1]} features, 5 classes")
+    
+    # Feature selection with FSA
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    
+    k = 25  # Select 25 features
+    print(f"\nSelecting top {k} features with FSA...")
+    
+    X_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
+    y_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
+    
+    fsa = FSA_Multiclass(k=k, mu=100, Niter=300)
+    fsa.fit(X_tensor, y_tensor, device, num_classes=5)
+    
+    selected_features = fsa.idx.cpu().numpy()
+    print(f"Selected features: {selected_features}")
+    
+    # Train classifier on selected features
+    X_train_selected = X_train[:, selected_features]
+    X_test_selected = X_test[:, selected_features]
+    
+    clf = RandomForestClassifier(n_estimators=100, random_state=42)
+    clf.fit(X_train_selected, y_train)
+    
+    # Evaluate
+    y_pred = clf.predict(X_test_selected)
+    acc = accuracy_score(y_test, y_pred)
+    
+    print(f"\nAccuracy with {k} selected features: {acc:.4f}")
+    
+    # Compare with all features
+    clf_all = RandomForestClassifier(n_estimators=100, random_state=42)
+    clf_all.fit(X_train, y_train)
+    y_pred_all = clf_all.predict(X_test)
+    acc_all = accuracy_score(y_test, y_pred_all)
+    
+    print(f"Accuracy with all {X_train.shape[1]} features: {acc_all:.4f}")
+    print()
+
+
+if __name__ == "__main__":
+    demo_binary_classification()
+    demo_multiclass_classification()
+    print("=" * 60)
+    print("DEMO COMPLETED")
+    print("=" * 60)
diff --git a/Python/fsa_custom.py b/Python/fsa_custom.py
new file mode 100644
index 0000000..fe490cc
--- /dev/null
+++ b/Python/fsa_custom.py
@@ -0,0 +1,208 @@
+"""
+Custom FSA (Feature Selection with Annealing) Implementation
+Supporting multiclass classification with PyTorch
+
+Original concept from:
+A. Barbu, Y. She, L. Ding, G. Gramajo. 
+Feature Selection with Annealing for Computer Vision and Big Data Learning. 
+IEEE PAMI, 39, No. 2, 272–286, 2017
+"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+
+
+class FSA_Multiclass:
+    """
+    Feature Selection with Annealing - Multiclass Support
+    
+    Parameters:
+    -----------
+    k : int
+        Target number of features to select
+    mu : float, default=100
+        Annealing parameter controlling the speed of feature reduction
+    s : float, default=0.0001
+        L2 regularization parameter
+    Niter : int, default=300
+        Number of iterations for optimization
+    lr : float, default=0.01
+        Learning rate for SGD optimizer
+    """
+    
+    def __init__(self, k, mu=100, s=0.0001, Niter=300, lr=0.01):
+        self.mu = mu
+        self.k = k
+        self.Niter = Niter
+        self.lr = lr
+        self.s = s
+        self.idx = None
+        self.w = None
+        self.w0 = None
+
+    def fit(self, X, y, device, num_classes=None):
+        """
+        Fit the FSA model to select features.
+        
+        Parameters:
+        -----------
+        X : torch.Tensor or array-like
+            Training data of shape (n_samples, n_features)
+        y : torch.Tensor or array-like
+            Target values of shape (n_samples,)
+        device : torch.device
+            Device to run computations on (CPU or CUDA)
+        num_classes : int, optional
+            Number of classes (auto-detected if not provided)
+            
+        Returns:
+        --------
+        self : FSA_Multiclass
+            Fitted estimator with selected feature indices in self.idx
+        """
+        p = X.shape[1]
+        
+        # Auto-detect number of classes if not provided
+        if num_classes is None:
+            n_classes = len(torch.unique(y))
+        else:
+            n_classes = num_classes
+        
+        # Initialize all features as selected
+        self.idx = torch.arange(0, p, dtype=torch.long).to(device)
+        
+        # Initialize weights based on number of classes
+        if n_classes == 2:
+            # Binary classification: single weight vector
+            self.w = torch.zeros((p, 1), device=device, requires_grad=True)
+            self.w0 = torch.zeros(1, device=device, requires_grad=True)
+            # Transform labels to {-1, +1}
+            y_mod = y.clone().float()
+            y_mod[y_mod == 0] = -1
+        else:
+            # Multiclass: weight matrix (one column per class)
+            self.w = torch.zeros((p, n_classes), device=device, requires_grad=True)
+            self.w0 = torch.zeros(n_classes, device=device, requires_grad=True)
+            y_mod = y.long()
+        
+        optimizer = optim.SGD([self.w, self.w0], lr=self.lr)
+        criterion = nn.CrossEntropyLoss()
+        
+        # Main optimization loop with annealing
+        for i in range(self.Niter):
+            optimizer.zero_grad()
+            
+            # Compute loss based on classification type
+            if n_classes == 2:
+                # Binary: logistic loss
+                xw = X[:, self.idx].float() @ self.w.view(-1, 1) + self.w0
+                yxw = y_mod * xw.squeeze()
+                l2 = torch.log(1 + torch.exp(-yxw))
+                loss_task = torch.mean(l2)
+            else:
+                # Multiclass: cross-entropy loss
+                logits = X[:, self.idx].float() @ self.w + self.w0
+                loss_task = criterion(logits, y_mod)
+            
+            # Add L2 regularization
+            loss_reg = self.s * torch.sum(self.w ** 2) + self.s * torch.sum(self.w0 ** 2)
+            loss = loss_task + loss_reg
+            
+            # Check for NaN and break if found
+            if torch.isnan(loss):
+                break
+                
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_([self.w, self.w0], max_norm=1.0)
+            optimizer.step()
+            
+            # Annealing schedule: gradually reduce number of features
+            m = int(self.k + (p - self.k) * max(0, (self.Niter - 2 * i) / (2 * i * self.mu + self.Niter)))
+            
+            if m < self.w.shape[0]:
+                # Compute feature importance
+                if n_classes == 2:
+                    feature_importance = torch.abs(self.w.view(-1))
+                else:
+                    # For multiclass, use L2 norm across all classes
+                    feature_importance = torch.norm(self.w, p=2, dim=1)
+                
+                # Select top m features
+                sw = -torch.sort(-feature_importance)[0]
+                thr = sw[m - 1].item()
+                
+                if torch.isnan(torch.tensor(thr)):
+                    break
+                    
+                j = torch.where(feature_importance >= thr)[0]
+                
+                # Ensure we have at least m features
+                if len(j) == 0:
+                    _, top_indices = torch.topk(feature_importance, m)
+                    j = top_indices
+                
+                # Update feature indices and weights
+                self.idx = self.idx[j]
+                self.w = self.w[j].detach().clone()
+                self.w.requires_grad = True
+                optimizer = optim.SGD([self.w, self.w0], lr=0.1)
+        
+        return self
+
+
+def select_features_fsa(X_train, y_train, k, **kwargs):
+    """
+    High-level function to perform FSA feature selection.
+    
+    Parameters:
+    -----------
+    X_train : pandas.DataFrame or array-like
+        Training features
+    y_train : pandas.Series or array-like
+        Training labels
+    k : int
+        Number of features to select
+    **kwargs : dict
+        Additional parameters (e.g., num_classes)
+        
+    Returns:
+    --------
+    selected_features : list
+        List of selected feature names (or indices if no names available)
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    
+    # Handle pandas DataFrames
+    if hasattr(X_train, 'values'):
+        X_val = X_train.values
+        feature_names = X_train.columns
+    else:
+        X_val = X_train
+        feature_names = np.array([f'x{i}' for i in range(X_train.shape[1])])
+        
+    if hasattr(y_train, 'values'):
+        y_val = y_train.values
+    else:
+        y_val = y_train
+        
+    # Convert to PyTorch tensors
+    X_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
+    y_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)
+    
+    # Initialize and fit FSA
+    fsa = FSA_Multiclass(k=k)
+    
+    if 'num_classes' in kwargs:
+        num_classes = kwargs['num_classes']
+    else:
+        num_classes = int(y_tensor.max()) + 1
+    
+    try:
+        fsa.fit(X_tensor, y_tensor, device, num_classes=num_classes)
+        selected_indices = fsa.idx.cpu().numpy()
+        return feature_names[selected_indices].tolist()
+    except Exception as e:
+        print(f"  ❌ FSA Failed: {e}")
+        return []