Complete ML Algorithms Visualizer

🎓 Machine Learning Fundamentals

Machine Learning is a subset of Artificial Intelligence that enables computers to learn and improve from experience without being explicitly programmed. This interactive guide covers all major ML algorithms with 3D visualizations, mathematical intuition, and Python implementations.

📌 Types of Machine Learning

1

Supervised Learning: Learning with labeled data (Regression & Classification)

2

Unsupervised Learning: Finding patterns in unlabeled data (Clustering & Dimensionality Reduction)

3

Reinforcement Learning: Learning through trial and error with rewards

4

Semi-supervised Learning: Mix of labeled and unlabeled data

🔄 ML Pipeline Overview

📊 Algorithm Selection Guide

# Complete ML Pipeline in Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Data Collection & Exploration
def explore_data(df):
    print("Dataset Shape:", df.shape)
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nStatistical Summary:\n", df.describe())
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Correlation heatmap
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=axes[0,0])
    axes[0,0].set_title('Correlation Matrix')
    
    # Distribution plots
    df.hist(ax=axes[0,1])
    axes[0,1].set_title('Feature Distributions')
    
    plt.tight_layout()
    plt.show()

# 2. Feature Engineering
def engineer_features(df):
    # Handle missing values
    df = df.fillna(df.mean())
    
    # Encode categorical variables
    le = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = le.fit_transform(df[col])
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df.drop('target', axis=1))
    
    return X_scaled, df['target']

# 3. Model Training & Evaluation
def train_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    return model

📈 Linear Regression

Linear Regression models the relationship between independent variables (X) and a dependent variable (y) by fitting a linear equation. It's the foundation of predictive modeling.

🧮 Mathematical Formulation

Hypothesis: \( h_\theta(x) = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + \theta_n x_n \)

Cost Function (MSE): \( J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2 \)

Gradient Descent: \( \theta_j := \theta_j - \alpha \frac{\partial J(\theta)}{\partial \theta_j} \)

Normal Equation: \( \theta = (X^T X)^{-1} X^T y \)

🎮 Interactive Controls

Learning Rate (α): 0.01

Number of Iterations: 100

Noise Level: 5

Dataset:

📊 2D Regression Line

🎲 3D Cost Function Surface

0.00

MSE (Mean Squared Error)

0.00

RMSE

0.00

R² Score

0.00

θ₀ (Intercept)

0.00

θ₁ (Slope)

# Linear Regression - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Method 1: Using sklearn (Quick)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"Coefficient: {model.coef_[0]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

# Method 2: Manual Gradient Descent (Educational)
class ManualLinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.theta = None
        self.cost_history = []
    
    def fit(self, X, y):
        m, n = X.shape
        # Add bias term
        X_b = np.c_[np.ones((m, 1)), X]
        self.theta = np.random.randn(n + 1, 1)
        
        for i in range(self.n_iter):
            # Predictions
            y_pred = X_b.dot(self.theta)
            
            # Gradient
            gradient = (1/m) * X_b.T.dot(y_pred - y.reshape(-1, 1))
            
            # Update
            self.theta -= self.lr * gradient
            
            # Cost
            cost = (1/(2*m)) * np.sum((y_pred - y.reshape(-1, 1))**2)
            self.cost_history.append(cost)
        
        return self
    
    def predict(self, X):
        m = X.shape[0]
        X_b = np.c_[np.ones((m, 1)), X]
        return X_b.dot(self.theta).flatten()

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Scatter plot with regression line
axes[0,0].scatter(X_test, y_test, color='blue', alpha=0.5, label='Actual')
axes[0,0].plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
axes[0,0].set_xlabel('X')
axes[0,0].set_ylabel('y')
axes[0,0].legend()
axes[0,0].set_title('Linear Regression Fit')

# 2. Residuals plot
residuals = y_test - y_pred
axes[0,1].scatter(y_pred, residuals, alpha=0.6)
axes[0,1].axhline(y=0, color='red', linestyle='--')
axes[0,1].set_xlabel('Predicted')
axes[0,1].set_ylabel('Residuals')
axes[0,1].set_title('Residual Plot')

# 3. Cost function convergence
manual_model = ManualLinearRegression(learning_rate=0.01, n_iterations=500)
manual_model.fit(X_train, y_train)
axes[1,0].plot(manual_model.cost_history)
axes[1,0].set_xlabel('Iteration')
axes[1,0].set_ylabel('Cost (MSE)')
axes[1,0].set_title('Gradient Descent Convergence')

# 4. Feature importance (for multiple features)
X_multi, y_multi = make_regression(n_samples=100, n_features=5, noise=10)
model_multi = LinearRegression()
model_multi.fit(X_multi, y_multi)
axes[1,1].bar(range(5), model_multi.coef_)
axes[1,1].set_xlabel('Feature Index')
axes[1,1].set_ylabel('Coefficient')
axes[1,1].set_title('Feature Importance')

plt.tight_layout()
plt.show()

🔮 Logistic Regression

Logistic Regression is a classification algorithm used to predict the probability of a binary or categorical outcome. Unlike Linear Regression, it uses the sigmoid function to constrain outputs between 0 and 1.

🧮 Mathematical Formulation

Sigmoid Function: \( \sigma(z) = \frac{1}{1 + e^{-z}} \)

Hypothesis: \( h_\theta(x) = \sigma(\theta^T x) = \frac{1}{1 + e^{-\theta^T x}} \)

Cost Function (Log Loss): \( J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} [y^{(i)} \log(h_\theta(x^{(i)})) + (1-y^{(i)}) \log(1-h_\theta(x^{(i)}))] \)

Gradient: \( \frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)} \)

🎮 Interactive Controls

Decision Threshold: 0.5

Regularization (C): 1.0

Dataset:

📊 Decision Boundary

📈 Sigmoid Function 3D

0.00

Accuracy

0.00

Precision

0.00

Recall

0.00

F1 Score

# Logistic Regression - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification, load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc

# Generate binary classification data
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0, 
                          n_informative=2, n_clusters_per_class=1, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression
model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Decision Boundary
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

axes[0,0].contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
scatter = axes[0,0].scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black')
axes[0,0].set_title('Decision Boundary')

# 2. Sigmoid Curve
x_sigmoid = np.linspace(-10, 10, 100)
y_sigmoid = 1 / (1 + np.exp(-x_sigmoid))
axes[0,1].plot(x_sigmoid, y_sigmoid, linewidth=3)
axes[0,1].axhline(y=0.5, color='red', linestyle='--')
axes[0,1].set_title('Sigmoid Function')
axes[0,1].grid(True)

# 3. ROC Curve
axes[1,0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
axes[1,0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1,0].set_xlabel('False Positive Rate')
axes[1,0].set_ylabel('True Positive Rate')
axes[1,0].set_title('ROC Curve')
axes[1,0].legend()

# 4. Probability Distribution
axes[1,1].hist(y_pred_proba[y_test==0], bins=20, alpha=0.5, label='Class 0', color='red')
axes[1,1].hist(y_pred_proba[y_test==1], bins=20, alpha=0.5, label='Class 1', color='blue')
axes[1,1].set_xlabel('Predicted Probability')
axes[1,1].set_title('Probability Distribution')
axes[1,1].legend()

plt.tight_layout()
plt.show()

🌳 Decision Trees

Decision Trees split data recursively based on feature values to create a tree-like model of decisions. They are intuitive, easy to interpret, and can handle both numerical and categorical data.

🧮 Splitting Criteria

Gini Impurity: \( Gini = 1 - \sum_{i=1}^{c} (p_i)^2 \)

Entropy: \( H = -\sum_{i=1}^{c} p_i \log_2(p_i) \)

Information Gain: \( IG = H_{parent} - \sum_{j} \frac{n_j}{n} H_j \)

🎮 Tree Parameters

Max Depth: 3

Min Samples Split: 2

Criterion:

🌲 Decision Tree Structure

📊 Decision Boundaries

🔍 How Decision Trees Work

1

Select Best Feature: Calculate Information Gain for each feature

2

Split Data: Divide dataset based on the best feature threshold

3

Recursion: Repeat process for each subset until stopping criteria met

4

Pruning: Remove branches that don't improve validation accuracy

# Decision Tree - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X, y = iris.data[:, :2], iris.target  # Use only 2 features for visualization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Decision Tree
dt = DecisionTreeClassifier(max_depth=3, criterion='gini', min_samples_split=2)
dt.fit(X_train, y_train)

# Predictions
y_pred = dt.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Decision Tree Plot
plot_tree(dt, feature_names=iris.feature_names[:2], 
          class_names=iris.target_names, filled=True, ax=axes[0,0])
axes[0,0].set_title('Decision Tree Structure')

# 2. Decision Boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
Z = dt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[0,1].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
scatter = axes[0,1].scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,1].set_xlabel(iris.feature_names[0])
axes[0,1].set_ylabel(iris.feature_names[1])
axes[0,1].set_title('Decision Boundaries')

# 3. Feature Importance
importances = dt.feature_importances_
axes[1,0].barh(iris.feature_names[:2], importances)
axes[1,0].set_xlabel('Importance')
axes[1,0].set_title('Feature Importance')

# 4. Tree Depth vs Accuracy
depths = range(1, 11)
train_scores = []
test_scores = []

for depth in depths:
    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(X_train, y_train)
    train_scores.append(clf.score(X_train, y_train))
    test_scores.append(clf.score(X_test, y_test))

axes[1,1].plot(depths, train_scores, 'o-', label='Training')
axes[1,1].plot(depths, test_scores, 's-', label='Testing')
axes[1,1].set_xlabel('Tree Depth')
axes[1,1].set_ylabel('Accuracy')
axes[1,1].set_title('Depth vs Accuracy (Overfitting Check)')
axes[1,1].legend()
axes[1,1].grid(True)

plt.tight_layout()
plt.show()

# Text representation of tree
tree_rules = export_text(dt, feature_names=iris.feature_names[:2])
print(tree_rules)

🎯 K-Means Clustering

K-Means is an unsupervised learning algorithm that partitions data into K clusters by minimizing the within-cluster sum of squares. It's the most popular clustering algorithm due to its simplicity and efficiency.

🧮 Mathematical Formulation

Objective Function: \( J = \sum_{i=1}^{k} \sum_{x \in C_i} ||x - \mu_i||^2 \)

Centroid Update: \( \mu_i = \frac{1}{|C_i|} \sum_{x \in C_i} x \)

Assignment Step: \( C_i = \{x : ||x - \mu_i||^2 \leq ||x - \mu_j||^2, \forall j\} \)

Elbow Method: Find K where marginal gain drops significantly

🎮 Clustering Controls

Number of Clusters (K): 3

Max Iterations: 100

Dataset:

📊 Clustering Result

📈 Elbow Method

0.00

Inertia (WCSS)

0.00

Silhouette Score

0

Iterations

# K-Means Clustering - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs, make_moons, load_iris
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Elbow Method to find optimal K
inertias = []
silhouettes = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X, kmeans.labels_))

# Apply K-Means with optimal K
optimal_k = 4  # Based on elbow plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Clustering Result
scatter = axes[0,0].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, alpha=0.7)
axes[0,0].scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, marker='X', edgecolors='black', label='Centroids')
axes[0,0].set_title(f'K-Means Clustering (K={optimal_k})')
axes[0,0].legend()

# 2. Elbow Plot
axes[0,1].plot(K_range, inertias, 'bo-')
axes[0,1].set_xlabel('Number of Clusters (K)')
axes[0,1].set_ylabel('Inertia (WCSS)')
axes[0,1].set_title('Elbow Method')
axes[0,1].grid(True)

# 3. Silhouette Score
axes[1,0].plot(K_range, silhouettes, 'go-')
axes[1,0].set_xlabel('Number of Clusters (K)')
axes[1,0].set_ylabel('Silhouette Score')
axes[1,0].set_title('Silhouette Analysis')
axes[1,0].grid(True)

# 4. Voronoi Diagram (Decision Boundaries)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[1,1].imshow(Z, interpolation='nearest', extent=(x_min, x_max, y_min, y_max), 
                cmap='viridis', aspect='auto', origin='lower', alpha=0.4)
axes[1,1].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, edgecolors='k')
axes[1,1].scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, marker='X')
axes[1,1].set_title('Voronoi Diagram (Cluster Regions)')

plt.tight_layout()
plt.show()

# Print cluster statistics
print(f"Final Inertia: {kmeans.inertia_:.2f}")
print(f"Silhouette Score: {silhouette_score(X, labels):.4f}")
print(f"Iterations to converge: {kmeans.n_iter_}")
print(f"\nCluster sizes: {np.bincount(labels)}")

🌲 Random Forest

Random Forest is an ensemble learning method that constructs multiple decision trees during training and outputs the mode of the classes (classification) or mean prediction (regression) of the individual trees. It reduces overfitting and improves accuracy.

🧮 Mathematical Foundation

Bootstrap Aggregation (Bagging): Train M trees on bootstrap samples

Random Subspace: Select m features randomly at each split (typically m = √p)

Prediction: \( \hat{y} = \frac{1}{M} \sum_{i=1}^{M} f_i(x) \) for regression

Majority Vote: \( \hat{y} = \text{mode}\{f_1(x), f_2(x), ..., f_M(x)\} \) for classification

Out-of-Bag Error: \( \text{OOB} = \frac{1}{N} \sum_{i=1}^{N} L(y_i, \hat{y}_i^{oob}) \)

🎮 Forest Parameters

Number of Trees: 100

Max Depth: 10

Max Features:

🌳 Forest Visualization

📊 Feature Importance

0.00

OOB Accuracy

100

Active Trees

10

Avg Depth

🔍 Why Random Forest Works

1

Decorrelated Trees: Random feature selection makes trees diverse

2

Wisdom of Crowds: Many weak learners combine into strong predictor

3

Built-in Validation: OOB error estimates generalization without CV

# Random Forest - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import load_iris, make_classification, load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Classification with Random Forest
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5,
                           n_redundant=3, n_classes=3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, 
                            max_features='sqrt', oob_score=True,
                            random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"OOB Score: {rf.oob_score_:.4f}")

# Feature Importance
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Feature Importance
axes[0,0].bar(range(10), importances[indices])
axes[0,0].set_xticks(range(10))
axes[0,0].set_xticklabels([f'Feature {i}' for i in indices], rotation=45)
axes[0,0].set_title('Feature Importance (Gini)')
axes[0,0].set_ylabel('Importance')

# 2. Trees vs Accuracy
n_estimators_range = [1, 5, 10, 25, 50, 100, 200]
train_scores = []
test_scores = []

for n in n_estimators_range:
    rf_temp = RandomForestClassifier(n_estimators=n, random_state=42)
    rf_temp.fit(X_train, y_train)
    train_scores.append(rf_temp.score(X_train, y_train))
    test_scores.append(rf_temp.score(X_test, y_test))

axes[0,1].plot(n_estimators_range, train_scores, 'o-', label='Training')
axes[0,1].plot(n_estimators_range, test_scores, 's-', label='Testing')
axes[0,1].set_xlabel('Number of Trees')
axes[0,1].set_ylabel('Accuracy')
axes[0,1].set_title('Ensemble Size Effect')
axes[0,1].legend()
axes[0,1].grid(True)

# 3. Decision Boundary (using 2 features)
X_viz, y_viz = make_classification(n_samples=200, n_features=2, n_redundant=0,
                                   n_clusters_per_class=1, random_state=42)
rf_viz = RandomForestClassifier(n_estimators=50, random_state=42)
rf_viz.fit(X_viz, y_viz)

x_min, x_max = X_viz[:, 0].min() - 1, X_viz[:, 0].max() + 1
y_min, y_max = X_viz[:, 1].min() - 1, X_viz[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
Z = rf_viz.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[1,0].contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
axes[1,0].scatter(X_viz[:, 0], X_viz[:, 1], c=y_viz, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[1,0].set_title('Decision Boundary (2D)')

# 4. Tree Depth Distribution
depths = [est.tree_.max_depth for est in rf.estimators_]
axes[1,1].hist(depths, bins=20, edgecolor='black')
axes[1,1].set_xlabel('Tree Depth')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('Distribution of Tree Depths')

plt.tight_layout()
plt.show()

⚡ Support Vector Machine (SVM)

Support Vector Machine finds the optimal hyperplane that maximally separates classes with the largest margin. It uses the kernel trick to handle non-linear decision boundaries and is effective in high-dimensional spaces.

🧮 Mathematical Formulation

Decision Function: \( f(x) = w^T x + b \)

Margin Maximization: \( \min_{w,b} \frac{1}{2} ||w||^2 \) subject to \( y_i(w^T x_i + b) \geq 1 \)

Soft Margin (C): \( \min \frac{1}{2} ||w||^2 + C \sum_{i} \max(0, 1 - y_i(w^T x_i + b)) \)

Kernel Trick: \( K(x_i, x_j) = \phi(x_i)^T \phi(x_j) \)

RBF Kernel: \( K(x, x') = \exp(-\gamma ||x - x'||^2) \)

🎮 SVM Parameters

C (Regularization): 1.0

Kernel:

Gamma: scale

📊 Decision Boundary & Support Vectors

📈 Kernel Comparison

0.00

Accuracy

0

Support Vectors

0.00

Margin Width

# SVM - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.datasets import make_classification, make_circles, make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate non-linear dataset
X, y = make_circles(n_samples=300, noise=0.1, factor=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train SVM with RBF kernel
svm = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)
svm.fit(X_train, y_train)

# Predictions
y_pred = svm.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Support Vectors: {svm.n_support_}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Decision Boundary with Support Vectors
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[0,0].contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
axes[0,0].contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], 
             alpha=0.5, linestyles=['--', '-', '--'])
axes[0,0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,0].scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1], 
             s=100, linewidth=1, facecolors='none', edgecolors='k', label='Support Vectors')
axes[0,0].set_title('SVM Decision Boundary')
axes[0,0].legend()

# 2. Kernel Comparison
kernels = ['linear', 'rbf', 'poly']
accuracies = []
for kernel in kernels:
    svm_k = SVC(kernel=kernel, random_state=42)
    svm_k.fit(X_train, y_train)
    accuracies.append(svm_k.score(X_test, y_test))

axes[0,1].bar(kernels, accuracies, color=['#00ffff', '#ff00ff', '#ffff00'])
axes[0,1].set_ylabel('Accuracy')
axes[0,1].set_title('Kernel Performance Comparison')
axes[0,1].set_ylim([0, 1])

# 3. C Parameter Effect
C_values = [0.01, 0.1, 1, 10, 100]
train_scores = []
test_scores = []

for C in C_values:
    svm_c = SVC(kernel='rbf', C=C, random_state=42)
    svm_c.fit(X_train, y_train)
    train_scores.append(svm_c.score(X_train, y_train))
    test_scores.append(svm_c.score(X_test, y_test))

axes[1,0].semilogx(C_values, train_scores, 'o-', label='Training')
axes[1,0].semilogx(C_values, test_scores, 's-', label='Testing')
axes[1,0].set_xlabel('C (Regularization Parameter)')
axes[1,0].set_ylabel('Accuracy')
axes[1,0].set_title('Effect of Regularization')
axes[1,0].legend()
axes[1,0].grid(True)

# 4. Probability Contours
svm_proba = SVC(kernel='rbf', probability=True, random_state=42)
svm_proba.fit(X_train, y_train)
proba = svm_proba.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
proba = proba.reshape(xx.shape)

contour = axes[1,1].contourf(xx, yy, proba, levels=50, alpha=0.8, cmap='RdYlBu')
axes[1,1].scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='k')
axes[1,1].set_title('Probability Contours')
plt.colorbar(contour, ax=axes[1,1])

plt.tight_layout()
plt.show()

👥 K-Nearest Neighbors (KNN)

KNN is a non-parametric, lazy learning algorithm that classifies data points based on the majority class of their k nearest neighbors. It's simple, intuitive, and makes no assumptions about data distribution.

🧮 Mathematical Formulation

Distance Metrics: Euclidean: \( d(x, x') = \sqrt{\sum_{i=1}^{n} (x_i - x'_i)^2} \)

Manhattan: \( d(x, x') = \sum_{i=1}^{n} |x_i - x'_i| \)

Minkowski: \( d(x, x') = (\sum_{i=1}^{n} |x_i - x'_i|^p)^{1/p} \)

Prediction: \( \hat{y} = \text{mode}\{y_1, y_2, ..., y_k\} \) or \( \hat{y} = \frac{1}{k} \sum_{i=1}^{k} y_i \)

🎮 KNN Parameters

K (Neighbors): 5

Distance Metric:

Weights:

📊 Decision Boundary

📈 K vs Accuracy

# KNN - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate data
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Find optimal K
k_range = range(1, 31)
train_scores = []
test_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))

# Best K
optimal_k = np.argmax(test_scores) + 1
print(f"Optimal K: {optimal_k}")

# Train with optimal K
knn_best = KNeighborsClassifier(n_neighbors=optimal_k)
knn_best.fit(X_train, y_train)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. K vs Accuracy
axes[0,0].plot(k_range, train_scores, 'o-', label='Training')
axes[0,0].plot(k_range, test_scores, 's-', label='Testing')
axes[0,0].axvline(optimal_k, color='red', linestyle='--', label=f'Optimal K={optimal_k}')
axes[0,0].set_xlabel('K (Number of Neighbors)')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].set_title('K Selection (Elbow Method)')
axes[0,0].legend()
axes[0,0].grid(True)

# 2. Decision Boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
Z = knn_best.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[0,1].contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
axes[0,1].scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,1].set_title(f'Decision Boundary (K={optimal_k})')

# 3. Distance Metric Comparison
metrics = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
metric_scores = []
for metric in metrics:
    knn_m = KNeighborsClassifier(n_neighbors=5, metric=metric)
    knn_m.fit(X_train, y_train)
    metric_scores.append(knn_m.score(X_test, y_test))

axes[1,0].bar(metrics, metric_scores, color=['#00ffff', '#ff00ff', '#ffff00', '#00ff00'])
axes[1,0].set_ylabel('Accuracy')
axes[1,0].set_title('Distance Metric Comparison')
axes[1,0].tick_params(axis='x', rotation=45)

# 4. Weight Comparison
weights = ['uniform', 'distance']
weight_scores = []
for weight in weights:
    knn_w = KNeighborsClassifier(n_neighbors=5, weights=weight)
    knn_w.fit(X_train, y_train)
    weight_scores.append(knn_w.score(X_test, y_test))

axes[1,1].bar(weights, weight_scores, color=['#00ffff', '#ff00ff'])
axes[1,1].set_ylabel('Accuracy')
axes[1,1].set_title('Weighting Scheme Comparison')

plt.tight_layout()
plt.show()

📊 Naive Bayes

Naive Bayes classifiers are probabilistic classifiers based on Bayes' theorem with strong (naive) independence assumptions. Despite their simplicity, they work well for text classification, spam filtering, and sentiment analysis.

🧮 Bayes Theorem

Bayes Rule: \( P(y|x) = \frac{P(x|y) P(y)}{P(x)} \)

Naive Assumption: Features are conditionally independent: \( P(x|y) = \prod_{i=1}^{n} P(x_i|y) \)

Classification: \( \hat{y} = \arg\max_y P(y) \prod_{i=1}^{n} P(x_i|y) \)

Gaussian NB: \( P(x_i|y) = \frac{1}{\sqrt{2\pi\sigma_y^2}} \exp\left(-\frac{(x_i - \mu_y)^2}{2\sigma_y^2}\right) \)

🎮 Naive Bayes Variants

Variant:

Smoothing (α): 1.0

# Naive Bayes - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.datasets import load_iris, make_classification, fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

# Gaussian NB for continuous data
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print(f"Gaussian NB Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Class-conditional densities for 2 features
X_2d = X[:, :2]
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(X_2d, y, test_size=0.3, random_state=42)

gnb_2d = GaussianNB()
gnb_2d.fit(X_train_2d, y_train_2d)

x_min, x_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
y_min, y_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = gnb_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axes[0,0].contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
scatter = axes[0,0].scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,0].set_xlabel(iris.feature_names[0])
axes[0,0].set_ylabel(iris.feature_names[1])
axes[0,0].set_title('Naive Bayes Decision Boundary')

# 2. Prior probabilities
axes[0,1].bar(iris.target_names, gnb.class_prior_, color=['#00ffff', '#ff00ff', '#ffff00'])
axes[0,1].set_ylabel('Prior Probability')
axes[0,1].set_title('Class Prior Probabilities')

# 3. Feature means per class
x_pos = np.arange(len(iris.feature_names))
width = 0.25
for i, class_name in enumerate(iris.target_names):
    axes[1,0].bar(x_pos + i*width, gnb.theta_[i], width, label=class_name)
axes[1,0].set_xticks(x_pos + width)
axes[1,0].set_xticklabels(iris.feature_names, rotation=45)
axes[1,0].set_ylabel('Mean Feature Value')
axes[1,0].set_title('Class-conditional Means')
axes[1,0].legend()

# 4. Variance per class
for i, class_name in enumerate(iris.target_names):
    axes[1,1].bar(x_pos + i*width, gnb.var_[i], width, label=class_name)
axes[1,1].set_xticks(x_pos + width)
axes[1,1].set_xticklabels(iris.feature_names, rotation=45)
axes[1,1].set_ylabel('Variance')
axes[1,1].set_title('Class-conditional Variances')
axes[1,1].legend()

plt.tight_layout()
plt.show()

# Text classification with Multinomial NB
categories = ['alt.atheism', 'sci.space', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=1000)
X_text = vectorizer.fit_transform(newsgroups.data)
y_text = newsgroups.target

mnb = MultinomialNB()
mnb.fit(X_text, y_text)
print(f"\nText Classification Accuracy: {mnb.score(X_text, y_text):.4f}")

🧠 Neural Networks (Deep Learning)

Neural Networks are computing systems inspired by biological neural networks. They consist of interconnected nodes (neurons) organized in layers. Deep Learning uses many hidden layers to learn complex patterns and representations from data.

🧮 Neural Network Mathematics

Neuron Output: \( z = \sum_{i=1}^{n} w_i x_i + b = W^T x + b \)

Activation: \( a = \sigma(z) \) where σ can be:

- Sigmoid: \( \sigma(z) = \frac{1}{1 + e^{-z}} \)

- ReLU: \( \sigma(z) = \max(0, z) \)

- Tanh: \( \sigma(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}} \)

Backpropagation: \( \frac{\partial L}{\partial w} = \frac{\partial L}{\partial a} \cdot \frac{\partial a}{\partial z} \cdot \frac{\partial z}{\partial w} \)

Weight Update: \( w_{new} = w_{old} - \eta \frac{\partial L}{\partial w} \)

🎮 Network Architecture

Hidden Layers: 2

Neurons per Layer: 64

Activation:

Learning Rate: 0.001

🧠 Network Architecture

📉 Training Progress

# Neural Network with TensorFlow/Keras
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Binary classification with moons dataset
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
                    solver='adam', learning_rate_init=0.001,
                    max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

print(f"Training Accuracy: {mlp.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {mlp.score(X_test, y_test):.4f}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Decision Boundary
x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = mlp.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

axes[0,0].contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
axes[0,0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,0].set_title('Neural Network Decision Boundary')

# 2. Loss Curve
axes[0,1].plot(mlp.loss_curve_)
axes[0,1].set_xlabel('Iterations')
axes[0,1].set_ylabel('Loss')
axes[0,1].set_title('Training Loss Curve')
axes[0,1].grid(True)

# 3. Activation Functions
x = np.linspace(-5, 5, 100)
axes[1,0].plot(x, np.maximum(0, x), label='ReLU', linewidth=2)
axes[1,0].plot(x, 1/(1+np.exp(-x)), label='Sigmoid', linewidth=2)
axes[1,0].plot(x, np.tanh(x), label='Tanh', linewidth=2)
axes[1,0].set_xlabel('Input')
axes[1,0].set_ylabel('Output')
axes[1,0].set_title('Activation Functions')
axes[1,0].legend()
axes[1,0].grid(True)

# 4. Architecture Diagram
layer_sizes = [2, 64, 32, 1]
max_neurons = max(layer_sizes)
for i, size in enumerate(layer_sizes):
    x_pos = i
    y_positions = np.linspace(-size/2, size/2, size)
    for j, y_pos in enumerate(y_positions):
        circle = plt.Circle((x_pos, y_pos), 0.2, color='cyan', alpha=0.7)
        axes[1,1].add_patch(circle)
        if i < len(layer_sizes) - 1:
            next_y_positions = np.linspace(-layer_sizes[i+1]/2, layer_sizes[i+1]/2, layer_sizes[i+1])
            for next_y in next_y_positions:
                axes[1,1].plot([x_pos, x_pos+1], [y_pos, next_y], 'gray', alpha=0.3, linewidth=0.5)

axes[1,1].set_xlim(-0.5, len(layer_sizes)-0.5)
axes[1,1].set_ylim(-max_neurons/2-1, max_neurons/2+1)
axes[1,1].set_aspect('equal')
axes[1,1].axis('off')
axes[1,1].set_title('Network Architecture: 2-64-32-1')

plt.tight_layout()
plt.show()

📉 Principal Component Analysis (PCA)

PCA is a dimensionality reduction technique that transforms data into a new coordinate system where the greatest variance lies on the first coordinate (first principal component), the second greatest variance on the second coordinate, and so on.

🧮 Mathematical Foundation

Covariance Matrix: \( \Sigma = \frac{1}{n-1} X^T X \)

Eigenvalue Problem: \( \Sigma v = \lambda v \)

Principal Components: Eigenvectors sorted by eigenvalues (descending)

Transformation: \( Z = X W \) where W is matrix of eigenvectors

Explained Variance: \( \frac{\lambda_i}{\sum_{j=1}^{p} \lambda_j} \)

🎮 PCA Parameters

Number of Components: 2

Dataset:

📊 2D PCA Projection

📈 Explained Variance

# PCA - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris, load_digits, load_wine
from sklearn.preprocessing import StandardScaler

# Load and standardize data
iris = load_iris()
X, y = iris.data, iris.target

# Standardization is crucial for PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
explained_var = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

colors = ['red', 'green', 'blue']
for i, target_name in enumerate(iris.target_names):
    axes[0,0].scatter(X_pca[y == i, 0], X_pca[y == i, 1], 
                     color=colors[i], label=target_name, alpha=0.7, s=50)
axes[0,0].set_xlabel(f'PC1 ({explained_var[0]:.1%} variance)')
axes[0,0].set_ylabel(f'PC2 ({explained_var[1]:.1%} variance)')
axes[0,0].set_title('PCA 2D Projection')
axes[0,0].legend()
axes[0,0].grid(True)

# Scree plot
axes[0,1].bar(range(1, len(explained_var) + 1), explained_var, alpha=0.7, label='Individual')
axes[0,1].plot(range(1, len(explained_var) + 1), cumulative_var, 'ro-', label='Cumulative')
axes[0,1].set_xlabel('Principal Component')
axes[0,1].set_ylabel('Explained Variance Ratio')
axes[0,1].set_title('Scree Plot')
axes[0,1].legend()
axes[0,1].grid(True)

# Biplot - show original feature directions
feature_vectors = pca.components_[:2].T
axes[1,0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.5)
for i, feature in enumerate(iris.feature_names):
    axes[1,0].arrow(0, 0, feature_vectors[i, 0]*3, feature_vectors[i, 1]*3,
                  head_width=0.1, head_length=0.1, fc='red', ec='red')
    axes[1,0].text(feature_vectors[i, 0]*3.2, feature_vectors[i, 1]*3.2, feature, color='red')
axes[1,0].set_xlabel('PC1')
axes[1,0].set_ylabel('PC2')
axes[1,0].set_title('Biplot with Feature Vectors')
axes[1,0].grid(True)

# 3D PCA (first 3 components)
ax3d = fig.add_subplot(2, 2, 4, projection='3d')
for i, target_name in enumerate(iris.target_names):
    ax3d.scatter(X_pca[y == i, 0], X_pca[y == i, 1], X_pca[y == i, 2],
                color=colors[i], label=target_name, alpha=0.7, s=30)
ax3d.set_xlabel('PC1')
ax3d.set_ylabel('PC2')
ax3d.set_zlabel('PC3')
ax3d.set_title('3D PCA Projection')
ax3d.legend()

plt.tight_layout()
plt.show()

# Optimal number of components
n_components_95 = np.argmax(cumulative_var >= 0.95) + 1
print(f"Components needed for 95% variance: {n_components_95}")
print(f"Variance explained by first 2 PCs: {cumulative_var[1]:.2%}")

🚀 Gradient Boosting (XGBoost/LightGBM)

Gradient Boosting builds models sequentially, with each new model correcting errors made by previous ones. It combines weak learners into a strong predictor using gradient descent in function space. XGBoost and LightGBM are optimized implementations.

🧮 Boosting Mathematics

Additive Model: \( F_M(x) = \sum_{m=1}^{M} \alpha_m h_m(x) \)

Gradient Descent: Fit new model to negative gradient: \( h_m = -\frac{\partial L(y, F_{m-1}(x))}{\partial F_{m-1}(x)} \)

Shrinkage: \( F_m(x) = F_{m-1}(x) + \nu \cdot h_m(x) \) where ν is learning rate

Regularization: \( \Omega(f) = \gamma T + \frac{1}{2} \lambda \sum_{j=1}^{T} w_j^2 \) (tree complexity)

🎮 Boosting Parameters

Number of Estimators: 100

Learning Rate: 0.1

Max Depth: 3

Subsample: 1.0

📉 Training Progress

📊 Feature Importance

# Gradient Boosting - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=6,
                           n_redundant=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                 max_depth=3, subsample=0.8,
                                 random_state=42)
gb.fit(X_train, y_train)

# Predictions
y_pred = gb.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Staged predictions for learning curves
train_scores = []
test_scores = []
for stage_pred in gb.staged_predict(X_train):
    train_scores.append(accuracy_score(y_train, stage_pred))
for stage_pred in gb.staged_predict(X_test):
    test_scores.append(accuracy_score(y_test, stage_pred))

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Training Deviance (Loss Curve)
axes[0,0].plot(np.arange(1, len(train_scores) + 1), train_scores, 'b-', label='Training')
axes[0,0].plot(np.arange(1, len(test_scores) + 1), test_scores, 'r-', label='Testing')
axes[0,0].set_xlabel('Boosting Iterations')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].set_title('Training Progress')
axes[0,0].legend()
axes[0,0].grid(True)

# 2. Feature Importance
feature_importance = gb.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
axes[0,1].barh(pos, feature_importance[sorted_idx], align='center')
axes[0,1].set_yticks(pos)
axes[0,1].set_yticklabels([f'Feature {i}' for i in sorted_idx])
axes[0,1].set_xlabel('Importance')
axes[0,1].set_title('Feature Importance')

# 3. Learning Rate vs N Estimators Heatmap
learning_rates = [0.01, 0.05, 0.1, 0.5]
n_estimators_range = [50, 100, 200, 300]
heatmap_data = []

for lr in learning_rates:
    row = []
    for n_est in n_estimators_range:
        gb_temp = GradientBoostingClassifier(learning_rate=lr, n_estimators=n_est, random_state=42)
        gb_temp.fit(X_train, y_train)
        row.append(gb_temp.score(X_test, y_test))
    heatmap_data.append(row)

im = axes[1,0].imshow(heatmap_data, cmap='YlOrRd', aspect='auto')
axes[1,0].set_xticks(range(len(n_estimators_range)))
axes[1,0].set_xticklabels(n_estimators_range)
axes[1,0].set_yticks(range(len(learning_rates)))
axes[1,0].set_yticklabels(learning_rates)
axes[1,0].set_xlabel('N Estimators')
axes[1,0].set_ylabel('Learning Rate')
axes[1,0].set_title('Parameter Grid Search')
plt.colorbar(im, ax=axes[1,0])

# 4. Out-of-bag improvement (if subsample < 1)
if hasattr(gb, 'oob_improvement_') and gb.subsample < 1.0:
    axes[1,1].plot(np.arange(1, len(gb.oob_improvement_) + 1), gb.oob_improvement_)
    axes[1,1].set_xlabel('Iteration')
    axes[1,1].set_ylabel('OOB Improvement')
    axes[1,1].set_title('Out-of-Bag Loss Improvement')
else:
    # Partial Dependence Plot (simplified)
    axes[1,1].text(0.5, 0.5, 'Partial Dependence\n(Requires sklearn.inspection)',
                 ha='center', va='center', fontsize=12)
    axes[1,1].axis('off')

plt.tight_layout()
plt.show()

# Comparison with Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

print(f"\nRandom Forest Accuracy: {rf.score(X_test, y_test):.4f}")
print(f"Gradient Boosting Accuracy: {gb.score(X_test, y_test):.4f}")

💾 All ML Datasets Collection

Complete collection of classic machine learning datasets with interactive visualizations, statistical summaries, and ready-to-use Python code for each dataset.

🌸 Iris Dataset

The most famous dataset in machine learning history. Contains 150 samples of iris flowers with 4 features: sepal length, sepal width, petal length, petal width. Three classes: Setosa, Versicolor, Virginica.

📊 Pairplot - Feature Relationships

🌸 3D Visualization

# Iris Dataset - Complete Analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

# Statistical summary
print("Dataset Shape:", df.shape)
print("\nClass Distribution:")
print(df['target'].value_counts().sort_index())

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Pairplot equivalent - scatter matrix
colors = ['red', 'green', 'blue']
for i in range(3):
    axes[0,0].scatter(X[y==i, 0], X[y==i, 1], c=colors[i], label=iris.target_names[i], alpha=0.6)
axes[0,0].set_xlabel('Sepal Length')
axes[0,0].set_ylabel('Sepal Width')
axes[0,0].legend()
axes[0,0].set_title('Sepal Dimensions')

# 2. Box plots
df_melted = df.melt(id_vars=['target'], var_name='feature', value_name='value')
sns.boxplot(data=df_melted, x='feature', y='value', hue='target', ax=axes[0,1])
axes[0,1].set_title('Feature Distributions by Class')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, ax=axes[1,0])
axes[1,0].set_title('Feature Correlation Matrix')

# 4. PCA 2D projection
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
for i in range(3):
    axes[1,1].scatter(X_pca[y==i, 0], X_pca[y==i, 1], c=colors[i], label=iris.target_names[i], alpha=0.6)
axes[1,1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
axes[1,1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
axes[1,1].legend()
axes[1,1].set_title('PCA 2D Projection')

plt.tight_layout()
plt.show()

# Dataset info
print(f"\nExplained variance ratio (first 2 PCs): {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

🤖 Artificial Intelligence Fundamentals

A comprehensive introduction to AI concepts, history, and applications. Learn about intelligent agents, search algorithms, knowledge representation, and the foundations of modern AI systems.

📚 Course Modules

1

Introduction to AI: History, applications, and AI types

2

Intelligent Agents: Agent architectures and environments

3

Search Algorithms: BFS, DFS, A*, heuristic search

4

Knowledge Representation: Logic, ontologies, semantic networks

5

Planning & Reasoning: Automated planning systems

# AI Fundamentals - Search Algorithm Example (A* Search)
import heapq

class Node:
    def __init__(self, state, parent=None, g=0, h=0):
        self.state = state
        self.parent = parent
        self.g = g  # Cost from start
        self.h = h  # Heuristic to goal
        self.f = g + h
    
    def __lt__(self, other):
        return self.f < other.f

def astar_search(start, goal, neighbors_func, heuristic):
    """A* Search Algorithm Implementation"""
    open_set = [Node(start, g=0, h=heuristic(start, goal))]
    closed_set = set()
    
    while open_set:
        current = heapq.heappop(open_set)
        
        if current.state == goal:
            # Reconstruct path
            path = []
            while current:
                path.append(current.state)
                current = current.parent
            return path[::-1]
        
        closed_set.add(current.state)
        
        for neighbor, cost in neighbors_func(current.state):
            if neighbor in closed_set:
                continue
            
            g = current.g + cost
            h = heuristic(neighbor, goal)
            node = Node(neighbor, current, g, h)
            heapq.heappush(open_set, node)
    
    return None  # No path found

# Example: Grid pathfinding
def manhattan_distance(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1])

# Usage
start = (0, 0)
goal = (5, 5)
print(f"A* Search from {start} to {goal}")

🔬 Data Science with Python

Master data manipulation, analysis, and visualization using Python. Learn pandas, numpy, matplotlib, seaborn, and real-world data science workflows from data collection to insights.

📚 Course Modules

1

Python for Data Science: NumPy, Pandas basics

2

Data Manipulation: Cleaning, transformation, merging

3

Data Visualization: Matplotlib, Seaborn, Plotly

4

Statistical Analysis: Descriptive & inferential stats

5

Real-world Projects: EDA on real datasets

# Data Science with Python - Complete EDA Example
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

# Load data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = [iris.target_names[t] for t in iris.target]

# Data Overview
print("Dataset Shape:", df.shape)
print("\nData Info:")
print(df.info())

# Statistical Summary
print("\nDescriptive Statistics:")
print(df.describe())

# Data Cleaning Example
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Distribution plots
sns.histplot(data=df, x='sepal length (cm)', hue='species', kde=True, ax=axes[0,0])
axes[0,0].set_title('Sepal Length Distribution')

# 2. Correlation heatmap
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', ax=axes[0,1])
axes[0,1].set_title('Feature Correlation Matrix')

# 3. Box plots
sns.boxplot(data=df, x='species', y='petal length (cm)', ax=axes[1,0])
axes[1,0].set_title('Petal Length by Species')

# 4. Pairplot subset
sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)', 
                hue='species', style='species', s=100, ax=axes[1,1])
axes[1,1].set_title('Sepal vs Petal Length')

plt.tight_layout()
plt.show()

# Advanced: Feature Engineering
df['sepal_petal_ratio'] = df['sepal length (cm)'] / df['petal length (cm)']
print(f"\nNew feature created: sepal_petal_ratio")

🧬 Machine Learning for Research

Advanced ML techniques for scientific research applications. Learn how to apply ML to genomics, drug discovery, climate science, physics simulations, and other research domains with case studies.

📚 Course Modules

1

ML in Genomics: DNA sequencing, gene expression analysis

2

Drug Discovery: Molecular property prediction

3

Climate Science: Weather prediction, climate modeling

4

Physics Simulations: Surrogate modeling, data-driven physics

5

Research Methodology: Reproducibility, peer review

# ML for Research - Genomics Example (Gene Expression Classification)
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Simulated genomics dataset
# Features: gene expression levels for 1000 genes
# Target: disease subtype (0, 1, 2)
np.random.seed(42)
n_samples = 500
n_genes = 1000

# Generate synthetic gene expression data
X = np.random.randn(n_samples, n_genes)
# Create disease subtypes based on gene expression patterns
y = np.zeros(n_samples, dtype=int)
for i in range(n_samples):
    if X[i, :50].mean() > 0.5:
        y[i] = 1
    elif X[i, 50:100].mean() > 0.3:
        y[i] = 2

print(f"Dataset: {n_samples} samples, {n_genes} genes")
print(f"Classes: {np.bincount(y)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Preprocessing: Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dimensionality Reduction for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Cross-validation
cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print(f"\nCross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Test performance
y_pred = rf.predict(X_test_scaled)
print(f"\nTest Accuracy: {rf.score(X_test_scaled, y_test):.4f}")

# Feature importance (top genes)
feature_importance = rf.feature_importances_
top_genes = np.argsort(feature_importance)[-20:][::-1]
print(f"\nTop 20 important genes: {top_genes}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. PCA visualization
colors = ['red', 'green', 'blue']
for i in range(3):
    mask = y_train == i
    axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1], 
                     c=colors[i], label=f'Disease Type {i}', alpha=0.6)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
axes[0].set_title('Gene Expression PCA')
axes[0].legend()
axes[0].grid(True)

# 2. Feature importance
axes[1].barh(range(20), feature_importance[top_genes])
axes[1].set_yticks(range(20))
axes[1].set_yticklabels([f'Gene {i}' for i in top_genes])
axes[1].set_xlabel('Importance')
axes[1].set_title('Top 20 Important Genes')

plt.tight_layout()
plt.show()

🧠 Complete ML Algorithms Visualizer

🎓 Machine Learning Fundamentals

📌 Types of Machine Learning

📈 Linear Regression

🧮 Mathematical Formulation

🎮 Interactive Controls

🔮 Logistic Regression

🧮 Mathematical Formulation

🎮 Interactive Controls

🌳 Decision Trees

🧮 Splitting Criteria

🎮 Tree Parameters

🔍 How Decision Trees Work

🎯 K-Means Clustering

🧮 Mathematical Formulation

🎮 Clustering Controls

🌲 Random Forest

🧮 Mathematical Foundation

🎮 Forest Parameters

🔍 Why Random Forest Works

⚡ Support Vector Machine (SVM)

🧮 Mathematical Formulation

🎮 SVM Parameters

👥 K-Nearest Neighbors (KNN)

🧮 Mathematical Formulation

🎮 KNN Parameters

📊 Naive Bayes

🧮 Bayes Theorem

🎮 Naive Bayes Variants

🧠 Neural Networks (Deep Learning)

🧮 Neural Network Mathematics

🎮 Network Architecture

📉 Principal Component Analysis (PCA)

🧮 Mathematical Foundation

🎮 PCA Parameters

🚀 Gradient Boosting (XGBoost/LightGBM)

🧮 Boosting Mathematics

🎮 Boosting Parameters

💾 All ML Datasets Collection

🌸 Iris Dataset

🤖 Artificial Intelligence Fundamentals

📚 Course Modules

🔬 Data Science with Python

📚 Course Modules

🧬 Machine Learning for Research

📚 Course Modules