Interactive 3D Visualizations with Python Implementation
# Complete ML Pipeline in Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# 1. Data Collection & Exploration
def explore_data(df):
print("Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nStatistical Summary:\n", df.describe())
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=axes[0,0])
axes[0,0].set_title('Correlation Matrix')
# Distribution plots
df.hist(ax=axes[0,1])
axes[0,1].set_title('Feature Distributions')
plt.tight_layout()
plt.show()
# 2. Feature Engineering
def engineer_features(df):
# Handle missing values
df = df.fillna(df.mean())
# Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
df[col] = le.fit_transform(df[col])
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('target', axis=1))
return X_scaled, df['target']
# 3. Model Training & Evaluation
def train_evaluate(model, X_train, X_test, y_train, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
return model
Hypothesis: \( h_\theta(x) = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + \theta_n x_n \)
Cost Function (MSE): \( J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2 \)
Gradient Descent: \( \theta_j := \theta_j - \alpha \frac{\partial J(\theta)}{\partial \theta_j} \)
Normal Equation: \( \theta = (X^T X)^{-1} X^T y \)
# Linear Regression - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Method 1: Using sklearn (Quick)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Coefficient: {model.coef_[0]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
# Method 2: Manual Gradient Descent (Educational)
class ManualLinearRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.theta = None
self.cost_history = []
def fit(self, X, y):
m, n = X.shape
# Add bias term
X_b = np.c_[np.ones((m, 1)), X]
self.theta = np.random.randn(n + 1, 1)
for i in range(self.n_iter):
# Predictions
y_pred = X_b.dot(self.theta)
# Gradient
gradient = (1/m) * X_b.T.dot(y_pred - y.reshape(-1, 1))
# Update
self.theta -= self.lr * gradient
# Cost
cost = (1/(2*m)) * np.sum((y_pred - y.reshape(-1, 1))**2)
self.cost_history.append(cost)
return self
def predict(self, X):
m = X.shape[0]
X_b = np.c_[np.ones((m, 1)), X]
return X_b.dot(self.theta).flatten()
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Scatter plot with regression line
axes[0,0].scatter(X_test, y_test, color='blue', alpha=0.5, label='Actual')
axes[0,0].plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
axes[0,0].set_xlabel('X')
axes[0,0].set_ylabel('y')
axes[0,0].legend()
axes[0,0].set_title('Linear Regression Fit')
# 2. Residuals plot
residuals = y_test - y_pred
axes[0,1].scatter(y_pred, residuals, alpha=0.6)
axes[0,1].axhline(y=0, color='red', linestyle='--')
axes[0,1].set_xlabel('Predicted')
axes[0,1].set_ylabel('Residuals')
axes[0,1].set_title('Residual Plot')
# 3. Cost function convergence
manual_model = ManualLinearRegression(learning_rate=0.01, n_iterations=500)
manual_model.fit(X_train, y_train)
axes[1,0].plot(manual_model.cost_history)
axes[1,0].set_xlabel('Iteration')
axes[1,0].set_ylabel('Cost (MSE)')
axes[1,0].set_title('Gradient Descent Convergence')
# 4. Feature importance (for multiple features)
X_multi, y_multi = make_regression(n_samples=100, n_features=5, noise=10)
model_multi = LinearRegression()
model_multi.fit(X_multi, y_multi)
axes[1,1].bar(range(5), model_multi.coef_)
axes[1,1].set_xlabel('Feature Index')
axes[1,1].set_ylabel('Coefficient')
axes[1,1].set_title('Feature Importance')
plt.tight_layout()
plt.show()
Sigmoid Function: \( \sigma(z) = \frac{1}{1 + e^{-z}} \)
Hypothesis: \( h_\theta(x) = \sigma(\theta^T x) = \frac{1}{1 + e^{-\theta^T x}} \)
Cost Function (Log Loss): \( J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} [y^{(i)} \log(h_\theta(x^{(i)})) + (1-y^{(i)}) \log(1-h_\theta(x^{(i)}))] \)
Gradient: \( \frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)} \)
# Logistic Regression - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification, load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
# Generate binary classification data
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1, random_state=42)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Logistic Regression
model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Decision Boundary
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
axes[0,0].contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
scatter = axes[0,0].scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black')
axes[0,0].set_title('Decision Boundary')
# 2. Sigmoid Curve
x_sigmoid = np.linspace(-10, 10, 100)
y_sigmoid = 1 / (1 + np.exp(-x_sigmoid))
axes[0,1].plot(x_sigmoid, y_sigmoid, linewidth=3)
axes[0,1].axhline(y=0.5, color='red', linestyle='--')
axes[0,1].set_title('Sigmoid Function')
axes[0,1].grid(True)
# 3. ROC Curve
axes[1,0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
axes[1,0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1,0].set_xlabel('False Positive Rate')
axes[1,0].set_ylabel('True Positive Rate')
axes[1,0].set_title('ROC Curve')
axes[1,0].legend()
# 4. Probability Distribution
axes[1,1].hist(y_pred_proba[y_test==0], bins=20, alpha=0.5, label='Class 0', color='red')
axes[1,1].hist(y_pred_proba[y_test==1], bins=20, alpha=0.5, label='Class 1', color='blue')
axes[1,1].set_xlabel('Predicted Probability')
axes[1,1].set_title('Probability Distribution')
axes[1,1].legend()
plt.tight_layout()
plt.show()
Gini Impurity: \( Gini = 1 - \sum_{i=1}^{c} (p_i)^2 \)
Entropy: \( H = -\sum_{i=1}^{c} p_i \log_2(p_i) \)
Information Gain: \( IG = H_{parent} - \sum_{j} \frac{n_j}{n} H_j \)
# Decision Tree - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load Iris dataset
iris = load_iris()
X, y = iris.data[:, :2], iris.target # Use only 2 features for visualization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Decision Tree
dt = DecisionTreeClassifier(max_depth=3, criterion='gini', min_samples_split=2)
dt.fit(X_train, y_train)
# Predictions
y_pred = dt.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# 1. Decision Tree Plot
plot_tree(dt, feature_names=iris.feature_names[:2],
class_names=iris.target_names, filled=True, ax=axes[0,0])
axes[0,0].set_title('Decision Tree Structure')
# 2. Decision Boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
Z = dt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[0,1].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
scatter = axes[0,1].scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,1].set_xlabel(iris.feature_names[0])
axes[0,1].set_ylabel(iris.feature_names[1])
axes[0,1].set_title('Decision Boundaries')
# 3. Feature Importance
importances = dt.feature_importances_
axes[1,0].barh(iris.feature_names[:2], importances)
axes[1,0].set_xlabel('Importance')
axes[1,0].set_title('Feature Importance')
# 4. Tree Depth vs Accuracy
depths = range(1, 11)
train_scores = []
test_scores = []
for depth in depths:
clf = DecisionTreeClassifier(max_depth=depth)
clf.fit(X_train, y_train)
train_scores.append(clf.score(X_train, y_train))
test_scores.append(clf.score(X_test, y_test))
axes[1,1].plot(depths, train_scores, 'o-', label='Training')
axes[1,1].plot(depths, test_scores, 's-', label='Testing')
axes[1,1].set_xlabel('Tree Depth')
axes[1,1].set_ylabel('Accuracy')
axes[1,1].set_title('Depth vs Accuracy (Overfitting Check)')
axes[1,1].legend()
axes[1,1].grid(True)
plt.tight_layout()
plt.show()
# Text representation of tree
tree_rules = export_text(dt, feature_names=iris.feature_names[:2])
print(tree_rules)
Objective Function: \( J = \sum_{i=1}^{k} \sum_{x \in C_i} ||x - \mu_i||^2 \)
Centroid Update: \( \mu_i = \frac{1}{|C_i|} \sum_{x \in C_i} x \)
Assignment Step: \( C_i = \{x : ||x - \mu_i||^2 \leq ||x - \mu_j||^2, \forall j\} \)
Elbow Method: Find K where marginal gain drops significantly
# K-Means Clustering - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs, make_moons, load_iris
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Elbow Method to find optimal K
inertias = []
silhouettes = []
K_range = range(2, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(X, kmeans.labels_))
# Apply K-Means with optimal K
optimal_k = 4 # Based on elbow plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Clustering Result
scatter = axes[0,0].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, alpha=0.7)
axes[0,0].scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, marker='X', edgecolors='black', label='Centroids')
axes[0,0].set_title(f'K-Means Clustering (K={optimal_k})')
axes[0,0].legend()
# 2. Elbow Plot
axes[0,1].plot(K_range, inertias, 'bo-')
axes[0,1].set_xlabel('Number of Clusters (K)')
axes[0,1].set_ylabel('Inertia (WCSS)')
axes[0,1].set_title('Elbow Method')
axes[0,1].grid(True)
# 3. Silhouette Score
axes[1,0].plot(K_range, silhouettes, 'go-')
axes[1,0].set_xlabel('Number of Clusters (K)')
axes[1,0].set_ylabel('Silhouette Score')
axes[1,0].set_title('Silhouette Analysis')
axes[1,0].grid(True)
# 4. Voronoi Diagram (Decision Boundaries)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[1,1].imshow(Z, interpolation='nearest', extent=(x_min, x_max, y_min, y_max),
cmap='viridis', aspect='auto', origin='lower', alpha=0.4)
axes[1,1].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, edgecolors='k')
axes[1,1].scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, marker='X')
axes[1,1].set_title('Voronoi Diagram (Cluster Regions)')
plt.tight_layout()
plt.show()
# Print cluster statistics
print(f"Final Inertia: {kmeans.inertia_:.2f}")
print(f"Silhouette Score: {silhouette_score(X, labels):.4f}")
print(f"Iterations to converge: {kmeans.n_iter_}")
print(f"\nCluster sizes: {np.bincount(labels)}")
Bootstrap Aggregation (Bagging): Train M trees on bootstrap samples
Random Subspace: Select m features randomly at each split (typically m = √p)
Prediction: \( \hat{y} = \frac{1}{M} \sum_{i=1}^{M} f_i(x) \) for regression
Majority Vote: \( \hat{y} = \text{mode}\{f_1(x), f_2(x), ..., f_M(x)\} \) for classification
Out-of-Bag Error: \( \text{OOB} = \frac{1}{N} \sum_{i=1}^{N} L(y_i, \hat{y}_i^{oob}) \)
# Random Forest - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import load_iris, make_classification, load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Classification with Random Forest
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5,
n_redundant=3, n_classes=3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10,
max_features='sqrt', oob_score=True,
random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
# Predictions
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"OOB Score: {rf.oob_score_:.4f}")
# Feature Importance
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Feature Importance
axes[0,0].bar(range(10), importances[indices])
axes[0,0].set_xticks(range(10))
axes[0,0].set_xticklabels([f'Feature {i}' for i in indices], rotation=45)
axes[0,0].set_title('Feature Importance (Gini)')
axes[0,0].set_ylabel('Importance')
# 2. Trees vs Accuracy
n_estimators_range = [1, 5, 10, 25, 50, 100, 200]
train_scores = []
test_scores = []
for n in n_estimators_range:
rf_temp = RandomForestClassifier(n_estimators=n, random_state=42)
rf_temp.fit(X_train, y_train)
train_scores.append(rf_temp.score(X_train, y_train))
test_scores.append(rf_temp.score(X_test, y_test))
axes[0,1].plot(n_estimators_range, train_scores, 'o-', label='Training')
axes[0,1].plot(n_estimators_range, test_scores, 's-', label='Testing')
axes[0,1].set_xlabel('Number of Trees')
axes[0,1].set_ylabel('Accuracy')
axes[0,1].set_title('Ensemble Size Effect')
axes[0,1].legend()
axes[0,1].grid(True)
# 3. Decision Boundary (using 2 features)
X_viz, y_viz = make_classification(n_samples=200, n_features=2, n_redundant=0,
n_clusters_per_class=1, random_state=42)
rf_viz = RandomForestClassifier(n_estimators=50, random_state=42)
rf_viz.fit(X_viz, y_viz)
x_min, x_max = X_viz[:, 0].min() - 1, X_viz[:, 0].max() + 1
y_min, y_max = X_viz[:, 1].min() - 1, X_viz[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
Z = rf_viz.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[1,0].contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
axes[1,0].scatter(X_viz[:, 0], X_viz[:, 1], c=y_viz, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[1,0].set_title('Decision Boundary (2D)')
# 4. Tree Depth Distribution
depths = [est.tree_.max_depth for est in rf.estimators_]
axes[1,1].hist(depths, bins=20, edgecolor='black')
axes[1,1].set_xlabel('Tree Depth')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('Distribution of Tree Depths')
plt.tight_layout()
plt.show()
Decision Function: \( f(x) = w^T x + b \)
Margin Maximization: \( \min_{w,b} \frac{1}{2} ||w||^2 \) subject to \( y_i(w^T x_i + b) \geq 1 \)
Soft Margin (C): \( \min \frac{1}{2} ||w||^2 + C \sum_{i} \max(0, 1 - y_i(w^T x_i + b)) \)
Kernel Trick: \( K(x_i, x_j) = \phi(x_i)^T \phi(x_j) \)
RBF Kernel: \( K(x, x') = \exp(-\gamma ||x - x'||^2) \)
# SVM - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.datasets import make_classification, make_circles, make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Generate non-linear dataset
X, y = make_circles(n_samples=300, noise=0.1, factor=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train SVM with RBF kernel
svm = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)
svm.fit(X_train, y_train)
# Predictions
y_pred = svm.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Support Vectors: {svm.n_support_}")
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Decision Boundary with Support Vectors
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[0,0].contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
axes[0,0].contour(xx, yy, Z, colors='k', levels=[-1, 0, 1],
alpha=0.5, linestyles=['--', '-', '--'])
axes[0,0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,0].scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1],
s=100, linewidth=1, facecolors='none', edgecolors='k', label='Support Vectors')
axes[0,0].set_title('SVM Decision Boundary')
axes[0,0].legend()
# 2. Kernel Comparison
kernels = ['linear', 'rbf', 'poly']
accuracies = []
for kernel in kernels:
svm_k = SVC(kernel=kernel, random_state=42)
svm_k.fit(X_train, y_train)
accuracies.append(svm_k.score(X_test, y_test))
axes[0,1].bar(kernels, accuracies, color=['#00ffff', '#ff00ff', '#ffff00'])
axes[0,1].set_ylabel('Accuracy')
axes[0,1].set_title('Kernel Performance Comparison')
axes[0,1].set_ylim([0, 1])
# 3. C Parameter Effect
C_values = [0.01, 0.1, 1, 10, 100]
train_scores = []
test_scores = []
for C in C_values:
svm_c = SVC(kernel='rbf', C=C, random_state=42)
svm_c.fit(X_train, y_train)
train_scores.append(svm_c.score(X_train, y_train))
test_scores.append(svm_c.score(X_test, y_test))
axes[1,0].semilogx(C_values, train_scores, 'o-', label='Training')
axes[1,0].semilogx(C_values, test_scores, 's-', label='Testing')
axes[1,0].set_xlabel('C (Regularization Parameter)')
axes[1,0].set_ylabel('Accuracy')
axes[1,0].set_title('Effect of Regularization')
axes[1,0].legend()
axes[1,0].grid(True)
# 4. Probability Contours
svm_proba = SVC(kernel='rbf', probability=True, random_state=42)
svm_proba.fit(X_train, y_train)
proba = svm_proba.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
proba = proba.reshape(xx.shape)
contour = axes[1,1].contourf(xx, yy, proba, levels=50, alpha=0.8, cmap='RdYlBu')
axes[1,1].scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='k')
axes[1,1].set_title('Probability Contours')
plt.colorbar(contour, ax=axes[1,1])
plt.tight_layout()
plt.show()
Distance Metrics: Euclidean: \( d(x, x') = \sqrt{\sum_{i=1}^{n} (x_i - x'_i)^2} \)
Manhattan: \( d(x, x') = \sum_{i=1}^{n} |x_i - x'_i| \)
Minkowski: \( d(x, x') = (\sum_{i=1}^{n} |x_i - x'_i|^p)^{1/p} \)
Prediction: \( \hat{y} = \text{mode}\{y_1, y_2, ..., y_k\} \) or \( \hat{y} = \frac{1}{k} \sum_{i=1}^{k} y_i \)
# KNN - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Generate data
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0,
n_clusters_per_class=1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Find optimal K
k_range = range(1, 31)
train_scores = []
test_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_scores.append(knn.score(X_train, y_train))
test_scores.append(knn.score(X_test, y_test))
# Best K
optimal_k = np.argmax(test_scores) + 1
print(f"Optimal K: {optimal_k}")
# Train with optimal K
knn_best = KNeighborsClassifier(n_neighbors=optimal_k)
knn_best.fit(X_train, y_train)
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. K vs Accuracy
axes[0,0].plot(k_range, train_scores, 'o-', label='Training')
axes[0,0].plot(k_range, test_scores, 's-', label='Testing')
axes[0,0].axvline(optimal_k, color='red', linestyle='--', label=f'Optimal K={optimal_k}')
axes[0,0].set_xlabel('K (Number of Neighbors)')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].set_title('K Selection (Elbow Method)')
axes[0,0].legend()
axes[0,0].grid(True)
# 2. Decision Boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
Z = knn_best.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[0,1].contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
axes[0,1].scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,1].set_title(f'Decision Boundary (K={optimal_k})')
# 3. Distance Metric Comparison
metrics = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
metric_scores = []
for metric in metrics:
knn_m = KNeighborsClassifier(n_neighbors=5, metric=metric)
knn_m.fit(X_train, y_train)
metric_scores.append(knn_m.score(X_test, y_test))
axes[1,0].bar(metrics, metric_scores, color=['#00ffff', '#ff00ff', '#ffff00', '#00ff00'])
axes[1,0].set_ylabel('Accuracy')
axes[1,0].set_title('Distance Metric Comparison')
axes[1,0].tick_params(axis='x', rotation=45)
# 4. Weight Comparison
weights = ['uniform', 'distance']
weight_scores = []
for weight in weights:
knn_w = KNeighborsClassifier(n_neighbors=5, weights=weight)
knn_w.fit(X_train, y_train)
weight_scores.append(knn_w.score(X_test, y_test))
axes[1,1].bar(weights, weight_scores, color=['#00ffff', '#ff00ff'])
axes[1,1].set_ylabel('Accuracy')
axes[1,1].set_title('Weighting Scheme Comparison')
plt.tight_layout()
plt.show()
Bayes Rule: \( P(y|x) = \frac{P(x|y) P(y)}{P(x)} \)
Naive Assumption: Features are conditionally independent: \( P(x|y) = \prod_{i=1}^{n} P(x_i|y) \)
Classification: \( \hat{y} = \arg\max_y P(y) \prod_{i=1}^{n} P(x_i|y) \)
Gaussian NB: \( P(x_i|y) = \frac{1}{\sqrt{2\pi\sigma_y^2}} \exp\left(-\frac{(x_i - \mu_y)^2}{2\sigma_y^2}\right) \)
# Naive Bayes - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.datasets import load_iris, make_classification, fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
# Gaussian NB for continuous data
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print(f"Gaussian NB Accuracy: {accuracy_score(y_test, y_pred):.4f}")
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Class-conditional densities for 2 features
X_2d = X[:, :2]
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(X_2d, y, test_size=0.3, random_state=42)
gnb_2d = GaussianNB()
gnb_2d.fit(X_train_2d, y_train_2d)
x_min, x_max = X_2d[:, 0].min() - 0.5, X_2d[:, 0].max() + 0.5
y_min, y_max = X_2d[:, 1].min() - 0.5, X_2d[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = gnb_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axes[0,0].contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
scatter = axes[0,0].scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,0].set_xlabel(iris.feature_names[0])
axes[0,0].set_ylabel(iris.feature_names[1])
axes[0,0].set_title('Naive Bayes Decision Boundary')
# 2. Prior probabilities
axes[0,1].bar(iris.target_names, gnb.class_prior_, color=['#00ffff', '#ff00ff', '#ffff00'])
axes[0,1].set_ylabel('Prior Probability')
axes[0,1].set_title('Class Prior Probabilities')
# 3. Feature means per class
x_pos = np.arange(len(iris.feature_names))
width = 0.25
for i, class_name in enumerate(iris.target_names):
axes[1,0].bar(x_pos + i*width, gnb.theta_[i], width, label=class_name)
axes[1,0].set_xticks(x_pos + width)
axes[1,0].set_xticklabels(iris.feature_names, rotation=45)
axes[1,0].set_ylabel('Mean Feature Value')
axes[1,0].set_title('Class-conditional Means')
axes[1,0].legend()
# 4. Variance per class
for i, class_name in enumerate(iris.target_names):
axes[1,1].bar(x_pos + i*width, gnb.var_[i], width, label=class_name)
axes[1,1].set_xticks(x_pos + width)
axes[1,1].set_xticklabels(iris.feature_names, rotation=45)
axes[1,1].set_ylabel('Variance')
axes[1,1].set_title('Class-conditional Variances')
axes[1,1].legend()
plt.tight_layout()
plt.show()
# Text classification with Multinomial NB
categories = ['alt.atheism', 'sci.space', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=1000)
X_text = vectorizer.fit_transform(newsgroups.data)
y_text = newsgroups.target
mnb = MultinomialNB()
mnb.fit(X_text, y_text)
print(f"\nText Classification Accuracy: {mnb.score(X_text, y_text):.4f}")
Neuron Output: \( z = \sum_{i=1}^{n} w_i x_i + b = W^T x + b \)
Activation: \( a = \sigma(z) \) where σ can be:
- Sigmoid: \( \sigma(z) = \frac{1}{1 + e^{-z}} \)
- ReLU: \( \sigma(z) = \max(0, z) \)
- Tanh: \( \sigma(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}} \)
Backpropagation: \( \frac{\partial L}{\partial w} = \frac{\partial L}{\partial a} \cdot \frac{\partial a}{\partial z} \cdot \frac{\partial z}{\partial w} \)
Weight Update: \( w_{new} = w_{old} - \eta \frac{\partial L}{\partial w} \)
# Neural Network with TensorFlow/Keras
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
# Binary classification with moons dataset
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
solver='adam', learning_rate_init=0.001,
max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
print(f"Training Accuracy: {mlp.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {mlp.score(X_test, y_test):.4f}")
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Decision Boundary
x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
Z = mlp.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
axes[0,0].contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
axes[0,0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.RdYlBu, edgecolors='k')
axes[0,0].set_title('Neural Network Decision Boundary')
# 2. Loss Curve
axes[0,1].plot(mlp.loss_curve_)
axes[0,1].set_xlabel('Iterations')
axes[0,1].set_ylabel('Loss')
axes[0,1].set_title('Training Loss Curve')
axes[0,1].grid(True)
# 3. Activation Functions
x = np.linspace(-5, 5, 100)
axes[1,0].plot(x, np.maximum(0, x), label='ReLU', linewidth=2)
axes[1,0].plot(x, 1/(1+np.exp(-x)), label='Sigmoid', linewidth=2)
axes[1,0].plot(x, np.tanh(x), label='Tanh', linewidth=2)
axes[1,0].set_xlabel('Input')
axes[1,0].set_ylabel('Output')
axes[1,0].set_title('Activation Functions')
axes[1,0].legend()
axes[1,0].grid(True)
# 4. Architecture Diagram
layer_sizes = [2, 64, 32, 1]
max_neurons = max(layer_sizes)
for i, size in enumerate(layer_sizes):
x_pos = i
y_positions = np.linspace(-size/2, size/2, size)
for j, y_pos in enumerate(y_positions):
circle = plt.Circle((x_pos, y_pos), 0.2, color='cyan', alpha=0.7)
axes[1,1].add_patch(circle)
if i < len(layer_sizes) - 1:
next_y_positions = np.linspace(-layer_sizes[i+1]/2, layer_sizes[i+1]/2, layer_sizes[i+1])
for next_y in next_y_positions:
axes[1,1].plot([x_pos, x_pos+1], [y_pos, next_y], 'gray', alpha=0.3, linewidth=0.5)
axes[1,1].set_xlim(-0.5, len(layer_sizes)-0.5)
axes[1,1].set_ylim(-max_neurons/2-1, max_neurons/2+1)
axes[1,1].set_aspect('equal')
axes[1,1].axis('off')
axes[1,1].set_title('Network Architecture: 2-64-32-1')
plt.tight_layout()
plt.show()
Covariance Matrix: \( \Sigma = \frac{1}{n-1} X^T X \)
Eigenvalue Problem: \( \Sigma v = \lambda v \)
Principal Components: Eigenvectors sorted by eigenvalues (descending)
Transformation: \( Z = X W \) where W is matrix of eigenvectors
Explained Variance: \( \frac{\lambda_i}{\sum_{j=1}^{p} \lambda_j} \)
# PCA - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris, load_digits, load_wine
from sklearn.preprocessing import StandardScaler
# Load and standardize data
iris = load_iris()
X, y = iris.data, iris.target
# Standardization is crucial for PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
# Explained variance ratio
explained_var = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
colors = ['red', 'green', 'blue']
for i, target_name in enumerate(iris.target_names):
axes[0,0].scatter(X_pca[y == i, 0], X_pca[y == i, 1],
color=colors[i], label=target_name, alpha=0.7, s=50)
axes[0,0].set_xlabel(f'PC1 ({explained_var[0]:.1%} variance)')
axes[0,0].set_ylabel(f'PC2 ({explained_var[1]:.1%} variance)')
axes[0,0].set_title('PCA 2D Projection')
axes[0,0].legend()
axes[0,0].grid(True)
# Scree plot
axes[0,1].bar(range(1, len(explained_var) + 1), explained_var, alpha=0.7, label='Individual')
axes[0,1].plot(range(1, len(explained_var) + 1), cumulative_var, 'ro-', label='Cumulative')
axes[0,1].set_xlabel('Principal Component')
axes[0,1].set_ylabel('Explained Variance Ratio')
axes[0,1].set_title('Scree Plot')
axes[0,1].legend()
axes[0,1].grid(True)
# Biplot - show original feature directions
feature_vectors = pca.components_[:2].T
axes[1,0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.5)
for i, feature in enumerate(iris.feature_names):
axes[1,0].arrow(0, 0, feature_vectors[i, 0]*3, feature_vectors[i, 1]*3,
head_width=0.1, head_length=0.1, fc='red', ec='red')
axes[1,0].text(feature_vectors[i, 0]*3.2, feature_vectors[i, 1]*3.2, feature, color='red')
axes[1,0].set_xlabel('PC1')
axes[1,0].set_ylabel('PC2')
axes[1,0].set_title('Biplot with Feature Vectors')
axes[1,0].grid(True)
# 3D PCA (first 3 components)
ax3d = fig.add_subplot(2, 2, 4, projection='3d')
for i, target_name in enumerate(iris.target_names):
ax3d.scatter(X_pca[y == i, 0], X_pca[y == i, 1], X_pca[y == i, 2],
color=colors[i], label=target_name, alpha=0.7, s=30)
ax3d.set_xlabel('PC1')
ax3d.set_ylabel('PC2')
ax3d.set_zlabel('PC3')
ax3d.set_title('3D PCA Projection')
ax3d.legend()
plt.tight_layout()
plt.show()
# Optimal number of components
n_components_95 = np.argmax(cumulative_var >= 0.95) + 1
print(f"Components needed for 95% variance: {n_components_95}")
print(f"Variance explained by first 2 PCs: {cumulative_var[1]:.2%}")
Additive Model: \( F_M(x) = \sum_{m=1}^{M} \alpha_m h_m(x) \)
Gradient Descent: Fit new model to negative gradient: \( h_m = -\frac{\partial L(y, F_{m-1}(x))}{\partial F_{m-1}(x)} \)
Shrinkage: \( F_m(x) = F_{m-1}(x) + \nu \cdot h_m(x) \) where ν is learning rate
Regularization: \( \Omega(f) = \gamma T + \frac{1}{2} \lambda \sum_{j=1}^{T} w_j^2 \) (tree complexity)
# Gradient Boosting - Complete Implementation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Load dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=6,
n_redundant=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
max_depth=3, subsample=0.8,
random_state=42)
gb.fit(X_train, y_train)
# Predictions
y_pred = gb.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
# Staged predictions for learning curves
train_scores = []
test_scores = []
for stage_pred in gb.staged_predict(X_train):
train_scores.append(accuracy_score(y_train, stage_pred))
for stage_pred in gb.staged_predict(X_test):
test_scores.append(accuracy_score(y_test, stage_pred))
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Training Deviance (Loss Curve)
axes[0,0].plot(np.arange(1, len(train_scores) + 1), train_scores, 'b-', label='Training')
axes[0,0].plot(np.arange(1, len(test_scores) + 1), test_scores, 'r-', label='Testing')
axes[0,0].set_xlabel('Boosting Iterations')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].set_title('Training Progress')
axes[0,0].legend()
axes[0,0].grid(True)
# 2. Feature Importance
feature_importance = gb.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
axes[0,1].barh(pos, feature_importance[sorted_idx], align='center')
axes[0,1].set_yticks(pos)
axes[0,1].set_yticklabels([f'Feature {i}' for i in sorted_idx])
axes[0,1].set_xlabel('Importance')
axes[0,1].set_title('Feature Importance')
# 3. Learning Rate vs N Estimators Heatmap
learning_rates = [0.01, 0.05, 0.1, 0.5]
n_estimators_range = [50, 100, 200, 300]
heatmap_data = []
for lr in learning_rates:
row = []
for n_est in n_estimators_range:
gb_temp = GradientBoostingClassifier(learning_rate=lr, n_estimators=n_est, random_state=42)
gb_temp.fit(X_train, y_train)
row.append(gb_temp.score(X_test, y_test))
heatmap_data.append(row)
im = axes[1,0].imshow(heatmap_data, cmap='YlOrRd', aspect='auto')
axes[1,0].set_xticks(range(len(n_estimators_range)))
axes[1,0].set_xticklabels(n_estimators_range)
axes[1,0].set_yticks(range(len(learning_rates)))
axes[1,0].set_yticklabels(learning_rates)
axes[1,0].set_xlabel('N Estimators')
axes[1,0].set_ylabel('Learning Rate')
axes[1,0].set_title('Parameter Grid Search')
plt.colorbar(im, ax=axes[1,0])
# 4. Out-of-bag improvement (if subsample < 1)
if hasattr(gb, 'oob_improvement_') and gb.subsample < 1.0:
axes[1,1].plot(np.arange(1, len(gb.oob_improvement_) + 1), gb.oob_improvement_)
axes[1,1].set_xlabel('Iteration')
axes[1,1].set_ylabel('OOB Improvement')
axes[1,1].set_title('Out-of-Bag Loss Improvement')
else:
# Partial Dependence Plot (simplified)
axes[1,1].text(0.5, 0.5, 'Partial Dependence\n(Requires sklearn.inspection)',
ha='center', va='center', fontsize=12)
axes[1,1].axis('off')
plt.tight_layout()
plt.show()
# Comparison with Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print(f"\nRandom Forest Accuracy: {rf.score(X_test, y_test):.4f}")
print(f"Gradient Boosting Accuracy: {gb.score(X_test, y_test):.4f}")
The most famous dataset in machine learning history. Contains 150 samples of iris flowers with 4 features: sepal length, sepal width, petal length, petal width. Three classes: Setosa, Versicolor, Virginica.
# Iris Dataset - Complete Analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
# Statistical summary
print("Dataset Shape:", df.shape)
print("\nClass Distribution:")
print(df['target'].value_counts().sort_index())
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Pairplot equivalent - scatter matrix
colors = ['red', 'green', 'blue']
for i in range(3):
axes[0,0].scatter(X[y==i, 0], X[y==i, 1], c=colors[i], label=iris.target_names[i], alpha=0.6)
axes[0,0].set_xlabel('Sepal Length')
axes[0,0].set_ylabel('Sepal Width')
axes[0,0].legend()
axes[0,0].set_title('Sepal Dimensions')
# 2. Box plots
df_melted = df.melt(id_vars=['target'], var_name='feature', value_name='value')
sns.boxplot(data=df_melted, x='feature', y='value', hue='target', ax=axes[0,1])
axes[0,1].set_title('Feature Distributions by Class')
axes[0,1].tick_params(axis='x', rotation=45)
# 3. Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, ax=axes[1,0])
axes[1,0].set_title('Feature Correlation Matrix')
# 4. PCA 2D projection
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
for i in range(3):
axes[1,1].scatter(X_pca[y==i, 0], X_pca[y==i, 1], c=colors[i], label=iris.target_names[i], alpha=0.6)
axes[1,1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
axes[1,1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
axes[1,1].legend()
axes[1,1].set_title('PCA 2D Projection')
plt.tight_layout()
plt.show()
# Dataset info
print(f"\nExplained variance ratio (first 2 PCs): {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")
# AI Fundamentals - Search Algorithm Example (A* Search)
import heapq
class Node:
def __init__(self, state, parent=None, g=0, h=0):
self.state = state
self.parent = parent
self.g = g # Cost from start
self.h = h # Heuristic to goal
self.f = g + h
def __lt__(self, other):
return self.f < other.f
def astar_search(start, goal, neighbors_func, heuristic):
"""A* Search Algorithm Implementation"""
open_set = [Node(start, g=0, h=heuristic(start, goal))]
closed_set = set()
while open_set:
current = heapq.heappop(open_set)
if current.state == goal:
# Reconstruct path
path = []
while current:
path.append(current.state)
current = current.parent
return path[::-1]
closed_set.add(current.state)
for neighbor, cost in neighbors_func(current.state):
if neighbor in closed_set:
continue
g = current.g + cost
h = heuristic(neighbor, goal)
node = Node(neighbor, current, g, h)
heapq.heappush(open_set, node)
return None # No path found
# Example: Grid pathfinding
def manhattan_distance(a, b):
return abs(a[0] - b[0]) + abs(a[1] - b[1])
# Usage
start = (0, 0)
goal = (5, 5)
print(f"A* Search from {start} to {goal}")
# Data Science with Python - Complete EDA Example
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
# Load data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = [iris.target_names[t] for t in iris.target]
# Data Overview
print("Dataset Shape:", df.shape)
print("\nData Info:")
print(df.info())
# Statistical Summary
print("\nDescriptive Statistics:")
print(df.describe())
# Data Cleaning Example
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. Distribution plots
sns.histplot(data=df, x='sepal length (cm)', hue='species', kde=True, ax=axes[0,0])
axes[0,0].set_title('Sepal Length Distribution')
# 2. Correlation heatmap
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', ax=axes[0,1])
axes[0,1].set_title('Feature Correlation Matrix')
# 3. Box plots
sns.boxplot(data=df, x='species', y='petal length (cm)', ax=axes[1,0])
axes[1,0].set_title('Petal Length by Species')
# 4. Pairplot subset
sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)',
hue='species', style='species', s=100, ax=axes[1,1])
axes[1,1].set_title('Sepal vs Petal Length')
plt.tight_layout()
plt.show()
# Advanced: Feature Engineering
df['sepal_petal_ratio'] = df['sepal length (cm)'] / df['petal length (cm)']
print(f"\nNew feature created: sepal_petal_ratio")
# ML for Research - Genomics Example (Gene Expression Classification)
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
# Simulated genomics dataset
# Features: gene expression levels for 1000 genes
# Target: disease subtype (0, 1, 2)
np.random.seed(42)
n_samples = 500
n_genes = 1000
# Generate synthetic gene expression data
X = np.random.randn(n_samples, n_genes)
# Create disease subtypes based on gene expression patterns
y = np.zeros(n_samples, dtype=int)
for i in range(n_samples):
if X[i, :50].mean() > 0.5:
y[i] = 1
elif X[i, 50:100].mean() > 0.3:
y[i] = 2
print(f"Dataset: {n_samples} samples, {n_genes} genes")
print(f"Classes: {np.bincount(y)}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Preprocessing: Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Dimensionality Reduction for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled)
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
# Cross-validation
cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print(f"\nCross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# Test performance
y_pred = rf.predict(X_test_scaled)
print(f"\nTest Accuracy: {rf.score(X_test_scaled, y_test):.4f}")
# Feature importance (top genes)
feature_importance = rf.feature_importances_
top_genes = np.argsort(feature_importance)[-20:][::-1]
print(f"\nTop 20 important genes: {top_genes}")
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 1. PCA visualization
colors = ['red', 'green', 'blue']
for i in range(3):
mask = y_train == i
axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1],
c=colors[i], label=f'Disease Type {i}', alpha=0.6)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
axes[0].set_title('Gene Expression PCA')
axes[0].legend()
axes[0].grid(True)
# 2. Feature importance
axes[1].barh(range(20), feature_importance[top_genes])
axes[1].set_yticks(range(20))
axes[1].set_yticklabels([f'Gene {i}' for i in top_genes])
axes[1].set_xlabel('Importance')
axes[1].set_title('Top 20 Important Genes')
plt.tight_layout()
plt.show()