Skip to content
Rain Hu's Workspace
Go back

[AI] 3-4. 線性迴歸

Rain Hu

目標

線性迴歸

sample

暴力解

import sys

areas = data[:,0]
prices = data[:,1]

def compute_loss(y_pred, y):
  return (y_pred - y)**2

best_w = 0.
best_b = 0.
min_loss = sys.float_info.max

# 猜 w=30-50, step = 0.1
# 猜 b=200-600 step = 1

for i in range(200):
    for j in range(400):
        w = 30 + i*0.1
        b = 200 + j*1
        loss = 0.
        for area, price in zip(areas, prices):
            y_pred = w * area + b
            loss += compute_loss(y_pred, price)
        if loss < min_loss:
            min_loss = loss
            best_w = w
            best_b = b

線性代數解法

梯度下降(gradient descent)

import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from matplotlib import cm

# 1. 資料正規化函數
def normalize_data(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

# 2. 建立並訓練模型的函數
def train_linear_regression(x_norm, y_norm, learning_rate=0.01, epochs=10):
    # 建立模型
    model = keras.Sequential([
        keras.layers.Dense(1, input_shape=(1,))
    ])
    
    # 編譯模型
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    
    # 用於記錄訓練過程的參數
    history = {'w': [], 'b': [], 'loss': []}
    
    class ParameterHistory(keras.callbacks.Callback):
        def on_epoch_begin(self, epoch, logs=None):
            w = self.model.layers[0].get_weights()[0][0][0]
            b = self.model.layers[0].get_weights()[1][0]
            loss = self.model.evaluate(x_norm, y_norm, verbose=0)
            history['w'].append(w)
            history['b'].append(b)
            history['loss'].append(loss)
            
    # 訓練模型
    parameter_history = ParameterHistory()
    model.fit(x_norm, y_norm, epochs=epochs, verbose=0, callbacks=[parameter_history])
    
    # 記錄最後一次的參數
    w = model.layers[0].get_weights()[0][0][0]
    b = model.layers[0].get_weights()[1][0]
    loss = model.evaluate(x_norm, y_norm, verbose=0)
    history['w'].append(w)
    history['b'].append(b)
    history['loss'].append(loss)
    
    return model, history

# 3. 視覺化函數
def plot_training_process(x_raw, y_raw, x_norm, y_norm, history):
    # 創建圖表
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # 將正規化的係數轉換回原始尺度
    w_raw_history = [w * np.std(y_raw) / np.std(x_raw) for w in history['w']]
    b_raw_history = [(b * np.std(y_raw) + np.mean(y_raw) - 
                     w * np.std(y_raw) * np.mean(x_raw) / np.std(x_raw))
                     for w, b in zip(history['w'], history['b'])]
    
    # Contour plot with raw scale
    margin_w = (max(w_raw_history) - min(w_raw_history)) * 0.5
    margin_b = (max(b_raw_history) - min(b_raw_history)) * 0.5
    w_raw_range = np.linspace(min(w_raw_history)-margin_w, max(w_raw_history)+margin_w, 100)
    b_raw_range = np.linspace(min(b_raw_history)-margin_b, max(b_raw_history)+margin_b, 100)
    W_RAW, B_RAW = np.meshgrid(w_raw_range, b_raw_range)
    Z = np.zeros_like(W_RAW)
    
    # 計算每個點的 MSE(在原始尺度上)
    for i in range(W_RAW.shape[0]):
        for j in range(W_RAW.shape[1]):
            y_pred = W_RAW[i,j] * x_raw + B_RAW[i,j]
            Z[i,j] = np.mean((y_pred - y_raw) ** 2)
    
    CS = ax1.contour(W_RAW, B_RAW, Z, levels=20)
    ax1.clabel(CS, inline=True, fontsize=8)
    ax1.plot(w_raw_history, b_raw_history, 'r.-', label='Training path')
    ax1.set_xlabel('w (原始尺度)')
    ax1.set_ylabel('b (原始尺度)')
    ax1.set_title('Contour Plot with Training Path (原始尺度)')
    ax1.legend()
    
    # Raw data scatter plot with regression lines
    ax2.scatter(x_raw, y_raw, alpha=0.5, label='Raw data')
    ax2.set_ylim(700, 2300)
    
    # 繪製每一輪的回歸線
    x_plot = np.linspace(min(x_raw), max(x_raw), 100)
    colors = cm.rainbow(np.linspace(0, 1, len(w_raw_history)))
    
    for i, (w, b) in enumerate(zip(w_raw_history, b_raw_history)):
        y_plot = w * x_plot + b
        ax2.plot(x_plot, y_plot, color=colors[i], alpha=0.3)
    
    ax2.set_xlabel('Area (坪)')
    ax2.set_ylabel('Price (萬)')
    ax2.set_title('Raw Data with Regression Lines')
    
    plt.tight_layout()
    plt.show()

# 載入數據
data = load_data()
x_raw, y_raw = data[:, 0], data[:, 1]

# 轉換為 TensorFlow 格式
x_raw = x_raw.reshape(-1, 1)
y_raw = y_raw.reshape(-1, 1)

# 正規化數據
x_norm = normalize_data(x_raw)
y_norm = normalize_data(y_raw)

# 訓練模型
model, history = train_linear_regression(x_norm, y_norm)

# 視覺化結果
plot_training_process(x_raw.flatten(), y_raw.flatten(), 
                        x_norm.flatten(), y_norm.flatten(), history)

# 輸出最終結果
final_w = history['w'][-1]
final_b = history['b'][-1]
final_loss = history['loss'][-1]

# 將係數轉換回原始尺度
w_raw = final_w * np.std(y_raw) / np.std(x_raw)
b_raw = (final_b * np.std(y_raw) + np.mean(y_raw) - 
        final_w * np.std(y_raw) * np.mean(x_raw) / np.std(x_raw))

print(f"Final equation: y = {w_raw[0]:.2f}x + {b_raw[0]:.2f}")
print(f"Final normalized loss: {final_loss:.6f}")

sample_with_gradient_descent

批次訓練(Batch)的概念

import numpy as np
import matplotlib.pyplot as plt

class LinearRegression:
    def __init__(self, learning_rate=0.0000001):
        self.w = 0.0
        self.b = 0.0
        self.lr = learning_rate
        self.loss_history = []
        
    def predict(self, X):
        return self.w * X + self.b
    
    def compute_loss(self, X, y):
        y_pred = self.predict(X)
        return np.mean((y_pred - y) ** 2)
    
    def compute_gradients(self, X, y):
        y_pred = self.predict(X)
        error = y_pred - y
        dw = np.mean(2 * error * X)
        db = np.mean(2 * error)
        return dw, db
    
    def train_batch(self, X, y, epochs=3000):
        """Full batch gradient descent"""
        for epoch in range(epochs):
            # Compute gradients using all data
            dw, db = self.compute_gradients(X, y)
            
            # Update parameters
            self.w -= self.lr * dw
            self.b -= self.lr * db
            
            # Record loss
            if epoch % 100 == 0:
                loss = self.compute_loss(X, y)
                self.loss_history.append(loss)
                print(f"Epoch {epoch}, Loss: {loss:.2f}")
                
    def train_mini_batch(self, X, y, batch_size=2, epochs=3000):
        """Mini-batch gradient descent"""
        n_samples = len(X)
        
        for epoch in range(epochs):
            # Shuffle the data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            # Mini-batch training
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # Compute gradients using batch data
                dw, db = self.compute_gradients(X_batch, y_batch)
                
                # Update parameters
                self.w -= self.lr * dw
                self.b -= self.lr * db
            
            # Record loss for the whole dataset
            if epoch % 100 == 0:
                loss = self.compute_loss(X, y)
                self.loss_history.append(loss)
                print(f"Epoch {epoch}, Loss: {loss:.2f}")
                
    def train_sgd(self, X, y, epochs=3000):
        """Stochastic gradient descent"""
        n_samples = len(X)
        
        for epoch in range(epochs):
            # Shuffle the data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            # SGD training (batch_size = 1)
            for i in range(n_samples):
                X_sample = X_shuffled[i:i+1]
                y_sample = y_shuffled[i:i+1]
                
                # Compute gradients using single sample
                dw, db = self.compute_gradients(X_sample, y_sample)
                
                # Update parameters
                self.w -= self.lr * dw
                self.b -= self.lr * db
            
            # Record loss for the whole dataset
            if epoch % 100 == 0:
                loss = self.compute_loss(X, y)
                self.loss_history.append(loss)
                print(f"Epoch {epoch}, Loss: {loss:.2f}")

(areas, prices) = load_data()
models = {
    'Batch': LinearRegression(learning_rate=1e-7),
    'Mini-batch': LinearRegression(learning_rate=1e-7),
    'SGD': LinearRegression(learning_rate=5e-8)
}

models['Batch'].train_batch(areas, prices)
models['Mini-batch'].train_mini_batch(areas, prices)
models['SGD'].train_sgd(areas, prices)

plt.figure(figsize=(10, 6))
for name, model in models.items():
    plt.plot(range(0, 3000, 100), model.loss_history, label=name)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Comparison')
plt.legend()
plt.grid(True)
plt.show()

for name, model in models.items():
    print(f"\n{name} Results:")
    print(f"w = {model.w:.6f}")
    print(f"b = {model.b:.6f}")
    print(f"Final Loss = {model.loss_histroy[-1]:.2f}")

損失函數(loss function)

L1/L2 正則化(L1/L2 Regularization)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 生成合成數據
np.random.seed(42)

def generate_synthetic_data(n_samples=100):
    # 生成基本特徵
    X1 = np.random.normal(0, 1, n_samples)  # 面積
    X2 = np.random.normal(0, 1, n_samples)  # 房齡
    
    # 生成共線性特徵(與面積高度相關的特徵,如房間數)
    X3 = 0.8 * X1 + 0.2 * np.random.normal(0, 1, n_samples)
    
    # 生成噪音特徵(完全無關的特徵)
    X4 = np.random.normal(0, 1, n_samples)
    
    # 組合特徵
    X = np.column_stack([X1, X2, X3, X4])
    
    # 生成目標值(房價)
    # 主要由X1和X2決定,X3有少許影響,X4完全不影響
    y = 3 * X1 + 2 * X2 + 0.5 * X3 + np.random.normal(0, 0.1, n_samples)
    
    return X, y

class RegularizedRegression:
    def __init__(self, learning_rate=1e-7, reg_type='l2', lambda_reg=0.1):
        self.w = 0.
        self.b = 0.
        self.lr = learning_rate
        self.reg_type = reg_type
        self.lambda_reg = lambda_reg
        self.loss_history = []
    
    def predict(self, X):
        return self.w * X + self.b

    def compute_loss(self, X, y):
        y_pred = self.predict(X)
        mse = np.mean((y_pred - y) ** 2)

        if self.reg_type == 'l1':
            reg_term = self.lambda_reg * np.abs(self.w)
        elif self.reg_type == 'l2':
            reg_term = self.lambda_reg * (self.w ** 2)
        else:
            reg_term = 0
        
        return mse + reg_term

    def compute_gradients(self, X, y):
        y_pred = self.predict(X)
        error = y_pred - y
        
        # mse 的梯度
        dw_mse = np.mean(2 * error * X)
        db = np.mean(2 * error)

        # 正則化項的梯度
        if self.reg_type == 'l1':
            dw_reg = self.lambda_reg * np.sign(self.w)
        elif self.reg_type == 'l2':
            dw_reg = self.lambda_reg * 2 * self.w
        else:
            dw_reg = 0
        dw = dw_mse + dw_reg

        return dw, db

    def train(self, X, y, epochs=3000):
        for epoch in range(epochs):
            dw, db = self.compute_gradients(X, y)

            self.w -= self.lr * dw
            self.b -= self.lr * db

    def train(self, X, y, batch_size=None, epochs=3000):
        n_samples = len(X)
        if batch_size is None:
            batch_size = n_samples
        for epoch in range(epochs):
            # Shuffle the data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            # Mini-batch training
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # Compute gradients using batch data
                dw, db = self.compute_gradients(X_batch, y_batch)
                
                # Update parameters
                self.w -= self.lr * dw
                self.b -= self.lr * db
            
            # Record loss for the whole dataset
            if epoch % 100 == 0:
                loss = self.compute_loss(X, y)
                self.loss_history.append(loss)
                print(f"Epoch {epoch}, Loss: {loss:.2f}")

regularization

激活函數(activation function)

import numpy as np
import matplotlib.pyplot as plt

class Activation:
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def sigmoid_derivative(x):
        sx = Activation.sigmoid(x)
        return sx * (1 - sx)

    @staticmethod
    def relu(x):
        return np.maximum(0, x)
    
    @staticmethod
    def relu_derivative(x):
        return np.where(x > 0, 1, 0)

    @staticmethod
    def tanh(x):
        return np.tanh(x)

    @staticmethod
    def tanh_derivative(x):
        return 1 - np.tanh(x) ** 2
import numpy as np
import matplotlib.pyplot as plt

class Activation:
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    @staticmethod
    def sigmoid_derivative(x):
        sx = Activation.sigmoid(x)
        return sx * (1 - sx)
    
    @staticmethod
    def relu(x):
        return np.maximum(0, x)
    
    @staticmethod
    def relu_derivative(x):
        return np.where(x > 0, 1, 0)
    
    @staticmethod
    def tanh(x):
        return np.tanh(x)
    
    @staticmethod
    def tanh_derivative(x):
        return 1 - np.tanh(x)**2

class NeuralNetwork:
    def __init__(self, activation='sigmoid'):
        # 網絡架構: 2 -> 4 -> 1
        self.W1 = np.random.randn(2, 4) * 0.1  # 輸入層到隱藏層的權重
        self.b1 = np.zeros((1, 4))             # 隱藏層偏差
        self.W2 = np.random.randn(4, 1) * 0.1  # 隱藏層到輸出層的權重
        self.b2 = np.zeros((1, 1))             # 輸出層偏差
        
        # 選擇激活函數
        if activation == 'sigmoid':
            self.activation = Activation.sigmoid
            self.activation_derivative = Activation.sigmoid_derivative
        elif activation == 'relu':
            self.activation = Activation.relu
            self.activation_derivative = Activation.relu_derivative
        elif activation == 'tanh':
            self.activation = Activation.tanh
            self.activation_derivative = Activation.tanh_derivative
        
        self.loss_history = []
    
    def forward(self, X):
        # 前向傳播
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.activation(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.activation(self.z2)
        return self.a2
    
    def backward(self, X, y, learning_rate=0.1):
        m = X.shape[0]
        
        # 計算梯度
        dz2 = self.a2 - y
        dW2 = np.dot(self.a1.T, dz2) / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        dz1 = np.dot(dz2, self.W2.T) * self.activation_derivative(self.z1)
        dW1 = np.dot(X.T, dz1) / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        # 更新權重
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
    
    def train(self, X, y, epochs=10000, learning_rate=0.1):
        for epoch in range(epochs):
            # 前向傳播
            output = self.forward(X)
            
            # 計算損失
            loss = np.mean((output - y) ** 2)
            self.loss_history.append(loss)
            
            # 反向傳播
            self.backward(X, y, learning_rate)
            
            if epoch % 1000 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict(self, X):
        return np.round(self.forward(X))

# 準備 XOR 數據
X = np.array([[0,0], [0,1], [1,0], [1,1]])
y = np.array([[0], [1], [1], [0]])

# 訓練不同激活函數的模型
activation_functions = ['sigmoid', 'relu', 'tanh']
models = {}

for activation in activation_functions:
    print(f"\nTraining with {activation} activation:")
    model = NeuralNetwork(activation=activation)
    model.train(X, y)
    models[activation] = model

# 繪製損失曲線比較
plt.figure(figsize=(10, 6))
for activation, model in models.items():
    plt.plot(model.loss_history, label=activation)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training Loss with Different Activation Functions')
plt.legend()
plt.grid(True)
plt.show()

# 測試預測結果
print("\nPrediction Results:")
for activation, model in models.items():
    print(f"\n{activation} activation:")
    predictions = model.predict(X)
    for x, y_true, y_pred in zip(X, y, predictions):
        print(f"Input: {x}, True: {y_true[0]}, Predicted: {y_pred[0]}")

# 視覺化決策邊界
plt.figure(figsize=(15, 5))
for i, (activation, model) in enumerate(models.items()):
    plt.subplot(1, 3, i+1)
    
    # 創建網格點
    xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100),
                        np.linspace(-0.5, 1.5, 100))
    grid = np.c_[xx.ravel(), yy.ravel()]
    
    # 預測
    Z = model.predict(grid)
    Z = Z.reshape(xx.shape)
    
    # 繪製決策邊界
    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=100)
    
    plt.title(f'{activation} Decision Boundary')
    plt.xlabel('Input 1')
    plt.ylabel('Input 2')

plt.tight_layout()
plt.show()

loss_plot prediction_plot

Sigmoid

適用時機:

不建議用在:

ReLU

適用時機:

主要優點:

Tanh

適用時機:

實際應用建議:

  1. 常見的最佳實踐組合
class NeuralNetwork:
    def __init__(self):
        self.hidden_activation = ReLU    # 隱藏層使用ReLU
        self.output_activation = Sigmoid  # 二分類輸出層使用Sigmoid
  1. 根據任務選擇
  1. 特殊情況

其它變體

  1. Leaky ReLu
    f(x) = x if x > 0 else αx # (α通常為0.01)
  1. ELU (Exponential Linear Unit)
    f(x) = x if x > 0 else α(exp(x) - 1)
  1. SELU (Scaled ELU)
    f(x) = λ(x if x > 0 else α(exp(x) - 1))
  1. GELU(Gaussian Error Linear Unit)
    f(x) = x * P(X ≤ x)
  1. Swish
    f(x) = x * sigmoid(βx)

Summary

  1. 先嘗試 ReLU:

    • 最簡單且通常效果不錯
    • 計算效率高
    • 容易優化
  2. 如果遇到問題,按順序嘗試:

    • Dead ReLU問題 → Leaky ReLU
    • 需要自歸一化 → SELU
    • 用於Transformer → GELU
    • 追求極致性能 → Swish
  3. 特殊情況:

    • 需要處理時序數據 → ELU或SELU
    • 計算資源受限 → 堅持使用ReLU
    • 特別關注梯度流動 → Leaky ReLU或ELU

優化器(Optimizers)


Share this post on:

Previous
[AI] 3-5. 邏輯斯迴歸(logistic regression)
Next
[AI] 3-3. 使用 TensorFlow 與 Keras 函式庫