利用梯度下降求一个凸函数的最小值

一、流程

  1. 求函数的导数
  2. 更新x,x_new = x - learning_rate * gradient
  3. 检查收敛性: |gradient| < tolerance

二、为什么沿着梯度方向就一定能达到最值?

需要用泰勒展开式进行推导。过程省略

 

import numpy as np
import matplotlib.pyplot as plt


def f(x):
    """目标函数: f(x) = (x+3)^2 + 1"""
    return (x + 3) ** 2 + 1


def grad_f(x):
    """梯度(导数): f'(x) = 2(x+3)"""
    return 2 * (x + 3)


def gradient_descent(learning_rate=0.1, max_iters=100, tolerance=1e-6, x0=0):
    """
    梯度下降算法

    参数:
    learning_rate: 学习率
    max_iters: 最大迭代次数
    tolerance: 收敛容差
    x0: 初始点

    返回:
    x_history: x的历史值
    f_history: 函数值的历史
    """
    x = x0
    x_history = [x]
    f_history = [f(x)]

    print(f"初始值: x0 = {x0:.4f}, f(x0) = {f(x0):.4f}")
    print("-" * 50)

    for i in range(max_iters):
        # 计算梯度
        gradient = grad_f(x)

        # 更新x
        x_new = x - learning_rate * gradient

        # 记录历史
        x_history.append(x_new)
        f_history.append(f(x_new))

        # 打印迭代信息
        if i < 10 or i % 10 == 9 or i == max_iters - 1:
            print(f"迭代 {i + 1:3d}: x = {x_new:8.6f}, f(x) = {f(x_new):8.6f}, 梯度 = {gradient:8.6f}")

        # 检查收敛
        if abs(gradient) < tolerance:
            print(f"\n在 {i + 1} 次迭代后收敛!")
            break

        x = x_new

    return x_history, f_history


def plot_results(x_history, f_history):
    """绘制优化过程"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    # 绘制函数曲线和优化路径
    x_vals = np.linspace(-5, 1, 100)
    y_vals = f(x_vals)

    ax1.plot(x_vals, y_vals, 'b-', label='f(x) = (x+3)² + 1', linewidth=2)
    ax1.plot(x_history, f_history, 'ro-', label='优化路径', markersize=4)
    ax1.set_xlabel('x')
    ax1.set_ylabel('f(x)')
    ax1.set_title('梯度下降优化过程')
    ax1.legend()
    ax1.grid(True)

    # 绘制函数值收敛情况
    ax2.semilogy(range(len(f_history)), f_history, 'g-o', markersize=4)
    ax2.set_xlabel('迭代次数')
    ax2.set_ylabel('f(x) (对数坐标)')
    ax2.set_title('函数值收敛情况')
    ax2.grid(True)

    plt.tight_layout()
    plt.show()


# 运行梯度下降
print("梯度下降法求解 f(x) = (x+3)² + 1 的最小值")
print("=" * 50)

# 不同学习率的比较
learning_rates = [0.01, 0.1, 0.3]
results = {}

for lr in learning_rates:
    print(f"\n学习率 α = {lr}:")
    x_hist, f_hist = gradient_descent(learning_rate=lr, x0=0)
    results[lr] = (x_hist, f_hist)
    print(f"最终结果: x = {x_hist[-1]:.6f}, f(x) = {f_hist[-1]:.6f}")

# 绘制最后一个学习率的结果
plot_results(results[0.1][0], results[0.1][1])

# 比较不同学习率的收敛速度
print("\n" + "=" * 50)
print("不同学习率的收敛情况比较:")
print("=" * 50)

for lr in learning_rates:
    x_hist, f_hist = results[lr]
    iterations = len(x_hist) - 1
    final_x = x_hist[-1]
    final_f = f_hist[-1]
    print(f"α = {lr:.2f}: {iterations:2d} 次迭代, x = {final_x:7.4f}, f(x) = {final_f:7.4f}")

# 理论最小值
print(f"\n理论最小值: x = -3.0000, f(x) = 1.0000")

 

posted @ 2025-12-02 15:14  ylxn  阅读(6)  评论(0)    收藏  举报