一、什么是深度学习?
1. 深度学习的定义
深度学习是机器学习的一个子领域,它通过模拟人脑的神经网络结构,使用多层非线性变换来学习数据的层次化表示。其核心思想是让机器自动学习特征表示,而不是依赖人工设计的特征。
# 深度学习与传统机器学习的对比
import matplotlib.pyplot as plt
# 可视化对比
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# 传统机器学习
axes[0].text(0.5, 0.5, '特征工程 → 模型训练\n\n人工特征设计\n+ 传统算法\n= 有限表现力',
ha='center', va='center', fontsize=12, fontweight='bold')
axes[0].set_title('传统机器学习', fontsize=14, fontweight='bold')
axes[0].axis('off')
# 深度学习
axes[1].text(0.5, 0.5, '原始数据 → 深度学习\n\n自动特征学习\n+ 多层神经网络\n= 强大表现力',
ha='center', va='center', fontsize=12, fontweight='bold')
axes[1].set_title('深度学习', fontsize=14, fontweight='bold')
axes[1].axis('off')
plt.suptitle('传统机器学习 vs 深度学习', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
2. 深度学习的历史里程碑
# 深度学习发展时间线
timeline_data = {
'1943': 'McCulloch & Pitts提出人工神经元模型',
'1958': 'Frank Rosenblatt发明感知机',
'1969': 'Minsky & Papert指出感知机的局限性',
'1986': '反向传播算法重新被发现',
'1998': 'Yann LeCun提出LeNet-5(卷积神经网络)',
'2006': 'Geoffrey Hinton提出深度信念网络',
'2012': 'AlexNet在ImageNet比赛中大获成功',
'2014': '生成对抗网络(GAN)被提出',
'2015': 'ResNet解决深度网络训练难题',
'2017': 'Transformer架构革命性突破',
'2020': 'GPT-3展现强大语言能力',
'2022': '扩散模型引领图像生成革命'
}
print("深度学习发展里程碑:")
for year, event in timeline_data.items():
print(f"{year}: {event}")
二、神经网络基础
1. 人工神经元模型
import numpy as np
class ArtificialNeuron:
"""实现基本的人工神经元"""
def __init__(self, input_size, activation='sigmoid'):
"""
初始化神经元
Args:
input_size: 输入特征数量
activation: 激活函数类型
"""
# 初始化权重和偏置
self.weights = np.random.randn(input_size) * 0.1
self.bias = np.random.randn() * 0.1
self.activation_type = activation
def activate(self, x):
"""前向传播计算"""
# 线性组合
z = np.dot(x, self.weights) + self.bias
# 应用激活函数
if self.activation_type == 'sigmoid':
return self._sigmoid(z)
elif self.activation_type == 'relu':
return self._relu(z)
elif self.activation_type == 'tanh':
return self._tanh(z)
else:
return z # 线性激活
def _sigmoid(self, x):
"""Sigmoid激活函数"""
return 1 / (1 + np.exp(-x))
def _relu(self, x):
"""ReLU激活函数"""
return np.maximum(0, x)
def _tanh(self, x):
"""Tanh激活函数"""
return np.tanh(x)
def __call__(self, x):
"""使神经元可调用"""
return self.activate(x)
# 使用示例
neuron = ArtificialNeuron(3, activation='sigmoid')
inputs = np.array([0.5, -0.2, 0.8])
output = neuron(inputs)
print(f"神经元输出: {output:.4f}")
2. 常用激活函数及其特性
import numpy as np
import matplotlib.pyplot as plt
def plot_activation_functions():
"""绘制常用激活函数"""
x = np.linspace(-5, 5, 100)
# 定义激活函数
functions = {
'Sigmoid': lambda x: 1 / (1 + np.exp(-x)),
'ReLU': lambda x: np.maximum(0, x),
'Leaky ReLU': lambda x: np.where(x > 0, x, 0.01 * x),
'Tanh': lambda x: np.tanh(x),
'Swish': lambda x: x * (1 / (1 + np.exp(-x))),
'ELU': lambda x: np.where(x > 0, x, np.exp(x) - 1),
'Softplus': lambda x: np.log(1 + np.exp(x))
}
# 创建子图
fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.ravel()
for idx, (name, func) in enumerate(functions.items()):
ax = axes[idx]
y = func(x)
ax.plot(x, y, 'b-', linewidth=2)
ax.axhline(y=0, color='k', linestyle='-', alpha=0.3)
ax.axvline(x=0, color='k', linestyle='-', alpha=0.3)
ax.grid(True, alpha=0.3)
ax.set_title(name, fontsize=12, fontweight='bold')
# 添加特性标注
if name == 'Sigmoid':
ax.text(0, 0.5, '输出范围: (0,1)\n平滑、可微\n存在梯度消失问题',
fontsize=9, ha='center')
elif name == 'ReLU':
ax.text(0, 2.5, '稀疏激活\n计算高效\n存在"死神经元"问题',
fontsize=9, ha='center')
# 调整布局
plt.tight_layout()
plt.suptitle('常用激活函数', fontsize=16, fontweight='bold', y=1.02)
plt.show()
# 绘制激活函数
plot_activation_functions()
三、深度学习架构类型
1. 前馈神经网络(FNN)
class FeedForwardNeuralNetwork:
"""实现简单的前馈神经网络"""
def __init__(self, layer_sizes, activations=None):
"""
初始化神经网络
Args:
layer_sizes: 每层神经元数量列表,如[10, 20, 5, 1]
activations: 每层的激活函数列表
"""
self.layer_sizes = layer_sizes
self.num_layers = len(layer_sizes) - 1
# 初始化权重和偏置
self.weights = []
self.biases = []
for i in range(self.num_layers):
# Xavier/Glorot初始化
scale = np.sqrt(2.0 / (layer_sizes[i] + layer_sizes[i+1]))
w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale
b = np.zeros(layer_sizes[i+1])
self.weights.append(w)
self.biases.append(b)
# 设置激活函数
if activations is None:
activations = ['relu'] * (self.num_layers - 1) + ['sigmoid']
self.activations = activations
def forward(self, X):
"""前向传播"""
self.activations_history = [X]
self.z_history = []
a = X
for i in range(self.num_layers):
z = np.dot(a, self.weights[i]) + self.biases[i]
self.z_history.append(z)
if self.activations[i] == 'sigmoid':
a = 1 / (1 + np.exp(-z))
elif self.activations[i] == 'relu':
a = np.maximum(0, z)
elif self.activations[i] == 'tanh':
a = np.tanh(z)
else:
a = z # 线性激活
self.activations_history.append(a)
return a
def predict(self, X):
"""预测"""
return self.forward(X)
def summary(self):
"""打印网络结构信息"""
print("=" * 50)
print("神经网络结构摘要")
print("=" * 50)
print(f"层数: {self.num_layers}")
print(f"神经元配置: {self.layer_sizes}")
print(f"激活函数: {self.activations}")
total_params = 0
for i, (w, b) in enumerate(zip(self.weights, self.biases)):
params = w.size + b.size
total_params += params
print(f"层 {i+1}: {w.shape[0]} → {w.shape[1]} "
f"(权重: {w.shape}, 偏置: {b.shape}) "
f"参数数: {params:,}")
print(f"总参数数: {total_params:,}")
print("=" * 50)
# 创建并测试神经网络
nn = FeedForwardNeuralNetwork(
layer_sizes=[10, 20, 15, 1],
activations=['relu', 'relu', 'sigmoid']
)
nn.summary()
# 测试前向传播
X_test = np.random.randn(5, 10) # 5个样本,10个特征
output = nn.predict(X_test)
print(f"\n输入形状: {X_test.shape}")
print(f"输出形状: {output.shape}")
2. 卷积神经网络(CNN)
class SimpleCNN:
"""实现简化的卷积神经网络"""
def __init__(self, input_shape=(28, 28, 1)):
"""初始化CNN"""
self.input_shape = input_shape
self.layers = []
def add_conv_layer(self, filters=32, kernel_size=3, activation='relu'):
"""添加卷积层"""
layer_info = {
'type': 'conv',
'filters': filters,
'kernel_size': kernel_size,
'activation': activation
}
self.layers.append(layer_info)
return self
def add_pooling_layer(self, pool_size=2, stride=2):
"""添加池化层"""
layer_info = {
'type': 'pool',
'pool_size': pool_size,
'stride': stride
}
self.layers.append(layer_info)
return self
def add_dense_layer(self, units, activation='relu'):
"""添加全连接层"""
layer_info = {
'type': 'dense',
'units': units,
'activation': activation
}
self.layers.append(layer_info)
return self
def add_flatten_layer(self):
"""添加展平层"""
self.layers.append({'type': 'flatten'})
return self
def forward(self, X):
"""前向传播(简化版)"""
# 模拟卷积操作
output = X
for layer in self.layers:
if layer['type'] == 'conv':
# 简化的卷积操作
output = self._conv_forward(output, layer)
elif layer['type'] == 'pool':
# 简化的池化操作
output = self._pool_forward(output, layer)
elif layer['type'] == 'flatten':
# 展平操作
output = output.reshape(output.shape[0], -1)
elif layer['type'] == 'dense':
# 全连接层
output = self._dense_forward(output, layer)
return output
def _conv_forward(self, X, layer_params):
"""简化的卷积前向传播"""
# 在实际实现中,这里会有实际的卷积计算
# 这里我们返回一个简化的结果
batch_size, height, width, channels = X.shape
filters = layer_params['filters']
# 简化的输出形状计算
kernel_size = layer_params['kernel_size']
output_height = height - kernel_size + 1
output_width = width - kernel_size + 1
return np.random.randn(batch_size, output_height, output_width, filters)
def _pool_forward(self, X, layer_params):
"""简化的池化前向传播"""
batch_size, height, width, channels = X.shape
pool_size = layer_params['pool_size']
# 简化的输出形状计算
output_height = height // pool_size
output_width = width // pool_size
return np.random.randn(batch_size, output_height, output_width, channels)
def _dense_forward(self, X, layer_params):
"""简化的全连接前向传播"""
units = layer_params['units']
activation = layer_params['activation']
# 线性变换
output = np.random.randn(X.shape[0], units)
# 激活函数
if activation == 'relu':
output = np.maximum(0, output)
elif activation == 'sigmoid':
output = 1 / (1 + np.exp(-output))
return output
def summary(self):
"""打印网络结构信息"""
print("=" * 60)
print("卷积神经网络结构摘要")
print("=" * 60)
print(f"输入形状: {self.input_shape}")
current_shape = self.input_shape
for i, layer in enumerate(self.layers):
layer_type = layer['type'].upper()
if layer_type == 'CONV':
filters = layer['filters']
kernel_size = layer['kernel_size']
print(f"层 {i+1}: 卷积层 "
f"({current_shape[0]}x{current_shape[1]}x{current_shape[2]}) → "
f"过滤器: {filters}, 核大小: {kernel_size}x{kernel_size}")
# 更新形状(简化计算)
current_shape = (
current_shape[0] - kernel_size + 1,
current_shape[1] - kernel_size + 1,
filters
)
elif layer_type == 'POOL':
pool_size = layer['pool_size']
print(f"层 {i+1}: 池化层 "
f"({current_shape[0]}x{current_shape[1]}x{current_shape[2]}) → "
f"池化大小: {pool_size}x{pool_size}")
# 更新形状
current_shape = (
current_shape[0] // pool_size,
current_shape[1] // pool_size,
current_shape[2]
)
elif layer_type == 'FLATTEN':
flattened_size = np.prod(current_shape)
print(f"层 {i+1}: 展平层 "
f"({current_shape[0]}x{current_shape[1]}x{current_shape[2]}) → "
f"({flattened_size},)")
current_shape = (flattened_size,)
elif layer_type == 'DENSE':
units = layer['units']
print(f"层 {i+1}: 全连接层 "
f"({current_shape[0] if isinstance(current_shape, tuple) else current_shape} ) → "
f"({units},)")
current_shape = (units,)
print("=" * 60)
# 创建并测试CNN
cnn = SimpleCNN(input_shape=(28, 28, 1))
cnn.add_conv_layer(filters=32, kernel_size=3, activation='relu')
cnn.add_pooling_layer(pool_size=2)
cnn.add_conv_layer(filters=64, kernel_size=3, activation='relu')
cnn.add_pooling_layer(pool_size=2)
cnn.add_flatten_layer()
cnn.add_dense_layer(units=128, activation='relu')
cnn.add_dense_layer(units=10, activation='softmax')
cnn.summary()
3. 循环神经网络(RNN)
class SimpleRNN:
"""实现简化的循环神经网络"""
def __init__(self, input_size, hidden_size, output_size):
"""初始化RNN"""
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
# 初始化参数
# 输入到隐藏层的权重
self.W_xh = np.random.randn(input_size, hidden_size) * 0.01
# 隐藏层到隐藏层的权重
self.W_hh = np.random.randn(hidden_size, hidden_size) * 0.01
# 隐藏层到输出层的权重
self.W_hy = np.random.randn(hidden_size, output_size) * 0.01
# 偏置项
self.b_h = np.zeros(hidden_size)
self.b_y = np.zeros(output_size)
# 缓存
self.history = {}
def forward(self, X):
"""
前向传播
Args:
X: 输入序列,形状为(seq_length, batch_size, input_size)
Returns:
输出序列
"""
seq_length, batch_size, _ = X.shape
# 初始化隐藏状态
h = np.zeros((batch_size, self.hidden_size))
# 存储历史值
self.history['h_states'] = [h]
self.history['inputs'] = X
# 存储输出
outputs = []
for t in range(seq_length):
# 当前时间步的输入
x_t = X[t]
# 更新隐藏状态
h = np.tanh(np.dot(x_t, self.W_xh) + np.dot(h, self.W_hh) + self.b_h)
self.history['h_states'].append(h)
# 计算输出
y_t = np.dot(h, self.W_hy) + self.b_y
outputs.append(y_t)
# 堆叠所有时间步的输出
return np.stack(outputs)
def backward(self, d_outputs):
"""反向传播(简化版)"""
# 在实际实现中,这里会有详细的反向传播计算
# 这里我们返回梯度的简化版本
gradients = {
'dW_xh': np.random.randn(*self.W_xh.shape) * 0.01,
'dW_hh': np.random.randn(*self.W_hh.shape) * 0.01,
'dW_hy': np.random.randn(*self.W_hy.shape) * 0.01,
'db_h': np.random.randn(*self.b_h.shape) * 0.01,
'db_y': np.random.randn(*self.b_y.shape) * 0.01
}
return gradients
def summary(self):
"""打印网络结构信息"""
print("=" * 50)
print("循环神经网络结构摘要")
print("=" * 50)
print(f"输入大小: {self.input_size}")
print(f"隐藏层大小: {self.hidden_size}")
print(f"输出大小: {self.output_size}")
print(f"总参数数: {self.W_xh.size + self.W_hh.size + self.W_hy.size + self.b_h.size + self.b_y.size:,}")
print("=" * 50)
def generate_sequence(self, seed, length=20):
"""生成序列(简化版)"""
# 在实际实现中,这里会有序列生成逻辑
generated = [seed]
h = np.zeros((1, self.hidden_size))
for i in range(length):
# 简化的序列生成
x = generated[-1]
h = np.tanh(np.dot(x, self.W_xh) + np.dot(h, self.W_hh) + self.b_h)
y = np.dot(h, self.W_hy) + self.b_y
# 添加一些随机性
next_item = y + np.random.randn(*y.shape) * 0.1
generated.append(next_item)
return np.stack(generated)
# 创建并测试RNN
rnn = SimpleRNN(input_size=10, hidden_size=20, output_size=5)
rnn.summary()
# 测试前向传播
seq_length = 15
batch_size = 8
X_test = np.random.randn(seq_length, batch_size, 10)
output = rnn.forward(X_test)
print(f"\n输入形状: {X_test.shape}")
print(f"输出形状: {output.shape}")
四、深度学习框架比较
1. 主要框架对比
import pandas as pd
# 创建框架对比表格
frameworks_data = {
'框架': ['TensorFlow', 'PyTorch', 'Keras', 'MXNet', 'JAX', 'PaddlePaddle'],
'发布年份': [2015, 2016, 2015, 2015, 2018, 2016],
'开发者': ['Google', 'Facebook', 'François Chollet', 'Amazon', 'Google', '百度'],
'主要语言': ['Python/C++', 'Python/C++', 'Python', 'Python/C++', 'Python', 'Python'],
'易用性': [3, 5, 5, 3, 4, 4],
'灵活性': [5, 5, 3, 5, 5, 4],
'部署能力': [5, 4, 3, 5, 4, 5],
'社区规模': [5, 5, 4, 3, 3, 3],
'主要优势': [
'生产部署、生态完善',
'动态图、研究友好',
'API简洁、快速原型',
'分布式训练、多语言',
'函数式、自动微分',
'中文文档、国产框架'
]
}
frameworks_df = pd.DataFrame(frameworks_data)
# 格式化输出
print("深度学习框架对比")
print("=" * 100)
print(frameworks_df.to_string(index=False))
print("\n" + "=" * 100)
# 创建选择指南
print("\n选择指南:")
print("1. TensorFlow: 适合生产部署,企业级应用,需要强大生态系统")
print("2. PyTorch: 适合学术研究,快速原型开发,动态计算图")
print("3. Keras: 适合初学者,快速上手,高级API")
print("4. MXNet: 适合分布式训练,多语言支持")
print("5. JAX: 适合函数式编程,数值计算,自动微分")
print("6. PaddlePaddle: 国产框架,中文文档丰富,工业级应用")
2. 框架安装与简单示例
def get_framework_setup_guide():
"""获取框架安装和使用指南"""
guides = {
'TensorFlow': {
'安装': 'pip install tensorflow',
'导入': 'import tensorflow as tf',
'简单示例': '''# 创建简单的神经网络
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])''',
'最新版本': '2.x (2023年)'
},
'PyTorch': {
'安装': 'pip install torch torchvision',
'导入': 'import torch',
'简单示例': '''# 创建简单的神经网络
class Net(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc1 = torch.nn.Linear(784, 64)
self.fc2 = torch.nn.Linear(64, 10)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x''',
'最新版本': '2.0+ (2023年)'
},
'Keras': {
'安装': 'pip install keras',
'导入': 'from keras import layers, models',
'简单示例': '''# 创建简单的神经网络
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(784,)))
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])''',
'最新版本': '3.0 (2023年)'
}
}
return guides
# 打印框架指南
guides = get_framework_setup_guide()
for framework, info in guides.items():
print(f"\n{'='*60}")
print(f"{framework} 快速指南")
print(f"{'='*60}")
for key, value in info.items():
print(f"{key}: {value}")
五、深度学习的关键概念
1. 损失函数
import numpy as np
import matplotlib.pyplot as plt
class LossFunctions:
"""实现常见的损失函数"""
@staticmethod
def mse(y_true, y_pred):
"""均方误差 (Mean Squared Error)"""
return np.mean((y_true - y_pred) ** 2)
@staticmethod
def mae(y_true, y_pred):
"""平均绝对误差 (Mean Absolute Error)"""
return np.mean(np.abs(y_true - y_pred))
@staticmethod
def binary_crossentropy(y_true, y_pred, epsilon=1e-7):
"""二分类交叉熵"""
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
@staticmethod
def categorical_crossentropy(y_true, y_pred, epsilon=1e-7):
"""多分类交叉熵"""
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
return -np.mean(np.sum(y_true * np.log(y_pred), axis=-1))
@staticmethod
def huber_loss(y_true, y_pred, delta=1.0):
"""Huber损失 (结合MSE和MAE的优点)"""
error = y_true - y_pred
abs_error = np.abs(error)
quadratic = np.minimum(abs_error, delta)
linear = abs_error - quadratic
return np.mean(0.5 * quadratic ** 2 + delta * linear)
@staticmethod
def contrastive_loss(y_true, y_pred, margin=1.0):
"""对比损失 (用于度量学习)"""
positive_distance = y_true * y_pred ** 2
negative_distance = (1 - y_true) * np.maximum(margin - y_pred, 0) ** 2
return np.mean(positive_distance + negative_distance)
@staticmethod
def visualize_loss_functions():
"""可视化损失函数"""
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()
x = np.linspace(-3, 3, 100)
y_true = 0 # 假设真实值为0
# MSE
y_pred = x
loss = (y_true - y_pred) ** 2
axes[0].plot(x, loss, 'b-', linewidth=2)
axes[0].set_title('MSE (均方误差)', fontweight='bold')
axes[0].set_xlabel('预测值')
axes[0].set_ylabel('损失')
axes[0].grid(True, alpha=0.3)
# MAE
loss = np.abs(y_true - y_pred)
axes[1].plot(x, loss, 'r-', linewidth=2)
axes[1].set_title('MAE (平均绝对误差)', fontweight='bold')
axes[1].set_xlabel('预测值')
axes[1].grid(True, alpha=0.3)
# Huber Loss
delta = 1.0
loss = np.where(np.abs(x) <= delta, 0.5 * x ** 2, delta * (np.abs(x) - 0.5 * delta))
axes[2].plot(x, loss, 'g-', linewidth=2)
axes[2].set_title('Huber损失', fontweight='bold')
axes[2].set_xlabel('预测值')
axes[2].grid(True, alpha=0.3)
# Binary Crossentropy (假设真实值为1)
y_true_binary = 1
y_pred_binary = 1 / (1 + np.exp(-x)) # Sigmoid变换
loss = - (y_true_binary * np.log(y_pred_binary) +
(1 - y_true_binary) * np.log(1 - y_pred_binary))
axes[3].plot(x, loss, 'm-', linewidth=2)
axes[3].set_title('二分类交叉熵 (y_true=1)', fontweight='bold')
axes[3].set_xlabel('预测值 (z)')
axes[3].grid(True, alpha=0.3)
# 对比损失
distance = np.abs(x)
margin = 1.0
y_true_contrastive = np.ones_like(x) # 假设是正样本对
loss = y_true_contrastive * distance ** 2
axes[4].plot(x, loss, 'c-', linewidth=2, label='正样本')
y_true_contrastive = np.zeros_like(x) # 假设是负样本对
loss = (1 - y_true_contrastive) * np.maximum(margin - distance, 0) ** 2
axes[4].plot(x, loss, 'y-', linewidth=2, label='负样本')
axes[4].set_title('对比损失', fontweight='bold')
axes[4].set_xlabel('距离')
axes[4].legend()
axes[4].grid(True, alpha=0.3)
# 损失函数应用场景
axes[5].axis('off')
axes[5].text(0.5, 0.5,
'损失函数选择指南:\n\n'
'• MSE: 回归问题,对异常值敏感\n'
'• MAE: 回归问题,对异常值鲁棒\n'
'• Huber: 结合MSE和MAE的优点\n'
'• Binary CE: 二分类问题\n'
'• Categorical CE: 多分类问题\n'
'• Contrastive: 度量学习,相似度计算',
ha='center', va='center', fontsize=11,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.suptitle('深度学习常用损失函数', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
# 测试损失函数
loss_funcs = LossFunctions()
# 测试数据
y_true = np.array([1, 0, 1, 0])
y_pred = np.array([0.9, 0.2, 0.8, 0.3])
print("损失函数计算结果:")
print(f"MSE: {loss_funcs.mse(y_true, y_pred):.4f}")
print(f"MAE: {loss_funcs.mae(y_true, y_pred):.4f}")
print(f"Binary Crossentropy: {loss_funcs.binary_crossentropy(y_true, y_pred):.4f}")
# 可视化损失函数
LossFunctions.visualize_loss_functions()
2. 优化器
class OptimizerComparison:
"""优化器比较和可视化"""
@staticmethod
def visualize_optimization_path():
"""可视化不同优化器的优化路径"""
# 定义测试函数 (Rosenbrock函数,有全局最小值)
def rosenbrock(x, y):
return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
# 生成网格
x = np.linspace(-2, 2, 100)
y = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x, y)
Z = rosenbrock(X, Y)
# 优化器模拟
optimizers = {
'SGD': {
'lr': 0.01,
'momentum': 0.0
},
'SGD with Momentum': {
'lr': 0.01,
'momentum': 0.9
},
'Adam': {
'lr': 0.01,
'beta1': 0.9,
'beta2': 0.999
},
'RMSprop': {
'lr': 0.01,
'rho': 0.9
},
'Adagrad': {
'lr': 0.1
}
}
# 创建图形
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()
for idx, (name, params) in enumerate(optimizers.items()):
ax = axes[idx]
# 绘制等高线
ax.contour(X, Y, Z, levels=np.logspace(-1, 3, 20), alpha=0.5)
# 初始化参数
x_pos, y_pos = -1.5, 2.5
# 存储轨迹
trajectory = [(x_pos, y_pos)]
# 模拟优化过程
for step in range(100):
# 计算梯度
grad_x = -2 * (1 - x_pos) - 400 * x_pos * (y_pos - x_pos ** 2)
grad_y = 200 * (y_pos - x_pos ** 2)
# 应用不同优化器更新规则
if name == 'SGD':
x_pos -= params['lr'] * grad_x
y_pos -= params['lr'] * grad_y
elif name == 'SGD with Momentum':
# 简化的动量实现
if step == 0:
vx, vy = 0, 0
vx = params['momentum'] * vx + params['lr'] * grad_x
vy = params['momentum'] * vy + params['lr'] * grad_y
x_pos -= vx
y_pos -= vy
elif name == 'Adam':
# 简化的Adam实现
if step == 0:
m1x, m1y = 0, 0
m2x, m2y = 0, 0
m1x = params['beta1'] * m1x + (1 - params['beta1']) * grad_x
m1y = params['beta1'] * m1y + (1 - params['beta1']) * grad_y
m2x = params['beta2'] * m2x + (1 - params['beta2']) * grad_x ** 2
m2y = params['beta2'] * m2y + (1 - params['beta2']) * grad_y ** 2
# 偏置校正
m1x_hat = m1x / (1 - params['beta1'] ** (step + 1))
m1y_hat = m1y / (1 - params['beta1'] ** (step + 1))
m2x_hat = m2x / (1 - params['beta2'] ** (step + 1))
m2y_hat = m2y / (1 - params['beta2'] ** (step + 1))
x_pos -= params['lr'] * m1x_hat / (np.sqrt(m2x_hat) + 1e-8)
y_pos -= params['lr'] * m1y_hat / (np.sqrt(m2y_hat) + 1e-8)
trajectory.append((x_pos, y_pos))
# 绘制轨迹
trajectory = np.array(trajectory)
ax.plot(trajectory[:, 0], trajectory[:, 1], 'ro-', linewidth=2, markersize=3)
ax.plot(trajectory[0, 0], trajectory[0, 1], 'go', markersize=8, label='起点')
ax.plot(trajectory[-1, 0], trajectory[-1, 1], 'bo', markersize=8, label='终点')
ax.set_title(name, fontweight='bold')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend()
ax.grid(True, alpha=0.3)
# 优化器选择指南
axes[5].axis('off')
axes[5].text(0.5, 0.5,
'优化器选择指南:\n\n'
'• SGD: 简单,收敛慢,可能震荡\n'
'• SGD+Momentum: 减少震荡,加速收敛\n'
'• Adam: 自适应学习率,通常表现好\n'
'• RMSprop: 适合非平稳目标\n'
'• Adagrad: 适合稀疏数据\n\n'
'一般推荐: Adam (默认选择)',
ha='center', va='center', fontsize=11,
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
plt.suptitle('优化器比较:在Rosenbrock函数上的优化路径',
fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
@staticmethod
def optimizer_summary():
"""优化器特性总结"""
optimizers_info = {
'SGD': {
'公式': 'θ = θ - η·∇J(θ)',
'优点': '简单,理论基础强',
'缺点': '收敛慢,易震荡',
'适用场景': '凸优化问题'
},
'Momentum': {
'公式': 'v = βv + η·∇J(θ)\nθ = θ - v',
'优点': '加速收敛,减少震荡',
'缺点': '需要调节β参数',
'适用场景': '深度学习训练'
},
'Adam': {
'公式': '复杂,结合动量和自适应学习率',
'优点': '自适应学习率,通常表现优异',
'缺点': '内存占用稍大',
'适用场景': '深度学习(默认推荐)'
},
'RMSprop': {
'公式': 'E[g²] = ρE[g²] + (1-ρ)g²\nθ = θ - η·g/√(E[g²]+ε)',
'优点': '自适应学习率,适合非平稳目标',
'缺点': '需要调节ρ参数',
'适用场景': 'RNN训练'
},
'Adagrad': {
'公式': 'G = G + g⊙g\nθ = θ - η·g/√(G+ε)',
'优点': '自适应学习率,适合稀疏数据',
'缺点': '学习率单调递减',
'适用场景': '稀疏特征学习'
}
}
print("=" * 80)
print("深度学习优化器总结")
print("=" * 80)
for name, info in optimizers_info.items():
print(f"\n{name}:")
print(f" 公式: {info['公式']}")
print(f" 优点: {info['优点']}")
print(f" 缺点: {info['缺点']}")
print(f" 适用场景: {info['适用场景']}")
print("\n" + "=" * 80)
# 显示优化器信息
OptimizerComparison.optimizer_summary()
OptimizerComparison.visualize_optimization_path()
六、深度学习训练技巧
1. 正则化技术
class RegularizationTechniques:
"""深度学习正则化技术"""
@staticmethod
def l1_regularization(weights, lambda_l1):
"""L1正则化 (Lasso)"""
return lambda_l1 * np.sum(np.abs(weights))
@staticmethod
def l2_regularization(weights, lambda_l2):
"""L2正则化 (Ridge)"""
return lambda_l2 * np.sum(weights ** 2)
@staticmethod
def elastic_net(weights, lambda_l1, lambda_l2):
"""弹性网络 (结合L1和L2)"""
return (lambda_l1 * np.sum(np.abs(weights)) +
lambda_l2 * np.sum(weights ** 2))
@staticmethod
def dropout(activations, dropout_rate, training=True):
"""Dropout正则化"""
if not training:
return activations
# 生成Dropout掩码
mask = np.random.binomial(1, 1 - dropout_rate, size=activations.shape)
# 应用Dropout并缩放
activations = activations * mask / (1 - dropout_rate)
return activations
@staticmethod
def batch_normalization(x, gamma=1, beta=0, epsilon=1e-5):
"""批量归一化 (简化版)"""
# 计算批次的均值和方差
mean = np.mean(x, axis=0)
variance = np.var(x, axis=0)
# 归一化
x_norm = (x - mean) / np.sqrt(variance + epsilon)
# 缩放和偏移
return gamma * x_norm + beta
@staticmethod
def data_augmentation_examples():
"""数据增强示例"""
techniques = {
'图像数据增强': [
'随机旋转 (±30度)',
'随机缩放 (0.8-1.2倍)',
'随机裁剪',
'随机水平翻转',
'颜色抖动 (亮度、对比度、饱和度)',
'随机噪声添加'
],
'文本数据增强': [
'同义词替换',
'随机插入',
'随机交换',
'随机删除',
'回译 (翻译成其他语言再译回)',
'EDA (Easy Data Augmentation)'
],
'时间序列增强': [
'时间扭曲',
'窗口滑动',
'随机缩放',
'添加噪声',
'通道混洗 (多变量时)'
]
}
print("数据增强技术:")
print("=" * 60)
for category, methods in techniques.items():
print(f"\n{category}:")
for method in methods:
print(f" • {method}")
print("\n" + "=" * 60)
@staticmethod
def early_stopping_callback(patience=10, min_delta=0.001):
"""早停回调函数"""
class EarlyStopping:
def __init__(self, patience=patience, min_delta=min_delta):
self.patience = patience
self.min_delta = min_delta
self.best_loss = float('inf')
self.counter = 0
self.should_stop = False
def __call__(self, current_loss):
if current_loss < self.best_loss - self.min_delta:
self.best_loss = current_loss
self.counter = 0
print(f"损失改善: {current_loss:.4f}")
return False
else:
self.counter += 1
print(f"早停计数: {self.counter}/{self.patience}")
if self.counter >= self.patience:
self.should_stop = True
print("达到早停条件,停止训练")
return self.should_stop
return EarlyStopping()
# 测试正则化技术
reg = RegularizationTechniques()
# 测试Dropout
activations = np.array([[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0]])
dropout_rate = 0.5
print("Dropout示例:")
print(f"原始激活值:\n{activations}")
print(f"Dropout后 (训练模式):\n{reg.dropout(activations, dropout_rate, training=True)}")
print(f"Dropout后 (推理模式):\n{reg.dropout(activations, dropout_rate, training=False)}")
# 显示数据增强技术
reg.data_augmentation_examples()
2. 超参数调优
class HyperparameterTuning:
"""深度学习超参数调优"""
@staticmethod
def learning_rate_scheduler():
"""学习率调度器"""
schedulers = {
'固定学习率': {
'描述': '整个训练过程使用固定学习率',
'适用场景': '简单任务,小数据集',
'代码示例': 'lr = 0.001'
},
'阶梯下降': {
'描述': '在指定轮次降低学习率',
'适用场景': '大多数深度学习任务',
'代码示例': '''if epoch % 30 == 0:
lr *= 0.1'''
},
'余弦退火': {
'描述': '学习率按余弦函数从高到低变化',
'适用场景': '需要跳出局部最优的任务',
'公式': 'lr = lr_min + 0.5*(lr_max-lr_min)*(1+cos(epoch/T_max*π))'
},
'循环学习率': {
'描述': '学习率在最小和最大值之间循环变化',
'适用场景': '提高模型泛化能力',
'代码示例': '''cycle = epoch % cycle_length
lr = lr_min + 0.5*(lr_max-lr_min)*(1+cos(cycle/cycle_length*π))'''
},
'热重启': {
'描述': '周期性重启学习率,每次重启后逐渐降低峰值',
'适用场景': '复杂任务,需要精细调优',
'优势': '结合大范围探索和精细调优'
}
}
print("学习率调度策略:")
print("=" * 80)
for name, info in schedulers.items():
print(f"\n{name}:")
print(f" 描述: {info['描述']}")
print(f" 适用场景: {info['适用场景']}")
if '公式' in info:
print(f" 公式: {info['公式']}")
if '代码示例' in info:
print(f" 代码示例: {info['代码示例']}")
print("\n" + "=" * 80)
@staticmethod
def visualize_learning_rates():
"""可视化不同学习率调度策略"""
epochs = 100
# 不同调度策略
strategies = {
'固定学习率': [0.001] * epochs,
'阶梯下降': [],
'指数衰减': [],
'余弦退火': [],
'循环学习率': []
}
# 生成学习率序列
for epoch in range(epochs):
# 阶梯下降 (每30轮降低10倍)
lr = 0.001
if epoch >= 30:
lr *= 0.1
if epoch >= 60:
lr *= 0.1
strategies['阶梯下降'].append(lr)
# 指数衰减
strategies['指数衰减'].append(0.001 * np.exp(-0.05 * epoch))
# 余弦退火
T_max = 50
lr_min = 0.0001
lr_max = 0.01
strategies['余弦退火'].append(
lr_min + 0.5 * (lr_max - lr_min) *
(1 + np.cos(epoch / T_max * np.pi))
)
# 循环学习率
cycle_length = 20
cycle = epoch % cycle_length
strategies['循环学习率'].append(
0.0001 + 0.5 * (0.01 - 0.0001) *
(1 + np.cos(cycle / cycle_length * np.pi))
)
# 绘制图形
plt.figure(figsize=(12, 8))
for idx, (name, lr_sequence) in enumerate(strategies.items()):
plt.plot(lr_sequence, linewidth=2, label=name)
plt.xlabel('训练轮次', fontsize=12)
plt.ylabel('学习率', fontsize=12)
plt.title('不同学习率调度策略比较', fontsize=16, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.yscale('log') # 对数尺度
plt.tight_layout()
plt.show()
@staticmethod
def hyperparameter_search_space():
"""深度学习超参数搜索空间"""
search_space = {
'学习率': {
'范围': [1e-5, 1e-1],
'推荐值': [1e-3, 3e-4, 1e-4],
'搜索策略': '对数均匀采样',
'备注': '最重要的超参数'
},
'批量大小': {
'范围': [16, 256],
'推荐值': [32, 64, 128],
'搜索策略': '均匀采样 (2的幂次)',
'备注': 'GPU内存允许的情况下尽量大'
},
'网络深度': {
'范围': [2, 20],
'推荐值': [3, 5, 8, 12],
'搜索策略': '均匀采样',
'备注': '根据任务复杂度选择'
},
'Dropout率': {
'范围': [0.0, 0.5],
'推荐值': [0.2, 0.3, 0.5],
'搜索策略': '均匀采样',
'备注': '正则化强度,防止过拟合'
},
'权重衰减': {
'范围': [0.0, 0.1],
'推荐值': [1e-4, 1e-5, 0.0],
'搜索策略': '对数均匀采样',
'备注': 'L2正则化强度'
},
'优化器': {
'选项': ['Adam', 'SGD', 'RMSprop', 'Adagrad'],
'推荐值': 'Adam',
'搜索策略': '类别采样',
'备注': 'Adam通常是默认选择'
}
}
print("深度学习超参数搜索空间:")
print("=" * 100)
for param, info in search_space.items():
print(f"\n{param}:")
for key, value in info.items():
print(f" {key}: {value}")
print("\n" + "=" * 100)
print("\n调优策略建议:")
print("1. 先调学习率,固定其他参数")
print("2. 然后调批量大小和网络结构")
print("3. 最后调正则化相关参数")
print("4. 使用贝叶斯优化或随机搜索")
print("5. 早停防止过拟合")
# 显示超参数调优信息
tuning = HyperparameterTuning()
tuning.hyperparameter_search_space()
tuning.learning_rate_scheduler()
tuning.visualize_learning_rates()
七、实践项目:手写数字识别
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
class MNISTDigitRecognition:
"""手写数字识别实践项目"""
def __init__(self):
"""初始化"""
self.X_train = None
self.y_train = None
self.X_test = None
self.y_test = None
self.model = None
def load_data(self):
"""加载MNIST数据集"""
print("加载MNIST数据集...")
# 使用fetch_openml加载MNIST数据集
mnist = fetch_openml('mnist_784', version=1, parser='auto')
X = mnist.data.astype('float32') / 255.0 # 归一化
y = mnist.target.astype('int')
# 划分训练集和测试集
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {self.X_train.shape}")
print(f"测试集大小: {self.X_test.shape}")
print(f"类别分布: {np.bincount(self.y_train)}")
return self
def visualize_samples(self, n_samples=10):
"""可视化样本"""
plt.figure(figsize=(15, 6))
for i in range(n_samples):
plt.subplot(2, n_samples//2, i+1)
image = self.X_train[i].reshape(28, 28)
plt.imshow(image, cmap='gray')
plt.title(f"标签: {self.y_train[i]}")
plt.axis('off')
plt.suptitle('MNIST手写数字样本', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
def create_simple_model(self):
"""创建简单的神经网络模型"""
class SimpleNN:
"""简单的全连接神经网络"""
def __init__(self, input_size=784, hidden_size=128, output_size=10):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
# 初始化参数
self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
self.b1 = np.zeros(hidden_size)
self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
self.b2 = np.zeros(output_size)
# 缓存
self.cache = {}
def relu(self, x):
"""ReLU激活函数"""
return np.maximum(0, x)
def softmax(self, x):
"""Softmax激活函数"""
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward(self, X):
"""前向传播"""
# 第一层
z1 = np.dot(X, self.W1) + self.b1
a1 = self.relu(z1)
# 第二层
z2 = np.dot(a1, self.W2) + self.b2
a2 = self.softmax(z2)
# 缓存中间结果
self.cache = {'X': X, 'z1': z1, 'a1': a1, 'z2': z2, 'a2': a2}
return a2
def backward(self, X, y, learning_rate=0.01):
"""反向传播"""
m = X.shape[0]
# 从缓存中获取前向传播的结果
z1 = self.cache['z1']
a1 = self.cache['a1']
a2 = self.cache['a2']
# 将y转换为one-hot编码
y_onehot = np.zeros((m, self.output_size))
y_onehot[np.arange(m), y] = 1
# 计算输出层的梯度
dz2 = a2 - y_onehot
dW2 = np.dot(a1.T, dz2) / m
db2 = np.sum(dz2, axis=0) / m
# 计算隐藏层的梯度
da1 = np.dot(dz2, self.W2.T)
dz1 = da1 * (z1 > 0) # ReLU的梯度
dW1 = np.dot(X.T, dz1) / m
db1 = np.sum(dz1, axis=0) / m
# 更新参数
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
def predict(self, X):
"""预测"""
probas = self.forward(X)
return np.argmax(probas, axis=1)
def evaluate(self, X, y):
"""评估模型"""
y_pred = self.predict(X)
accuracy = np.mean(y_pred == y)
return accuracy
def summary(self):
"""打印模型信息"""
print("=" * 50)
print("简单神经网络模型")
print("=" * 50)
print(f"输入大小: {self.input_size}")
print(f"隐藏层大小: {self.hidden_size}")
print(f"输出大小: {self.output_size}")
total_params = (self.W1.size + self.b1.size +
self.W2.size + self.b2.size)
print(f"总参数数: {total_params:,}")
print("=" * 50)
self.model = SimpleNN()
return self.model
def train_model(self, epochs=10, batch_size=64, learning_rate=0.01):
"""训练模型"""
n_samples = self.X_train.shape[0]
n_batches = n_samples // batch_size
print(f"开始训练...")
print(f"训练样本数: {n_samples}")
print(f"批次大小: {batch_size}")
print(f"批次数: {n_batches}")
print(f"训练轮次: {epochs}")
train_losses = []
train_accuracies = []
test_accuracies = []
for epoch in range(epochs):
epoch_loss = 0
epoch_accuracy = 0
# 打乱数据
indices = np.random.permutation(n_samples)
X_shuffled = self.X_train[indices]
y_shuffled = self.y_train[indices]
for batch in range(n_batches):
# 获取当前批次数据
start = batch * batch_size
end = start + batch_size
X_batch = X_shuffled[start:end]
y_batch = y_shuffled[start:end]
# 前向传播
y_pred = self.model.forward(X_batch)
# 计算损失(交叉熵)
m = X_batch.shape[0]
y_onehot = np.zeros((m, 10))
y_onehot[np.arange(m), y_batch] = 1
loss = -np.mean(np.sum(y_onehot * np.log(y_pred + 1e-8), axis=1))
epoch_loss += loss
# 计算准确率
batch_pred = np.argmax(y_pred, axis=1)
batch_acc = np.mean(batch_pred == y_batch)
epoch_accuracy += batch_acc
# 反向传播和参数更新
self.model.backward(X_batch, y_batch, learning_rate)
# 计算平均损失和准确率
avg_loss = epoch_loss / n_batches
avg_accuracy = epoch_accuracy / n_batches
# 测试集准确率
test_acc = self.model.evaluate(self.X_test, self.y_test)
train_losses.append(avg_loss)
train_accuracies.append(avg_accuracy)
test_accuracies.append(test_acc)
print(f"轮次 {epoch+1}/{epochs}: "
f"训练损失={avg_loss:.4f}, "
f"训练准确率={avg_accuracy:.4f}, "
f"测试准确率={test_acc:.4f}")
# 可视化训练过程
self.plot_training_history(train_losses, train_accuracies, test_accuracies)
return train_losses, train_accuracies, test_accuracies
def plot_training_history(self, train_losses, train_accuracies, test_accuracies):
"""绘制训练历史"""
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# 损失曲线
axes[0].plot(train_losses, 'b-', linewidth=2, label='训练损失')
axes[0].set_xlabel('训练轮次')
axes[0].set_ylabel('损失')
axes[0].set_title('训练损失曲线')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 准确率曲线
axes[1].plot(train_accuracies, 'g-', linewidth=2, label='训练准确率')
axes[1].plot(test_accuracies, 'r-', linewidth=2, label='测试准确率')
axes[1].set_xlabel('训练轮次')
axes[1].set_ylabel('准确率')
axes[1].set_title('准确率曲线')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.suptitle('MNIST手写数字识别训练历史', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
def show_predictions(self, n_samples=15):
"""展示预测结果"""
# 随机选择测试样本
indices = np.random.choice(len(self.X_test), n_samples, replace=False)
X_sample = self.X_test[indices]
y_sample = self.y_test[indices]
# 预测
y_pred = self.model.predict(X_sample)
# 可视化
plt.figure(figsize=(15, 10))
n_cols = 5
n_rows = int(np.ceil(n_samples / n_cols))
for i, idx in enumerate(indices):
plt.subplot(n_rows, n_cols, i+1)
image = X_sample[i].reshape(28, 28)
plt.imshow(image, cmap='gray')
# 标记正确/错误
is_correct = y_pred[i] == y_sample[i]
color = 'green' if is_correct else 'red'
plt.title(f"真实: {y_sample[i]}\n预测: {y_pred[i]}", color=color)
plt.axis('off')
accuracy = np.mean(y_pred == y_sample)
plt.suptitle(f'预测结果 (准确率: {accuracy:.2%})', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
# 打印混淆矩阵(简化)
print("\n预测结果统计:")
print(f"样本数: {n_samples}")
print(f"正确数: {np.sum(y_pred == y_sample)}")
print(f"错误数: {np.sum(y_pred != y_sample)}")
print(f"准确率: {accuracy:.2%}")
# 运行MNIST手写数字识别项目
mnist_project = MNISTDigitRecognition()
# 加载数据
mnist_project.load_data()
# 可视化样本
mnist_project.visualize_samples(10)
# 创建模型
model = mnist_project.create_simple_model()
model.summary()
# 训练模型
train_losses, train_accuracies, test_accuracies = mnist_project.train_model(
epochs=20,
batch_size=128,
learning_rate=0.01
)
# 展示预测结果
mnist_project.show_predictions(15)
深度学习正在改变世界。从图像识别到自然语言处理,从自动驾驶到医疗诊断,深度学习的应用无处不在。虽然深度学习技术看起来很复杂,但通过系统的学习和实践,你也可以掌握这项强大的技术。