Adapter微调

什么是Adapter?

Adapter是一种参数高效的微调方法,通过在预训练模型的层之间插入小型神经网络模块来实现任务适配,而不修改原始模型参数。

核心架构

Adapter模块设计

import torch
import torch.nn as nn
 
class AdapterLayer(nn.Module):
    def __init__(self, hidden_size, adapter_size, activation="relu"):
        super().__init__()
        self.hidden_size = hidden_size
        self.adapter_size = adapter_size
        
        # 下投影:降维
        self.down_project = nn.Linear(hidden_size, adapter_size)
        
        # 激活函数
        if activation == "relu":
            self.activation = nn.ReLU()
        elif activation == "gelu":
            self.activation = nn.GELU()
        else:
            self.activation = nn.Identity()
        
        # 上投影:恢复维度
        self.up_project = nn.Linear(adapter_size, hidden_size)
        
        # 初始化:接近恒等映射
        nn.init.zeros_(self.up_project.weight)
        nn.init.zeros_(self.up_project.bias)
    
    def forward(self, x):
        # Adapter前向传播:降维 → 激活 → 升维
        adapter_output = self.down_project(x)
        adapter_output = self.activation(adapter_output)
        adapter_output = self.up_project(adapter_output)
        
        # 残差连接
        return x + adapter_output

瓶颈架构

Adapter采用瓶颈(bottleneck)设计:

输入(d) → 下投影(d→r) → 激活函数 → 上投影(r→d) → 残差连接 → 输出(d)

其中r << d,大幅减少参数量。

插入策略

Transformer中的插入位置

class TransformerWithAdapter(nn.Module):
    def __init__(self, transformer_layer, adapter_config):
        super().__init__()
        self.transformer_layer = transformer_layer
        
        # 在不同位置插入Adapter
        self.adapter_after_attention = AdapterLayer(
            hidden_size=adapter_config["hidden_size"],
            adapter_size=adapter_config["adapter_size"]
        )
        
        self.adapter_after_ffn = AdapterLayer(
            hidden_size=adapter_config["hidden_size"],
            adapter_size=adapter_config["adapter_size"]
        )
    
    def forward(self, x, attention_mask=None):
        # 原始注意力机制
        attention_output = self.transformer_layer.attention(x, attention_mask)
        
        # 第一个Adapter
        x = self.adapter_after_attention(attention_output)
        
        # 原始FFN
        ffn_output = self.transformer_layer.ffn(x)
        
        # 第二个Adapter
        output = self.adapter_after_ffn(ffn_output)
        
        return output

插入位置选择

  1. 仅在FFN后:最常用,平衡效果与参数量
  2. 仅在注意力后:适合注意力相关任务
  3. 双重插入:最佳效果,但参数量增加
  4. 并行插入:与原始层并行而非串行

实现方法

使用AdapterHub

from transformers import AutoModel
from adapters import AdapterConfig, AdapterTrainer
 
# 加载预训练模型
model = AutoModel.from_pretrained("bert-base-uncased")
 
# 配置Adapter
adapter_config = AdapterConfig.load(
    "pfeiffer",  # Adapter架构类型
    reduction_factor=16,  # 降维因子
    non_linearity="relu"
)
 
# 添加Adapter
model.add_adapter("task_adapter", config=adapter_config)
 
# 激活Adapter
model.set_active_adapters("task_adapter")
 
# 冻结原始参数,只训练Adapter
model.train_adapter("task_adapter")

手动实现Adapter

def add_adapters_to_model(model, adapter_config):
    """为模型添加Adapter层"""
    
    for name, module in model.named_modules():
        if "transformer.h" in name and "mlp" in name:
            # 在MLP层后添加Adapter
            adapter = AdapterLayer(
                hidden_size=adapter_config["hidden_size"],
                adapter_size=adapter_config["adapter_size"]
            )
            
            # 替换原始模块
            parent_name = ".".join(name.split(".")[:-1])
            parent_module = model.get_submodule(parent_name)
            
            # 创建包含Adapter的新模块
            new_module = nn.Sequential(module, adapter)
            setattr(parent_module, name.split(".")[-1], new_module)
    
    return model

训练策略

单任务训练

from transformers import Trainer, TrainingArguments
 
# 训练配置
training_args = TrainingArguments(
    output_dir="./adapter_output",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    learning_rate=1e-3,  # Adapter可以用更大的学习率
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    evaluation_strategy="epoch",
)
 
# 只训练Adapter参数
def freeze_base_model(model):
    for name, param in model.named_parameters():
        if "adapter" not in name:
            param.requires_grad = False
        else:
            param.requires_grad = True
 
freeze_base_model(model)
 
# 训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
 
trainer.train()

多任务训练

class MultiTaskAdapterModel(nn.Module):
    def __init__(self, base_model, task_configs):
        super().__init__()
        self.base_model = base_model
        self.adapters = nn.ModuleDict()
        
        # 为每个任务创建独立的Adapter
        for task_name, config in task_configs.items():
            self.adapters[task_name] = AdapterLayer(
                hidden_size=config["hidden_size"],
                adapter_size=config["adapter_size"]
            )
    
    def forward(self, x, task_name):
        # 基础模型前向传播
        base_output = self.base_model(x)
        
        # 应用任务特定的Adapter
        if task_name in self.adapters:
            adapted_output = self.adapters[task_name](base_output)
            return adapted_output
        else:
            return base_output
 
# 多任务训练循环
def multi_task_training(model, task_dataloaders):
    for epoch in range(num_epochs):
        for task_name, dataloader in task_dataloaders.items():
            for batch in dataloader:
                # 前向传播
                outputs = model(batch["input"], task_name=task_name)
                loss = compute_loss(outputs, batch["labels"])
                
                # 反向传播
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

高级技术

AdapterFusion

class AdapterFusion(nn.Module):
    def __init__(self, adapter_names, hidden_size):
        super().__init__()
        self.adapter_names = adapter_names
        self.num_adapters = len(adapter_names)
        
        # 注意力权重计算
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=8,
            batch_first=True
        )
        
        # 融合权重
        self.fusion_weights = nn.Parameter(
            torch.ones(self.num_adapters) / self.num_adapters
        )
    
    def forward(self, base_output, adapter_outputs):
        # 计算注意力权重
        stacked_outputs = torch.stack(adapter_outputs, dim=1)
        
        # 自注意力融合
        fused_output, _ = self.attention(
            stacked_outputs, stacked_outputs, stacked_outputs
        )
        
        # 加权融合
        weighted_output = torch.sum(
            fused_output * self.fusion_weights.view(1, -1, 1), 
            dim=1
        )
        
        return base_output + weighted_output

层次化Adapter

class HierarchicalAdapter(nn.Module):
    def __init__(self, hidden_size, adapter_sizes):
        super().__init__()
        self.adapters = nn.ModuleList()
        
        # 创建多层Adapter
        for adapter_size in adapter_sizes:
            self.adapters.append(AdapterLayer(hidden_size, adapter_size))
    
    def forward(self, x):
        # 逐层应用Adapter
        for adapter in self.adapters:
            x = adapter(x)
        return x

性能优化

参数共享

class SharedAdapter(nn.Module):
    def __init__(self, hidden_size, adapter_size, num_layers):
        super().__init__()
        # 所有层共享同一个Adapter
        self.shared_adapter = AdapterLayer(hidden_size, adapter_size)
        self.num_layers = num_layers
    
    def forward(self, layer_outputs):
        # 对所有层输出应用相同的Adapter
        adapted_outputs = []
        for output in layer_outputs:
            adapted_outputs.append(self.shared_adapter(output))
        return adapted_outputs

动态Adapter

class DynamicAdapter(nn.Module):
    def __init__(self, hidden_size, max_adapter_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.max_adapter_size = max_adapter_size
        
        # 可变大小的投影层
        self.down_project = nn.Linear(hidden_size, max_adapter_size)
        self.up_project = nn.Linear(max_adapter_size, hidden_size)
        
        # 大小控制器
        self.size_controller = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # 动态确定Adapter大小
        size_logit = self.size_controller(x.mean(dim=1))
        adapter_size = int(torch.sigmoid(size_logit) * self.max_adapter_size)
        
        # 使用动态大小进行计算
        down_output = self.down_project(x)[:, :, :adapter_size]
        up_output = self.up_project(
            F.pad(down_output, (0, self.max_adapter_size - adapter_size))
        )
        
        return x + up_output

部署与推理

Adapter切换

class AdapterSwitcher:
    def __init__(self, model, adapter_configs):
        self.model = model
        self.adapters = {}
        
        # 加载所有Adapter
        for name, config in adapter_configs.items():
            adapter_path = config["path"]
            self.adapters[name] = torch.load(adapter_path)
    
    def switch_adapter(self, adapter_name):
        """切换到指定的Adapter"""
        if adapter_name not in self.adapters:
            raise ValueError(f"Adapter {adapter_name} not found")
        
        # 加载Adapter权重
        adapter_weights = self.adapters[adapter_name]
        
        for name, param in self.model.named_parameters():
            if "adapter" in name and name in adapter_weights:
                param.data.copy_(adapter_weights[name])
    
    def inference(self, input_text, adapter_name):
        """使用指定Adapter进行推理"""
        self.switch_adapter(adapter_name)
        
        with torch.no_grad():
            outputs = self.model(input_text)
        
        return outputs

批量推理优化

def batch_inference_with_adapters(model, inputs, adapter_names):
    """批量推理,每个输入使用不同的Adapter"""
    
    results = []
    
    # 按Adapter分组
    adapter_groups = {}
    for i, (input_text, adapter_name) in enumerate(zip(inputs, adapter_names)):
        if adapter_name not in adapter_groups:
            adapter_groups[adapter_name] = []
        adapter_groups[adapter_name].append((i, input_text))
    
    # 分组处理
    for adapter_name, group_inputs in adapter_groups.items():
        # 切换Adapter
        model.set_active_adapters(adapter_name)
        
        # 批量处理
        batch_texts = [text for _, text in group_inputs]
        batch_outputs = model(batch_texts)
        
        # 记录结果
        for (original_idx, _), output in zip(group_inputs, batch_outputs):
            results.append((original_idx, output))
    
    # 按原始顺序排序
    results.sort(key=lambda x: x[0])
    return [output for _, output in results]

优势与局限

优势

  1. 模块化设计:每个任务独立的Adapter,便于管理
  2. 参数效率:通常只需0.5%-2%的额外参数
  3. 任务迁移:Adapter可以在相似任务间迁移
  4. 并行训练:多个任务可以并行训练不同的Adapter
  5. 热插拔:推理时可以动态切换Adapter

局限性

  1. 推理开销:额外的前向传播计算
  2. 架构限制:需要修改模型架构
  3. 任务相关性:对差异很大的任务效果有限
  4. 超参数敏感:Adapter大小需要仔细调优

最佳实践

Adapter大小选择

def choose_adapter_size(hidden_size, task_complexity):
    """根据任务复杂度选择Adapter大小"""
    
    size_ratios = {
        "simple": 1/16,    # 简单任务:hidden_size/16
        "medium": 1/8,     # 中等任务:hidden_size/8  
        "complex": 1/4     # 复杂任务:hidden_size/4
    }
    
    ratio = size_ratios.get(task_complexity, 1/8)
    adapter_size = max(8, int(hidden_size * ratio))  # 最小8
    
    return adapter_size

训练策略

def adapter_training_strategy():
    """Adapter训练最佳实践"""
    
    return {
        "学习率": "1e-3到1e-4,比全参数微调大",
        "训练轮数": "5-10轮,比LoRA多",
        "批次大小": "可以用较大的batch size",
        "正则化": "适度的weight decay",
        "初始化": "接近恒等映射的初始化",
        "梯度裁剪": "防止梯度爆炸"
    }

相关概念