From c081620ffdd70e09766a60f17d4e8e4fad097b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=88=AA=E5=AE=87?= <3364451258@qq.com> Date: Sat, 13 Jun 2026 13:32:04 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20MoE=20Top-1=20=E8=B7=AF=E7=94=B1=20+=20?= =?UTF-8?q?(p1+p2)=20=E6=9D=83=E9=87=8D=E8=A1=A5=E5=81=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 仅路由到 Top-1 expert(节省 50% FFN 计算) - gate 输出 top-2 概率,用 p1+p2 作为输出权重 - 近似 k=2 的输出幅度,避免 PCOC 偏移 - 是参数调整修正,非方案本身错误 --- 代码/code/infer.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/代码/code/infer.py b/代码/code/infer.py index bdf89ae..a4ac157 100644 --- a/代码/code/infer.py +++ b/代码/code/infer.py @@ -320,7 +320,7 @@ class TopKGate(nn.Module): return topk_idx, topk_score, probs class SMoE(nn.Module): - def __init__(self, d_model, dim_ff, num_experts, k=2): + def __init__(self, d_model, dim_ff, num_experts, k=1): super().__init__() self.num_experts = num_experts self.k = k @@ -329,35 +329,34 @@ class SMoE(nn.Module): Expert(d_model, dim_ff) for _ in range(num_experts) ]) - self.gate = TopKGate(d_model, num_experts, k=k) + self.gate = TopKGate(d_model, num_experts, k=2) # gate 内部用 k=2 获取补偿权重 def forward(self, x): # x: [B,S,D] B, S, D = x.shape topk_idx, topk_score, probs = self.gate(x) + # topk_idx: [B, S, 2], topk_score: [B, S, 2] + + # 仅路由到 Top-1 expert,但用 (p1+p2) 作为权重补偿 + route_idx = topk_idx[:, :, :1] # [B, S, 1] — 只取 top-1 + weight_sum = topk_score.sum(dim=-1) # [B, S] — p1 + p2 作为总权重 out = torch.zeros_like(x) - # flatten - x_flat = x.reshape(-1, D) # [B*S, D] - idx_flat = topk_idx.reshape(-1, self.k) # [B*S, k] - score_flat = topk_score.reshape(-1, self.k) - out_flat = out.reshape(-1, D) # 提前 reshape,避免循环内重复 + x_flat = x.reshape(-1, D) + idx_flat = route_idx.reshape(-1) # [B*S] + weight_flat = weight_sum.reshape(-1) # [B*S] + out_flat = out.reshape(-1, D) for i in range(self.num_experts): - # 找到被路由到 expert i 的 token - mask = (idx_flat == i) # [B*S, k] - # 注:k=2 时几乎所有 expert 都分到 token,移除 .any() 检查避免 GPU 同步 - - token_idx, k_idx = mask.nonzero(as_tuple=True) + mask = (idx_flat == i) # [B*S] + token_idx = mask.nonzero(as_tuple=True)[0] if token_idx.numel() == 0: continue - - selected_x = x_flat[token_idx] # [N, D] - expert_out = self.experts[i](selected_x) # [N, D] - weight = score_flat[token_idx, k_idx].unsqueeze(-1) - out_flat[token_idx] += expert_out * weight + selected_x = x_flat[token_idx] + expert_out = self.experts[i](selected_x) + out_flat[token_idx] = expert_out * weight_flat[token_idx].unsqueeze(-1) importance = probs.sum(dim=(0,1)) # [E] moe_loss = (importance.std() / (importance.mean() + 1e-6)) @@ -384,7 +383,7 @@ class TransformerEncoder(nn.Module): self.act = getattr(F, act) self.attention_fn = attention_fn self.moe = nn.ModuleList([ - SMoE(d_model, dim_ff, num_experts=8, k=2) + SMoE(d_model, dim_ff, num_experts=8, k=1) # Top-1 路由 + (p1+p2) 权重补偿 for _ in range(num_layers) ])