From c081620ffdd70e09766a60f17d4e8e4fad097b8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=88=AA=E5=AE=87?= <3364451258@qq.com>
Date: Sat, 13 Jun 2026 13:32:04 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20MoE=20Top-1=20=E8=B7=AF=E7=94=B1=20+=20?=
 =?UTF-8?q?(p1+p2)=20=E6=9D=83=E9=87=8D=E8=A1=A5=E5=81=BF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 仅路由到 Top-1 expert（节省 50% FFN 计算）
- gate 输出 top-2 概率，用 p1+p2 作为输出权重
- 近似 k=2 的输出幅度，避免 PCOC 偏移
- 是参数调整修正，非方案本身错误
---
 代码/code/infer.py | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/代码/code/infer.py b/代码/code/infer.py
index bdf89ae..a4ac157 100644
--- a/代码/code/infer.py
+++ b/代码/code/infer.py
@@ -320,7 +320,7 @@ class TopKGate(nn.Module):
         return topk_idx, topk_score, probs
 
 class SMoE(nn.Module):
-    def __init__(self, d_model, dim_ff, num_experts, k=2):
+    def __init__(self, d_model, dim_ff, num_experts, k=1):
         super().__init__()
         self.num_experts = num_experts
         self.k = k
@@ -329,35 +329,34 @@ class SMoE(nn.Module):
             Expert(d_model, dim_ff) for _ in range(num_experts)
         ])
 
-        self.gate = TopKGate(d_model, num_experts, k=k)
+        self.gate = TopKGate(d_model, num_experts, k=2)  # gate 内部用 k=2 获取补偿权重
 
     def forward(self, x):
         # x: [B,S,D]
         B, S, D = x.shape
 
         topk_idx, topk_score, probs = self.gate(x)
+        # topk_idx: [B, S, 2], topk_score: [B, S, 2]
+
+        # 仅路由到 Top-1 expert，但用 (p1+p2) 作为权重补偿
+        route_idx = topk_idx[:, :, :1]         # [B, S, 1] — 只取 top-1
+        weight_sum = topk_score.sum(dim=-1)     # [B, S] — p1 + p2 作为总权重
 
         out = torch.zeros_like(x)
 
-        # flatten
-        x_flat = x.reshape(-1, D)                # [B*S, D]
-        idx_flat = topk_idx.reshape(-1, self.k)  # [B*S, k]
-        score_flat = topk_score.reshape(-1, self.k)
-        out_flat = out.reshape(-1, D)            # 提前 reshape，避免循环内重复
+        x_flat = x.reshape(-1, D)
+        idx_flat = route_idx.reshape(-1)         # [B*S]
+        weight_flat = weight_sum.reshape(-1)     # [B*S]
+        out_flat = out.reshape(-1, D)
 
         for i in range(self.num_experts):
-            # 找到被路由到 expert i 的 token
-            mask = (idx_flat == i)  # [B*S, k]
-            # 注：k=2 时几乎所有 expert 都分到 token，移除 .any() 检查避免 GPU 同步
-
-            token_idx, k_idx = mask.nonzero(as_tuple=True)
+            mask = (idx_flat == i)  # [B*S]
+            token_idx = mask.nonzero(as_tuple=True)[0]
             if token_idx.numel() == 0:
                 continue
-
-            selected_x = x_flat[token_idx]  # [N, D]
-            expert_out = self.experts[i](selected_x)  # [N, D]
-            weight = score_flat[token_idx, k_idx].unsqueeze(-1)
-            out_flat[token_idx] += expert_out * weight
+            selected_x = x_flat[token_idx]
+            expert_out = self.experts[i](selected_x)
+            out_flat[token_idx] = expert_out * weight_flat[token_idx].unsqueeze(-1)
 
         importance = probs.sum(dim=(0,1))  # [E]
         moe_loss = (importance.std() / (importance.mean() + 1e-6))
@@ -384,7 +383,7 @@ class TransformerEncoder(nn.Module):
         self.act = getattr(F, act)
         self.attention_fn = attention_fn
         self.moe = nn.ModuleList([
-            SMoE(d_model, dim_ff, num_experts=8, k=2)
+            SMoE(d_model, dim_ff, num_experts=8, k=1)  # Top-1 路由 + (p1+p2) 权重补偿
             for _ in range(num_layers)
         ])