revert: MoE k=1 → k=2（PCOC 从 1.059 炸到 2.075，Top-1 破坏输出校准）

保留 inference_mode + torch.compile(default)
2026-06-13 11:50:30 +08:00
parent 47c89cc76d
commit faedab5245
1 changed files with 23 additions and 31 deletions
@@ -320,7 +320,7 @@ class TopKGate(nn.Module):
        return topk_idx, topk_score, probs

 class SMoE(nn.Module):
-    def __init__(self, d_model, dim_ff, num_experts, k=1):
+    def __init__(self, d_model, dim_ff, num_experts, k=2):
        super().__init__()
        self.num_experts = num_experts
        self.k = k
@@ -337,39 +337,31 @@ class SMoE(nn.Module):

        topk_idx, topk_score, probs = self.gate(x)

-        # flatten: [B, S, k] → [B*S, k]
-        x_flat = x.reshape(-1, D)
-        idx_flat = topk_idx.reshape(-1, self.k)
+        out = torch.zeros_like(x)
+
+        # flatten
+        x_flat = x.reshape(-1, D)                # [B*S, D]
+        idx_flat = topk_idx.reshape(-1, self.k)  # [B*S, k]
        score_flat = topk_score.reshape(-1, self.k)

-        if self.k == 1:
-            # Top-1 快速路径：无需二维 mask 和加权累加
-            idx_flat = idx_flat.squeeze(-1)    # [B*S]
-            score_flat = score_flat.squeeze(-1) # [B*S]
-            out = torch.zeros_like(x_flat)
+        for i in range(self.num_experts):
+            # 找到被路由到 expert i 的 token
+            mask = (idx_flat == i)  # [B*S, k]

-            for i in range(self.num_experts):
-                mask = (idx_flat == i)  # [B*S]
-                if not mask.any():
-                    continue
-                selected_x = x_flat[mask]
-                expert_out = self.experts[i](selected_x)
-                out[mask] = expert_out * score_flat[mask].unsqueeze(-1)
+            if not mask.any():
+                continue

-            out = out.reshape(B, S, D)
-        else:
-            # Top-K 通用路径（k > 1）
-            out = torch.zeros_like(x)
-            for i in range(self.num_experts):
-                mask = (idx_flat == i)  # [B*S, k]
-                if not mask.any():
-                    continue
-                token_idx, k_idx = mask.nonzero(as_tuple=True)
-                selected_x = x_flat[token_idx]
-                expert_out = self.experts[i](selected_x)
-                weight = score_flat[token_idx, k_idx].unsqueeze(-1)
-                out_flat = out.reshape(-1, D)
-                out_flat[token_idx] += expert_out * weight
+            # 哪些 token 命中了 expert i
+            token_idx, k_idx = mask.nonzero(as_tuple=True)
+
+            selected_x = x_flat[token_idx]  # [N, D]
+
+            expert_out = self.experts[i](selected_x)  # [N, D]
+
+            weight = score_flat[token_idx, k_idx].unsqueeze(-1)
+
+            out_flat = out.reshape(-1, D)
+            out_flat[token_idx] += expert_out * weight

        importance = probs.sum(dim=(0,1))  # [E]
        moe_loss = (importance.std() / (importance.mean() + 1e-6))
@@ -396,7 +388,7 @@ class TransformerEncoder(nn.Module):
        self.act = getattr(F, act)
        self.attention_fn = attention_fn
        self.moe = nn.ModuleList([
-            SMoE(d_model, dim_ff, num_experts=8, k=1)  # Top-1 gating: 每个 token 仅激活 1 个 expert
+            SMoE(d_model, dim_ff, num_experts=8, k=2)
            for _ in range(num_layers)
        ])