From d5c327dc970fe3698588ef3c50ce60ce6f78f221 Mon Sep 17 00:00:00 2001 From: OwnerSunshine530 Date: Mon, 15 Jun 2026 15:07:29 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20chunk=5Fusers=20=E9=BB=98=E8=AE=A4=204(?= =?UTF-8?q?=E6=9C=AC=E5=9C=B0=E6=9C=80=E5=BF=AB6.18s);=E6=B3=A8=E6=84=8F?= =?UTF-8?q?=E5=8A=9Bchunk=E6=94=B6=E7=9B=8A=E5=B7=B2=E9=80=92=E5=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 --- 代码/code/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/代码/code/infer.py b/代码/code/infer.py index 6bad1a1..339e765 100644 --- a/代码/code/infer.py +++ b/代码/code/infer.py @@ -44,7 +44,7 @@ CONFIG = { # sdpa 是评测端验证最快(89.96s/58.86)。flex/compile/小batch/varlen 在评测端都更差。 # attn: "chunked"(按用户分块SDPA,降O(S²),本地14.25->7.92s) / "sdpa"(稠密mask) / 其它对照 "attn": "chunked", - "chunk_users": 8, # chunked 每块用户数(本地 8 比 16 更快;切小拼接序列降注意力O(S²)) + "chunk_users": 4, # chunked 每块用户数(本地 4 最快 6.18s;再小收益递减) # 稠密MoE去掉了 model(batch) 内唯一的同步点(MoE循环的.nonzero())。若评测计时不 # synchronize,去掉同步点可能让被计时的 model(batch) 大幅缩短。本地force-sync看不出, # 须靠提交验证。AUC中性、MoE仅占2%算力故风险极低。