perf: dedup_embedding 默认开启 — 本地7.80->6.49s(快17%),AUC逐位不变

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
OwnerSunshine530
2026-06-15 14:21:45 +08:00
parent 2268fa6cf3
commit a358dfd0a3
+1 -1
View File
@@ -52,7 +52,7 @@ CONFIG = {
"fuse_embedding": True, # True=28个slot的查表+池化融合为1次(减per-batch kernel启动) "fuse_embedding": True, # True=28个slot的查表+池化融合为1次(减per-batch kernel启动)
"syncfree_mask": True, # True=用searchsorted构造因果mask(无同步)False=repeat_interleave(同步) "syncfree_mask": True, # True=用searchsorted构造因果mask(无同步)False=repeat_interleave(同步)
"emb_fp16": True, # True=Embedding表转FP16(查表带宽减半,实测AUC 0.75932≈无损) "emb_fp16": True, # True=Embedding表转FP16(查表带宽减半,实测AUC 0.75932≈无损)
"dedup_embedding": False, # True=查表前对sign去重(只查唯一值再展开),减少大表随机访存。数学等价 "dedup_embedding": True, # True=查表前对sign去重(只查唯一值再展开),本地7.80->6.49s,AUC逐位等价
"compile": False, # 是否 torch.compile(实测慢5×,勿开) "compile": False, # 是否 torch.compile(实测慢5×,勿开)
} }