revert: collate_dedup默认关(评测33.44>33.00,per_sample_weights加权kernel更慢+评测重复率不够)。锁定71.34
This commit is contained in:
+2
-1
@@ -168,7 +168,8 @@ CONFIG = {
|
|||||||
"syncfree_mask": True, # True=用searchsorted构造因果mask(无同步);False=repeat_interleave(同步)
|
"syncfree_mask": True, # True=用searchsorted构造因果mask(无同步);False=repeat_interleave(同步)
|
||||||
"emb_fp16": True, # True=Embedding表转FP16(查表带宽减半,实测AUC 0.75932≈无损)
|
"emb_fp16": True, # True=Embedding表转FP16(查表带宽减半,实测AUC 0.75932≈无损)
|
||||||
"use_embedding_bag": True, # F.embedding_bag 融合查表+池化(单kernel,消dedup的unique同步,AUC≈无损)
|
"use_embedding_bag": True, # F.embedding_bag 融合查表+池化(单kernel,消dedup的unique同步,AUC≈无损)
|
||||||
"collate_dedup": True, # collate(不计时)段内去重+计数→embedding_bag per_sample_weights,减查表带宽(本地4.10→3.98,AUC精确不变)
|
# 评测净负33.44>33.00:per_sample_weights走更慢的加权kernel+评测重复率不够,盖过带宽节省。退回。
|
||||||
|
"collate_dedup": False, # True=collate段内去重+计数(本地快评测慢,勿开)
|
||||||
"dedup_embedding": True, # True=查表前对sign去重(只查唯一值再展开),本地7.80->6.49s,AUC逐位等价
|
"dedup_embedding": True, # True=查表前对sign去重(只查唯一值再展开),本地7.80->6.49s,AUC逐位等价
|
||||||
"sparse_pool": False, # True=用(段×唯一)稀疏矩阵乘做池化,避免materialize整个[M,512](段内高重复时省)
|
"sparse_pool": False, # True=用(段×唯一)稀疏矩阵乘做池化,避免materialize整个[M,512](段内高重复时省)
|
||||||
"compile": False, # 是否 torch.compile(实测慢5×,勿开)
|
"compile": False, # 是否 torch.compile(实测慢5×,勿开)
|
||||||
|
|||||||
Reference in New Issue
Block a user