9d5a5a52f2
- infer.py: 模块级 CONFIG(fp16/keep_fp32_modules/expert_merge/ merge_threshold/signid_mode/sync_timing),默认值=当前最优行为; load_model 按 CONFIG 控制半精度/FP32敏感层/expert合并; RepEncoder 支持 clamp/modulo 两种 sign-id 处理; 新增 _force_fp32_io 钩子让敏感层在FP16模型里以FP32 IO 计算。 - bench.py: 设置 CONFIG → 跑推理 → cuda.synchronize 真实计时 → _cal_score 打印 AUC/PCOC/延迟/总分,支持配置/batch扫描。不进提交包。 - EXPERIMENTS.md: 实验记录表。 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
111 lines
3.8 KiB
Python
111 lines
3.8 KiB
Python
"""本地测量闭环:设置 infer.CONFIG,跑推理,同步计时,打印 AUC/PCOC/延迟/总分。
|
||
|
||
不进提交包。在 AI Studio notebook(带 dataset/ 与 ckpt.pt)里运行:
|
||
|
||
%cd /home/aistudio/code
|
||
!python bench.py # 默认配置基准
|
||
|
||
或在 notebook cell 里逐配置扫描:
|
||
|
||
import bench
|
||
bench.run_once({"fp16": False, "expert_merge": False}) # FP32 参考跑
|
||
bench.run_once({"signid_mode": "modulo"}) # 取模 vs clamp
|
||
"""
|
||
import time
|
||
from pathlib import Path
|
||
|
||
import torch
|
||
from torch.utils.data import DataLoader
|
||
|
||
import infer # 同目录
|
||
|
||
|
||
def run_once(config_override=None, batch_size=50, max_batches=None, max_feasign_per_slot=None):
|
||
"""跑一次本地推理并打分。
|
||
|
||
Args:
|
||
config_override: 覆盖 infer.CONFIG 的字典(如 {"fp16": False})
|
||
batch_size: DataLoader 的 batch 大小(本地参考;评测端可能自有设定)
|
||
max_batches: 只跑前 N 个 batch(快速冒烟用),None=全量
|
||
max_feasign_per_slot: 传给 CTRTestSeqDataset 的截断字典,None=不截断;
|
||
默认沿用 baseline 的 {1: 2}
|
||
Returns:
|
||
infer._cal_score 的结果 dict
|
||
"""
|
||
if config_override is None:
|
||
config_override = {}
|
||
if max_feasign_per_slot is None:
|
||
max_feasign_per_slot = {1: 2}
|
||
|
||
infer.CONFIG.update(config_override)
|
||
infer.CONFIG["sync_timing"] = True
|
||
|
||
cur = Path(__file__).parent
|
||
ref = cur / "dataset"
|
||
history = ref / "history"
|
||
test_csv = ref / "test.csv"
|
||
label_file = ref / "label_data.txt"
|
||
|
||
# ----- 加载数据 -----
|
||
files = (sorted(history.glob("*.csv")) if history.exists() else []) + [test_csv]
|
||
item_dict, user_seq = infer.load_sample_files(files)
|
||
test_logids = infer.load_logids_from_file(test_csv)
|
||
ds = infer.CTRTestSeqDataset(
|
||
test_logids_ordered=list(test_logids),
|
||
item_dict=item_dict,
|
||
user_seq=user_seq,
|
||
max_feasign_per_slot=max_feasign_per_slot,
|
||
max_ctx_len=None,
|
||
)
|
||
loader = DataLoader(
|
||
ds, batch_size=batch_size, shuffle=False, num_workers=0,
|
||
collate_fn=infer.make_collate_fn(ds.max_slot_id),
|
||
)
|
||
batches = []
|
||
for b in loader:
|
||
batches.append(infer.move_batch_to_device(b, torch.device("cpu")))
|
||
if max_batches is not None and len(batches) >= max_batches:
|
||
break
|
||
|
||
# ----- 加载模型 -----
|
||
model, dev = infer.load_model(ckpt_path=None)
|
||
|
||
# ----- 推理 + 同步计时 -----
|
||
logid2p = {}
|
||
t_sum = 0.0
|
||
cuda = (dev.type == "cuda")
|
||
with torch.inference_mode():
|
||
for b in batches:
|
||
b = infer.move_batch_to_device(b, dev)
|
||
pm = b["pred_mask"].bool()
|
||
if cuda:
|
||
torch.cuda.synchronize()
|
||
t0 = time.time()
|
||
logits, _ = model(b)
|
||
probs = torch.sigmoid(logits.squeeze(-1))
|
||
if cuda:
|
||
torch.cuda.synchronize()
|
||
t_sum += time.time() - t0
|
||
for lid, p in zip(b["logid"][pm].cpu().tolist(), probs[pm].cpu().tolist()):
|
||
logid2p[lid] = p
|
||
|
||
# ----- 按 test.csv 顺序写 predict.txt 并打分 -----
|
||
order = [int(l.split(",")[0]) for l in open(test_csv) if l.strip()]
|
||
pred_path = cur / "predict.txt"
|
||
with open(pred_path, "w") as f:
|
||
for lid in order:
|
||
f.write(f"{logid2p[lid]}\n")
|
||
|
||
res = infer._cal_score(pred_path, label_file, default_latency=t_sum)
|
||
print(
|
||
f"[BENCH] cfg={config_override} bs={batch_size}"
|
||
f"{'' if max_batches is None else f' (first {max_batches} batches)'}"
|
||
f" -> AUC={res['auc']:.5f} PCOC={res['pcoc']:.4f}"
|
||
f" lat={res['latency']:.2f}s score={res['score_all']:.2f}"
|
||
)
|
||
return res
|
||
|
||
|
||
if __name__ == "__main__":
|
||
run_once({}) # 默认配置基准
|