feat: infer.py 接入 CONFIG 实验开关 + 新增 bench.py 测量闭环
- infer.py: 模块级 CONFIG(fp16/keep_fp32_modules/expert_merge/ merge_threshold/signid_mode/sync_timing),默认值=当前最优行为; load_model 按 CONFIG 控制半精度/FP32敏感层/expert合并; RepEncoder 支持 clamp/modulo 两种 sign-id 处理; 新增 _force_fp32_io 钩子让敏感层在FP16模型里以FP32 IO 计算。 - bench.py: 设置 CONFIG → 跑推理 → cuda.synchronize 真实计时 → _cal_score 打印 AUC/PCOC/延迟/总分,支持配置/batch扫描。不进提交包。 - EXPERIMENTS.md: 实验记录表。 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,19 @@
|
|||||||
|
# 实验记录
|
||||||
|
|
||||||
|
> 在 AI Studio notebook 里跑 `bench.py` 后,把每次配置的实测值填进表里。
|
||||||
|
> 「本地分」用本地 test.csv + label_data.txt 算(仅作方向参考);「提交分」是验证集真实分数。
|
||||||
|
> 本文件可入 git,但**不进提交包**(打包只含 infer.py / requirements.txt / build_env.sh)。
|
||||||
|
|
||||||
|
| 任务 | 配置 | AUC | PCOC | 延迟(同步) | 本地分 | 提交分 |
|
||||||
|
|------|------|-----|------|-----------|--------|--------|
|
||||||
|
| 基线 | 默认(当前最优: fp16+merge0.90+clamp) | _待测_ | _待测_ | _待测_ | _待测_ | 58.86 |
|
||||||
|
|
||||||
|
## 待跑(按计划顺序)
|
||||||
|
|
||||||
|
- [ ] Task 2: `python bench.py` 默认配置 → 填上面「基线」行的本地实测
|
||||||
|
- [ ] **Task 3(最关键)**: `bench.run_once({"fp16": False, "expert_merge": False, "signid_mode": "clamp"})` → FP32 天花板 AUC,判定 80+ 是否有 AUC 空间
|
||||||
|
- [ ] Task 4: clamp vs modulo(先查 max_sign_id 是否超 5M)
|
||||||
|
- [ ] Task 5: 混合精度 keep_fp32_modules 扫描
|
||||||
|
- [ ] Task 6: expert_merge 开/关的 AUC 代价
|
||||||
|
- [ ] Task 7: 特征截断 + 上下文完整性核查
|
||||||
|
- [ ] Task 8: 锁定阶段 A 配置并提交一次
|
||||||
@@ -0,0 +1,110 @@
|
|||||||
|
"""本地测量闭环:设置 infer.CONFIG,跑推理,同步计时,打印 AUC/PCOC/延迟/总分。
|
||||||
|
|
||||||
|
不进提交包。在 AI Studio notebook(带 dataset/ 与 ckpt.pt)里运行:
|
||||||
|
|
||||||
|
%cd /home/aistudio/code
|
||||||
|
!python bench.py # 默认配置基准
|
||||||
|
|
||||||
|
或在 notebook cell 里逐配置扫描:
|
||||||
|
|
||||||
|
import bench
|
||||||
|
bench.run_once({"fp16": False, "expert_merge": False}) # FP32 参考跑
|
||||||
|
bench.run_once({"signid_mode": "modulo"}) # 取模 vs clamp
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
import infer # 同目录
|
||||||
|
|
||||||
|
|
||||||
|
def run_once(config_override=None, batch_size=50, max_batches=None, max_feasign_per_slot=None):
|
||||||
|
"""跑一次本地推理并打分。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_override: 覆盖 infer.CONFIG 的字典(如 {"fp16": False})
|
||||||
|
batch_size: DataLoader 的 batch 大小(本地参考;评测端可能自有设定)
|
||||||
|
max_batches: 只跑前 N 个 batch(快速冒烟用),None=全量
|
||||||
|
max_feasign_per_slot: 传给 CTRTestSeqDataset 的截断字典,None=不截断;
|
||||||
|
默认沿用 baseline 的 {1: 2}
|
||||||
|
Returns:
|
||||||
|
infer._cal_score 的结果 dict
|
||||||
|
"""
|
||||||
|
if config_override is None:
|
||||||
|
config_override = {}
|
||||||
|
if max_feasign_per_slot is None:
|
||||||
|
max_feasign_per_slot = {1: 2}
|
||||||
|
|
||||||
|
infer.CONFIG.update(config_override)
|
||||||
|
infer.CONFIG["sync_timing"] = True
|
||||||
|
|
||||||
|
cur = Path(__file__).parent
|
||||||
|
ref = cur / "dataset"
|
||||||
|
history = ref / "history"
|
||||||
|
test_csv = ref / "test.csv"
|
||||||
|
label_file = ref / "label_data.txt"
|
||||||
|
|
||||||
|
# ----- 加载数据 -----
|
||||||
|
files = (sorted(history.glob("*.csv")) if history.exists() else []) + [test_csv]
|
||||||
|
item_dict, user_seq = infer.load_sample_files(files)
|
||||||
|
test_logids = infer.load_logids_from_file(test_csv)
|
||||||
|
ds = infer.CTRTestSeqDataset(
|
||||||
|
test_logids_ordered=list(test_logids),
|
||||||
|
item_dict=item_dict,
|
||||||
|
user_seq=user_seq,
|
||||||
|
max_feasign_per_slot=max_feasign_per_slot,
|
||||||
|
max_ctx_len=None,
|
||||||
|
)
|
||||||
|
loader = DataLoader(
|
||||||
|
ds, batch_size=batch_size, shuffle=False, num_workers=0,
|
||||||
|
collate_fn=infer.make_collate_fn(ds.max_slot_id),
|
||||||
|
)
|
||||||
|
batches = []
|
||||||
|
for b in loader:
|
||||||
|
batches.append(infer.move_batch_to_device(b, torch.device("cpu")))
|
||||||
|
if max_batches is not None and len(batches) >= max_batches:
|
||||||
|
break
|
||||||
|
|
||||||
|
# ----- 加载模型 -----
|
||||||
|
model, dev = infer.load_model(ckpt_path=None)
|
||||||
|
|
||||||
|
# ----- 推理 + 同步计时 -----
|
||||||
|
logid2p = {}
|
||||||
|
t_sum = 0.0
|
||||||
|
cuda = (dev.type == "cuda")
|
||||||
|
with torch.inference_mode():
|
||||||
|
for b in batches:
|
||||||
|
b = infer.move_batch_to_device(b, dev)
|
||||||
|
pm = b["pred_mask"].bool()
|
||||||
|
if cuda:
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
t0 = time.time()
|
||||||
|
logits, _ = model(b)
|
||||||
|
probs = torch.sigmoid(logits.squeeze(-1))
|
||||||
|
if cuda:
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
t_sum += time.time() - t0
|
||||||
|
for lid, p in zip(b["logid"][pm].cpu().tolist(), probs[pm].cpu().tolist()):
|
||||||
|
logid2p[lid] = p
|
||||||
|
|
||||||
|
# ----- 按 test.csv 顺序写 predict.txt 并打分 -----
|
||||||
|
order = [int(l.split(",")[0]) for l in open(test_csv) if l.strip()]
|
||||||
|
pred_path = cur / "predict.txt"
|
||||||
|
with open(pred_path, "w") as f:
|
||||||
|
for lid in order:
|
||||||
|
f.write(f"{logid2p[lid]}\n")
|
||||||
|
|
||||||
|
res = infer._cal_score(pred_path, label_file, default_latency=t_sum)
|
||||||
|
print(
|
||||||
|
f"[BENCH] cfg={config_override} bs={batch_size}"
|
||||||
|
f"{'' if max_batches is None else f' (first {max_batches} batches)'}"
|
||||||
|
f" -> AUC={res['auc']:.5f} PCOC={res['pcoc']:.4f}"
|
||||||
|
f" lat={res['latency']:.2f}s score={res['score_all']:.2f}"
|
||||||
|
)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_once({}) # 默认配置基准
|
||||||
+53
-3
@@ -18,6 +18,41 @@ from torch.utils.data import Dataset, DataLoader
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 实验配置开关板
|
||||||
|
# 提交时保持下面的默认值 = 当前最优行为;评测系统不碰它,按默认值跑。
|
||||||
|
# bench.py 会在 import 之后用 infer.CONFIG.update(...) 覆盖这些值。
|
||||||
|
# ============================================================
|
||||||
|
CONFIG = {
|
||||||
|
"fp16": True, # True=半精度推理;False=FP32 参考跑(确立 AUC 天花板)
|
||||||
|
"keep_fp32_modules": (), # fp16 下仍保留 FP32 的子模块名前缀,如 ("linear",)
|
||||||
|
"expert_merge": True, # 是否做 expert 权重相似度合并
|
||||||
|
"merge_threshold": 0.90, # 合并的余弦相似度阈值
|
||||||
|
"signid_mode": "clamp", # "clamp" 或 "modulo":处理超界 sign id 的方式
|
||||||
|
"sync_timing": False, # bench 里设 True,做 torch.cuda.synchronize 真实计时
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _force_fp32_io(module):
|
||||||
|
"""让某个模块在 FP16 模型里以 FP32 计算:输入转 FP32、输出转回 FP16。
|
||||||
|
用于 keep_fp32_modules 指定的精度敏感层(如最终输出头、LayerNorm)。"""
|
||||||
|
module.float()
|
||||||
|
|
||||||
|
def _pre(m, args):
|
||||||
|
return tuple(
|
||||||
|
a.float() if torch.is_tensor(a) and a.is_floating_point() else a
|
||||||
|
for a in args
|
||||||
|
)
|
||||||
|
|
||||||
|
def _post(m, args, output):
|
||||||
|
if torch.is_tensor(output) and output.is_floating_point():
|
||||||
|
return output.half()
|
||||||
|
return output
|
||||||
|
|
||||||
|
module.register_forward_pre_hook(_pre)
|
||||||
|
module.register_forward_hook(_post)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# 数据加载(来自 train/dataset.py)
|
# 数据加载(来自 train/dataset.py)
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -263,6 +298,9 @@ class RepEncoder(nn.Module):
|
|||||||
for i in range(self.slot_num):
|
for i in range(self.slot_num):
|
||||||
values, offsets = batch[i + 1]
|
values, offsets = batch[i + 1]
|
||||||
offsets = offsets.to(values.device)
|
offsets = offsets.to(values.device)
|
||||||
|
if CONFIG["signid_mode"] == "modulo":
|
||||||
|
values = values % self.emb.num_embeddings # 取模哈希(与训练一致时用)
|
||||||
|
else:
|
||||||
values = values.clamp(0, max_idx) # 超出 vocab_size 的 sign id 截断,避免越界
|
values = values.clamp(0, max_idx) # 超出 vocab_size 的 sign id 截断,避免越界
|
||||||
sign_emb = self.emb(values).to(target_dtype)
|
sign_emb = self.emb(values).to(target_dtype)
|
||||||
res = torch.segment_reduce(sign_emb, reduce='sum', offsets=offsets, initial=0)
|
res = torch.segment_reduce(sign_emb, reduce='sum', offsets=offsets, initial=0)
|
||||||
@@ -496,13 +534,25 @@ def load_model(ckpt_path, device='cuda:0'):
|
|||||||
model.load_state_dict(ckpt['model_state_dict'])
|
model.load_state_dict(ckpt['model_state_dict'])
|
||||||
print(f"[INFO] Loaded checkpoint from {ckpt_path} (epoch={ckpt.get('epoch', '?')})")
|
print(f"[INFO] Loaded checkpoint from {ckpt_path} (epoch={ckpt.get('epoch', '?')})")
|
||||||
|
|
||||||
# === FP16 量化:模型参数转半精度,Embedding 保留 FP32 ===
|
if CONFIG["fp16"]:
|
||||||
model = model.half()
|
model = model.half()
|
||||||
|
# Embedding 始终保留 FP32(int 索引查表,不受浮点精度影响)
|
||||||
model.rep_encoder.emb = model.rep_encoder.emb.to(torch.float32)
|
model.rep_encoder.emb = model.rep_encoder.emb.to(torch.float32)
|
||||||
print("[INFO] Model converted to FP16 (embedding kept in FP32)")
|
# 额外保留 FP32 的精度敏感模块(输入/输出自动转换)
|
||||||
|
for name, module in model.named_modules():
|
||||||
|
if name and any(name.startswith(p) for p in CONFIG["keep_fp32_modules"]):
|
||||||
|
_force_fp32_io(module)
|
||||||
|
print(f"[INFO] FP16 on; FP32-kept: "
|
||||||
|
f"{('rep_encoder.emb',) + tuple(CONFIG['keep_fp32_modules'])}")
|
||||||
|
else:
|
||||||
|
model = model.float()
|
||||||
|
print("[INFO] FP32 reference (no half)")
|
||||||
|
|
||||||
# === 按 Expert 权重相似度合并冗余 expert ===
|
# === 按 Expert 权重相似度合并冗余 expert ===
|
||||||
_merge_experts(model, sim_threshold=0.90)
|
if CONFIG["expert_merge"]:
|
||||||
|
_merge_experts(model, sim_threshold=CONFIG["merge_threshold"])
|
||||||
|
else:
|
||||||
|
print("[INFO] expert_merge off")
|
||||||
else:
|
else:
|
||||||
print(f"[WARNING] Checkpoint {ckpt_path} not found, using random weights")
|
print(f"[WARNING] Checkpoint {ckpt_path} not found, using random weights")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user