diff --git a/代码/code/EXPERIMENTS.md b/代码/code/EXPERIMENTS.md new file mode 100644 index 0000000..d4229ff --- /dev/null +++ b/代码/code/EXPERIMENTS.md @@ -0,0 +1,19 @@ +# 实验记录 + +> 在 AI Studio notebook 里跑 `bench.py` 后,把每次配置的实测值填进表里。 +> 「本地分」用本地 test.csv + label_data.txt 算(仅作方向参考);「提交分」是验证集真实分数。 +> 本文件可入 git,但**不进提交包**(打包只含 infer.py / requirements.txt / build_env.sh)。 + +| 任务 | 配置 | AUC | PCOC | 延迟(同步) | 本地分 | 提交分 | +|------|------|-----|------|-----------|--------|--------| +| 基线 | 默认(当前最优: fp16+merge0.90+clamp) | _待测_ | _待测_ | _待测_ | _待测_ | 58.86 | + +## 待跑(按计划顺序) + +- [ ] Task 2: `python bench.py` 默认配置 → 填上面「基线」行的本地实测 +- [ ] **Task 3(最关键)**: `bench.run_once({"fp16": False, "expert_merge": False, "signid_mode": "clamp"})` → FP32 天花板 AUC,判定 80+ 是否有 AUC 空间 +- [ ] Task 4: clamp vs modulo(先查 max_sign_id 是否超 5M) +- [ ] Task 5: 混合精度 keep_fp32_modules 扫描 +- [ ] Task 6: expert_merge 开/关的 AUC 代价 +- [ ] Task 7: 特征截断 + 上下文完整性核查 +- [ ] Task 8: 锁定阶段 A 配置并提交一次 diff --git a/代码/code/bench.py b/代码/code/bench.py new file mode 100644 index 0000000..272197c --- /dev/null +++ b/代码/code/bench.py @@ -0,0 +1,110 @@ +"""本地测量闭环:设置 infer.CONFIG,跑推理,同步计时,打印 AUC/PCOC/延迟/总分。 + +不进提交包。在 AI Studio notebook(带 dataset/ 与 ckpt.pt)里运行: + + %cd /home/aistudio/code + !python bench.py # 默认配置基准 + +或在 notebook cell 里逐配置扫描: + + import bench + bench.run_once({"fp16": False, "expert_merge": False}) # FP32 参考跑 + bench.run_once({"signid_mode": "modulo"}) # 取模 vs clamp +""" +import time +from pathlib import Path + +import torch +from torch.utils.data import DataLoader + +import infer # 同目录 + + +def run_once(config_override=None, batch_size=50, max_batches=None, max_feasign_per_slot=None): + """跑一次本地推理并打分。 + + Args: + config_override: 覆盖 infer.CONFIG 的字典(如 {"fp16": False}) + batch_size: DataLoader 的 batch 大小(本地参考;评测端可能自有设定) + max_batches: 只跑前 N 个 batch(快速冒烟用),None=全量 + max_feasign_per_slot: 传给 CTRTestSeqDataset 的截断字典,None=不截断; + 默认沿用 baseline 的 {1: 2} + Returns: + infer._cal_score 的结果 dict + """ + if config_override is None: + config_override = {} + if max_feasign_per_slot is None: + max_feasign_per_slot = {1: 2} + + infer.CONFIG.update(config_override) + infer.CONFIG["sync_timing"] = True + + cur = Path(__file__).parent + ref = cur / "dataset" + history = ref / "history" + test_csv = ref / "test.csv" + label_file = ref / "label_data.txt" + + # ----- 加载数据 ----- + files = (sorted(history.glob("*.csv")) if history.exists() else []) + [test_csv] + item_dict, user_seq = infer.load_sample_files(files) + test_logids = infer.load_logids_from_file(test_csv) + ds = infer.CTRTestSeqDataset( + test_logids_ordered=list(test_logids), + item_dict=item_dict, + user_seq=user_seq, + max_feasign_per_slot=max_feasign_per_slot, + max_ctx_len=None, + ) + loader = DataLoader( + ds, batch_size=batch_size, shuffle=False, num_workers=0, + collate_fn=infer.make_collate_fn(ds.max_slot_id), + ) + batches = [] + for b in loader: + batches.append(infer.move_batch_to_device(b, torch.device("cpu"))) + if max_batches is not None and len(batches) >= max_batches: + break + + # ----- 加载模型 ----- + model, dev = infer.load_model(ckpt_path=None) + + # ----- 推理 + 同步计时 ----- + logid2p = {} + t_sum = 0.0 + cuda = (dev.type == "cuda") + with torch.inference_mode(): + for b in batches: + b = infer.move_batch_to_device(b, dev) + pm = b["pred_mask"].bool() + if cuda: + torch.cuda.synchronize() + t0 = time.time() + logits, _ = model(b) + probs = torch.sigmoid(logits.squeeze(-1)) + if cuda: + torch.cuda.synchronize() + t_sum += time.time() - t0 + for lid, p in zip(b["logid"][pm].cpu().tolist(), probs[pm].cpu().tolist()): + logid2p[lid] = p + + # ----- 按 test.csv 顺序写 predict.txt 并打分 ----- + order = [int(l.split(",")[0]) for l in open(test_csv) if l.strip()] + pred_path = cur / "predict.txt" + with open(pred_path, "w") as f: + for lid in order: + f.write(f"{logid2p[lid]}\n") + + res = infer._cal_score(pred_path, label_file, default_latency=t_sum) + print( + f"[BENCH] cfg={config_override} bs={batch_size}" + f"{'' if max_batches is None else f' (first {max_batches} batches)'}" + f" -> AUC={res['auc']:.5f} PCOC={res['pcoc']:.4f}" + f" lat={res['latency']:.2f}s score={res['score_all']:.2f}" + ) + return res + + +if __name__ == "__main__": + run_once({}) # 默认配置基准 diff --git a/代码/code/infer.py b/代码/code/infer.py index 1745d7d..7d3d131 100644 --- a/代码/code/infer.py +++ b/代码/code/infer.py @@ -18,6 +18,41 @@ from torch.utils.data import Dataset, DataLoader from tqdm import tqdm +# ============================================================ +# 实验配置开关板 +# 提交时保持下面的默认值 = 当前最优行为;评测系统不碰它,按默认值跑。 +# bench.py 会在 import 之后用 infer.CONFIG.update(...) 覆盖这些值。 +# ============================================================ +CONFIG = { + "fp16": True, # True=半精度推理;False=FP32 参考跑(确立 AUC 天花板) + "keep_fp32_modules": (), # fp16 下仍保留 FP32 的子模块名前缀,如 ("linear",) + "expert_merge": True, # 是否做 expert 权重相似度合并 + "merge_threshold": 0.90, # 合并的余弦相似度阈值 + "signid_mode": "clamp", # "clamp" 或 "modulo":处理超界 sign id 的方式 + "sync_timing": False, # bench 里设 True,做 torch.cuda.synchronize 真实计时 +} + + +def _force_fp32_io(module): + """让某个模块在 FP16 模型里以 FP32 计算:输入转 FP32、输出转回 FP16。 + 用于 keep_fp32_modules 指定的精度敏感层(如最终输出头、LayerNorm)。""" + module.float() + + def _pre(m, args): + return tuple( + a.float() if torch.is_tensor(a) and a.is_floating_point() else a + for a in args + ) + + def _post(m, args, output): + if torch.is_tensor(output) and output.is_floating_point(): + return output.half() + return output + + module.register_forward_pre_hook(_pre) + module.register_forward_hook(_post) + + # ============================================================ # 数据加载(来自 train/dataset.py) # ============================================================ @@ -263,7 +298,10 @@ class RepEncoder(nn.Module): for i in range(self.slot_num): values, offsets = batch[i + 1] offsets = offsets.to(values.device) - values = values.clamp(0, max_idx) # 超出 vocab_size 的 sign id 截断,避免越界 + if CONFIG["signid_mode"] == "modulo": + values = values % self.emb.num_embeddings # 取模哈希(与训练一致时用) + else: + values = values.clamp(0, max_idx) # 超出 vocab_size 的 sign id 截断,避免越界 sign_emb = self.emb(values).to(target_dtype) res = torch.segment_reduce(sign_emb, reduce='sum', offsets=offsets, initial=0) pooled_embs.append(res) @@ -496,13 +534,25 @@ def load_model(ckpt_path, device='cuda:0'): model.load_state_dict(ckpt['model_state_dict']) print(f"[INFO] Loaded checkpoint from {ckpt_path} (epoch={ckpt.get('epoch', '?')})") - # === FP16 量化:模型参数转半精度,Embedding 保留 FP32 === - model = model.half() - model.rep_encoder.emb = model.rep_encoder.emb.to(torch.float32) - print("[INFO] Model converted to FP16 (embedding kept in FP32)") + if CONFIG["fp16"]: + model = model.half() + # Embedding 始终保留 FP32(int 索引查表,不受浮点精度影响) + model.rep_encoder.emb = model.rep_encoder.emb.to(torch.float32) + # 额外保留 FP32 的精度敏感模块(输入/输出自动转换) + for name, module in model.named_modules(): + if name and any(name.startswith(p) for p in CONFIG["keep_fp32_modules"]): + _force_fp32_io(module) + print(f"[INFO] FP16 on; FP32-kept: " + f"{('rep_encoder.emb',) + tuple(CONFIG['keep_fp32_modules'])}") + else: + model = model.float() + print("[INFO] FP32 reference (no half)") # === 按 Expert 权重相似度合并冗余 expert === - _merge_experts(model, sim_threshold=0.90) + if CONFIG["expert_merge"]: + _merge_experts(model, sim_threshold=CONFIG["merge_threshold"]) + else: + print("[INFO] expert_merge off") else: print(f"[WARNING] Checkpoint {ckpt_path} not found, using random weights")