2026-06-15 12:33:32 +08:00
3 changed files with 185 additions and 6 deletions
@@ -0,0 +1,19 @@
+# 实验记录
+
+> 在 AI Studio notebook 里跑 `bench.py` 后，把每次配置的实测值填进表里。
+> 「本地分」用本地 test.csv + label_data.txt 算（仅作方向参考）；「提交分」是验证集真实分数。
+> 本文件可入 git，但**不进提交包**（打包只含 infer.py / requirements.txt / build_env.sh）。
+
+| 任务 | 配置 | AUC | PCOC | 延迟(同步) | 本地分 | 提交分 |
+|------|------|-----|------|-----------|--------|--------|
+| 基线 | 默认(当前最优: fp16+merge0.90+clamp) | _待测_ | _待测_ | _待测_ | _待测_ | 58.86 |
+
+## 待跑（按计划顺序）
+
+- [ ] Task 2: `python bench.py` 默认配置 → 填上面「基线」行的本地实测
+- [ ] **Task 3（最关键）**: `bench.run_once({"fp16": False, "expert_merge": False, "signid_mode": "clamp"})` → FP32 天花板 AUC，判定 80+ 是否有 AUC 空间
+- [ ] Task 4: clamp vs modulo（先查 max_sign_id 是否超 5M）
+- [ ] Task 5: 混合精度 keep_fp32_modules 扫描
+- [ ] Task 6: expert_merge 开/关的 AUC 代价
+- [ ] Task 7: 特征截断 + 上下文完整性核查
+- [ ] Task 8: 锁定阶段 A 配置并提交一次
@@ -0,0 +1,110 @@
+"""本地测量闭环：设置 infer.CONFIG，跑推理，同步计时，打印 AUC/PCOC/延迟/总分。
+
+不进提交包。在 AI Studio notebook（带 dataset/ 与 ckpt.pt）里运行：
+
+    %cd /home/aistudio/code
+    !python bench.py                       # 默认配置基准
+
+或在 notebook cell 里逐配置扫描：
+
+    import bench
+    bench.run_once({"fp16": False, "expert_merge": False})   # FP32 参考跑
+    bench.run_once({"signid_mode": "modulo"})                # 取模 vs clamp
+"""
+import time
+from pathlib import Path
+
+import torch
+from torch.utils.data import DataLoader
+
+import infer  # 同目录
+
+
+def run_once(config_override=None, batch_size=50, max_batches=None, max_feasign_per_slot=None):
+    """跑一次本地推理并打分。
+
+    Args:
+        config_override: 覆盖 infer.CONFIG 的字典（如 {"fp16": False}）
+        batch_size: DataLoader 的 batch 大小（本地参考；评测端可能自有设定）
+        max_batches: 只跑前 N 个 batch（快速冒烟用），None=全量
+        max_feasign_per_slot: 传给 CTRTestSeqDataset 的截断字典，None=不截断；
+                              默认沿用 baseline 的 {1: 2}
+    Returns:
+        infer._cal_score 的结果 dict
+    """
+    if config_override is None:
+        config_override = {}
+    if max_feasign_per_slot is None:
+        max_feasign_per_slot = {1: 2}
+
+    infer.CONFIG.update(config_override)
+    infer.CONFIG["sync_timing"] = True
+
+    cur = Path(__file__).parent
+    ref = cur / "dataset"
+    history = ref / "history"
+    test_csv = ref / "test.csv"
+    label_file = ref / "label_data.txt"
+
+    # ----- 加载数据 -----
+    files = (sorted(history.glob("*.csv")) if history.exists() else []) + [test_csv]
+    item_dict, user_seq = infer.load_sample_files(files)
+    test_logids = infer.load_logids_from_file(test_csv)
+    ds = infer.CTRTestSeqDataset(
+        test_logids_ordered=list(test_logids),
+        item_dict=item_dict,
+        user_seq=user_seq,
+        max_feasign_per_slot=max_feasign_per_slot,
+        max_ctx_len=None,
+    )
+    loader = DataLoader(
+        ds, batch_size=batch_size, shuffle=False, num_workers=0,
+        collate_fn=infer.make_collate_fn(ds.max_slot_id),
+    )
+    batches = []
+    for b in loader:
+        batches.append(infer.move_batch_to_device(b, torch.device("cpu")))
+        if max_batches is not None and len(batches) >= max_batches:
+            break
+
+    # ----- 加载模型 -----
+    model, dev = infer.load_model(ckpt_path=None)
+
+    # ----- 推理 + 同步计时 -----
+    logid2p = {}
+    t_sum = 0.0
+    cuda = (dev.type == "cuda")
+    with torch.inference_mode():
+        for b in batches:
+            b = infer.move_batch_to_device(b, dev)
+            pm = b["pred_mask"].bool()
+            if cuda:
+                torch.cuda.synchronize()
+            t0 = time.time()
+            logits, _ = model(b)
+            probs = torch.sigmoid(logits.squeeze(-1))
+            if cuda:
+                torch.cuda.synchronize()
+            t_sum += time.time() - t0
+            for lid, p in zip(b["logid"][pm].cpu().tolist(), probs[pm].cpu().tolist()):
+                logid2p[lid] = p
+
+    # ----- 按 test.csv 顺序写 predict.txt 并打分 -----
+    order = [int(l.split(",")[0]) for l in open(test_csv) if l.strip()]
+    pred_path = cur / "predict.txt"
+    with open(pred_path, "w") as f:
+        for lid in order:
+            f.write(f"{logid2p[lid]}\n")
+
+    res = infer._cal_score(pred_path, label_file, default_latency=t_sum)
+    print(
+        f"[BENCH] cfg={config_override} bs={batch_size}"
+        f"{'' if max_batches is None else f' (first {max_batches} batches)'}"
+        f" -> AUC={res['auc']:.5f} PCOC={res['pcoc']:.4f}"
+        f" lat={res['latency']:.2f}s score={res['score_all']:.2f}"
+    )
+    return res
+
+
+if __name__ == "__main__":
+    run_once({})  # 默认配置基准
@@ -18,6 +18,41 @@ from torch.utils.data import Dataset, DataLoader
 from tqdm import tqdm


+# ============================================================
+# 实验配置开关板
+# 提交时保持下面的默认值 = 当前最优行为；评测系统不碰它，按默认值跑。
+# bench.py 会在 import 之后用 infer.CONFIG.update(...) 覆盖这些值。
+# ============================================================
+CONFIG = {
+    "fp16": True,             # True=半精度推理；False=FP32 参考跑（确立 AUC 天花板）
+    "keep_fp32_modules": (),  # fp16 下仍保留 FP32 的子模块名前缀，如 ("linear",)
+    "expert_merge": True,     # 是否做 expert 权重相似度合并
+    "merge_threshold": 0.90,  # 合并的余弦相似度阈值
+    "signid_mode": "clamp",   # "clamp" 或 "modulo"：处理超界 sign id 的方式
+    "sync_timing": False,     # bench 里设 True，做 torch.cuda.synchronize 真实计时
+}
+
+
+def _force_fp32_io(module):
+    """让某个模块在 FP16 模型里以 FP32 计算：输入转 FP32、输出转回 FP16。
+    用于 keep_fp32_modules 指定的精度敏感层（如最终输出头、LayerNorm）。"""
+    module.float()
+
+    def _pre(m, args):
+        return tuple(
+            a.float() if torch.is_tensor(a) and a.is_floating_point() else a
+            for a in args
+        )
+
+    def _post(m, args, output):
+        if torch.is_tensor(output) and output.is_floating_point():
+            return output.half()
+        return output
+
+    module.register_forward_pre_hook(_pre)
+    module.register_forward_hook(_post)
+
+
 # ============================================================
 # 数据加载（来自 train/dataset.py）
 # ============================================================
@@ -263,6 +298,9 @@ class RepEncoder(nn.Module):
        for i in range(self.slot_num):
            values, offsets = batch[i + 1]
            offsets = offsets.to(values.device)
+            if CONFIG["signid_mode"] == "modulo":
+                values = values % self.emb.num_embeddings  # 取模哈希（与训练一致时用）
+            else:
                values = values.clamp(0, max_idx)  # 超出 vocab_size 的 sign id 截断，避免越界
            sign_emb = self.emb(values).to(target_dtype)
            res = torch.segment_reduce(sign_emb, reduce='sum', offsets=offsets, initial=0)
@@ -496,13 +534,25 @@ def load_model(ckpt_path, device='cuda:0'):
        model.load_state_dict(ckpt['model_state_dict'])
        print(f"[INFO] Loaded checkpoint from {ckpt_path} (epoch={ckpt.get('epoch', '?')})")

-        # === FP16 量化：模型参数转半精度，Embedding 保留 FP32 ===
+        if CONFIG["fp16"]:
            model = model.half()
+            # Embedding 始终保留 FP32（int 索引查表，不受浮点精度影响）
            model.rep_encoder.emb = model.rep_encoder.emb.to(torch.float32)
-        print("[INFO] Model converted to FP16 (embedding kept in FP32)")
+            # 额外保留 FP32 的精度敏感模块（输入/输出自动转换）
+            for name, module in model.named_modules():
+                if name and any(name.startswith(p) for p in CONFIG["keep_fp32_modules"]):
+                    _force_fp32_io(module)
+            print(f"[INFO] FP16 on; FP32-kept: "
+                  f"{('rep_encoder.emb',) + tuple(CONFIG['keep_fp32_modules'])}")
+        else:
+            model = model.float()
+            print("[INFO] FP32 reference (no half)")

        # === 按 Expert 权重相似度合并冗余 expert ===
-        _merge_experts(model, sim_threshold=0.90)
+        if CONFIG["expert_merge"]:
+            _merge_experts(model, sim_threshold=CONFIG["merge_threshold"])
+        else:
+            print("[INFO] expert_merge off")
    else:
        print(f"[WARNING] Checkpoint {ckpt_path} not found, using random weights")