fix: Phase B 实测回归(flex+dense慢5-6x)，默认回退 sdpa+loop；bench 加 --profile

实测 A800：sdpa+loop=15.15s，flex+dense=98s，+compile=82s。模型是开销瓶颈非算力瓶颈(30TFLOP应0.15s却跑15s)，FlexAttention解决的算力问题非此处瓶颈、反增开销。默认改回已验证最快的 sdpa+loop。新增 bench --profile 用 torch.profiler 定位真正的开销来源(算子级)。 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 00:25:53 +08:00
parent c1d8b91fb2
commit 9eaf5f5511
2 changed files with 51 additions and 3 deletions
@@ -160,6 +160,47 @@ def run_diag(rebuild=False):
          f"超界sign占比={over}/{tot}={(over / max(tot, 1)):.2%}")
 def run_profile(config_override=None, n=20, batch_size=50, rebuild=False):
    """用 torch.profiler 剖析前 n 个 batch，打印按 CUDA 耗时排序的算子表，定位真正瓶颈。"""
    if config_override is None:
        config_override = {}
    infer.CONFIG.update(config_override)
    cur = Path(__file__).parent
    ref = cur / "dataset"
    item_dict, user_seq = _get_data(cur, ref, rebuild=rebuild)
    test_logids = infer.load_logids_from_file(ref / "test.csv")
    ds = infer.CTRTestSeqDataset(
        test_logids_ordered=list(test_logids), item_dict=item_dict,
        user_seq=user_seq, max_feasign_per_slot={1: 2}, max_ctx_len=None)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0,
                        collate_fn=infer.make_collate_fn(ds.max_slot_id))
    batches = []
    for b in loader:
        batches.append(infer.move_batch_to_device(b, torch.device("cpu")))
        if len(batches) >= n:
            break
    del item_dict, user_seq, ds, loader
    import gc
    gc.collect()
    model, dev = infer.load_model(ckpt_path=None)
    cuda = (dev.type == "cuda")
    from torch.profiler import profile, ProfilerActivity
    acts = [ProfilerActivity.CPU] + ([ProfilerActivity.CUDA] if cuda else [])
    with torch.inference_mode():
        warm = infer.move_batch_to_device(batches[0], dev)  # 预热（触发任何首次编译）
        model(warm)
        if cuda:
            torch.cuda.synchronize()
        with profile(activities=acts) as prof:
            for b in batches:
                b = infer.move_batch_to_device(b, dev)
                model(b)
                if cuda:
                    torch.cuda.synchronize()
    sort_key = "cuda_time_total" if cuda else "cpu_time_total"
    print(prof.key_averages().table(sort_by=sort_key, row_limit=25))
 def run_once(config_override=None, batch_size=50, max_batches=None,
             max_feasign_per_slot=None, rebuild=False):
    """跑一次本地推理并打分。返回 infer._cal_score 的结果 dict。"""
@@ -255,6 +296,8 @@ def _parse_args():
    ap.add_argument("--moe", choices=["dense", "loop"], default=None,
                    help="MoE实现：dense=向量化(新), loop=逐expert循环(原)")
    ap.add_argument("--compile", action="store_true", help="开启 torch.compile")
    ap.add_argument("--profile", type=int, default=None, metavar="N",
                    help="剖析前 N 个 batch，打印按 CUDA 耗时排序的算子表（定位瓶颈）")
    ap.add_argument("--rebuild", action="store_true", help="强制重建过滤缓存")
    return ap.parse_args()
@@ -284,5 +327,8 @@ if __name__ == "__main__":
        cfg["vectorize_moe"] = (a.moe == "dense")
    if a.compile:
        cfg["compile"] = True
    if a.profile is not None:
        run_profile(cfg, n=a.profile, batch_size=a.bs, rebuild=a.rebuild)
        sys.exit(0)
    mf = None if a.feasign_none else {1: 2}
    run_once(cfg, batch_size=a.bs, max_batches=a.smoke, max_feasign_per_slot=mf, rebuild=a.rebuild)
@@ -40,9 +40,11 @@ CONFIG = {
    "signid_mode": "clamp",   # "clamp" 或 "modulo"：处理超界 sign id 的方式
    "sync_timing": False,     # bench 里设 True，做 torch.cuda.synchronize 真实计时
    "filter_test_users": True,  # 只处理含测试样本的用户（跳过会被丢弃的用户，省算力）
-    "use_flex_attn": "auto",  # "auto"(SM80+用flex,否则SDPA) / True / False
+    # 实测：FlexAttention + 稠密MoE 在本模型上反而慢 5-6 倍（模型是开销瓶颈非算力瓶颈），
-    "vectorize_moe": True,    # True=稠密向量化MoE（无Python循环/同步）；False=原逐expert循环
+    # 故默认回到已验证最快的 sdpa + loop；flex/dense 仅作 bench 对照选项。
-    "compile": False,         # 是否 torch.compile（图理干净后再开）
+    "use_flex_attn": False,   # "auto"(SM80+用flex,否则SDPA) / True / False
    "vectorize_moe": False,   # True=稠密向量化MoE；False=原逐expert循环（默认，已验证更快）
    "compile": False,         # 是否 torch.compile
 }