From bc6e8307c5dffa714511c03693c231e2e61803f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=88=AA=E5=AE=87?= <3364451258@qq.com>
Date: Fri, 12 Jun 2026 22:02:40 +0800
Subject: [PATCH] =?UTF-8?q?revert:=20=E7=A7=BB=E9=99=A4=20torch.compile?=
 =?UTF-8?q?=EF=BC=88=E5=8A=A8=E6=80=81=20batch=20=E5=BD=A2=E7=8A=B6?=
 =?UTF-8?q?=E5=AF=BC=E8=87=B4=E5=8F=8D=E5=A4=8D=E9=87=8D=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=EF=BC=8C=E5=8F=8D=E8=80=8C=E6=85=A2=E4=BA=8E=E4=B8=8D=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sequence Packing 使每个 batch 序列长度不同，CUDA Graph 需反复重编译。
Flash Attention + FP16 是目前最优组合（94.5s, 56.98 分）。
---
 代码/code/infer.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/代码/code/infer.py b/代码/code/infer.py
index 691f95d..09e3a44 100644
--- a/代码/code/infer.py
+++ b/代码/code/infer.py
@@ -511,11 +511,6 @@ def load_model(ckpt_path, device='cuda:0'):
 
     model.to(dev)
     model.eval()
-
-    # === torch.compile：算子融合 + 减少 kernel launch 开销 ===
-    model = torch.compile(model, mode="reduce-overhead")
-    print("[INFO] torch.compile applied (mode=reduce-overhead)")
-
     print(f"[INFO] Model ready. Device: {dev}")
 
     return model, dev