From bc6e8307c5dffa714511c03693c231e2e61803f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=88=AA=E5=AE=87?= <3364451258@qq.com> Date: Fri, 12 Jun 2026 22:02:40 +0800 Subject: [PATCH] =?UTF-8?q?revert:=20=E7=A7=BB=E9=99=A4=20torch.compile?= =?UTF-8?q?=EF=BC=88=E5=8A=A8=E6=80=81=20batch=20=E5=BD=A2=E7=8A=B6?= =?UTF-8?q?=E5=AF=BC=E8=87=B4=E5=8F=8D=E5=A4=8D=E9=87=8D=E7=BC=96=E8=AF=91?= =?UTF-8?q?=EF=BC=8C=E5=8F=8D=E8=80=8C=E6=85=A2=E4=BA=8E=E4=B8=8D=E7=BC=96?= =?UTF-8?q?=E8=AF=91=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sequence Packing 使每个 batch 序列长度不同,CUDA Graph 需反复重编译。 Flash Attention + FP16 是目前最优组合(94.5s, 56.98 分)。 --- 代码/code/infer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/代码/code/infer.py b/代码/code/infer.py index 691f95d..09e3a44 100644 --- a/代码/code/infer.py +++ b/代码/code/infer.py @@ -511,11 +511,6 @@ def load_model(ckpt_path, device='cuda:0'): model.to(dev) model.eval() - - # === torch.compile:算子融合 + 减少 kernel launch 开销 === - model = torch.compile(model, mode="reduce-overhead") - print("[INFO] torch.compile applied (mode=reduce-overhead)") - print(f"[INFO] Model ready. Device: {dev}") return model, dev