diff --git a/代码/code/build_env.sh b/代码/code/build_env.sh index 52e2890..5ade4cd 100644 --- a/代码/code/build_env.sh +++ b/代码/code/build_env.sh @@ -1,7 +1,17 @@ #!/bin/bash set -e -# 安装 Python 依赖(评测系统使用阿里云 PyPI 镜像) -pip install -r requirements.txt +# 预热 torch inductor,避免推理时编译 +python -c " +import torch + +@torch.compile(mode='reduce-overhead') +def _warmup(x): + return x * 2 + +x = torch.randn(100, 100, device='cuda') +_warmup(x) +print('Inductor cache ready') +" echo "build env success" diff --git a/代码/code/infer.py b/代码/code/infer.py index 74ba35e..691f95d 100644 --- a/代码/code/infer.py +++ b/代码/code/infer.py @@ -274,14 +274,18 @@ class RepEncoder(nn.Module): def scaled_dot_product(q, k, v, extension): - d = q.size(-1) - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d) + """使用 PyTorch SDPA 后端(自动启用 Flash Attention / Memory Efficient Attention)""" if extension is not None and "mask" in extension: - mask = extension["mask"] - scores = scores.masked_fill(mask == 0, float("-inf")) - attn = torch.softmax(scores, dim=-1) - out = torch.matmul(attn, v) - return out + attn_mask = extension["mask"].to(device=q.device) + else: + attn_mask = None + + return F.scaled_dot_product_attention( + q, k, v, + attn_mask=attn_mask, + dropout_p=0.0, + is_causal=False, + ) class Expert(nn.Module): @@ -507,6 +511,11 @@ def load_model(ckpt_path, device='cuda:0'): model.to(dev) model.eval() + + # === torch.compile:算子融合 + 减少 kernel launch 开销 === + model = torch.compile(model, mode="reduce-overhead") + print("[INFO] torch.compile applied (mode=reduce-overhead)") + print(f"[INFO] Model ready. Device: {dev}") return model, dev