diff --git a/requirements.txt b/requirements.txt
index aadfb81..d06171e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,10 @@
-torch>=2.0.1
-torchvision>=0.15.2
-faster-coco-eval>=1.6.6
-PyYAML
-tensorboard
-scipy
-calflops
-transformers
-loguru
+torch==2.9.0
+torchvision==0.24.0
+faster-coco-eval==1.7.2
+PyYAML==6.0.3
+tensorboard==2.20.0
+scipy==1.17.1
+calflops==0.3.2
+transformers==4.57.6
+loguru==0.7.3
+matplotlib==3.10.3
diff --git a/src/misc/dist_utils.py b/src/misc/dist_utils.py
index 959ded6..0d12794 100644
--- a/src/misc/dist_utils.py
+++ b/src/misc/dist_utils.py
@@ -13,6 +13,7 @@ import time
 
 import numpy as np
 import torch
+import torch_npu
 import torch.backends.cudnn
 import torch.distributed
 import torch.nn as nn
@@ -20,6 +21,7 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.nn.parallel import DataParallel as DP
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.data import DistributedSampler
+from torch_npu.contrib import transfer_to_npu
 
 # from torch.utils.data.dataloader import DataLoader
 from ..data import DataLoader
@@ -44,8 +46,7 @@ def setup_distributed(
         WORLD_SIZE = int(os.getenv("WORLD_SIZE", 1))
 
         # torch.distributed.init_process_group(backend=backend, init_method='env://')
-        torch.distributed.init_process_group(init_method="env://")
-        torch.distributed.barrier()
+        torch.distributed.init_process_group(backend='hccl', rank=LOCAL_RANK, world_size=WORLD_SIZE)
 
         rank = torch.distributed.get_rank()
         torch.cuda.set_device(rank)
@@ -100,7 +101,6 @@ def is_dist_available_and_initialized():
 def cleanup():
     """cleanup distributed environment"""
     if is_dist_available_and_initialized():
-        torch.distributed.barrier()
         torch.distributed.destroy_process_group()
 
 
diff --git a/src/misc/logger.py b/src/misc/logger.py
index 0c1ca18..9a1fa20 100644
--- a/src/misc/logger.py
+++ b/src/misc/logger.py
@@ -40,7 +40,7 @@ class SmoothedValue(object):
         """
         if not is_dist_available_and_initialized():
             return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        t = torch.tensor([self.count, self.total], dtype=torch.float32, device="cuda")
         tdist.barrier()
         tdist.all_reduce(t)
         t = t.tolist()
diff --git a/src/zoo/dfine/utils.py b/src/zoo/dfine/utils.py
index dffdcb5..936828d 100644
--- a/src/zoo/dfine/utils.py
+++ b/src/zoo/dfine/utils.py
@@ -53,8 +53,8 @@ def deformable_attention_core_func(
         sampling_grid_l_ = sampling_grids[:, :, :, level].permute(0, 2, 1, 3, 4).flatten(0, 1)
         # N_*M_, D_, Lq_, P_
         sampling_value_l_ = F.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
-        )
+            value_l_.to(torch.float16), sampling_grid_l_.to(torch.float16), mode="bilinear", padding_mode="zeros", align_corners=False
+        ).to(torch.float32)
         sampling_value_list.append(sampling_value_l_)
     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
     attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape(
@@ -108,8 +108,8 @@ def deformable_attention_core_func_v2(
 
         if method == "default":
             sampling_value_l = F.grid_sample(
-                value_l, sampling_grid_l, mode="bilinear", padding_mode="zeros", align_corners=False
-            )
+                value_l.to(torch.float16), sampling_grid_l.to(torch.float16), mode="bilinear", padding_mode="zeros", align_corners=False
+            ).to(torch.float32)
 
         elif method == "discrete":
             # n * m, seq, n, 2
diff --git a/tools/deployment/export_onnx.py b/tools/deployment/export_onnx.py
index 58a12cd..5b1677c 100644
--- a/tools/deployment/export_onnx.py
+++ b/tools/deployment/export_onnx.py
@@ -55,7 +55,7 @@ def main(
 
     model = Model()
 
-    data = torch.rand(32, 3, 640, 640)
+    data = torch.rand(1, 3, 640, 640)
     size = torch.tensor([[640, 640]])
     _ = model(data, size)
 
@@ -74,10 +74,11 @@ def main(
         output_file,
         input_names=["images", "orig_target_sizes"],
         output_names=["labels", "boxes", "scores"],
-        dynamic_axes=dynamic_axes,
         opset_version=16,
         verbose=False,
-        do_constant_folding=True,
+        do_constant_folding=False,
+        dynamo=False,
+        keep_initializers_as_inputs=False,
     )
 
     if args.check:
@@ -122,7 +123,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--simplify",
         action="store_true",
-        default=True,
+        default=False,
     )
     args = parser.parse_args()
     main(args)
diff --git a/tools/inference/torch_inf.py b/tools/inference/torch_inf.py
index c5ba0ef..6daf57d 100644
--- a/tools/inference/torch_inf.py
+++ b/tools/inference/torch_inf.py
@@ -8,9 +8,11 @@ import sys
 import cv2  # Added for video processing
 import numpy as np
 import torch
+import torch_npu
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image, ImageDraw
+from torch_npu.contrib import transfer_to_npu
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
 from src.core import YAMLConfig