@@ -1,9 +1,10 @@
-torch>=2.0.1
-torchvision>=0.15.2
-faster-coco-eval>=1.6.6
-PyYAML
-tensorboard
-scipy
-calflops
-transformers
-loguru
+torch==2.9.0
+torchvision==0.24.0
+faster-coco-eval==1.7.2
+PyYAML==6.0.3
+tensorboard==2.20.0
+scipy==1.17.1
+calflops==0.3.2
+transformers==4.57.6
+loguru==0.7.3
+matplotlib==3.10.3
@@ -13,6 +13,7 @@ import time
import numpy as np
import torch
+import torch_npu
import torch.backends.cudnn
import torch.distributed
import torch.nn as nn
@@ -20,6 +21,7 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.nn.parallel import DataParallel as DP
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DistributedSampler
+from torch_npu.contrib import transfer_to_npu
# from torch.utils.data.dataloader import DataLoader
from ..data import DataLoader
@@ -44,8 +46,7 @@ def setup_distributed(
WORLD_SIZE = int(os.getenv("WORLD_SIZE", 1))
# torch.distributed.init_process_group(backend=backend, init_method='env://')
- torch.distributed.init_process_group(init_method="env://")
- torch.distributed.barrier()
+ torch.distributed.init_process_group(backend='hccl', rank=LOCAL_RANK, world_size=WORLD_SIZE)
rank = torch.distributed.get_rank()
torch.cuda.set_device(rank)
@@ -100,7 +101,6 @@ def is_dist_available_and_initialized():
def cleanup():
"""cleanup distributed environment"""
if is_dist_available_and_initialized():
- torch.distributed.barrier()
torch.distributed.destroy_process_group()
@@ -40,7 +40,7 @@ class SmoothedValue(object):
"""
if not is_dist_available_and_initialized():
return
- t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+ t = torch.tensor([self.count, self.total], dtype=torch.float32, device="cuda")
tdist.barrier()
tdist.all_reduce(t)
t = t.tolist()
@@ -53,8 +53,8 @@ def deformable_attention_core_func(
sampling_grid_l_ = sampling_grids[:, :, :, level].permute(0, 2, 1, 3, 4).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(
- value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
- )
+ value_l_.to(torch.float16), sampling_grid_l_.to(torch.float16), mode="bilinear", padding_mode="zeros", align_corners=False
+ ).to(torch.float32)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape(
@@ -108,8 +108,8 @@ def deformable_attention_core_func_v2(
if method == "default":
sampling_value_l = F.grid_sample(
- value_l, sampling_grid_l, mode="bilinear", padding_mode="zeros", align_corners=False
- )
+ value_l.to(torch.float16), sampling_grid_l.to(torch.float16), mode="bilinear", padding_mode="zeros", align_corners=False
+ ).to(torch.float32)
elif method == "discrete":
# n * m, seq, n, 2
@@ -55,7 +55,7 @@ def main(
model = Model()
- data = torch.rand(32, 3, 640, 640)
+ data = torch.rand(1, 3, 640, 640)
size = torch.tensor([[640, 640]])
_ = model(data, size)
@@ -74,10 +74,11 @@ def main(
output_file,
input_names=["images", "orig_target_sizes"],
output_names=["labels", "boxes", "scores"],
- dynamic_axes=dynamic_axes,
opset_version=16,
verbose=False,
- do_constant_folding=True,
+ do_constant_folding=False,
+ dynamo=False,
+ keep_initializers_as_inputs=False,
)
if args.check:
@@ -122,7 +123,7 @@ if __name__ == "__main__":
parser.add_argument(
"--simplify",
action="store_true",
- default=True,
+ default=False,
)
args = parser.parse_args()
main(args)
@@ -8,9 +8,11 @@ import sys
import cv2 # Added for video processing
import numpy as np
import torch
+import torch_npu
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image, ImageDraw
+from torch_npu.contrib import transfer_to_npu
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
from src.core import YAMLConfig