ops-transformer_8242/experimental/mamba/utils/utils.py-代码预览-ops-transformer_8242:基于 CANN 的 transformer 类大模型进阶算子库项目 - AtomGit

# ----------------------------------------------------------------------------
# Copyright (c) 2025 Huawei Technologies Co., Ltd.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of 
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# ----------------------------------------------------------------------------


import logging
import torch
import torch_npu

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def check_diff(x, y):
    diff = torch.abs(x - y)
    rel_diff = diff.max() / torch.abs(x.max())
    logger.info(
        "Max diff: %.04f | Relative max diff: %.05f",
        diff.max(),
        rel_diff,
    )


def profiling(model, inputs, mode):
    repeat = 10
    warmup = 5
    
    with torch_npu.profiler.profile(
        activities=[
            torch_npu.profiler.ProfilerActivity.CPU,
            torch_npu.profiler.ProfilerActivity.NPU
        ],
        schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1, skip_first=0),
        on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f'./{mode}_profile_results')
    ) as prof:
        
        for _ in range(warmup):
            outputs = model(*inputs)

        t1 = torch.npu.Event(enable_timing=True)
        t2 = torch.npu.Event(enable_timing=True)
        t1.record()
        for _ in range(repeat):
            outputs = model(*inputs)
        t2.record()
        elapsed = t1.elapsed_time(t2)

        logger.info(
            ">>>> %s IMPL TIME ELAPSED: %.1f us",
            mode,
            elapsed / repeat * 1000,
        )

        prof.step()