/**
* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
 * \file device_atomic_functions_debug.cpp
 * \brief
 */
#if defined(ASCENDC_CPU_DEBUG)
#include <bitset>
#include <string>
#include "kernel_utils.h"
#include "kernel_simt_cpu.h"
#include "stub_def.h"

template <typename T>
T AtomicAddCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = *address + val;
    return ret;
}

template <typename T>
T AtomicSubCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = *address - val;
    return ret;
}

template <typename T>
T AtomicExchCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = val;
    return ret;
}

template <typename T>
T AtomicMaxCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = (*address > val) ? *address : val;
    return ret;
}

template <typename T>
T AtomicMinCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = (*address < val) ? *address : val;
    return ret;
}

template <typename T>
T AtomicIncCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = (*address >= val) ? 0 : *address + 1;
    return ret;
}

template <typename T>
T AtomicDecCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = (*address == 0 || *address > val) ? val : *address - 1;
    return ret;
}

template <typename T>
T AtomicCasCPU(__gm__ T *address, T compare, T val)
{
    T ret = *address;
    *address = (*address == compare) ? val : *address;
    return ret;
}

template <typename T>
T AtomicAndCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = *address & val;
    return ret;
}

template <typename T>
T AtomicOrCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = *address | val;
    return ret;
}

template <typename T>
T AtomicXorCPU(__gm__ T *address, T val)
{
    T ret = *address;
    *address = *address ^ val;
    return ret;
}

#define REGISTER_SIMT_ATOMIC_ADD(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicAddCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_ADD(atomicAdd, uint32_t);
REGISTER_SIMT_ATOMIC_ADD(atomicAdd, int32_t);
REGISTER_SIMT_ATOMIC_ADD(atomicAdd, uint64_t);
REGISTER_SIMT_ATOMIC_ADD(atomicAdd, int64_t);
REGISTER_SIMT_ATOMIC_ADD(atomicAdd, float);
#endif

#define REGISTER_SIMT_ATOMIC_SUB(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicSubCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_SUB(atomicSub, uint32_t);
REGISTER_SIMT_ATOMIC_SUB(atomicSub, int32_t);
REGISTER_SIMT_ATOMIC_SUB(atomicSub, uint64_t);
REGISTER_SIMT_ATOMIC_SUB(atomicSub, int64_t);
REGISTER_SIMT_ATOMIC_SUB(atomicSub, float);
#endif

#define REGISTER_SIMT_ATOMIC_EXCH(INTRI, TYPE)                                  \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicExchCPU<TYPE>(dst, val);                                   \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_EXCH(atomicExch, uint32_t);
REGISTER_SIMT_ATOMIC_EXCH(atomicExch, int32_t);
REGISTER_SIMT_ATOMIC_EXCH(atomicExch, uint64_t);
REGISTER_SIMT_ATOMIC_EXCH(atomicExch, int64_t);
REGISTER_SIMT_ATOMIC_EXCH(atomicExch, float);
#endif

#define REGISTER_SIMT_ATOMIC_MAX(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicMaxCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_MAX(atomicMax, uint32_t);
REGISTER_SIMT_ATOMIC_MAX(atomicMax, int32_t);
REGISTER_SIMT_ATOMIC_MAX(atomicMax, uint64_t);
REGISTER_SIMT_ATOMIC_MAX(atomicMax, int64_t);
REGISTER_SIMT_ATOMIC_MAX(atomicMax, float);
#endif

#define REGISTER_SIMT_ATOMIC_MIN(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicMinCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_MIN(atomicMin, uint32_t);
REGISTER_SIMT_ATOMIC_MIN(atomicMin, int32_t);
REGISTER_SIMT_ATOMIC_MIN(atomicMin, uint64_t);
REGISTER_SIMT_ATOMIC_MIN(atomicMin, int64_t);
REGISTER_SIMT_ATOMIC_MIN(atomicMin, float);
#endif

#define REGISTER_SIMT_ATOMIC_INC(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicIncCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_INC(atomicInc, uint32_t);
REGISTER_SIMT_ATOMIC_INC(atomicInc, uint64_t);
#endif

#define REGISTER_SIMT_ATOMIC_DEC(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicDecCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_DEC(atomicDec, uint32_t);
REGISTER_SIMT_ATOMIC_DEC(atomicDec, uint64_t);
#endif

#define REGISTER_SIMT_ATOMIC_CAS(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val1, TYPE val2)                          \
    {                                                                           \
        return AtomicCasCPU<TYPE>(dst, val1, val2);                             \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_CAS(atomicCAS, uint32_t);
REGISTER_SIMT_ATOMIC_CAS(atomicCAS, int32_t);
REGISTER_SIMT_ATOMIC_CAS(atomicCAS, uint64_t);
REGISTER_SIMT_ATOMIC_CAS(atomicCAS, int64_t);
REGISTER_SIMT_ATOMIC_CAS(atomicCAS, float);
#endif

#define REGISTER_SIMT_ATOMIC_AND(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicAndCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_AND(atomicAnd, uint32_t);
REGISTER_SIMT_ATOMIC_AND(atomicAnd, int32_t);
REGISTER_SIMT_ATOMIC_AND(atomicAnd, uint64_t);
REGISTER_SIMT_ATOMIC_AND(atomicAnd, int64_t);
#endif

#define REGISTER_SIMT_ATOMIC_OR(INTRI, TYPE)                                    \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicOrCPU<TYPE>(dst, val);                                     \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_OR(atomicOr, uint32_t);
REGISTER_SIMT_ATOMIC_OR(atomicOr, int32_t);
REGISTER_SIMT_ATOMIC_OR(atomicOr, uint64_t);
REGISTER_SIMT_ATOMIC_OR(atomicOr, int64_t);
#endif

#define REGISTER_SIMT_ATOMIC_XOR(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicXorCPU<TYPE>(dst, val);                                    \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_XOR(atomicXOr, uint32_t);
REGISTER_SIMT_ATOMIC_XOR(atomicXOr, int32_t);
REGISTER_SIMT_ATOMIC_XOR(atomicXOr, uint64_t);
REGISTER_SIMT_ATOMIC_XOR(atomicXOr, int64_t);
#endif

template <typename T>
T AtomicAddVecCPU(__gm__ T *address, T val)
{
    T ret = *address;
    return ret;
}

#define REGISTER_ATOMIC_ADD_VEC(INTRI, TYPE)                                    \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicAddVecCPU<TYPE>(dst, val);                                 \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_ATOMIC_ADD_VEC(atomicAdd, half2);
REGISTER_ATOMIC_ADD_VEC(atomicAdd, bfloat16x2_t);
#endif

template <typename T>
T AtomicSubVecCPU(__gm__ T *address, T val)
{
    T ret = *address;
    return ret;
}

#define REGISTER_ATOMIC_SUB_VEC(INTRI, TYPE)                                    \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicSubVecCPU<TYPE>(dst, val);                                 \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_ATOMIC_SUB_VEC(atomicSub, half2);
REGISTER_ATOMIC_SUB_VEC(atomicSub, bfloat16x2_t);
#endif

template <typename T>
T AtomicExchVecCPU(__gm__ T *address, T val)
{
    T ret = *address;
    return ret;
}

#define REGISTER_ATOMIC_EXCH_VEC(INTRI, TYPE)                                   \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicExchVecCPU<TYPE>(dst, val);                                \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_ATOMIC_EXCH_VEC(atomicExch, half2);
REGISTER_ATOMIC_EXCH_VEC(atomicExch, bfloat16x2_t);
#endif

template <typename T>
T AtomicMaxVecCPU(__gm__ T *address, T val)
{
    T ret = *address;
    return ret;
}

#define REGISTER_ATOMIC_MAX_VEC(INTRI, TYPE)                                    \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicMaxVecCPU<TYPE>(dst, val);                                 \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_ATOMIC_MAX_VEC(atomicMax, half2);
REGISTER_ATOMIC_MAX_VEC(atomicMax, bfloat16x2_t);
#endif

template <typename T>
T AtomicMinVecCPU(__gm__ T *address, T val)
{
    T ret = *address;
    return ret;
}

#define REGISTER_ATOMIC_MIN_VEC(INTRI, TYPE)                                    \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val)                                      \
    {                                                                           \
        return AtomicMinVecCPU<TYPE>(dst, val);                                 \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_ATOMIC_MIN_VEC(atomicMin, half2);
REGISTER_ATOMIC_MIN_VEC(atomicMin, bfloat16x2_t);
#endif

template <typename T>
T AtomicCasVecCPU(__gm__ T *address, T compare, T val)
{
    T ret = *address;
    return ret;
}

#define REGISTER_SIMT_ATOMIC_CAS_VEC(INTRI, TYPE)                               \
    TYPE INTRI(__gm__ TYPE *dst, TYPE val1, TYPE val2)                          \
    {                                                                           \
        return AtomicCasVecCPU<TYPE>(dst, val1, val2);                          \
    }

#if defined (__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
REGISTER_SIMT_ATOMIC_CAS_VEC(atomicCAS, half2);
REGISTER_SIMT_ATOMIC_CAS_VEC(atomicCAS, bfloat16x2_t);
#endif

#endif