/**
* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
 * \file kernel_simt_atomic_impl.h
 * \brief
 */
#ifndef IMPL_SIMT_API_CPP_DAV_C310_KERNEL_SIMT_ATOMIC_IMPL_H
#define IMPL_SIMT_API_CPP_DAV_C310_KERNEL_SIMT_ATOMIC_IMPL_H

#if defined(ASCENDC_CPU_DEBUG)
#include "kernel_process_lock.h"
#include "kernel_utils.h"
#include "kernel_simt_cpu.h"
#include "stub_def.h"
#endif

namespace AscendC {
namespace Simt {
#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicCasImpl(__gm__ T *address, T compare, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, compare, val]() {
        if (*address == compare) {
            *address = val;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicCasImpl(__ubuf__ T *address, T compare, T val)
{
    return atomicCAS(address, compare, val);
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicCasImpl(__gm__ T *address, T compare, T val)
{
    return atomicCAS(address, compare, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicAddImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() { *address += val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicAddImpl(__ubuf__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, float>) {
        return atomicAdd(address, val);
    } else {
        atomicAdd(address, val);
        return *address;
    }
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicAddImpl(__gm__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, int64_t, uint64_t, float, half2, bfloat16x2_t>) {
        return atomicAdd(address, val);
    } else {
        atomicAdd(address, val);
        return *address;
    }
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicSubImpl(__gm__ T *address, T val)
{
    return AtomicAddImpl(address, (T)0 - val);
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicSubImpl(__ubuf__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, float>) {
        return atomicSub(address, val);
    } else {
        atomicAdd(address, -val);
        return *address;
    }
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicSubImpl(__gm__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, int64_t, uint64_t, float, half2, bfloat16x2_t>) {
        return atomicSub(address, val);
    } else {
        atomicAdd(address, -val);
        return *address;
    }
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicExchImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() { *address = val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicExchImpl(__ubuf__ T *address, T val)
{
    return atomicExch(address, val);
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicExchImpl(__gm__ T *address, T val)
{
    return atomicExch(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicMaxImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address < val) {
            *address = val;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicMaxImpl(__ubuf__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, float>) {
        return atomicMax(address, val);
    } else {
        atomicMax(address, val);
        return *address;
    }
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicMaxImpl(__gm__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, int64_t, uint64_t, float, half2, bfloat16x2_t>) {
        return atomicMax(address, val);
    } else {
        atomicMax(address, val);
        return *address;
    }
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicMinImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address > val) {
            *address = val;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicMinImpl(__ubuf__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, float>) {
        return atomicMin(address, val);
    } else {
        atomicMin(address, val);
        return *address;
    }
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicMinImpl(__gm__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, int32_t, uint32_t, int64_t, uint64_t, float, half2, bfloat16x2_t>) {
        return atomicMin(address, val);
    } else {
        atomicMin(address, val);
        return *address;
    }
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicIncImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address >= val) {
            *address = (T)0;
        } else {
            *address += (T)1;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline DstType AtomicIncImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        if (old >= val) {
            newVal = (DstType)0;
        } else {
            newVal = old + (DstType)1;
        }
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicIncImpl(__ubuf__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, uint32_t>) {
        return atomicInc(address, val);
    } else {
        return AtomicIncImpl_<T, __ubuf__ T>(address, val);
    }
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicIncImpl(__gm__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, uint32_t, uint64_t>) {
        return atomicInc(address, val);
    } else {
        return AtomicIncImpl_<T, __gm__ T>(address, val);
    }
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicDecImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val]() {
        if (*address == (T)0 || *address > val) {
            *address = val;
        } else {
            *address -= (T)1;
        }
    });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename DstType, typename SrcType>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline DstType AtomicDecImpl_(SrcType *address, DstType val)
{
    DstType old = *address;
    DstType cmp;
    DstType newVal;
    do {
        cmp = old;
        if (old == (DstType)0 || old > val) {
            newVal = val;
        } else {
            newVal = old - (DstType)1;
        }
        old = AtomicCasImpl(address, cmp, newVal);
    } while (cmp != old);
    return old;
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicDecImpl(__ubuf__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, uint32_t>) {
        return atomicDec(address, val);
    } else {
        return AtomicDecImpl_<T, __ubuf__ T>(address, val);
    }
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicDecImpl(__gm__ T *address, T val)
{
    if constexpr (SupportTypeSimtInternel<T, uint32_t, uint64_t>) {
        return atomicDec(address, val);
    } else {
        return AtomicDecImpl_<T, __gm__ T>(address, val);
    }
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicAndImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val, ret]() { *address = ret & val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicAndImpl(__ubuf__ T *address, T val)
{
    return atomicAnd(address, val);
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicAndImpl(__gm__ T *address, T val)
{
    return atomicAnd(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicOrImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val, ret]() { *address = ret | val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicOrImpl(__ubuf__ T *address, T val)
{
    return atomicOr(address, val);
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicOrImpl(__gm__ T *address, T val)
{
    return atomicOr(address, val);
}
#endif

#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
T AtomicXorImpl(__gm__ T *address, T val)
{
    T ret;
    ProcessLock::GetProcessLock()->Write();
    ret = *address;
    ThreadBlock::GetBlockInstance().AtomicOp([address, val, ret]() { *address = ret ^ val; });
    ProcessLock::GetProcessLock()->Unlock();
    return ret;
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicXorImpl(__ubuf__ T *address, T val)
{
    return atomicXOr(address, val);
}

template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AtomicXorImpl(__gm__ T *address, T val)
{
    return atomicXOr(address, val);
}
#endif

}  // namespace Simt
}  // namespace AscendC
#endif  // IMPL_SIMT_API_CPP_DAV_C310_KERNEL_SIMT_ATOMIC_IMPL_H