/**
 * Copyright (c) 2026 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

#include <array>
#include <vector>
#include <iostream>
#include <string>
#include <cstdint>
#include "gtest/gtest.h"
#include "tikicpulib.h"
#include "data_utils.h"

#include "../../../op_kernel/atan2.cpp"

using namespace std;

class Atan2Test : public testing::Test {
protected:
    static void SetUpTestCase()
    {
        std::cout << "atan2_test SetUp" << std::endl;
        // 修正路径:从 build 目录直接访问 experimental 目录
        const string cmd = "cp -rf ../experimental/math/atan2/tests/ut/op_kernel/atan2_data ./";
        system(cmd.c_str());
        system("chmod -R 755 ./atan2_data/");
    }
    static void TearDownTestCase()
    {
        std::cout << "atan2_test TearDown" << std::endl;
    }
};

template <typename T1, typename T2>
inline T1 CeilAlign(T1 a, T2 b)
{
    return (a + b - 1) / b * b;
}

TEST_F(Atan2Test, test_case_float16_1)
{
    uint32_t blockDim = 40;
    // 调用 python 生成测试数据
    system("cd ./atan2_data/ && python3 gen_data.py '(128, 64)' 'float16'");
    uint32_t dataCount = 128 * 64;
    size_t inputByteSize = dataCount * sizeof(half);
    
    std::string x1_fileName = "./atan2_data/float16_input_x1_t_atan2.bin";
    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(CeilAlign(inputByteSize, 256));
    ReadFile(x1_fileName, inputByteSize, x1, inputByteSize);

    std::string x2_fileName = "./atan2_data/float16_input_x2_t_atan2.bin";
    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(CeilAlign(inputByteSize, 256));
    ReadFile(x2_fileName, inputByteSize, x2, inputByteSize);

    size_t outputByteSize = dataCount * sizeof(half);
    uint8_t* y = (uint8_t*)AscendC::GmAlloc(CeilAlign(outputByteSize, 256));

    size_t workspaceSize = 16 * 1024 * 1024;
    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(sizeof(Atan2TilingData));

    Atan2TilingData* tilingData = reinterpret_cast<Atan2TilingData*>(tiling);

    // 将 Tiling 里的块大小强行缩减,避免把 192KB 的 UB 内存撑爆
    tilingData->smallCoreDataNum = 128;  
    tilingData->bigCoreDataNum = 144;
    tilingData->finalBigTileNum = 1;
    tilingData->finalSmallTileNum = 1;
    tilingData->tileDataNum = 256;      // <-- 缩小:原为 4080
    tilingData->smallTailDataNum = 128;
    tilingData->bigTailDataNum = 144;
    tilingData->tailBlockNum = 0;
    tilingData->atanTmpSize = 8192;     // <-- 缩小:原为 130560

    AscendC::SetKernelMode(KernelMode::AIV_MODE);
    auto func = atan2<ELEMENTWISE_TPL_SCH_MODE_0>;
    ICPU_RUN_KF(func, blockDim, x1, x2, y, workspace, (uint8_t*)(tilingData));

    std::string fileName = "./atan2_data/float16_output_t_atan2.bin";
    WriteFile(fileName, y, outputByteSize);

    AscendC::GmFree((void*)(x1));
    AscendC::GmFree((void*)(x2));
    AscendC::GmFree((void*)(y));
    AscendC::GmFree((void*)workspace);
    AscendC::GmFree((void*)tiling);

    // 调用 python 进行精度比对
    system("cd ./atan2_data/ && python3 compare_data.py 'float16'");
}