* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* NOTE: Portions of this code were AI-generated and have been
* technically reviewed for functional accuracy and security
*/
* @file test_aclnn_population_count.cpp
* @brief PopulationCount 算子 ACLNN 调用示例(两段式接口)
*/
#include <iostream>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <vector>
#include "acl/acl.h"
#include "aclnn_population_count.h"
#define CHECK_ACL(expr) \
do { \
auto _ret = (expr); \
if (_ret != ACL_SUCCESS) { \
std::cerr << "ACL Error: " << #expr << " returned " << _ret \
<< " at " << __FILE__ << ":" << __LINE__ << std::endl; \
goto cleanup; \
} \
} while (0)
static inline uint8_t CpuPopcount16(uint16_t v) {
return static_cast<uint8_t>(__builtin_popcount(static_cast<uint32_t>(v)));
}
static int RunOneCase(const char* label,
aclDataType dtype,
const uint16_t* rawU16,
int64_t elemCount,
aclrtStream stream)
{
const int64_t shape[] = {elemCount};
const int64_t strides[] = {1};
constexpr int64_t ndim = 1;
int32_t ret = 1;
void *devX = nullptr;
void *devY = nullptr;
void *workspace = nullptr;
aclTensor *xTensor = nullptr;
aclTensor *yTensor = nullptr;
size_t inputBytes = elemCount * sizeof(uint16_t);
size_t outputBytes = elemCount * sizeof(uint8_t);
std::vector<uint16_t> hostX(rawU16, rawU16 + elemCount);
std::vector<uint8_t> expected(elemCount, 0);
for (int64_t i = 0; i < elemCount; ++i) {
expected[i] = CpuPopcount16(hostX[i]);
}
CHECK_ACL(aclrtMalloc(&devX, inputBytes, ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMalloc(&devY, outputBytes, ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMemcpy(devX, inputBytes, hostX.data(), inputBytes, ACL_MEMCPY_HOST_TO_DEVICE));
CHECK_ACL(aclrtMemset(devY, outputBytes, 0, outputBytes));
xTensor = aclCreateTensor(shape, ndim, dtype, strides, 0,
ACL_FORMAT_ND, shape, ndim, devX);
yTensor = aclCreateTensor(shape, ndim, ACL_UINT8, strides, 0,
ACL_FORMAT_ND, shape, ndim, devY);
if (xTensor == nullptr || yTensor == nullptr) {
std::cerr << "[" << label << "] aclCreateTensor failed" << std::endl;
goto cleanup;
}
{
uint64_t workspaceSize = 0;
aclOpExecutor *executor = nullptr;
CHECK_ACL(aclnnPopulationCountGetWorkspaceSize(
xTensor, yTensor, &workspaceSize, &executor));
if (workspaceSize > 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
}
CHECK_ACL(aclnnPopulationCount(workspace, workspaceSize, executor, stream));
CHECK_ACL(aclrtSynchronizeStream(stream));
std::vector<uint8_t> hostY(elemCount, 0);
CHECK_ACL(aclrtMemcpy(hostY.data(), outputBytes, devY, outputBytes,
ACL_MEMCPY_DEVICE_TO_HOST));
std::cout << "PopulationCount Example [" << label << "] (shape: [" << elemCount
<< "], y: uint8)" << std::endl;
std::cout << " y[i] = popcount(x[i]) (16-bit)" << std::endl;
std::cout << "-----------------------------------------------------------------" << std::endl;
std::printf(" %4s | %11s | %7s | %6s | %6s | %s\n",
"Idx", "x(u16)", "x(hex)", "NPU", "CPU", "Status");
std::cout << "-----------------------------------------------------------------" << std::endl;
int passCount = 0;
for (int64_t i = 0; i < elemCount; ++i) {
bool pass = (hostY[i] == expected[i]);
passCount += pass ? 1 : 0;
std::printf(" %4lld | %11u | 0x%04X | %6u | %6u | %s\n",
static_cast<long long>(i),
static_cast<unsigned int>(hostX[i]),
static_cast<unsigned int>(hostX[i]),
static_cast<unsigned int>(hostY[i]),
static_cast<unsigned int>(expected[i]),
pass ? "PASS" : "FAIL");
}
std::cout << "-----------------------------------------------------------------" << std::endl;
std::cout << "[" << label << "] Result: " << passCount << "/" << elemCount << " passed";
if (passCount == elemCount) {
std::cout << " -- ALL PASS" << std::endl;
ret = 0;
} else {
std::cout << " -- FAILED" << std::endl;
ret = 1;
}
}
cleanup:
if (xTensor) aclDestroyTensor(xTensor);
if (yTensor) aclDestroyTensor(yTensor);
if (workspace) { aclrtFree(workspace); workspace = nullptr; }
if (devX) aclrtFree(devX);
if (devY) aclrtFree(devY);
return ret;
}
int main()
{
constexpr int64_t ELEM_COUNT = 16;
int16_t int16Raw[ELEM_COUNT] = {
0,
1,
2,
3,
7,
15,
255,
256,
21845,
(int16_t)43690,
(int16_t)0x7FFF,
(int16_t)0x8000,
-1,
-2,
16,
4660,
};
uint16_t int16RawU16[ELEM_COUNT];
for (int i = 0; i < ELEM_COUNT; ++i) {
int16RawU16[i] = static_cast<uint16_t>(int16Raw[i]);
}
uint16_t uint16Raw[ELEM_COUNT] = {
0x0000,
0x0001,
0x8000,
0x5555,
0xAAAA,
0xFFFF,
0x7FFF,
0x0F0F,
0xF0F0,
0x00FF,
0xFF00,
0x1234,
0xDEAD,
0xBEEF,
0xC3C3,
0x3C3C,
};
int32_t ret = 1;
aclrtStream stream = nullptr;
CHECK_ACL(aclInit(nullptr));
CHECK_ACL(aclrtSetDevice(0));
CHECK_ACL(aclrtCreateStream(&stream));
{
int retInt16 = RunOneCase("INT16", ACL_INT16, int16RawU16, ELEM_COUNT, stream);
std::cout << std::endl;
int retUint16 = RunOneCase("UINT16", ACL_UINT16, uint16Raw, ELEM_COUNT, stream);
std::cout << "=================================================================" << std::endl;
std::cout << "Overall: " << ((retInt16 == 0 && retUint16 == 0) ? "ALL PASS" : "FAILED")
<< std::endl;
std::cout << "=================================================================" << std::endl;
ret = (retInt16 == 0 && retUint16 == 0) ? 0 : 1;
}
cleanup:
if (stream) aclrtDestroyStream(stream);
aclrtResetDevice(0);
aclFinalize();
return ret;
}