yuanrong-datasystem/src/datasystem/common/device/device_manager_factory.h-代码预览-yuanrong-datasystem:基于 Sphinx 的文档生成项目 - AtomGit

gcw_sJBnCQsZfix: enable GPU-only(no CANN toolkit) compilation
/**
 * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Description: Device manager factory with runtime device detection.
 * Both NPU and GPU backends are compiled in (when USE_NPU/USE_GPU defined).
 * The actual backend is selected at runtime by probing /dev/davinci* or /dev/nvidia*.
 */
#ifndef DATASYSTEM_COMMON_DEVICE_DEVICE_MANAGER_FACTORY_H
#define DATASYSTEM_COMMON_DEVICE_DEVICE_MANAGER_FACTORY_H

#include "datasystem/common/device/device_manager_base.h"
#include "datasystem/common/log/log.h"
#if defined(USE_NPU) || defined(USE_GPU)
#include "datasystem/common/util/file_util.h"
#endif
#if defined(USE_NPU) || defined(WITH_TESTS)
#include "datasystem/common/device/ascend/acl_device_manager.h"
#endif
#ifdef USE_GPU
#include "datasystem/common/device/nvidia/cuda_device_manager.h"
#endif

#if defined(USE_NPU) || defined(USE_GPU) || defined(WITH_TESTS)
#include <glob.h>
#include <cstdlib>
#endif

namespace datasystem {

enum class DeviceBackend { NPU, GPU, UNKNOWN };

/**
 * @brief Device Manager Factory with runtime device detection.
 *
 * When both USE_NPU and USE_GPU are defined (universal wheel build),
 * the factory probes /dev/davinci* and /dev/nvidia* at runtime to
 * decide which backend to use. Only the selected backend's plugin
 * is dlopen'd.
 *
 * Usage:
 *   DeviceManagerBase* mgr = DeviceManagerFactory::GetDeviceManager();
 */
class DeviceManagerFactory {
public:
    static DeviceManagerBase *GetDeviceManager()
    {
        if (auto *overrideMgr = DetectTestOverrideManager(); overrideMgr != nullptr) {
            return overrideMgr;
        }
        static DeviceManagerBase *inst = Detect();
        return inst;
    }

    static DeviceBackend ProbeBackend()
    {
#ifdef WITH_TESTS
        if (IsTestAscendMockForced()) {
            return DeviceBackend::NPU;
        }
#endif
        return ProbePhysicalBackend();
    }

    static DeviceBackend ProbePhysicalBackend()
    {
        static DeviceBackend backend = DetectPhysicalBackend();
        return backend;
    }

private:
    DeviceManagerFactory() = delete;
    ~DeviceManagerFactory() = delete;

    static DeviceManagerBase *DetectTestOverrideManager()
    {
#ifdef WITH_TESTS
        if (IsTestAscendMockForced()) {
            return acl::AclDeviceManager::Instance();
        }
#endif
        return nullptr;
    }

    static DeviceBackend DetectPhysicalBackend()
    {
        const bool hasNpu = HasNpuDevice();
        const bool hasGpu = HasGpuDevice();
        if (hasNpu && hasGpu) {
            LOG(WARNING) << "Both NPU (/dev/davinci*) and GPU (/dev/nvidia*) devices detected. "
                            "By policy, NPU backend is preferred.";
            return DeviceBackend::NPU;
        }
        if (hasNpu) {
            return DeviceBackend::NPU;
        }
        if (hasGpu) {
            return DeviceBackend::GPU;
        }
        return DeviceBackend::UNKNOWN;
    }

    static bool HasNpuDevice()
    {
#ifdef USE_NPU
        return HasDevNode("/dev/davinci[0-16]*");
#else
        return false;
#endif
    }

    static bool HasGpuDevice()
    {
#ifdef USE_GPU
        return HasDevNode("/dev/nvidia[0-9]*") || HasNvidiaSmi();
#else
        return false;
#endif
    }

    static bool HasDevNode(const char *pattern)
    {
#if defined(USE_NPU) || defined(USE_GPU)
        std::vector<std::string> paths;
        Status rc = Glob(pattern, paths);
        if (rc.IsError()) {
            LOG(WARNING) << "Glob failed for pattern " << pattern << ", rc: " << rc.ToString();
            return false;
        }
        return !paths.empty();
#else
        (void)pattern;
        return false;
#endif
    }

    static bool HasNvidiaSmi()
    {
#ifdef USE_GPU
        // Check if nvidia-smi command is available and can detect GPUs
        // This works in both native Linux and WSL2 environments
        int ret = system("nvidia-smi -L > /dev/null 2>&1");
        return (ret == 0);
#else
        return false;
#endif
    }

    static bool IsTestAscendMockForced()
    {
#ifdef WITH_TESTS
        const char *forceAscendMock = std::getenv("DS_TEST_FORCE_ASCEND_DEVICE_MANAGER");
        return forceAscendMock != nullptr && forceAscendMock[0] != '\0' && forceAscendMock[0] != '0';
#else
        return false;
#endif
    }

    static void LogNoAcceleratorDetected()
    {
        LOG(INFO) << "No accelerator device detected. "
                     "Checked:"
#ifdef USE_NPU
                     " /dev/davinci[0-16]*"
#endif
#ifdef USE_GPU
                     " /dev/nvidia[0-9]* and nvidia-smi"
#endif
                     ". Ensure the device driver is installed and "
                     "the device node exists.";
    }

    static DeviceManagerBase *SelectManager(DeviceBackend backend)
    {
        switch (backend) {
            case DeviceBackend::NPU:
#if defined(USE_NPU) || defined(WITH_TESTS)
                LOG(INFO) << "Detected NPU device (/dev/davinci*), using Ascend ACL backend.";
                return acl::AclDeviceManager::Instance();
#endif
                break;
            case DeviceBackend::GPU:
#ifdef USE_GPU
                LOG(INFO) << "Detected GPU device (/dev/nvidia*), using CUDA backend.";
                return cuda::CudaDeviceManager::Instance();
#endif
                break;
            case DeviceBackend::UNKNOWN:
            default:
                LogNoAcceleratorDetected();
                break;
        }
        return nullptr;
    }

    static DeviceManagerBase *Detect()
    {
        if (auto *overrideMgr = DetectTestOverrideManager(); overrideMgr != nullptr) {
            return overrideMgr;
        }
#if defined(WITH_TESTS) && !defined(BUILD_HETERO)
        LOG(INFO) << "BUILD_HETERO is OFF in test build, fallback to Ascend device manager for mock-based tests.";
        return acl::AclDeviceManager::Instance();
#endif
        return SelectManager(ProbePhysicalBackend());
    }
};

}  // namespace datasystem
#endif  // DATASYSTEM_COMMON_DEVICE_DEVICE_MANAGER_FACTORY_H