05360171创建于 2022年3月18日历史提交
# BSD 3-Clause License
#
# Copyright (c) 2017 xxxx
# All rights reserved.
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ============================================================================
# My version of nvgpu because nvgpu didn't have all the information I was looking for.
import re
import subprocess
import shutil
import os

def gpu_info() -> list:
    """
    Returns a dictionary of stats mined from nvidia-smi for each gpu in a list.
    Adapted from nvgpu: https://pypi.org/project/nvgpu/, but mine has more info.
    """
    gpus = [line for line in _run_cmd(['nvidia-smi', '-L']) if line]
    gpu_infos = [re.match('GPU ([0-9]+): ([^(]+) \(UUID: ([^)]+)\)', gpu).groups() for gpu in gpus]
    gpu_infos = [dict(zip(['idx', 'name', 'uuid'], info)) for info in gpu_infos]
    gpu_count = len(gpus)

    lines = _run_cmd(['nvidia-smi'])
    selected_lines = lines[7:7 + 3 * gpu_count]
    for i in range(gpu_count):
        mem_used, mem_total = [int(m.strip().replace('MiB', '')) for m in
                               selected_lines[3 * i + 1].split('|')[2].strip().split('/')]
        
        pw_tmp_info, mem_info, util_info = [x.strip() for x in selected_lines[3 * i + 1].split('|')[1:-1]]
        
        pw_tmp_info = [x[:-1] for x in pw_tmp_info.split(' ') if len(x) > 0]
        fan_speed, temperature, pwr_used, pwr_cap = [int(pw_tmp_info[i]) for i in (0, 1, 3, 5)]
        gpu_infos[i]['fan_spd' ] = fan_speed
        gpu_infos[i]['temp'    ] = temperature
        gpu_infos[i]['pwr_used'] = pwr_used
        gpu_infos[i]['pwr_cap' ] = pwr_cap

        mem_used, mem_total = [int(x) for x in mem_info.replace('MiB', '').split(' / ')]
        gpu_infos[i]['mem_used' ] = mem_used
        gpu_infos[i]['mem_total'] = mem_total

        utilization = int(util_info.split(' ')[0][:-1])
        gpu_infos[i]['util'] = utilization

        gpu_infos[i]['idx'] = int(gpu_infos[i]['idx'])

    return gpu_infos

def nvsmi_available() -> bool:
    """ Returns whether or not nvidia-smi is present in this system's PATH. """
    return shutil.which('nvidia-smi') is not None


def visible_gpus() -> list:
    """ Returns a list of the indexes of all the gpus visible to pytorch. """

    if 'CUDA_VISIBLE_DEVICES' not in os.environ:
        return list(range(len(gpu_info())))
    else:
        return [int(x.strip()) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')]




def _run_cmd(cmd:list) -> list:
    """ Runs a command and returns a list of output lines. """
    output = subprocess.check_output(cmd)
    output = output.decode('UTF-8')
    return output.split('\n')