#include <c10d/reducer_timer.hpp>
#include <c10d/debug.h>
#include "torch_npu/csrc/npu/Event.h"
#include "torch_npu/csrc/core/npu/NPUGuard.h"
namespace c10d {
namespace {
const int kMilliSecondToNanosSecond = 1000000;
class NpuTimer : public c10d::Timer {
public:
explicit NpuTimer(c10::Device dev) : device(dev) {}
void record(Event event) override
{
Timer::record(event);
c10_npu::NPUGuard g(device);
getEvent(event).record();
}
c10::optional<int64_t> measureDifference(Event start, Event end) override
{
if (debug_level() != DebugLevel::Detail) {
return c10::nullopt;
}
c10_npu::NPUGuard g(device);
c10_npu::NPUEvent& start_event = getEvent(start);
c10_npu::NPUEvent& end_event = getEvent(end);
if (!start_event.isCreated() || !end_event.isCreated()) {
return c10::nullopt;
}
start_event.synchronize();
end_event.synchronize();
float milliseconds;
try {
milliseconds = start_event.elapsed_time(end_event);
} catch (std::exception &e) {
milliseconds = -1;
}
if (milliseconds < 0) {
return c10::nullopt;
}
return int64_t(milliseconds * kMilliSecondToNanosSecond);
}
private:
c10::Device device;
c10_npu::NPUEvent forward_start = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
c10_npu::NPUEvent backward_compute_start = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
c10_npu::NPUEvent backward_compute_end = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
c10_npu::NPUEvent backward_comm_start = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
c10_npu::NPUEvent backward_comm_end = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
c10_npu::NPUEvent& getEvent(Event event)
{
switch (event) {
case Event::kForwardStart:
return forward_start;
case Event::kBackwardComputeStart:
return backward_compute_start;
case Event::kBackwardComputeEnd:
return backward_compute_end;
case Event::kBackwardCommStart:
return backward_comm_start;
case Event::kBackwardCommEnd:
return backward_comm_end;
default:
TORCH_INTERNAL_ASSERT(false);
}
}
};
C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kPrivateUse1, NpuTimer);
}
}