* NPU Memory Model for CPU Simulation
*
* Provides UB, L1, L0A, L0B, L0C memory buffers sized per NPU architecture.
* TASSIGN maps tiles to offsets within these buffers based on TileType.
*
* Each thread gets its own independent NPUMemoryModel instance via
* thread_local storage, accurately modeling the hardware where each
* AICore has physically separate UB/L0 memory.
*
* Memory mapping:
* - Vec tiles → UB (Unified Buffer)
* - Mat tiles → L1
* - Left tiles → L0A
* - Right tiles → L0B
* - Acc tiles → L0C
*/
#ifndef PTO_NPU_MEMORY_MODEL_HPP
#define PTO_NPU_MEMORY_MODEL_HPP
#include <cstddef>
#include <algorithm>
#include <cstdint>
#include <vector>
namespace pto {
struct ArchMemorySizes {
std::size_t ubSize;
std::size_t l1Size;
std::size_t l0aSize;
std::size_t l0bSize;
std::size_t l0cSize;
};
inline constexpr ArchMemorySizes kA2A3MemorySizes = {
192 * 1024,
512 * 1024,
64 * 1024,
64 * 1024,
128 * 1024
};
inline constexpr ArchMemorySizes kA5MemorySizes = {
256 * 1024,
512 * 1024,
64 * 1024,
64 * 1024,
256 * 1024
};
enum class NPUArch
{
A2A3,
A5
};
enum class MemoryRegion
{
UB,
L1,
L0A,
L0B,
L0C
};
class NPUMemoryModel {
public:
static NPUMemoryModel &Instance()
{
thread_local NPUMemoryModel instance;
return instance;
}
static void SetDefaultArch(NPUArch arch)
{
defaultArch_ = arch;
}
void Initialize(NPUArch arch)
{
switch (arch) {
case NPUArch::A2A3:
sizes_ = kA2A3MemorySizes;
break;
case NPUArch::A5:
sizes_ = kA5MemorySizes;
break;
}
ubBuffer_.resize(sizes_.ubSize, 0);
l1Buffer_.resize(sizes_.l1Size, 0);
l0aBuffer_.resize(sizes_.l0aSize, 0);
l0bBuffer_.resize(sizes_.l0bSize, 0);
l0cBuffer_.resize(sizes_.l0cSize, 0);
arch_ = arch;
initialized_ = true;
}
void EnsureInitialized()
{
if (!initialized_) {
Initialize(defaultArch_);
}
}
template <typename T>
T *GetPointer(MemoryRegion region, std::size_t byteOffset)
{
EnsureInitialized();
char *base = nullptr;
std::size_t regionSize = 0;
switch (region) {
case MemoryRegion::UB:
base = ubBuffer_.data();
regionSize = sizes_.ubSize;
break;
case MemoryRegion::L1:
base = l1Buffer_.data();
regionSize = sizes_.l1Size;
break;
case MemoryRegion::L0A:
base = l0aBuffer_.data();
regionSize = sizes_.l0aSize;
break;
case MemoryRegion::L0B:
base = l0bBuffer_.data();
regionSize = sizes_.l0bSize;
break;
case MemoryRegion::L0C:
base = l0cBuffer_.data();
regionSize = sizes_.l0cSize;
break;
}
return reinterpret_cast<T *>(base + byteOffset);
}
char *GetUBBase()
{
EnsureInitialized();
return ubBuffer_.data();
}
char *GetL1Base()
{
EnsureInitialized();
return l1Buffer_.data();
}
char *GetL0ABase()
{
EnsureInitialized();
return l0aBuffer_.data();
}
char *GetL0BBase()
{
EnsureInitialized();
return l0bBuffer_.data();
}
char *GetL0CBase()
{
EnsureInitialized();
return l0cBuffer_.data();
}
const ArchMemorySizes &GetSizes() const
{
return sizes_;
}
NPUArch GetArch() const
{
return arch_;
}
bool IsInitialized() const
{
return initialized_;
}
void Clear()
{
if (initialized_) {
std::fill(ubBuffer_.begin(), ubBuffer_.end(), 0);
std::fill(l1Buffer_.begin(), l1Buffer_.end(), 0);
std::fill(l0aBuffer_.begin(), l0aBuffer_.end(), 0);
std::fill(l0bBuffer_.begin(), l0bBuffer_.end(), 0);
std::fill(l0cBuffer_.begin(), l0cBuffer_.end(), 0);
}
}
void Reset()
{
ubBuffer_.clear();
l1Buffer_.clear();
l0aBuffer_.clear();
l0bBuffer_.clear();
l0cBuffer_.clear();
initialized_ = false;
}
private:
NPUMemoryModel() = default;
static inline NPUArch defaultArch_ = NPUArch::A2A3;
std::vector<char> ubBuffer_;
std::vector<char> l1Buffer_;
std::vector<char> l0aBuffer_;
std::vector<char> l0bBuffer_;
std::vector<char> l0cBuffer_;
ArchMemorySizes sizes_{};
NPUArch arch_ = NPUArch::A2A3;
bool initialized_ = false;
};
}
#endif