TVM VTA (TSIM) ソースコード解析
Verilog HDL/Chisel のコードを
Verilator(+DPI) で繋いで、
Pythonからどのように使っているのか?
Created date:2020.03.15

@Vengineer



TVM TSIM
vta-hw/hardware/dpi/tsim_device.cc
module TestAccel(
input clock,
input reset,
input sim_clock,
output sim_wait
);
clock
reset
dpi/module.cc
SimDPI
MemDPI
HostDPI
VTADeviceRun
tsim/tsim_driver.cc
CommandQueue::
Synchronize
SystemVerilog
de10nano/de10nano_driver.cc
pynq/pynq_driver.cc
tsim/tsim_driver.cc
sim/sim_driver.cc
VTASimDPI
VTAMemDPI
VTAHostDPI
Verilator DPI
Accel
TVM TSIM
VTASimDPI
VTAMemDPI
VTAHostDPI
vta-hw/hardware/chisel/src/main/scala/test/Test.scala
VTAShell
dpi/module.cc
SimDPI
MemDPI
HostDPI
VTADeviceRun
tsim/tsim_driver.cc
CommandQueue::
Synchronize
Verilator DPI
Chisel
de10nano/de10nano_driver.cc
pynq/pynq_driver.cc
tsim/tsim_driver.cc
sim/sim_driver.cc
SimShell Test
TVM TSIM
VTADeviceRun
int VTADeviceRun(VTADeviceHandle handle,
vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
return static_cast<vta::tsim::Device*>(handle)->Run(
insn_phy_addr,
insn_count,
wait_cycles);
}
TVM TSIM
VTADevicevta::tsim::Device::Run
int Run(vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
this->Init();
this->Launch(insn_phy_addr,
insn_count,
wait_cycles);
this->WaitForCompletion(wait_cycles);
return 0;
}
TVM TSIM
VTADevicevta::tsim::Device::Init
void Init() {
dpi_ = loader_->Get();
dpi_->SimResume();
}
Device() {
loader_ = DPILoader::Global();
prof_ = Profiler::Global();
}
class DPILoader {
DPIModuleNode* Get() {
return
static_cast<DPIModuleNode*>( mod_.operato
r->());
}
void Init(Module module) {
mod_ = module;
dpi_ = this->Get();
dpi_->SimLaunch();
dpi_->SimWait();
}
TVM_REGISTER_GLOBAL(" vta.tsim.init")
.set_body([](TVMArgs args, TVMRetValue*
rv) {
Module m = args[0];
DPILoader::Global()->Init(m);
});
TVM TSIM
VTADevice::vta::tsim::Device::Launch
void Launch(vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
dpi_->WriteReg(0x08, insn_count);
dpi_->WriteReg(0x0c, insn_phy_addr);
dpi_->WriteReg(0x10, 0);
dpi_->WriteReg(0x14, 0);
dpi_->WriteReg(0x18, 0);
dpi_->WriteReg(0x1c, 0);
dpi_->WriteReg(0x20, 0);
// start
dpi_->WriteReg(0x00, 0x1);
}
TVM TSIM
VTADevicevta::tsim::Device::WaitForCompletion
void WaitForCompletion (uint32_t wait_cycles) {
uint32_t i, val;
for (i = 0; i < wait_cycles; i++) {
val = dpi_->ReadReg(0x00);
val &= 0x2;
if (val == 0x2) break; // finish
}
prof_->Update(0, dpi_->ReadReg(0x04));
dpi_->SimWait();
}
テストコード
vta-hw/apps/tsim_example/tests/python/verilog_accel.py
vta-hw/apps/tsim_example/tests/python/chisel_accel.py
TVM TSIM : apps/python
vta-hw/apps/tsim_example/tests/python/verilog_accel.py
if __name__ == "__main__":
tsim.init("verilog") => Verilog HDL モデルの初期化
for i in range(10):
test_accel()
vta-hw/apps/tsim_example/tests/python/chisel_accel.py
if __name__ == "__main__":
tsim.init("chisel") => Chisel モデルの初期化
for i in range(10):
test_accel()
TVM TSIM : apps/python
vta-hw/apps/tsim_example/python/tsim.py
def init(hw_backend):
"""Init hardware and software shared library for accelerator
Parameters
------------
hw_backend : str
Hardware backend can be verilog or chisel
"""
cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
hw_libname = "libhw" + get_ext()
if hw_backend in ("verilog", "chisel"):
hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build",
hw_libname)
load_sw()
m = tvm.runtime.load_module(hw_lib, "vta-tsim")
f = tvm.get_global_func("tvm.vta.tsim.init")
f(m)
TVM TSIM : apps/python (verilog_accel)
vta-hw/apps/tsim_example/tests/python/verilog_accel.py
def test_accel():
rmax = 64
dtype = "uint64"
n = np.random.randint(1, rmax)
c = np.random.randint(0, rmax)
ctx = tvm.cpu(0)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
f = tsim.load_module() => 関数の獲得
cycles = f(a, b, c) => 関数の実行
msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
np.testing.assert_equal( b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " +
msg)
print("[PASS] " + msg)
TVM TSIM : apps/python (chisel_accel)
vta-hw/apps/tsim_example/tests/python/chisel_accel.py
def test_accel():
rmax = 64
dtype = "uint64"
n = np.random.randint(1, rmax)
c = np.random.randint(0, rmax)
ctx = tvm.cpu(0)
a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
f = tsim.load_module() => 関数の獲得
cycles = f(a, b, c) => 関数の実行
msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
np.testing.assert_equal( b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " +
msg)
print("[PASS] " + msg)
TVM TSIM : apps/python (tsim.load_module)
vta-hw/apps/tsim_example/python/tsim.py
def load_module():
"""Return driver function"""
load_sw()
return tvm.get_global_func("tvm.vta.driver")
TVM TSIM : apps/python (tvm.vta.driver)
vta-hw/apps/tsim_example/src/driver.cc
TVM_REGISTER_GLOBAL(" tvm.vta.driver")
.set_body([](TVMArgs args, TVMRetValue* rv) {
Device dev_;
DLTensor* A = args[0];
DLTensor* B = args[1];
uint32_t c = static_cast<int>(args[2]);
uint32_t cycles = dev_.Run(c, A, B);
*rv = static_cast<int>(cycles);
});
TVM TSIM : dev_.Run
vta-hw/apps/tsim_example/src/driver.cc
uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) {
uint32_t cycles;
uint32_t len = a->shape[0];
size_t size = (a->dtype.bits >> 3) * len;
a_ = this->MemAlloc(size);
b_ = this->MemAlloc(size);
this->MemCopyFromHost(a_, a->data, size);
this->Init();
this->Launch(c, len);
cycles = this->WaitForCompletion();
this->MemCopyToHost(b->data, b_, size);
this->MemFree(a_);
this->MemFree(b_);
return cycles;
}
TVM TSIM : Init / Launch
vta-hw/apps/tsim_example/src/driver.cc
void Init() {
dpi_ = loader_->Get();
dpi_->SimResume();
}
void Launch(uint32_t c, uint32_t len) {
dpi_->WriteReg(0x08, c);
dpi_->WriteReg(0x0c, len);
dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_));
dpi_->WriteReg(0x14, 0);
dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_));
dpi_->WriteReg(0x1c, 0);
dpi_->WriteReg(0x00, 0x1); // launch
}
TVM TSIM : WaitForCompletion
vta-hw/apps/tsim_example/src/driver.cc
uint32_t WaitForCompletion () {
uint32_t i, val;
for (i = 0; i < wait_cycles_; i++) {
val = dpi_->ReadReg(0x00);
if (val == 2) break; // finish
}
val = dpi_->ReadReg(0x04);
dpi_->SimWait();
return val;
}
シミュレーション制御
シミュレーション制御メソッド
vta-hw/src/dpi/module.cc
void SimLaunch() {
auto frun = [this]() {
(*ftsim_)();
};
tsim_thread_ = std::thread(frun);
}
void SimFinish() {
sim_device_. Exit();
tsim_thread_.join();
}
void SimWait() {
sim_device_. Wait();
}
void SimResume() {
sim_device_. Resume();
}
~DPILoader() {
dpi_->SimResume();
dpi_->SimFinish();
}}
class DPILoader {
void Init(Module module)
{
mod_ = module;
dpi_ = this->Get();
dpi_->SimLaunch();
dpi_->SimWait();
}
シミュレーション制御メソッド
vta-hw/src/tsim/tsim_driver.cc
TVM_REGISTER_GLOBAL(" vta.tsim.init")
.set_body([](TVMArgs args, TVMRetValue*
rv) {
Module m = args[0];
DPILoader::Global()-> Init(m);
});
class DPILoader {
void Init(Module module)
{
mod_ = module;
dpi_ = this->Get();
dpi_->SimLaunch();
dpi_->SimWait();
}
シミュレーション制御メソッド
vta-hw/src/dpi/module.cc
void SimDevice:: Wait() {
std::unique_lock<std::mutex>
lock(mutex_);
wait_ = true;
}
void SimDevice:: Resume() {
std::unique_lock<std::mutex>
lock(mutex_);
wait_ = false;
}
void SimDevice:: Exit() {
std::unique_lock<std::mutex>
lock(mutex_);
exit_ = true;
}
シミュレーション制御メソッド
vta-hw/src/dpi/module.cc
void SimDPI(dpi8_t* wait,
dpi8_t* exit) {
*wait = sim_device_. GetWaitStatus();
*exit = sim_device_. GetExitStatus();
}
bool SimDevice:: GetWaitStatus() {
std::unique_lock<std::mutex> lock(mutex_);
return wait_;
}
bool SimDevice:: GetExitStatus() {
std::unique_lock<std::mutex> lock(mutex_);
return exit_;
}
SimDPI
MemDPI
Verilator DPI
HostDPI
I am a computer engineer,
not a deep learning craftsman




ありがとうございました。
Thanks
@Vengineer
ソースコード解析職人
Source code analysis craftsman

TVM VTA (TSIM)

  • 1.
    TVM VTA (TSIM)ソースコード解析 Verilog HDL/Chisel のコードを Verilator(+DPI) で繋いで、 Pythonからどのように使っているのか? Created date:2020.03.15
 @Vengineer
 

  • 2.
    TVM TSIM vta-hw/hardware/dpi/tsim_device.cc module TestAccel( inputclock, input reset, input sim_clock, output sim_wait ); clock reset dpi/module.cc SimDPI MemDPI HostDPI VTADeviceRun tsim/tsim_driver.cc CommandQueue:: Synchronize SystemVerilog de10nano/de10nano_driver.cc pynq/pynq_driver.cc tsim/tsim_driver.cc sim/sim_driver.cc VTASimDPI VTAMemDPI VTAHostDPI Verilator DPI Accel
  • 3.
  • 4.
    TVM TSIM VTADeviceRun int VTADeviceRun(VTADeviceHandlehandle, vta_phy_addr_t insn_phy_addr, uint32_t insn_count, uint32_t wait_cycles) { return static_cast<vta::tsim::Device*>(handle)->Run( insn_phy_addr, insn_count, wait_cycles); }
  • 5.
    TVM TSIM VTADevicevta::tsim::Device::Run int Run(vta_phy_addr_tinsn_phy_addr, uint32_t insn_count, uint32_t wait_cycles) { this->Init(); this->Launch(insn_phy_addr, insn_count, wait_cycles); this->WaitForCompletion(wait_cycles); return 0; }
  • 6.
    TVM TSIM VTADevicevta::tsim::Device::Init void Init(){ dpi_ = loader_->Get(); dpi_->SimResume(); } Device() { loader_ = DPILoader::Global(); prof_ = Profiler::Global(); } class DPILoader { DPIModuleNode* Get() { return static_cast<DPIModuleNode*>( mod_.operato r->()); } void Init(Module module) { mod_ = module; dpi_ = this->Get(); dpi_->SimLaunch(); dpi_->SimWait(); } TVM_REGISTER_GLOBAL(" vta.tsim.init") .set_body([](TVMArgs args, TVMRetValue* rv) { Module m = args[0]; DPILoader::Global()->Init(m); });
  • 7.
    TVM TSIM VTADevice::vta::tsim::Device::Launch void Launch(vta_phy_addr_tinsn_phy_addr, uint32_t insn_count, uint32_t wait_cycles) { dpi_->WriteReg(0x08, insn_count); dpi_->WriteReg(0x0c, insn_phy_addr); dpi_->WriteReg(0x10, 0); dpi_->WriteReg(0x14, 0); dpi_->WriteReg(0x18, 0); dpi_->WriteReg(0x1c, 0); dpi_->WriteReg(0x20, 0); // start dpi_->WriteReg(0x00, 0x1); }
  • 8.
    TVM TSIM VTADevicevta::tsim::Device::WaitForCompletion void WaitForCompletion(uint32_t wait_cycles) { uint32_t i, val; for (i = 0; i < wait_cycles; i++) { val = dpi_->ReadReg(0x00); val &= 0x2; if (val == 0x2) break; // finish } prof_->Update(0, dpi_->ReadReg(0x04)); dpi_->SimWait(); }
  • 9.
  • 10.
    TVM TSIM :apps/python vta-hw/apps/tsim_example/tests/python/verilog_accel.py if __name__ == "__main__": tsim.init("verilog") => Verilog HDL モデルの初期化 for i in range(10): test_accel() vta-hw/apps/tsim_example/tests/python/chisel_accel.py if __name__ == "__main__": tsim.init("chisel") => Chisel モデルの初期化 for i in range(10): test_accel()
  • 11.
    TVM TSIM :apps/python vta-hw/apps/tsim_example/python/tsim.py def init(hw_backend): """Init hardware and software shared library for accelerator Parameters ------------ hw_backend : str Hardware backend can be verilog or chisel """ cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__))) hw_libname = "libhw" + get_ext() if hw_backend in ("verilog", "chisel"): hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname) load_sw() m = tvm.runtime.load_module(hw_lib, "vta-tsim") f = tvm.get_global_func("tvm.vta.tsim.init") f(m)
  • 12.
    TVM TSIM :apps/python (verilog_accel) vta-hw/apps/tsim_example/tests/python/verilog_accel.py def test_accel(): rmax = 64 dtype = "uint64" n = np.random.randint(1, rmax) c = np.random.randint(0, rmax) ctx = tvm.cpu(0) a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx) b = tvm.nd.array(np.zeros(n).astype(dtype), ctx) f = tsim.load_module() => 関数の獲得 cycles = f(a, b, c) => 関数の実行 msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c) np.testing.assert_equal( b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg) print("[PASS] " + msg)
  • 13.
    TVM TSIM :apps/python (chisel_accel) vta-hw/apps/tsim_example/tests/python/chisel_accel.py def test_accel(): rmax = 64 dtype = "uint64" n = np.random.randint(1, rmax) c = np.random.randint(0, rmax) ctx = tvm.cpu(0) a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx) b = tvm.nd.array(np.zeros(n).astype(dtype), ctx) f = tsim.load_module() => 関数の獲得 cycles = f(a, b, c) => 関数の実行 msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c) np.testing.assert_equal( b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg) print("[PASS] " + msg)
  • 14.
    TVM TSIM :apps/python (tsim.load_module) vta-hw/apps/tsim_example/python/tsim.py def load_module(): """Return driver function""" load_sw() return tvm.get_global_func("tvm.vta.driver")
  • 15.
    TVM TSIM :apps/python (tvm.vta.driver) vta-hw/apps/tsim_example/src/driver.cc TVM_REGISTER_GLOBAL(" tvm.vta.driver") .set_body([](TVMArgs args, TVMRetValue* rv) { Device dev_; DLTensor* A = args[0]; DLTensor* B = args[1]; uint32_t c = static_cast<int>(args[2]); uint32_t cycles = dev_.Run(c, A, B); *rv = static_cast<int>(cycles); });
  • 16.
    TVM TSIM :dev_.Run vta-hw/apps/tsim_example/src/driver.cc uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) { uint32_t cycles; uint32_t len = a->shape[0]; size_t size = (a->dtype.bits >> 3) * len; a_ = this->MemAlloc(size); b_ = this->MemAlloc(size); this->MemCopyFromHost(a_, a->data, size); this->Init(); this->Launch(c, len); cycles = this->WaitForCompletion(); this->MemCopyToHost(b->data, b_, size); this->MemFree(a_); this->MemFree(b_); return cycles; }
  • 17.
    TVM TSIM :Init / Launch vta-hw/apps/tsim_example/src/driver.cc void Init() { dpi_ = loader_->Get(); dpi_->SimResume(); } void Launch(uint32_t c, uint32_t len) { dpi_->WriteReg(0x08, c); dpi_->WriteReg(0x0c, len); dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_)); dpi_->WriteReg(0x14, 0); dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_)); dpi_->WriteReg(0x1c, 0); dpi_->WriteReg(0x00, 0x1); // launch }
  • 18.
    TVM TSIM :WaitForCompletion vta-hw/apps/tsim_example/src/driver.cc uint32_t WaitForCompletion () { uint32_t i, val; for (i = 0; i < wait_cycles_; i++) { val = dpi_->ReadReg(0x00); if (val == 2) break; // finish } val = dpi_->ReadReg(0x04); dpi_->SimWait(); return val; }
  • 19.
  • 20.
    シミュレーション制御メソッド vta-hw/src/dpi/module.cc void SimLaunch() { autofrun = [this]() { (*ftsim_)(); }; tsim_thread_ = std::thread(frun); } void SimFinish() { sim_device_. Exit(); tsim_thread_.join(); } void SimWait() { sim_device_. Wait(); } void SimResume() { sim_device_. Resume(); } ~DPILoader() { dpi_->SimResume(); dpi_->SimFinish(); }} class DPILoader { void Init(Module module) { mod_ = module; dpi_ = this->Get(); dpi_->SimLaunch(); dpi_->SimWait(); }
  • 21.
    シミュレーション制御メソッド vta-hw/src/tsim/tsim_driver.cc TVM_REGISTER_GLOBAL(" vta.tsim.init") .set_body([](TVMArgs args,TVMRetValue* rv) { Module m = args[0]; DPILoader::Global()-> Init(m); }); class DPILoader { void Init(Module module) { mod_ = module; dpi_ = this->Get(); dpi_->SimLaunch(); dpi_->SimWait(); }
  • 22.
    シミュレーション制御メソッド vta-hw/src/dpi/module.cc void SimDevice:: Wait(){ std::unique_lock<std::mutex> lock(mutex_); wait_ = true; } void SimDevice:: Resume() { std::unique_lock<std::mutex> lock(mutex_); wait_ = false; } void SimDevice:: Exit() { std::unique_lock<std::mutex> lock(mutex_); exit_ = true; }
  • 23.
    シミュレーション制御メソッド vta-hw/src/dpi/module.cc void SimDPI(dpi8_t* wait, dpi8_t*exit) { *wait = sim_device_. GetWaitStatus(); *exit = sim_device_. GetExitStatus(); } bool SimDevice:: GetWaitStatus() { std::unique_lock<std::mutex> lock(mutex_); return wait_; } bool SimDevice:: GetExitStatus() { std::unique_lock<std::mutex> lock(mutex_); return exit_; } SimDPI MemDPI Verilator DPI HostDPI
  • 24.
    I am acomputer engineer, not a deep learning craftsman 
 
 ありがとうございました。 Thanks @Vengineer ソースコード解析職人 Source code analysis craftsman