DAFunctor
Symbolic Translator
from NumPy/PyTorch ND-Array operations to C
Buganini Chiu
PyCon Taiwan 2021
https://github.com/buganini/dafunctor
About me
●
Buganini Chiu, aka Bug
●
Know some C & Python
●
Had done a few deep learning solution porting tasks to various
platforms
https://github.com/buganini
What? Why? How?
●
Purpose
●
Other approaches
●
DAFunctor
Purpose
●
Porting numpy operations to certain platforms
– Memory limit for embedded systems
●
Intermediate buffers
– Lack of support for C++ and ecosystem (some RTOS; no POSIX API)
– Usage constrains ruled by development guidelines
●
Fragmentation concern
– MISRA C:2004, 20.4 - Dynamic heap memory allocation shall not be used.
– MISRA C++ 2008, 18-4-1 - Dynamic heap memory allocation shall not be used.
– MISRA C:2012, 21.3 The memory allocation and deallocation functions of <stdlib.h> shall not be used
– Interpolatability with other modules
●
Maintainability
Other approaches
Given a function like this :
def sample():
matrix = np.array([[4,5],[6,7]])
return matrix * 3 + 5
static PyObject *__pyx_pf_19numpy_sample_cython_sample(CYTHON_UNUSED
PyObject *__pyx_self) {
PyObject *__pyx_v_matrix = NULL;
PyObject *__pyx_r = NULL;
__Pyx_RefNannyDeclarations
PyObject *__pyx_t_1 = NULL;
PyObject *__pyx_t_2 = NULL;
PyObject *__pyx_t_3 = NULL;
PyObject *__pyx_t_4 = NULL;
PyObject *__pyx_t_5 = NULL;
__Pyx_RefNannySetupContext("sample", 0);
/* "numpy_sample_cython.pyx":4
*
* def sample():
* matrix = np.array([[4,5],[6,7]]) # <<<<<<<<<<<<<<
* return matrix * 3 + 5
*
*/
__Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_np); if (unlikely(!__pyx_t_2))
__PYX_ERR(0, 4, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_array); if (unlikely(!
__pyx_t_3)) __PYX_ERR(0, 4, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_3);
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__pyx_t_2 = PyList_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4,
__pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__Pyx_INCREF(__pyx_int_4);
__Pyx_GIVEREF(__pyx_int_4);
PyList_SET_ITEM(__pyx_t_2, 0, __pyx_int_4);
__Pyx_INCREF(__pyx_int_5);
__Pyx_GIVEREF(__pyx_int_5);
PyList_SET_ITEM(__pyx_t_2, 1, __pyx_int_5);
__pyx_t_4 = PyList_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 4,
__pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_INCREF(__pyx_int_6);
__Pyx_GIVEREF(__pyx_int_6);
PyList_SET_ITEM(__pyx_t_4, 0, __pyx_int_6);
__Pyx_INCREF(__pyx_int_7);
__Pyx_GIVEREF(__pyx_int_7);
PyList_SET_ITEM(__pyx_t_4, 1, __pyx_int_7);
__pyx_t_5 = PyList_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 4,
__pyx_L1_error)
__Pyx_GOTREF(__pyx_t_5);
__Pyx_GIVEREF(__pyx_t_2);
PyList_SET_ITEM(__pyx_t_5, 0, __pyx_t_2);
__Pyx_GIVEREF(__pyx_t_4);
PyList_SET_ITEM(__pyx_t_5, 1, __pyx_t_4);
__pyx_t_2 = 0;
__pyx_t_4 = 0;
__pyx_t_4 = NULL;
if (CYTHON_UNPACK_METHODS &&
unlikely(PyMethod_Check(__pyx_t_3))) {
__pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
if (likely(__pyx_t_4)) {
PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
__Pyx_INCREF(__pyx_t_4);
__Pyx_INCREF(function);
__Pyx_DECREF_SET(__pyx_t_3, function);
}
}
__pyx_t_1 = (__pyx_t_4) ? __Pyx_PyObject_Call2Args(__pyx_t_3,
__pyx_t_4, __pyx_t_5) : __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5);
__Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
__Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
__pyx_v_matrix = __pyx_t_1;
__pyx_t_1 = 0;
/* "numpy_sample_cython.pyx":5
* def sample():
* matrix = np.array([[4,5],[6,7]])
* return matrix * 3 + 5 # <<<<<<<<<<<<<<
*
* sample()
*/
__Pyx_XDECREF(__pyx_r);
__pyx_t_1 = PyNumber_Multiply(__pyx_v_matrix, __pyx_int_3); if (unlikely(!
__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_3 = __Pyx_PyInt_AddObjC(__pyx_t_1, __pyx_int_5, 5, 0, 0); if
(unlikely(!__pyx_t_3)) __PYX_ERR(0, 5, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_3);
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_r = __pyx_t_3;
__pyx_t_3 = 0;
goto __pyx_L0;
def sample():
matrix = np.array([[4,5],[6,7]])
return matrix * 3 + 5
Other approaches
Part of Cython output, auto translation
Not for alien systems
Other approaches
NumCpp, manual translation
#include "NumCpp.hpp"
nc::NdArray<int> sample() {
nc::NdArray<int> matrix = { {4, 5}, {6, 7} };
return matrix * 3 + 5;
}
def sample():
matrix = np.array([[4,5],[6,7]])
return matrix * 3 + 5
C++
Memory allocation for every function/operators
Other approaches
NumExpr, just for reference
import numpy as np
import numexpr as ne
def sample():
matrix = np.array([[4,5],[6,7]])
return ne.evaluate("matrix * 3 + 5")
def sample():
matrix = np.array([[4,5],[6,7]])
return matrix * 3 + 5
JIT only, does not generate code
Uses its own DSL for expression
No intermediate buffers
DAFunctor
auto translation
void sample(float array3[4] /* shape=[2, 2] */)
{
const static int d_array_1[] = {4,5,6,7};
for(int i0=0;i0<2;i0+=1)
for(int i1=0;i1<2;i1+=1)
{
array3[i0*2 + i1] = ((d_array_1[((i0*2)+i1)]*3)+5);
}
}
Exported to memory
def sample():
matrix = np.array([[4,5],[6,7]])
return matrix * 3 + 5
Plotted by DAFunctor
DAFunctor
auto translation
def sample():
A = np.array([[4,5],[6,7]]) * 2
B = np.array([[5,5],[6,6]]) + 3
return A + B
void sample(float array5[4] /* shape=[2, 2] */)
{
const static int d_array_1[] = {4,5,6,7};
const static int d_array_2[] = {5,5,6,6};
for(int i0=0;i0<2;i0+=1)
for(int i1=0;i1<2;i1+=1)
{
array5[i0*2 + i1] = ((d_array_1[((i0*2)+i1)]*2)+(d_array_2[((i0*2)+i1)]+3));
}
}
Exported to memory
Plotted by DAFunctor
Concepts
● Functor (函子) vs. Function (函數)
– How symbolic translation works
● Generative function (生成) vs. Aggregate function (匯總)
– What kind of functions is tackled in DAFunctor
Functor vs. Function
Functor (函子)
– Functor() → Function
– Constructs
●
Functions
●
Graphs, NN
– Example
●
functools.partial
●
torch.nn
Function (函數)
– Function() → Value
– Evaluates
– Example
●
Ordinary functions in
imperative programming
languages
Generative function vs. Aggregate function
Generative function (生成函數)
– Create from nothing
●
np.zeros, np.ones, np.arange, np.meshgrid
– 1 : 1
●
np.sqrt, arithmetics
●
np.transpose, np.reshape
– Merge
●
np.concatenate, np.stack
– 1 : N
●
np.repeat
– Slicing
– Most of them have common properties so that
they can be merged together.
Aggregate function (匯總函數)
– N : 1
●
np.max, np.min, np.mean,
np.std
Others
– Complex functions
– Branched logic
Decomposition of generative functions
The common properties
Take #1
– Consider functions like
●
np.ones, np.reshape,
arithmetics
for Didx in OutBuf:
OutBuf[Didx] = F( I(Didx) )
●
Intuitive
Decomposition of generative functions
The common properties
Take #1 cont.
– Consider functions like
●
np.ones, np.reshape,
arithmetics
●
np.stack, np.concatenate
for Didx in OutBuf:
k = find_source_parition()
OutBuf[Didx] = Fk( Ik(Didx) )
●
Intuitive
●
Branch in loop
Decomposition of generative functions
The common properties
Take #2
– Consider functions like
●
np.ones, np.reshape,
arithmetics
●
np.stack, np.concatenate
# stack/concat-ed partitions
for partition in partitions:
for Sidx in partition:
OutBuf[ J(Sidx) ] = F( Sidx )
●
No branch in loop
Decomposition of generative functions
The common properties
Take #2 cont.
– Consider functions like
●
np.ones, np.reshape,
arithmetics
●
np.stack, np.concatenate
●
np.repeat
# stack/concat-ed partitions
for partition in partitions:
for Sidx in partition:
OutBuf[ J(Sidx) ] = F( Sidx )
●
No branch in loop
●
Unable to handle scattering
(one-to-many indice mapping)
Decomposition of generative functions
The common properties
Take #3
– Consider functions like
●
np.ones, np.reshape,
arithmetics
●
np.stack, np.concatenate
●
np.repeat
# stack/concat-ed partitions
for partition in partitions:
for Sidx in partition:
Didx = J( Sidx )
v = F( Sidx )
for Kidx in scatters(Didx):
OutBuf[ Kidx ] = v
Decomposition of generative functions
The common properties
●
F : Value expression
– evaluate from index/data to value
●
J : Index expression
– evaluate from sub-functor index to index
for this functor
●
Scatters: Scatter expression
– map one index to multiple indices
●
Partitions : Sub-functors
●
Shape
– Start, num, step (for slicing)
# stack/concat-ed partitions
for partition in partitions:
for Sidx in partition:
Didx = J( Sidx )
v = F( Sidx )
for Kidx in scatters(Didx):
OutBuf[ Kidx ] = v
vexpr - ones, full
def ones(shape):
return NumpyFunctor(
shape,
vexpr = 1,
desc = "ones",
opdesc = f"ones({shape})",
)
def full(shape, fill_value):
return NumpyFunctor(
shape,
vexpr = fill_value,
desc = "full",
opdesc = f"full({shape}, {fill_value})",
)
vexpr - arange
shape = [int(math.ceil((end - start) / step))]
return NumpyFunctor(
shape,
vexpr = ["+",["d0",["*",["i0","d2"]]]], # start + i * step
data = [start, end, step],
desc = "arange",
opdesc = f"arange({start}, {end}, {step})",
)
def f():
a = [[1,2,3,4]]
return np.reshape( np.array(a), (2,2) )
iexpr - reshape
def reshape(cls, a, shape):
offset = ["+",[
["*",
[f"i{i}"] + [a.shape[j] for j in range(i+1,len(a.shape))]
] for i in rangel(a.shape)]]
iexpr = []
for i in rangel(shape):
iexpr.append(
ast_strip([
"//",
[
["%",
[offset]+[
[
"*",
shape[j:]
] for j in range(i+1)
]
],
["*", shape[i+1:]]]]))
Functor: #7 reshape3
reshape((2, 2))_array((1, 4))
shape=((0, 2, 1), (0, 2, 1))
partitions=[[(0, 1, 1), (0, 4, 1)]]
iexpr=[
['//', [['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]]]], 2]]
['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]], 2]]
]
Functor[0]: #6 array6
array((1, 4))
shape=((0, 1, 1), (0, 4, 1))
vexpr=['ref', ['d', ['+', [['*', ['i0', 4]], 'i1']]]]
data=[1, 2, 3, 4]
def f():
a = [[1,2,3,4]]
return np.reshape( np.array(a), (2,2) )
iexpr - reshape
void gen_reshape(float reshape3[4] /* shape=[2, 2] */)
{
const static int d_array_3[] = {1,2,3,4};
for(int i0=0;i0<1;i0+=1)
for(int i1=0;i1<4;i1+=1)
{
// reshape((2, 2))
const int i0_0_1 = ((((i0*4)+i1)%(2*2))/2);
const int i1_0_1 = (((i0*4)+i1)%(2*2)%2);
reshape3[i0_0_1*2 + i1_0_1] = d_array_3[((i0*4)+i1)];
}
}
Functor: #7 reshape3
reshape((2, 2))_array((1, 4))
shape=((0, 2, 1), (0, 2, 1))
partitions=[[(0, 1, 1), (0, 4, 1)]]
iexpr=[
['//', [['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]]]], 2]]
['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]], 2]]
]
Functor[0]: #6 array6
array((1, 4))
shape=((0, 1, 1), (0, 4, 1))
vexpr=['ref', ['d', ['+', [['*', ['i0', 4]], 'i1']]]]
data=[1, 2, 3, 4]
void gen_reshape(float reshape2[6] /* shape=[3, 2] */) {
const static int d_array_2[] = {1,2,3,4,5,6};
for(int i0=0;i0<2;i0+=1)
for(int i1=0;i1<3;i1+=1) {
// reshape((1, 2, 1, 3, 1))
const int i0_0_1 = ((((i0*3)+i1)%(1*2*1*3*1))/(2*1*3*1));
const int i1_0_1 = ((((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1))/(1*3*1));
const int i2_0_1 = ((((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1)%(1*3*1))/(3*1));
const int i3_0_1 = ((((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1)%(1*3*1)%(3*1))/1);
const int i4_0_1 = (((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1)%(1*3*1)%(3*1)%1);
// reshape((3, 2))
const int i0_0_2 = ((((i0_0_1*2*1*3*1)+(i1_0_1*1*3*1)+(i2_0_1*3*1)+(i3_0_1*1)+i4_0_1)%(3*2))/2);
const int i1_0_2 = (((i0_0_1*2*1*3*1)+(i1_0_1*1*3*1)+(i2_0_1*3*1)+(i3_0_1*1)+i4_0_1)%(3*2)%2);
reshape2[i0_0_2*2 + i1_0_2] = d_array_2[((i0*3)+i1)];
}
}
def f():
a = [[1,2,3],[4,5,6]]
return np.reshape( np.reshape( np.array(a), (1,2,1,3,1) ), (3,2) )
reshape - merged
def f():
return np.transpose(
np.array([[4,5],[6,7]]), (1,0)
) + 3
iexpr + vexpr
Functor: #3 transpose
add
shape=((0, 2, 1), (0, 2, 1))
vexpr=['+', ['v0', 3]]
Functor[0]: #2 array2
transposed_array((2, 2))
shape=((0, 2, 1), (0, 2, 1))
partitions=[[(0, 2, 1), (0, 2, 1)]]
iexpr=[
i1
i0
]
Functor[0]: #1 array1
array((2, 2))
shape=((0, 2, 1), (0, 2, 1))
vexpr=['ref', ['d', ['+', [['*', ['i0', 2]], 'i1']]]]
data=[4, 5, 6, 7]
void gen_transpose(float transpose[4] /* shape=[2, 2] */)
{
const static int d_array_1[] = {4,5,6,7};
for(int i0=0;i0<2;i0+=1)
for(int i1=0;i1<2;i1+=1)
{
// transpose((1, 0))
const int i0_0_1 = i1;
const int i1_0_1 = i0;
transpose[i0_0_1*2 + i1_0_1] = (d_array_1[((i0*2)+i1)]+3);
}
}
def f():
return np.repeat(
np.array( [[1,2,3],[4,5,6]]),
3, axis=1) sexpr - repeat
shape = list(a.shape)
shape[axis] *= repeats
iexpr = [f"i{i}" for i in rangel(shape)]
iexpr[axis] = ["*", [f"i{axis}", repeats]]
sexpr = (axis, 0, repeats, 1) # axis, start, num, step
return NumpyFunctor(
shape,
partitions = [[(0,s,1) for s in a.shape]],
iexpr = iexpr,
sexpr = sexpr,
desc = f"repeat_{repeats}",
opdesc = f"repeat({repeats})",
subs = [a]
)
Functor: #2 repeat_axis_1
repeat_3
shape=((0, 2, 1), (0, 9, 1))
partitions=[[(0, 2, 1), (0, 3, 1)]]
sexpr=(1, 0, 3, 1)
iexpr=[
i0
['*', ['i1', 3]]
]
Functor[0]: #1 array1
array((2, 3))
shape=((0, 2, 1), (0, 3, 1))
vexpr=['ref', ['d', ['+', [['*', ['i0', 3]], 'i1']]]]
data=[1, 2, 3, 4, 5, 6]
def f():
return np.repeat(
np.array( [[1,2,3],[4,5,6]]),
3, axis=1) sexpr - repeat
Functor: #2 repeat_axis_1
repeat_3
shape=((0, 2, 1), (0, 9, 1))
partitions=[[(0, 2, 1), (0, 3, 1)]]
sexpr=(1, 0, 3, 1)
iexpr=[
i0
['*', ['i1', 3]]
]
Functor[0]: #1 array1
array((2, 3))
shape=((0, 2, 1), (0, 3, 1))
vexpr=['ref', ['d', ['+', [['*', ['i0', 3]], 'i1']]]]
data=[1, 2, 3, 4, 5, 6]
void gen_repeat_axis_1(float repeat_axis_1[18] /* shape=[2, 9] */)
{
const static int d_array_1[] = {1,2,3,4,5,6};
for(int i0=0;i0<2;i0+=1)
for(int i1=0;i1<3;i1+=1)
{
// repeat(3)
const int i0_0_1 = i0;
const int i1_0_1 = (i1*3);
const float v0 = d_array_1[((i0*3)+i1)];
const int i0_1_1 = i0_0_1;
for(int i1_1_1=0+i1_0_1;i1_1_1<3+i1_0_1;i1_1_1+=1)
{
repeat_axis_1[i0_1_1*9 + i1_1_1] = v0;
}
}
}
Python Black Magic
●
Shadow object: data view wrapper
●
__assign__: let the object know its name
– To maintain readability
●
@jit decorator: wrap all the procedures translating python
function to native function
Shadow object: data view wrapper
●
Shape (1,12), (3,4), (6,2)
has no difference in
memory
●
Share everything except
for shape
– Once-only export
●
Override magic methods
– __getattr__
– __setattr__
class Reshaper(Functor):
def __init__(self, functor, shape):
self.functor = functor
if type(self.functor) is Reshaper:
self.functor = self.functor.functor
self.shape = Shape(shape)
def __getattr__(self, name):
if name in ("functor", "shape"):
return object.__getattr__(self, name)
else:
return getattr(self.functor, name)
def __setattr__(self, name, value):
if name in ("functor", "shape"):
object.__setattr__(self, name, value)
else:
setattr(self.functor, name, value)
__assign__: let the object know its name
Long story
def f(assignee):
a = assignee
b, c = assignee, assignee
arr = [assignee, assignee]
d, e = arr
f, (g, h) = assignee, (assignee, assignee)
i = [0,0]
i[1] = assignee
j = [0,0]
x = 1
j[x] = assignee
k = [0,0]
x = [1]
z = 0
k[x[z]] = assignee
https://github.com/buganini/DAFunctor/blob/master/dafunctor/assign.py
__assign__: let the object know its name
First thought
import inspect
def f():
a = set()
print("locals():", locals())
print("This frame:", inspect.currentframe().f_locals)
print("Outer frame:", inspect.currentframe().f_back.f_locals)
def f2():
b = set()
f()
f2()
●
Symbol table
– NumExpr, SymPy
●
Cannot be applied from the
outside of functions
Output:
locals(): {'a': set()}
This frame: {'a': set()}
Outer frame: {'b': set()}
__assign__: let the object know its name
Short story
varname = obj
varname = obj
if hasattr(obj, "__assign__"):
obj.__assign__("varname", None)
array[3] = obj
array[3] = obj
if hasattr(obj, "__assign__"):
obj.__assign__("array", 3)
__assign__: let the object know its name
1. Get the source code
2. Source code → AST (Abstract Syntax Tree)
3. Patch AST
4. Compile patched AST to bytecode
5. Execute bytecode
__assign__: let the object know its name
Get the source code
import inspect
def f():
a = set()
print(inspect.getsource(f))
def f():
a = set()
__assign__: let the object know its name
Source → AST (Abstract Syntax Tree)
import inspect
import ast
def f():
a = set()
src = inspect.getsource(f)
node = ast.parse(src)
# print(ast.dump(node, indent=4)) # Python 3.9+
print(ast.dump(node))
Module(
body=[
FunctionDef(
name='f',
args=arguments(
posonlyargs=[],
args=[],
kwonlyargs=[],
kw_defaults=[],
defaults=[]),
body=[
Assign(
targets=[
Name(id='a', ctx=Store())],
value=Call(
func=Name(id='set', ctx=Load()),
args=[],
keywords=[]))],
decorator_list=[])],
type_ignores=[])
__assign__: let the object know its name
Patch AST
class AssignTransformer(ast.NodeTransformer):
def visit_Assign(self, node):
...
def visit_FunctionDef(self, func):
…
trans = AssignTransformer()
new_node = trans.visit(node)
# print(ast.unparse(new_node)) # Python 3.9+
ast.fix_missing_locations(new_node)
print(ast.dump(new_node))
__assign__: let the object know its name
Patch AST
def f():
a = set()
Module(
body=[
FunctionDef(
name='f',
args=arguments(
posonlyargs=[],
args=[],
kwonlyargs=[],
kw_defaults=[],
defaults=[]),
body=[
Assign(
targets=[
Name(id='a', ctx=Store())],
value=Call(
func=Name(id='set', ctx=Load()),
args=[],
keywords=[]))],
decorator_list=[])],
type_ignores=[])
Assign(
targets=[
Name(id='a', ctx=Store())],
value=Call(
func=Name(id='set', ctx=Load()),
args=[],
keywords=[])),
If(
test=Call(
func=Name(id='hasattr', ctx=Load()),
args=[
Name(id='a', ctx=Load()),
Constant(value='__assign__')],
keywords=[]),
body=[
Expr(
value=Call(
func=Attribute(
value=Name(id='a', ctx=Load()),
attr='__assign__',
ctx=Load()),
args=[
Constant(value='a'),
Constant(value=None)],
keywords=[]))],
orelse=[])],
__assign__: let the object know its name
Patch AST
def f():
a = set()
# print(ast.unparse(new_node)) # Python 3.9+
def f():
if True:
a = set()
if hasattr(a, '__assign__'):
a.__assign__('a', None)
NodeTransformer does node-to-node transformation,
use a dummy IF to wrap two nodes.
__assign__: let the object know its name
Compile patched AST to bytecode
Execute bytecode
patched_code = compile(new_node, "__assign__", "exec")
local_vars = {}
exec(patched_code, global_vars, local_vars)
patched_func = local_vars[func.__name__]
__assign__: let the object know its name
def f(assignee):
a = assignee
b, c = assignee, assignee
arr = [assignee,assignee]
d, e = arr
f, (g, h) = assignee, (assignee,assignee)
i = [0,0]
i[1] = assignee
j = [0,0]
x = 1
j[x] = assignee
k = [0,0]
x = [1]
z = 0
k[x[z]] = assignee
Assign <__main__.A object at 0x7f5fa0ecc550> as assignee idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as a idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as b idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as c idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as d idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as e idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as f idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as g idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as h idx None
Assign <__main__.A object at 0x7f5fa0ecc550> as i idx 1
Assign <__main__.A object at 0x7f5fa0ecc550> as j idx 1
Assign <__main__.A object at 0x7f5fa0ecc550> as k idx 1
def __assign__(self, name, idx):
print("Assign", self, "as", name, "idx", idx)
@jit decorator
1. Generate C code for wrapped python function
2. Compile C code into .so (shared object)
3. Load native function from .so with ctypes or cffi
4. Wrap native function with data conversion wrapper
5. Call wrapper function when invoked
@jit decorator
Plain python function
func
@jit decorator
Add a dummy decorator
func
Decorator
@jit decorator
Generate C code for wrapped python function
func
Decorator
C
@jit decorator
Compile C code into .so (shared object)
func
Decorator
.so
C
@jit decorator
Load native function from .so with ctypes or cffi
func
Decorator
native
func
.so
C
@jit decorator
Wrap native function with data conversion wrapper
func
Decorator
native
func
pre post
.so
C
@jit decorator
Call wrapper function when invoked
func
Decorator
native
func
pre post
.so
C
@jit decorator
Python function has no power here
func
Decorator
native
func
pre post
.so
C
Python Black Magic
Pitfalls
●
Decorators come with function source code obtained with
inspect.getsource() and attach on function in AST
– Beware of decorator recursion during bytecode execution
●
Indentations come with function source code obtained with
inspect.getsource()
– Need to strip extra indentation before ast.parse()
●
AST structure differs with python versions
– Tested with 3.8, 3.9
The rest of the transpiler
sketchy version
Trace output nodes to get DAG (Directed Acyclic Graph)
– Unrelated parts are ignored
The rest of the transpiler
sketchy version
Split branched DAG (partitions)
# stack/concat-ed partitions
for partition in partitions:
for Sidx in partition:
Didx = J( Sidx )
v = F( Sidx )
for Kidx in scatters(Didx):
OutBuf[ Kidx ] = v
The rest of the transpiler
sketchy version
Assemble CFG (Control-Flow Graph) and AST according to
Iexpr/Vexpr/Sexpr for each unbranched DAG
['autobuf', Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1)],
['for_shape', ((0, 2, 1), (0, 2, 1)), 0, 0],
['scope',
[
[
'=',
Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1),
['*', [['ref', 'd_array_1', ('+', [['*', [['idx', 0, 0, 0], 2]], ['idx', 1, 0, 0]])], 2]], 0, 0
]
]
],
['newline'],
['comment', 'end of A'],
The rest of the transpiler
sketchy version
Generate C code
['autobuf', Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1)],
['for_shape', ((0, 2, 1), (0, 2, 1)), 0, 0],
['scope',
[
[
'=',
Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1),
['*', [['ref', 'd_array_1', ('+', [['*', [['idx', 0, 0, 0], 2]], ['idx', 1, 0, 0]])], 2]], 0, 0
]
]
],
['newline'],
['comment', 'end of A'],
AUTOBUF float A[2 * 2]; // [2, 2] multiply
for(int i0=0;i0<2;i0+=1)
for(int i1=0;i1<2;i1+=1)
{
A[i0*2 + i1] = (d_array_1[((i0*2)+i1)]*2);
}
// end of A
The rest of the transpiler
A bit more detail
Index tailoring
def f_range_step2(np):
a = list(range(20))
return np.array(a)[2:-2:2][3:15:3]
void gen_getitem_range_step2(float getitem_range_step2[2] /* shape=[2] */)
{
const static int d_array_1[] = {0,1,2,3,4,5,6,7,8,9,10,
11,12,13,14,15,16,17,18,19};
for(int i0=8;i0<20;i0+=6)
{
// [(slice(2, -2, 2),)]
const int i0_0_1 = ((i0-2)/2);
// [(slice(3, 15, 3),)]
const int i0_0_2 = ((i0_0_1-3)/3);
getitem_range_step2[i0_0_2] = d_array_1[i0];
}
}
Functor: #3 getitem_range_step2
array((20,))[(slice(2, -2, 2),)][(slice(3, 15, 3),)]
shape=((0, 2, 1),)
partitions=[[(3, 2, 3)]]
iexpr=[
['//', [['-', ['i0', 3]], 3]]
]
Functor[0]: #2 array2
array((20,))[(slice(2, -2, 2),)]
shape=((0, 8, 1),)
partitions=[[(2, 8, 2)]]
iexpr=[
['//', [['-', ['i0', 2]], 2]]
]
Functor[0]: #1 array1
array((20,))
shape=((0, 20, 1),)
vexpr=['ref', ['d', 'i0']]
data=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Testing System
NumPy/Torch
vs.
Functor I/V/S expr
vs.
Generated C code
Functor + Transpiler = Fragile Development
Testing System
Golden Result
Compare
(Verify functor) Compare
(Verify transpiler)
test
NumPy
functors
DaFunctor
Result
eval
I/V/S expr
C
transpile
.so
compile
Result2
execute
Testing System
Dependency Injection
– np
●
numpy
●
dafunctor.numpy
from tester_numpy import *
def sample(np):
A = np.array([[4,5],[6,7]]) * 2
B = np.array([[5,5],[6,6]]) + 3
return A + B
test_func("sample", sample)
Testing System
Transpiled evaluation
– As previously described
Pure Python evaluation
– Per-node evaluation
– Standalone interpreter
Testing System
import dafunctor.numpy as np
s = np.meshgrid([1,2],[3,4,5])
s.eval()
array([[[1., 2.],
[1., 2.],
[1., 2.]],
[[3., 3.],
[4., 4.],
[5., 5.]]])
f = s.jit()
f()
array([[[1., 2.],
[1., 2.],
[1., 2.]],
[[3., 3.],
[4., 4.],
[5., 5.]]], dtype=float32)
More python
importlib
– Test suite loader
pygments
– Syntax highlighter
importlib
Test suite loader
import importlib
for fn in sorted(os.listdir(os.path.dirname(os.path.abspath(__file__)))):
if fn.startswith("numpy_"):
try:
importlib.import_module(os.path.splitext(fn)[0])
except:
print("Error running test", fn)
# raise
pygments
Syntax highlighter
pygments
Syntax highlighter
from pygments import highlight
from pygments.formatters import TerminalFormatter as Formatter
from pygments.lexers import CLexer as Lexer
code = open(cfile).read()
print(highlight(code, Lexer(), Formatter()))
from pygments import highlight
from pygments.formatters import TerminalFormatter as Formatter
from pygments.lexers import CLexer as Lexer
code = open(cfile).read()
print(highlight(code, Lexer(), Formatter()))
https://extensions.libreoffice.org/en/extensions/show/code-highlighter
References
● https://devguide.python.org/compiler/
Design of CPython’s Compiler
– Concepts for transpiler design
● https://github.com/RyanKung/assign
– The origin of the __assign__ magic
Source of Inspiration
DaFunctor
– Generates C code
– Compiled to machine code
– Execute machine code
Python
– Generates bytecode
– Interpret bytecode
Julia (LLVM based JIT)
– Generates LLIR
– Compiled to machine code
– Execute machine code
Thank You

Dafunctor

  • 1.
    DAFunctor Symbolic Translator from NumPy/PyTorchND-Array operations to C Buganini Chiu PyCon Taiwan 2021 https://github.com/buganini/dafunctor
  • 2.
    About me ● Buganini Chiu,aka Bug ● Know some C & Python ● Had done a few deep learning solution porting tasks to various platforms https://github.com/buganini
  • 3.
    What? Why? How? ● Purpose ● Otherapproaches ● DAFunctor
  • 4.
    Purpose ● Porting numpy operationsto certain platforms – Memory limit for embedded systems ● Intermediate buffers – Lack of support for C++ and ecosystem (some RTOS; no POSIX API) – Usage constrains ruled by development guidelines ● Fragmentation concern – MISRA C:2004, 20.4 - Dynamic heap memory allocation shall not be used. – MISRA C++ 2008, 18-4-1 - Dynamic heap memory allocation shall not be used. – MISRA C:2012, 21.3 The memory allocation and deallocation functions of <stdlib.h> shall not be used – Interpolatability with other modules ● Maintainability
  • 5.
    Other approaches Given afunction like this : def sample(): matrix = np.array([[4,5],[6,7]]) return matrix * 3 + 5
  • 6.
    static PyObject *__pyx_pf_19numpy_sample_cython_sample(CYTHON_UNUSED PyObject*__pyx_self) { PyObject *__pyx_v_matrix = NULL; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; PyObject *__pyx_t_2 = NULL; PyObject *__pyx_t_3 = NULL; PyObject *__pyx_t_4 = NULL; PyObject *__pyx_t_5 = NULL; __Pyx_RefNannySetupContext("sample", 0); /* "numpy_sample_cython.pyx":4 * * def sample(): * matrix = np.array([[4,5],[6,7]]) # <<<<<<<<<<<<<< * return matrix * 3 + 5 * */ __Pyx_GetModuleGlobalName(__pyx_t_2, __pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_array); if (unlikely(! __pyx_t_3)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = PyList_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_INCREF(__pyx_int_4); __Pyx_GIVEREF(__pyx_int_4); PyList_SET_ITEM(__pyx_t_2, 0, __pyx_int_4); __Pyx_INCREF(__pyx_int_5); __Pyx_GIVEREF(__pyx_int_5); PyList_SET_ITEM(__pyx_t_2, 1, __pyx_int_5); __pyx_t_4 = PyList_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_INCREF(__pyx_int_6); __Pyx_GIVEREF(__pyx_int_6); PyList_SET_ITEM(__pyx_t_4, 0, __pyx_int_6); __Pyx_INCREF(__pyx_int_7); __Pyx_GIVEREF(__pyx_int_7); PyList_SET_ITEM(__pyx_t_4, 1, __pyx_int_7); __pyx_t_5 = PyList_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_GIVEREF(__pyx_t_2); PyList_SET_ITEM(__pyx_t_5, 0, __pyx_t_2); __Pyx_GIVEREF(__pyx_t_4); PyList_SET_ITEM(__pyx_t_5, 1, __pyx_t_4); __pyx_t_2 = 0; __pyx_t_4 = 0; __pyx_t_4 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3); if (likely(__pyx_t_4)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); __Pyx_INCREF(__pyx_t_4); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_3, function); } } __pyx_t_1 = (__pyx_t_4) ? __Pyx_PyObject_Call2Args(__pyx_t_3, __pyx_t_4, __pyx_t_5) : __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_matrix = __pyx_t_1; __pyx_t_1 = 0; /* "numpy_sample_cython.pyx":5 * def sample(): * matrix = np.array([[4,5],[6,7]]) * return matrix * 3 + 5 # <<<<<<<<<<<<<< * * sample() */ __Pyx_XDECREF(__pyx_r); __pyx_t_1 = PyNumber_Multiply(__pyx_v_matrix, __pyx_int_3); if (unlikely(! __pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = __Pyx_PyInt_AddObjC(__pyx_t_1, __pyx_int_5, 5, 0, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_r = __pyx_t_3; __pyx_t_3 = 0; goto __pyx_L0; def sample(): matrix = np.array([[4,5],[6,7]]) return matrix * 3 + 5 Other approaches Part of Cython output, auto translation Not for alien systems
  • 7.
    Other approaches NumCpp, manualtranslation #include "NumCpp.hpp" nc::NdArray<int> sample() { nc::NdArray<int> matrix = { {4, 5}, {6, 7} }; return matrix * 3 + 5; } def sample(): matrix = np.array([[4,5],[6,7]]) return matrix * 3 + 5 C++ Memory allocation for every function/operators
  • 8.
    Other approaches NumExpr, justfor reference import numpy as np import numexpr as ne def sample(): matrix = np.array([[4,5],[6,7]]) return ne.evaluate("matrix * 3 + 5") def sample(): matrix = np.array([[4,5],[6,7]]) return matrix * 3 + 5 JIT only, does not generate code Uses its own DSL for expression No intermediate buffers
  • 9.
    DAFunctor auto translation void sample(floatarray3[4] /* shape=[2, 2] */) { const static int d_array_1[] = {4,5,6,7}; for(int i0=0;i0<2;i0+=1) for(int i1=0;i1<2;i1+=1) { array3[i0*2 + i1] = ((d_array_1[((i0*2)+i1)]*3)+5); } } Exported to memory def sample(): matrix = np.array([[4,5],[6,7]]) return matrix * 3 + 5 Plotted by DAFunctor
  • 10.
    DAFunctor auto translation def sample(): A= np.array([[4,5],[6,7]]) * 2 B = np.array([[5,5],[6,6]]) + 3 return A + B void sample(float array5[4] /* shape=[2, 2] */) { const static int d_array_1[] = {4,5,6,7}; const static int d_array_2[] = {5,5,6,6}; for(int i0=0;i0<2;i0+=1) for(int i1=0;i1<2;i1+=1) { array5[i0*2 + i1] = ((d_array_1[((i0*2)+i1)]*2)+(d_array_2[((i0*2)+i1)]+3)); } } Exported to memory Plotted by DAFunctor
  • 11.
    Concepts ● Functor (函子)vs. Function (函數) – How symbolic translation works ● Generative function (生成) vs. Aggregate function (匯總) – What kind of functions is tackled in DAFunctor
  • 12.
    Functor vs. Function Functor(函子) – Functor() → Function – Constructs ● Functions ● Graphs, NN – Example ● functools.partial ● torch.nn Function (函數) – Function() → Value – Evaluates – Example ● Ordinary functions in imperative programming languages
  • 13.
    Generative function vs.Aggregate function Generative function (生成函數) – Create from nothing ● np.zeros, np.ones, np.arange, np.meshgrid – 1 : 1 ● np.sqrt, arithmetics ● np.transpose, np.reshape – Merge ● np.concatenate, np.stack – 1 : N ● np.repeat – Slicing – Most of them have common properties so that they can be merged together. Aggregate function (匯總函數) – N : 1 ● np.max, np.min, np.mean, np.std Others – Complex functions – Branched logic
  • 14.
    Decomposition of generativefunctions The common properties Take #1 – Consider functions like ● np.ones, np.reshape, arithmetics for Didx in OutBuf: OutBuf[Didx] = F( I(Didx) ) ● Intuitive
  • 15.
    Decomposition of generativefunctions The common properties Take #1 cont. – Consider functions like ● np.ones, np.reshape, arithmetics ● np.stack, np.concatenate for Didx in OutBuf: k = find_source_parition() OutBuf[Didx] = Fk( Ik(Didx) ) ● Intuitive ● Branch in loop
  • 16.
    Decomposition of generativefunctions The common properties Take #2 – Consider functions like ● np.ones, np.reshape, arithmetics ● np.stack, np.concatenate # stack/concat-ed partitions for partition in partitions: for Sidx in partition: OutBuf[ J(Sidx) ] = F( Sidx ) ● No branch in loop
  • 17.
    Decomposition of generativefunctions The common properties Take #2 cont. – Consider functions like ● np.ones, np.reshape, arithmetics ● np.stack, np.concatenate ● np.repeat # stack/concat-ed partitions for partition in partitions: for Sidx in partition: OutBuf[ J(Sidx) ] = F( Sidx ) ● No branch in loop ● Unable to handle scattering (one-to-many indice mapping)
  • 18.
    Decomposition of generativefunctions The common properties Take #3 – Consider functions like ● np.ones, np.reshape, arithmetics ● np.stack, np.concatenate ● np.repeat # stack/concat-ed partitions for partition in partitions: for Sidx in partition: Didx = J( Sidx ) v = F( Sidx ) for Kidx in scatters(Didx): OutBuf[ Kidx ] = v
  • 19.
    Decomposition of generativefunctions The common properties ● F : Value expression – evaluate from index/data to value ● J : Index expression – evaluate from sub-functor index to index for this functor ● Scatters: Scatter expression – map one index to multiple indices ● Partitions : Sub-functors ● Shape – Start, num, step (for slicing) # stack/concat-ed partitions for partition in partitions: for Sidx in partition: Didx = J( Sidx ) v = F( Sidx ) for Kidx in scatters(Didx): OutBuf[ Kidx ] = v
  • 20.
    vexpr - ones,full def ones(shape): return NumpyFunctor( shape, vexpr = 1, desc = "ones", opdesc = f"ones({shape})", ) def full(shape, fill_value): return NumpyFunctor( shape, vexpr = fill_value, desc = "full", opdesc = f"full({shape}, {fill_value})", )
  • 21.
    vexpr - arange shape= [int(math.ceil((end - start) / step))] return NumpyFunctor( shape, vexpr = ["+",["d0",["*",["i0","d2"]]]], # start + i * step data = [start, end, step], desc = "arange", opdesc = f"arange({start}, {end}, {step})", )
  • 22.
    def f(): a =[[1,2,3,4]] return np.reshape( np.array(a), (2,2) ) iexpr - reshape def reshape(cls, a, shape): offset = ["+",[ ["*", [f"i{i}"] + [a.shape[j] for j in range(i+1,len(a.shape))] ] for i in rangel(a.shape)]] iexpr = [] for i in rangel(shape): iexpr.append( ast_strip([ "//", [ ["%", [offset]+[ [ "*", shape[j:] ] for j in range(i+1) ] ], ["*", shape[i+1:]]]])) Functor: #7 reshape3 reshape((2, 2))_array((1, 4)) shape=((0, 2, 1), (0, 2, 1)) partitions=[[(0, 1, 1), (0, 4, 1)]] iexpr=[ ['//', [['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]]]], 2]] ['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]], 2]] ] Functor[0]: #6 array6 array((1, 4)) shape=((0, 1, 1), (0, 4, 1)) vexpr=['ref', ['d', ['+', [['*', ['i0', 4]], 'i1']]]] data=[1, 2, 3, 4]
  • 23.
    def f(): a =[[1,2,3,4]] return np.reshape( np.array(a), (2,2) ) iexpr - reshape void gen_reshape(float reshape3[4] /* shape=[2, 2] */) { const static int d_array_3[] = {1,2,3,4}; for(int i0=0;i0<1;i0+=1) for(int i1=0;i1<4;i1+=1) { // reshape((2, 2)) const int i0_0_1 = ((((i0*4)+i1)%(2*2))/2); const int i1_0_1 = (((i0*4)+i1)%(2*2)%2); reshape3[i0_0_1*2 + i1_0_1] = d_array_3[((i0*4)+i1)]; } } Functor: #7 reshape3 reshape((2, 2))_array((1, 4)) shape=((0, 2, 1), (0, 2, 1)) partitions=[[(0, 1, 1), (0, 4, 1)]] iexpr=[ ['//', [['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]]]], 2]] ['%', [['+', [['*', ['i0', 4]], 'i1']], ['*', [2, 2]], 2]] ] Functor[0]: #6 array6 array((1, 4)) shape=((0, 1, 1), (0, 4, 1)) vexpr=['ref', ['d', ['+', [['*', ['i0', 4]], 'i1']]]] data=[1, 2, 3, 4]
  • 24.
    void gen_reshape(float reshape2[6]/* shape=[3, 2] */) { const static int d_array_2[] = {1,2,3,4,5,6}; for(int i0=0;i0<2;i0+=1) for(int i1=0;i1<3;i1+=1) { // reshape((1, 2, 1, 3, 1)) const int i0_0_1 = ((((i0*3)+i1)%(1*2*1*3*1))/(2*1*3*1)); const int i1_0_1 = ((((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1))/(1*3*1)); const int i2_0_1 = ((((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1)%(1*3*1))/(3*1)); const int i3_0_1 = ((((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1)%(1*3*1)%(3*1))/1); const int i4_0_1 = (((i0*3)+i1)%(1*2*1*3*1)%(2*1*3*1)%(1*3*1)%(3*1)%1); // reshape((3, 2)) const int i0_0_2 = ((((i0_0_1*2*1*3*1)+(i1_0_1*1*3*1)+(i2_0_1*3*1)+(i3_0_1*1)+i4_0_1)%(3*2))/2); const int i1_0_2 = (((i0_0_1*2*1*3*1)+(i1_0_1*1*3*1)+(i2_0_1*3*1)+(i3_0_1*1)+i4_0_1)%(3*2)%2); reshape2[i0_0_2*2 + i1_0_2] = d_array_2[((i0*3)+i1)]; } } def f(): a = [[1,2,3],[4,5,6]] return np.reshape( np.reshape( np.array(a), (1,2,1,3,1) ), (3,2) ) reshape - merged
  • 25.
    def f(): return np.transpose( np.array([[4,5],[6,7]]),(1,0) ) + 3 iexpr + vexpr Functor: #3 transpose add shape=((0, 2, 1), (0, 2, 1)) vexpr=['+', ['v0', 3]] Functor[0]: #2 array2 transposed_array((2, 2)) shape=((0, 2, 1), (0, 2, 1)) partitions=[[(0, 2, 1), (0, 2, 1)]] iexpr=[ i1 i0 ] Functor[0]: #1 array1 array((2, 2)) shape=((0, 2, 1), (0, 2, 1)) vexpr=['ref', ['d', ['+', [['*', ['i0', 2]], 'i1']]]] data=[4, 5, 6, 7] void gen_transpose(float transpose[4] /* shape=[2, 2] */) { const static int d_array_1[] = {4,5,6,7}; for(int i0=0;i0<2;i0+=1) for(int i1=0;i1<2;i1+=1) { // transpose((1, 0)) const int i0_0_1 = i1; const int i1_0_1 = i0; transpose[i0_0_1*2 + i1_0_1] = (d_array_1[((i0*2)+i1)]+3); } }
  • 26.
    def f(): return np.repeat( np.array([[1,2,3],[4,5,6]]), 3, axis=1) sexpr - repeat shape = list(a.shape) shape[axis] *= repeats iexpr = [f"i{i}" for i in rangel(shape)] iexpr[axis] = ["*", [f"i{axis}", repeats]] sexpr = (axis, 0, repeats, 1) # axis, start, num, step return NumpyFunctor( shape, partitions = [[(0,s,1) for s in a.shape]], iexpr = iexpr, sexpr = sexpr, desc = f"repeat_{repeats}", opdesc = f"repeat({repeats})", subs = [a] ) Functor: #2 repeat_axis_1 repeat_3 shape=((0, 2, 1), (0, 9, 1)) partitions=[[(0, 2, 1), (0, 3, 1)]] sexpr=(1, 0, 3, 1) iexpr=[ i0 ['*', ['i1', 3]] ] Functor[0]: #1 array1 array((2, 3)) shape=((0, 2, 1), (0, 3, 1)) vexpr=['ref', ['d', ['+', [['*', ['i0', 3]], 'i1']]]] data=[1, 2, 3, 4, 5, 6]
  • 27.
    def f(): return np.repeat( np.array([[1,2,3],[4,5,6]]), 3, axis=1) sexpr - repeat Functor: #2 repeat_axis_1 repeat_3 shape=((0, 2, 1), (0, 9, 1)) partitions=[[(0, 2, 1), (0, 3, 1)]] sexpr=(1, 0, 3, 1) iexpr=[ i0 ['*', ['i1', 3]] ] Functor[0]: #1 array1 array((2, 3)) shape=((0, 2, 1), (0, 3, 1)) vexpr=['ref', ['d', ['+', [['*', ['i0', 3]], 'i1']]]] data=[1, 2, 3, 4, 5, 6] void gen_repeat_axis_1(float repeat_axis_1[18] /* shape=[2, 9] */) { const static int d_array_1[] = {1,2,3,4,5,6}; for(int i0=0;i0<2;i0+=1) for(int i1=0;i1<3;i1+=1) { // repeat(3) const int i0_0_1 = i0; const int i1_0_1 = (i1*3); const float v0 = d_array_1[((i0*3)+i1)]; const int i0_1_1 = i0_0_1; for(int i1_1_1=0+i1_0_1;i1_1_1<3+i1_0_1;i1_1_1+=1) { repeat_axis_1[i0_1_1*9 + i1_1_1] = v0; } } }
  • 28.
    Python Black Magic ● Shadowobject: data view wrapper ● __assign__: let the object know its name – To maintain readability ● @jit decorator: wrap all the procedures translating python function to native function
  • 29.
    Shadow object: dataview wrapper ● Shape (1,12), (3,4), (6,2) has no difference in memory ● Share everything except for shape – Once-only export ● Override magic methods – __getattr__ – __setattr__ class Reshaper(Functor): def __init__(self, functor, shape): self.functor = functor if type(self.functor) is Reshaper: self.functor = self.functor.functor self.shape = Shape(shape) def __getattr__(self, name): if name in ("functor", "shape"): return object.__getattr__(self, name) else: return getattr(self.functor, name) def __setattr__(self, name, value): if name in ("functor", "shape"): object.__setattr__(self, name, value) else: setattr(self.functor, name, value)
  • 30.
    __assign__: let theobject know its name Long story def f(assignee): a = assignee b, c = assignee, assignee arr = [assignee, assignee] d, e = arr f, (g, h) = assignee, (assignee, assignee) i = [0,0] i[1] = assignee j = [0,0] x = 1 j[x] = assignee k = [0,0] x = [1] z = 0 k[x[z]] = assignee https://github.com/buganini/DAFunctor/blob/master/dafunctor/assign.py
  • 31.
    __assign__: let theobject know its name First thought import inspect def f(): a = set() print("locals():", locals()) print("This frame:", inspect.currentframe().f_locals) print("Outer frame:", inspect.currentframe().f_back.f_locals) def f2(): b = set() f() f2() ● Symbol table – NumExpr, SymPy ● Cannot be applied from the outside of functions Output: locals(): {'a': set()} This frame: {'a': set()} Outer frame: {'b': set()}
  • 32.
    __assign__: let theobject know its name Short story varname = obj varname = obj if hasattr(obj, "__assign__"): obj.__assign__("varname", None) array[3] = obj array[3] = obj if hasattr(obj, "__assign__"): obj.__assign__("array", 3)
  • 33.
    __assign__: let theobject know its name 1. Get the source code 2. Source code → AST (Abstract Syntax Tree) 3. Patch AST 4. Compile patched AST to bytecode 5. Execute bytecode
  • 34.
    __assign__: let theobject know its name Get the source code import inspect def f(): a = set() print(inspect.getsource(f)) def f(): a = set()
  • 35.
    __assign__: let theobject know its name Source → AST (Abstract Syntax Tree) import inspect import ast def f(): a = set() src = inspect.getsource(f) node = ast.parse(src) # print(ast.dump(node, indent=4)) # Python 3.9+ print(ast.dump(node)) Module( body=[ FunctionDef( name='f', args=arguments( posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]), body=[ Assign( targets=[ Name(id='a', ctx=Store())], value=Call( func=Name(id='set', ctx=Load()), args=[], keywords=[]))], decorator_list=[])], type_ignores=[])
  • 36.
    __assign__: let theobject know its name Patch AST class AssignTransformer(ast.NodeTransformer): def visit_Assign(self, node): ... def visit_FunctionDef(self, func): … trans = AssignTransformer() new_node = trans.visit(node) # print(ast.unparse(new_node)) # Python 3.9+ ast.fix_missing_locations(new_node) print(ast.dump(new_node))
  • 37.
    __assign__: let theobject know its name Patch AST def f(): a = set() Module( body=[ FunctionDef( name='f', args=arguments( posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]), body=[ Assign( targets=[ Name(id='a', ctx=Store())], value=Call( func=Name(id='set', ctx=Load()), args=[], keywords=[]))], decorator_list=[])], type_ignores=[]) Assign( targets=[ Name(id='a', ctx=Store())], value=Call( func=Name(id='set', ctx=Load()), args=[], keywords=[])), If( test=Call( func=Name(id='hasattr', ctx=Load()), args=[ Name(id='a', ctx=Load()), Constant(value='__assign__')], keywords=[]), body=[ Expr( value=Call( func=Attribute( value=Name(id='a', ctx=Load()), attr='__assign__', ctx=Load()), args=[ Constant(value='a'), Constant(value=None)], keywords=[]))], orelse=[])],
  • 38.
    __assign__: let theobject know its name Patch AST def f(): a = set() # print(ast.unparse(new_node)) # Python 3.9+ def f(): if True: a = set() if hasattr(a, '__assign__'): a.__assign__('a', None) NodeTransformer does node-to-node transformation, use a dummy IF to wrap two nodes.
  • 39.
    __assign__: let theobject know its name Compile patched AST to bytecode Execute bytecode patched_code = compile(new_node, "__assign__", "exec") local_vars = {} exec(patched_code, global_vars, local_vars) patched_func = local_vars[func.__name__]
  • 40.
    __assign__: let theobject know its name def f(assignee): a = assignee b, c = assignee, assignee arr = [assignee,assignee] d, e = arr f, (g, h) = assignee, (assignee,assignee) i = [0,0] i[1] = assignee j = [0,0] x = 1 j[x] = assignee k = [0,0] x = [1] z = 0 k[x[z]] = assignee Assign <__main__.A object at 0x7f5fa0ecc550> as assignee idx None Assign <__main__.A object at 0x7f5fa0ecc550> as a idx None Assign <__main__.A object at 0x7f5fa0ecc550> as b idx None Assign <__main__.A object at 0x7f5fa0ecc550> as c idx None Assign <__main__.A object at 0x7f5fa0ecc550> as d idx None Assign <__main__.A object at 0x7f5fa0ecc550> as e idx None Assign <__main__.A object at 0x7f5fa0ecc550> as f idx None Assign <__main__.A object at 0x7f5fa0ecc550> as g idx None Assign <__main__.A object at 0x7f5fa0ecc550> as h idx None Assign <__main__.A object at 0x7f5fa0ecc550> as i idx 1 Assign <__main__.A object at 0x7f5fa0ecc550> as j idx 1 Assign <__main__.A object at 0x7f5fa0ecc550> as k idx 1 def __assign__(self, name, idx): print("Assign", self, "as", name, "idx", idx)
  • 41.
    @jit decorator 1. GenerateC code for wrapped python function 2. Compile C code into .so (shared object) 3. Load native function from .so with ctypes or cffi 4. Wrap native function with data conversion wrapper 5. Call wrapper function when invoked
  • 42.
  • 43.
    @jit decorator Add adummy decorator func Decorator
  • 44.
    @jit decorator Generate Ccode for wrapped python function func Decorator C
  • 45.
    @jit decorator Compile Ccode into .so (shared object) func Decorator .so C
  • 46.
    @jit decorator Load nativefunction from .so with ctypes or cffi func Decorator native func .so C
  • 47.
    @jit decorator Wrap nativefunction with data conversion wrapper func Decorator native func pre post .so C
  • 48.
    @jit decorator Call wrapperfunction when invoked func Decorator native func pre post .so C
  • 49.
    @jit decorator Python functionhas no power here func Decorator native func pre post .so C
  • 50.
    Python Black Magic Pitfalls ● Decoratorscome with function source code obtained with inspect.getsource() and attach on function in AST – Beware of decorator recursion during bytecode execution ● Indentations come with function source code obtained with inspect.getsource() – Need to strip extra indentation before ast.parse() ● AST structure differs with python versions – Tested with 3.8, 3.9
  • 51.
    The rest ofthe transpiler sketchy version Trace output nodes to get DAG (Directed Acyclic Graph) – Unrelated parts are ignored
  • 52.
    The rest ofthe transpiler sketchy version Split branched DAG (partitions) # stack/concat-ed partitions for partition in partitions: for Sidx in partition: Didx = J( Sidx ) v = F( Sidx ) for Kidx in scatters(Didx): OutBuf[ Kidx ] = v
  • 53.
    The rest ofthe transpiler sketchy version Assemble CFG (Control-Flow Graph) and AST according to Iexpr/Vexpr/Sexpr for each unbranched DAG ['autobuf', Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1)], ['for_shape', ((0, 2, 1), (0, 2, 1)), 0, 0], ['scope', [ [ '=', Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1), ['*', [['ref', 'd_array_1', ('+', [['*', [['idx', 0, 0, 0], 2]], ['idx', 1, 0, 0]])], 2]], 0, 0 ] ] ], ['newline'], ['comment', 'end of A'],
  • 54.
    The rest ofthe transpiler sketchy version Generate C code ['autobuf', Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1)], ['for_shape', ((0, 2, 1), (0, 2, 1)), 0, 0], ['scope', [ [ '=', Functor(id=2, name=A, desc=multiply, shape=((0, 2, 1), (0, 2, 1)), subs=1), ['*', [['ref', 'd_array_1', ('+', [['*', [['idx', 0, 0, 0], 2]], ['idx', 1, 0, 0]])], 2]], 0, 0 ] ] ], ['newline'], ['comment', 'end of A'], AUTOBUF float A[2 * 2]; // [2, 2] multiply for(int i0=0;i0<2;i0+=1) for(int i1=0;i1<2;i1+=1) { A[i0*2 + i1] = (d_array_1[((i0*2)+i1)]*2); } // end of A
  • 55.
    The rest ofthe transpiler A bit more detail Index tailoring def f_range_step2(np): a = list(range(20)) return np.array(a)[2:-2:2][3:15:3] void gen_getitem_range_step2(float getitem_range_step2[2] /* shape=[2] */) { const static int d_array_1[] = {0,1,2,3,4,5,6,7,8,9,10, 11,12,13,14,15,16,17,18,19}; for(int i0=8;i0<20;i0+=6) { // [(slice(2, -2, 2),)] const int i0_0_1 = ((i0-2)/2); // [(slice(3, 15, 3),)] const int i0_0_2 = ((i0_0_1-3)/3); getitem_range_step2[i0_0_2] = d_array_1[i0]; } } Functor: #3 getitem_range_step2 array((20,))[(slice(2, -2, 2),)][(slice(3, 15, 3),)] shape=((0, 2, 1),) partitions=[[(3, 2, 3)]] iexpr=[ ['//', [['-', ['i0', 3]], 3]] ] Functor[0]: #2 array2 array((20,))[(slice(2, -2, 2),)] shape=((0, 8, 1),) partitions=[[(2, 8, 2)]] iexpr=[ ['//', [['-', ['i0', 2]], 2]] ] Functor[0]: #1 array1 array((20,)) shape=((0, 20, 1),) vexpr=['ref', ['d', 'i0']] data=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
  • 56.
    Testing System NumPy/Torch vs. Functor I/V/Sexpr vs. Generated C code Functor + Transpiler = Fragile Development
  • 57.
    Testing System Golden Result Compare (Verifyfunctor) Compare (Verify transpiler) test NumPy functors DaFunctor Result eval I/V/S expr C transpile .so compile Result2 execute
  • 58.
    Testing System Dependency Injection –np ● numpy ● dafunctor.numpy from tester_numpy import * def sample(np): A = np.array([[4,5],[6,7]]) * 2 B = np.array([[5,5],[6,6]]) + 3 return A + B test_func("sample", sample)
  • 59.
    Testing System Transpiled evaluation –As previously described Pure Python evaluation – Per-node evaluation – Standalone interpreter
  • 60.
    Testing System import dafunctor.numpyas np s = np.meshgrid([1,2],[3,4,5]) s.eval() array([[[1., 2.], [1., 2.], [1., 2.]], [[3., 3.], [4., 4.], [5., 5.]]]) f = s.jit() f() array([[[1., 2.], [1., 2.], [1., 2.]], [[3., 3.], [4., 4.], [5., 5.]]], dtype=float32)
  • 61.
    More python importlib – Testsuite loader pygments – Syntax highlighter
  • 62.
    importlib Test suite loader importimportlib for fn in sorted(os.listdir(os.path.dirname(os.path.abspath(__file__)))): if fn.startswith("numpy_"): try: importlib.import_module(os.path.splitext(fn)[0]) except: print("Error running test", fn) # raise
  • 63.
  • 64.
    pygments Syntax highlighter from pygmentsimport highlight from pygments.formatters import TerminalFormatter as Formatter from pygments.lexers import CLexer as Lexer code = open(cfile).read() print(highlight(code, Lexer(), Formatter())) from pygments import highlight from pygments.formatters import TerminalFormatter as Formatter from pygments.lexers import CLexer as Lexer code = open(cfile).read() print(highlight(code, Lexer(), Formatter())) https://extensions.libreoffice.org/en/extensions/show/code-highlighter
  • 65.
    References ● https://devguide.python.org/compiler/ Design ofCPython’s Compiler – Concepts for transpiler design ● https://github.com/RyanKung/assign – The origin of the __assign__ magic
  • 66.
    Source of Inspiration DaFunctor –Generates C code – Compiled to machine code – Execute machine code Python – Generates bytecode – Interpret bytecode Julia (LLVM based JIT) – Generates LLIR – Compiled to machine code – Execute machine code
  • 67.