TOPI 简介
备注
单击 此处 下载完整的示例代码
这是 TVM 算子清单(TOPI)的入门教程。 TOPI 提供了 numpy 风格的通用操作和 schedule,其抽象程度高于 TVM。本教程将介绍 TOPI 是如何使得 TVM 中的代码不那么样板化的。
import tvm
import tvm.testing
from tvm import te
from tvm import topi
import numpy as np
基本示例
让我们回顾一下行求和操作(例如 B = numpy.sum(A, axis=1)
)。要计算二维 TVM 张量 A 的行之和,应指定符号运算以及 schedule,如下所示:
n = te.var("n")
m = te.var("m")
A = te.placeholder((n, m), name="A")
k = te.reduce_axis((0, m), "k")
B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
s = te.create_schedule(B.op)
输入以下命令查看可读的 IR 代码:
print(tvm.lower(s, [A], simple_mode=True))
输出结果:
@main = primfn(A_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto")}
buffer_map = {A_1: A}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [n, m: int32], [stride, stride_1: int32], type="auto")} {
allocate(B: Pointer(global float32), float32, [n]), storage_scope = global;
for (i: int32, 0, n) {
B_1: Buffer(B, float32, [n], [])[i] = 0f32
for (k: int32, 0, m) {
B_1[i] = (B_1[i] + A[((i*stride) + (k*stride_1))])
}
}
}
然而,必须为这样一个常用的操作定义 reduce 轴,并用 te.compute
定义显式计算。幸运的是,可以用 topi.sum
(类似 numpy.sum
)来替换这两行:
C = topi.sum(A, axis=1)
ts = te.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))
输出结果:
@main = primfn(A_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto")}
buffer_map = {A_1: A}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [n, m: int32], [stride, stride_1: int32], type="auto")} {
allocate(A_red: Pointer(global float32), float32, [n]), storage_scope = global;
for (ax0: int32, 0, n) {
A_red_1: Buffer(A_red, float32, [n], [])[ax0] = 0f32
for (k1: int32, 0, m) {
A_red_1[ax0] = (A_red_1[ax0] + A[((ax0*stride) + (k1*stride_1))])
}
}
}
Numpy 风格的算子重载
可用 topi.broadcast_add
添加两个张量(其 shape 可广播,且是特定的)。TOPI 为此类常见操作提供了算子重载使其更简短。例如:
x, y = 100, 10
a = te.placeholder((x, y, y), name="a")
b = te.placeholder((y, y), name="b")
c = a + b # 等价于 topi.broadcast_add
d = a * b # 等价于 topi.broadcast_mul
TOPI 使用相同的语法重载,将原语 (int, float) 广播到张量 d - 3.14
。
通用调度和融合操作
前面已经展示了 TOPI 如何使我们免于用低级 API 编写显式的计算过程,但调度过程还是和以前一样。TOPI 还基于给定的上下文提供了更高级的调度方案。可以仅用 topi.generic.schedule_reduce
调度下面以 topi.sum
结尾的一系列操作,以 CUDA 为例:
e = topi.elemwise_sum([c, d])
f = e / 2.0
g = topi.sum(f)
with tvm.target.cuda():
sg = topi.cuda.schedule_reduce(g)
print(tvm.lower(sg, [a, b], simple_mode=True))
输出结果:
/workspace/python/tvm/target/target.py:377: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
warnings.warn("Try specifying cuda arch by adding 'arch=sm_xx' to your target.")
@main = primfn(a_1: handle, b_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {a: Buffer(a_2: Pointer(float32), float32, [10000], []),
b: Buffer(b_2: Pointer(float32), float32, [100], [])}
buffer_map = {a_1: a, b_1: b}
preflattened_buffer_map = {a_1: a_3: Buffer(a_2, float32, [100, 10, 10], []), b_1: b_3: Buffer(b_2, float32, [10, 10], [])} {
allocate(T_divide_red: Pointer(global float32), float32, [1]), storage_scope = global;
attr [IterVar(threadIdx.x: int32, [0:1024], "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024;
allocate(T_divide_red.rf: Pointer(local float32), float32, [1]), storage_scope = local;
allocate(reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local {
T_divide_red.rf_1: Buffer(T_divide_red.rf, float32, [1], [], scope="local", align=4)[0] = 0f32
for (k0.k1.fused.k2.fused.outer: int32, 0, 10) {
if @tir.likely((((((k0.k1.fused.k2.fused.outer*64) + floordiv(threadIdx.x, 16)) < 625) && (((k0.k1.fused.k2.fused.outer*64) + floordiv(threadIdx.x, 16)) < 625)) && (((k0.k1.fused.k2.fused.outer*64) + floordiv(threadIdx.x, 16)) < 625)), dtype=bool) {
T_divide_red.rf_1[0] = (T_divide_red.rf_1[0] + (((a[((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x)] + b[((floordiv(floormod(((k0.k1.fused.k2.fused.outer*12) + floordiv(threadIdx.x, 2)), 50), 5)*10) + floormod(((k0.k1.fused.k2.fused.outer*4) + threadIdx.x), 10))]) + (a[((k0.k1.fused.k2.fused.outer*1024) + threadIdx.x)]*b[((floordiv(floormod(((k0.k1.fused.k2.fused.outer*12) + floordiv(threadIdx.x, 2)), 50), 5)*10) + floormod(((k0.k1.fused.k2.fused.outer*4) + threadIdx.x), 10))]))*0.5f32))
}
}
attr [meta[tir.CommReducer][0]] "reduce_scope" = @tir.reinterpret(0u64, dtype=handle);
@tir.tvm_thread_allreduce(1u32, T_divide_red.rf_1[0], True, reduce_temp0_1: Buffer(reduce_temp0, float32, [1], [], scope="local")[0], threadIdx.x, dtype=handle)
if (threadIdx.x == 0) {
T_divide_red_1: Buffer(T_divide_red, float32, [1], [], align=4)[0] = reduce_temp0_1[0]
}
}
}
如上所示,计算的调度阶段是累积的,可以输入以下命令来查看:
print(sg.stages)
输出结果:
[stage(a, placeholder(a, 0x228afb00)), stage(b, placeholder(b, 0x22097c90)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_elemwise_sum, compute(T_elemwise_sum, body=[(T_add[ax0, ax1, ax2] + T_multiply[ax0, ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide, compute(T_divide, body=[(T_elemwise_sum[ax0, ax1, ax2]/2f)], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide_red.rf, compute(T_divide_red.rf, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide[floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10), 10), floormod(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10), 10), floormod((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10)]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], where=tir.likely((((floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10), 10) < 100) && (floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)), 10) < 1000)) && ((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer*1024)) < 10000))), value_index=0)], axis=[iter_var(k0.k1.fused.k2.fused.inner, range(min=0, ext=1024))], reduce_axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], tag=, attrs={})), stage(T_divide_red, compute(T_divide_red.repl, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide_red.rf[k0.k1.fused.k2.fused.inner.v]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], where=(bool)1, value_index=0)], axis=[], reduce_axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], tag=, attrs={}))]
可通过与 numpy
结果对比来验证其正确性,如下所示:
func = tvm.build(sg, [a, b, g], "cuda")
dev = tvm.cuda(0)
a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
a_nd = tvm.nd.array(a_np, dev)
b_nd = tvm.nd.array(b_np, dev)
g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
func(a_nd, b_nd, g_nd)
tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5)
TOPI 还提供了常见神经网络操作,例如对优化的 schedule 进行 softmax:
tarray = te.placeholder((512, 512), name="tarray")
softmax_topi = topi.nn.softmax(tarray)
with tvm.target.Target("cuda"):
sst = topi.cuda.schedule_softmax(softmax_topi)
print(tvm.lower(sst, [tarray], simple_mode=True))