为 x86 CPU 自动调优卷积网络
备注
单击 此处 下载完整的示例代码
本文介绍如何为 x86 CPU 调优卷积神经网络。
注意,本教程不会在 Windows 或最新版本的 macOS 上运行。如需运行,请将本教程的主体放在 if __name__ == "__main__":
代码块中。
import os
import numpy as np
import tvm
from tvm import relay, autotvm
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
import tvm.contrib.graph_executor as runtime
定义网络
首先在 Relay 前端 API 中定义网络,可以从 relay.testing
加载一些预定义的网络,也可以使用 Relay 构建 relay.testing.resnet
。还可以从 MXNet、ONNX 和 TensorFlow 加载模型。
本教程用 resnet-18 作为调优示例。
def get_network(name, batch_size):
"""获取网络的符号定义和随机权重"""
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
if "resnet" in name:
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer, batch_size=batch_size, dtype=dtype
)
elif "vgg" in name:
n_layer = int(name.split("-")[1])
mod, params = relay.testing.vgg.get_workload(
num_layers=n_layer, batch_size=batch_size, dtype=dtype
)
elif name == "mobilenet":
mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
elif name == "squeezenet_v1.1":
mod, params = relay.testing.squeezenet.get_workload(
batch_size=batch_size, version="1.1", dtype=dtype
)
elif name == "inception_v3":
input_shape = (batch_size, 3, 299, 299)
mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
elif name == "mxnet":
# MXNet 模型的示例
from mxnet.gluon.model_zoo.vision import get_model
block = get_model("resnet18_v1", pretrained=True)
mod, params = relay.frontend.from_mxnet(block, shape={input_name: input_shape}, dtype=dtype)
net = mod["main"]
net = relay.Function(
net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
)
mod = tvm.IRModule.from_expr(net)
else:
raise ValueError("Unsupported network: " + name)
return mod, params, input_shape, output_shape
# 将「llvm」替换为你的 CPU 的 target。
# 例如,对于支持 Intel Xeon Platinum 8000 系列的 AWS EC2 c5 实例,
# target 是「llvm -mcpu=skylake-avx512」。
# 对于支持 Intel Xeon E5-2666 v3 的 AWS EC2 c4 实例,target 是「llvm -mcpu=core-avx2」。
target = "llvm"
batch_size = 1
dtype = "float32"
model_name = "resnet-18"
log_file = "%s.log" % model_name
graph_opt_sch_file = "%s_graph_opt.log" % model_name
# 设置图计算图的输入名称
# 对于 ONNX 模型,它通常为“0”。
input_name = "data"
# 根据 CPU 内核设置调优的线程数
num_threads = 1
os.environ["TVM_NUM_THREADS"] = str(num_threads)
配置张量调优设置并创建任务
为了在 x86 CPU 上获得更好的内核执行性能,将卷积内核的数据布局从「NCHW」更改为「NCHWc」。为了处理这种情况,在 topi 中定义了 conv2d_NCHWc 算子,调优此算子而非普通的 conv2d。
使 用本地模式来调优配置,RPC tracker 模式的设置类似于 自动调优 ARM CPU 的卷积网络 教程中的方法。
为了精准测试,应该多次重复测试,并取结果的平均值。此外,需要在重复测试时刷新权重张量的缓存,使得算子的测试延迟更接近其在端到端推理期间的实际延迟。
tuning_option = {
"log_filename": log_file,
"tuner": "random",
"early_stopping": None,
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(),
runner=autotvm.LocalRunner(
number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
),
),
}
# 可跳过此函数的实现。
def tune_kernels(
tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log"
):
for i, task in enumerate(tasks):
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
# 创建调优器
if tuner == "xgb" or tuner == "xgb-rank":
tuner_obj = XGBTuner(task, loss_type="rank")
elif tuner == "ga":
tuner_obj = GATuner(task, pop_size=50)
elif tuner == "random":
tuner_obj = RandomTuner(task)
elif tuner == "gridsearch":
tuner_obj = GridSearchTuner(task)
else:
raise ValueError("Invalid tuner: " + tuner)
# 开始调优
n_trial = len(task.config_space)
tuner_obj.tune(
n_trial=n_trial,
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(n_trial, prefix=prefix),
autotvm.callback.log_to_file(log_filename),
],
)
# 使用 graph Tuner 实现计算图级别最优调度
# 如果完成时间过长,则设置 use_DP=False。
def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
target_op = [
relay.op.get("nn.conv2d"),
]
Tuner = DPTuner if use_DP else PBQPTuner
executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
executor.benchmark_layout_transform(min_exec_num=2000)
executor.run()
executor.write_opt_sch2record_file(opt_sch_file)
最后启动调优作业,并评估端到端性能。
def evaluate_performance(lib, data_shape):
# 上传参数到设备
dev = tvm.cpu()
data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
module = runtime.GraphModule(lib["default"](dev))
module.set_input(input_name, data_tvm)
# 评估
print("Evaluate inference time cost...")
print(module.benchmark(dev, number=100, repeat=3))
def tune_and_evaluate(tuning_opt):
# 从 Relay 程序中提取工作负载
print("Extract tasks...")
mod, params, data_shape, out_shape = get_network(model_name, batch_size)
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)
# 运行调优任务
tune_kernels(tasks, **tuning_opt)
tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
# 在默认模式下编译内核
print("Evaluation of the network compiled in 'default' mode without auto tune:")
with tvm.transform.PassContext(opt_level=3):
print("Compile...")
lib = relay.build(mod, target=target, params=params)
evaluate_performance(lib, data_shape)
# 在仅内核调优模式下编译内核
print("\nEvaluation of the network been tuned on kernel level:")
with autotvm.apply_history_best(log_file):
print("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target=target, params=params)
evaluate_performance(lib, data_shape)
# 编译具有计算图级最佳记录的内核
print("\nEvaluation of the network been tuned on graph level:")
with autotvm.apply_graph_best(graph_opt_sch_file):
print("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(mod, target=target, params=params)
evaluate_performance(lib, data_shape)
# 不在网页服务器中运行调优,因为它需要的时间太长。
# 取消注释运行下一行
# tune_and_evaluate(tuning_option)