I have a linalg dialect and then super-vectorized to vector dialect. How can I convert it to GPU or SPIRV dialect? I want to produce such a module/kernel where the arithmetics are vectorized.
The pass and the source I was using:
mlir-opt --pass-pipeline='
builtin.module(
convert-tensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,shape-bufferize,linalg-bufferize,bufferization-bufferize,tensor-bufferize),
func-bufferize,
func.func(convert-linalg-to-affine-loops,affine-super-vectorize{virtual-vector-size=8},
affine-loop-unroll{unroll-full})
)' 2.mlir > 3.mlir
2.mlir
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "ReLU"} {
func.func @forward(%arg0: tensor<512x640x20x16xf32>) -> tensor<512x640x20x16xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<512x640x20x16xf32>
%1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x640x20x16xf32>) outs(%0 : tensor<512x640x20x16xf32>) {
^bb0(%in: f32, %out: f32):
%2 = arith.cmpf ugt, %in, %cst : f32
%3 = arith.select %2, %in, %cst : f32
linalg.yield %3 : f32
} -> tensor<512x640x20x16xf32>
return %1 : tensor<512x640x20x16xf32>
}
func.func @main() {
%0= arith.constant dense<1.3>:tensor<512x640x20x16xf32>
%1 = func.call @forward(%0) : (tensor<512x640x20x16xf32>) -> tensor<512x640x20x16xf32>
return
}
}
3.mlir
module attributes {torch.debug_module_name = "ReLU"} {
memref.global "private" constant @__constant_512x640x20x16xf32 : memref<512x640x20x16xf32> = dense<1.300000e+00>
func.func @forward(%arg0: memref<512x640x20x16xf32>) -> memref<512x640x20x16xf32> {
%cst = arith.constant 0.000000e+00 : f32
%alloc = memref.alloc() {alignment = 64 : i64} : memref<512x640x20x16xf32>
affine.for %arg1 = 0 to 512 {
affine.for %arg2 = 0 to 640 {
affine.for %arg3 = 0 to 20 {
affine.for %arg4 = 0 to 16 step 8 {
%cst_0 = arith.constant dense<0.000000e+00> : vector<8xf32>
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = vector.transfer_read %arg0[%arg1, %arg2, %arg3, %arg4], %cst_1 {in_bounds = [true]} : memref<512x640x20x16xf32>, vector<8xf32>
%1 = arith.cmpf ugt, %0, %cst_0 : vector<8xf32>
%2 = arith.select %1, %0, %cst_0 : vector<8xi1>, vector<8xf32>
vector.transfer_write %2, %alloc[%arg1, %arg2, %arg3, %arg4] {in_bounds = [true]} : vector<8xf32>, memref<512x640x20x16xf32>
}
}
}
}
return %alloc : memref<512x640x20x16xf32>
}
func.func @main() {
%0 = memref.get_global @__constant_512x640x20x16xf32 : memref<512x640x20x16xf32>
%1 = call @forward(%0) : (memref<512x640x20x16xf32>) -> memref<512x640x20x16xf32>
return
}
}
When I try to apply convert-vector-to-gpu
or convert-vector-to-spirv
to 3.mlir
above, there is no effect. I guess the key point is to lower the vector.transfer_read
and vector.transfer_write
, but I don’t know what they should be lowered to or how to lower them.
I hope I can get a module that finally be like:
The operations inside kernel will need to be vectorized
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module attributes {gpu.container_module, torch.debug_module_name = "ReLU"} {
memref.global "private" constant @__constant_512x640x20x15xf32 : memref<512x640x20x15xf32> = dense<1.300000e+00>
func.func @forward(%arg0: memref<512x640x20x15xf32>) -> memref<512x640x20x15xf32> {
%c20 = arith.constant 20 : index
%c640 = arith.constant 640 : index
%c512 = arith.constant 512 : index
%cst = arith.constant 0.000000e+00 : f32
%c15 = arith.constant 15 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%memref = gpu.alloc host_shared () : memref<512x640x20x15xf32>
memref.copy %arg0, %memref : memref<512x640x20x15xf32> to memref<512x640x20x15xf32>
%memref_0 = gpu.alloc host_shared () : memref<512x640x20x15xf32>
gpu.launch_func @forward_kernel::@forward_kernel blocks in (%c512, %c640, %c20) threads in (%c1, %c1, %c1) args(%memref : memref<512x640x20x15xf32>, %cst : f32, %memref_0 : memref<512x640x20x15xf32>, %c0 : index, %c15 : index, %c1 : index)
%alloc = memref.alloc() : memref<512x640x20x15xf32>
memref.copy %memref_0, %alloc : memref<512x640x20x15xf32> to memref<512x640x20x15xf32>
gpu.dealloc %memref_0 : memref<512x640x20x15xf32>
gpu.dealloc %memref : memref<512x640x20x15xf32>
return %alloc : memref<512x640x20x15xf32>
}
gpu.module @forward_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @forward_kernel(%arg0: memref<512x640x20x15xf32>, %arg1: f32, %arg2: memref<512x640x20x15xf32>, %arg3: index, %arg4: index, %arg5: index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 512, 640, 20>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%0 = gpu.block_id x
%1 = gpu.block_id y
%2 = gpu.block_id z
scf.for %arg6 = %arg3 to %arg4 step %arg5 {
%3 = memref.load %arg0[%0, %1, %2, %arg6] : memref<512x640x20x15xf32>
%4 = arith.cmpf ugt, %3, %arg1 : f32
%5 = arith.select %4, %3, %arg1 : f32
memref.store %5, %arg2[%0, %1, %2, %arg6] : memref<512x640x20x15xf32>
}
gpu.return
}
}
func.func @main() {
%0 = memref.get_global @__constant_512x640x20x15xf32 : memref<512x640x20x15xf32>
%1 = call @forward(%0) : (memref<512x640x20x15xf32>) -> memref<512x640x20x15xf32>
return
}
}