I found CUDA error after running a very simple vector add (8 elements) program on NVIDIA GPU through greedy parallelizing and converting to GPU dialect.
I’m not very familiar with what exactly the error wants to tell, could someone help this?
Program
func.func @add(%a: memref<?xf32>, %b: memref<?xf32>, %c: memref<?xf32>) {
affine.for %i = 0 to 8 {
%0 = affine.load %a[%i] : memref<?xf32>
%1 = affine.load %b[%i] : memref<?xf32>
%2 = arith.addf %0, %1 : f32
affine.store %2, %c[%i] : memref<?xf32>
}
return
}
func.func @main() {
%a = memref.alloc() : memref<8xf32>
%b = memref.alloc() : memref<8xf32>
%c = memref.alloc() : memref<8xf32>
%cst0 = arith.constant 0.0 : f32
%cst1 = arith.constant 1.0 : f32
%cst2 = arith.constant 2.0 : f32
%cst3 = arith.constant 3.0 : f32
%cst4 = arith.constant 4.0 : f32
%cst5 = arith.constant 5.0 : f32
%cst6 = arith.constant 6.0 : f32
%cst7 = arith.constant 7.0 : f32
%cst8 = arith.constant 8.0 : f32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%c8 = arith.constant 8 : index
memref.store %cst0, %a[%c0] : memref<8xf32>
memref.store %cst1, %a[%c1] : memref<8xf32>
memref.store %cst2, %a[%c2] : memref<8xf32>
memref.store %cst3, %a[%c3] : memref<8xf32>
memref.store %cst4, %a[%c4] : memref<8xf32>
memref.store %cst5, %a[%c5] : memref<8xf32>
memref.store %cst6, %a[%c6] : memref<8xf32>
memref.store %cst7, %a[%c7] : memref<8xf32>
memref.store %cst8, %a[%c8] : memref<8xf32>
memref.store %cst0, %b[%c0] : memref<8xf32>
memref.store %cst1, %b[%c1] : memref<8xf32>
memref.store %cst2, %b[%c2] : memref<8xf32>
memref.store %cst3, %b[%c3] : memref<8xf32>
memref.store %cst4, %b[%c4] : memref<8xf32>
memref.store %cst5, %b[%c5] : memref<8xf32>
memref.store %cst6, %b[%c6] : memref<8xf32>
memref.store %cst7, %b[%c7] : memref<8xf32>
memref.store %cst8, %b[%c8] : memref<8xf32>
%cast_a = memref.cast %a : memref<8xf32> to memref<?xf32>
%cast_b = memref.cast %b : memref<8xf32> to memref<?xf32>
%cast_c = memref.cast %c : memref<8xf32> to memref<?xf32>
call @add(%cast_a, %cast_b, %cast_c) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
return
}
func.func private @printMemrefF32(%ptr : memref<*xf32>)
Command
$BUILD_DIR/bin/mlir-opt add.mlir \
--convert-linalg-to-parallel-loops \
--affine-parallelize \
--lower-affine \
--canonicalize \
--test-gpu-greedy-parallel-loop-mapping \
--convert-parallel-loops-to-gpu \
--lower-affine \
--convert-scf-to-cf \
--split-input-file \
--verify-diagnostics \
--gpu-kernel-outlining \
--pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin)' \
--gpu-to-llvm | \
$BUILD_DIR/bin/mlir-cpu-runner \
--shared-libs=$BUILD_DIR/lib/libmlir_cuda_runtime.so \
--shared-libs=$BUILD_DIR/lib/libmlir_runner_utils.so \
--shared-libs=$BUILD_DIR/lib/libmlir_c_runner_utils.so \
--entry-point-result=void
Error Message
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
Unranked Memref base@ = 0x555fa5e030f0 rank = 1 offset = 0 sizes = [8] strides = [1] data =
[-3.63856e-16, 3.06254e-41, 0, 0, 1.76939e+25, 1.69308e+22, 6.84356e+25, 1.12704e+21]