GPU Dialect SPMV

Hello,
I am using gpu.spmv trying to compute spmv on gpu, I followed the example in mlir/test/Dialect/GPU,the gpu.spmv lower to mgpuSpMV, but the symbol can’t found in libmlir_cuda_runtime.so or libcusparse.so. How can I find this symbol?
I build llvm and mlir follow this
LLVM:

$ cmake -G Ninja ../llvm \
     -DLLVM_ENABLE_PROJECTS="mlir" \
     -DLLVM_BUILD_EXAMPLES=ON \
     -DLLVM_TARGETS_TO_BUILD="X86;NVPTX;AMDGPU" \
     -DCMAKE_BUILD_TYPE=Release \
     -DMLIR_ENABLE_CUDA_RUNNER=ON \
     -DLLVM_ENABLE_ASSERTIONS=ON
$ ninja check-mlir

MLIR:

$ cmake -G Ninja .. \
    -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \
    -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \
    -DLLVM_ENABLE_ASSERTIONS=ON \
    -DCMAKE_BUILD_TYPE=RELEASE
$ ninja

thanks!

Thanks for trying out our recent GPU extensions for sparsity.
Note that you will also need to set:

MLIR_ENABLE_CUDA_CUSPARSE   : enables cuSPARSE
MLIR_ENABLE_CUDA_CUSPARSELT : enables cuSPARSElt (for 2:4)

thanks for your solution!
when I config -DMLIR_ENABLE_CUDA_CUSPARSE=“enables cuSPARSE” and -DMLIR_ENABLE_CUDA_CUSPARSELT=“enables cuSPARSElt (for 2:4)” the mgpuspmv symbol can be found in libmlir_cuda_runtime.so :grinning:

Hi, when running gpu.spmv, I still have some problems:

  1. I can’t find create_sparse_env in gpu dialect in the documents or in the GPUOps.td, so I declare mgpuCreateSparseEnv and called mgpuCreateSparseEnv use llvm function.
  2. the result seems not right, I running the following code and the result is [0, 0, 0, 0], I am running llvm at branch main(hash is 79786c4d23f1fd7af438e4fd4e33ec109626bee4) with the following code
module attributes {gpu.container_module} {
    func.func private @printMemrefI32(memref<*xi32>)
    llvm.func @mgpuCreateSparseEnv() -> ()

    func.func @main() {
        llvm.call @mgpuCreateSparseEnv(): () -> ()
        %sz = arith.constant 4 : index
        %add_sz = arith.constant 5 : index

        %i0 = arith.constant 0 : i32
        %i1 = arith.constant 1 : i32
        %i2 = arith.constant 2 : i32
        %i3 = arith.constant 3 : i32
        %i4 = arith.constant 4 : i32
        %i5 = arith.constant 5 : i32

        %f0 = arith.constant 0.0 : f32
        %f1 = arith.constant 1.0 : f32
        %f2 = arith.constant 2.0 : f32
        %f3 = arith.constant 3.0 : f32
        %f4 = arith.constant 4.0 : f32

        %mat_row = arith.constant 4 : index
        %mat_col = arith.constant 4 : index
        %mat_nnz = arith.constant 4 : index

        // %16 = gpu.create_sparse_env
        %token0 = gpu.wait async
        %h_offset = memref.alloc(%add_sz) : memref<?xi32>

        // h_offset = {0, 1, 2, 3, 4}
        memref.store %i0, %h_offset[%c0] : memref<?xi32>
        memref.store %i1, %h_offset[%c1] : memref<?xi32>
        memref.store %i2, %h_offset[%c2] : memref<?xi32>
        memref.store %i3, %h_offset[%c3] : memref<?xi32>
        memref.store %i4, %h_offset[%c4] : memref<?xi32>

        %d_offset, %token1 = gpu.alloc async [%token0] (%c5) : memref<?xi32>
        %token2 = gpu.memcpy async [%token1] %d_offset, %h_offset : memref<?xi32>, memref<?xi32>

        // h_indexs = {0, 1, 2, 3}
        %h_indexs = memref.alloc(%c4) : memref<?xi32>
        memref.store %i0, %h_indexs[%c0] : memref<?xi32>
        memref.store %i1, %h_indexs[%c1] : memref<?xi32>
        memref.store %i2, %h_indexs[%c2] : memref<?xi32>
        memref.store %i3, %h_indexs[%c3] : memref<?xi32>
        %d_indexs, %token3 = gpu.alloc async [%token0] (%c4) : memref<?xi32>
        %token4 = gpu.memcpy async [%token3] %d_indexs, %h_indexs : memref<?xi32>, memref<?xi32>

        // h_values is {1.0, 2.0, 3.0, 4.0}
        %h_values = memref.alloc(%c4) : memref<?xf32>
        memref.store %f1, %h_values[%c0] : memref<?xf32>
        memref.store %f2, %h_values[%c1] : memref<?xf32>
        memref.store %f3, %h_values[%c2] : memref<?xf32>
        memref.store %f4, %h_values[%c3] : memref<?xf32>
        %d_values, %token5 = gpu.alloc async [%token0] (%c4) : memref<?xf32>
        %token6 = gpu.memcpy async [%token5] %d_values, %h_values : memref<?xf32>, memref<?xf32> 

        // h_vectors is {1.0, 2.0, 3.0, 4.0}
        %h_vector = memref.alloc(%c4) : memref<?xf32>
        memref.store %f1, %h_vector[%c0] : memref<?xf32>
        memref.store %f2, %h_vector[%c1] : memref<?xf32>
        memref.store %f3, %h_vector[%c2] : memref<?xf32>
        memref.store %f4, %h_vector[%c3] : memref<?xf32>
        %d_vector, %token7 = gpu.alloc async [%token0] (%c4) : memref<?xf32>
        %token8 = gpu.memcpy async [%token7] %d_vector, %h_vector : memref<?xf32>, memref<?xf32>

        // malloc h_result, d_result
        %h_result = memref.alloc(%c4) : memref<?xf32>
        memref.store %f0, %h_result[%c0] : memref<?xf32>
        memref.store %f0, %h_result[%c1] : memref<?xf32>
        memref.store %f0, %h_result[%c2] : memref<?xf32>
        memref.store %f0, %h_result[%c3] : memref<?xf32>
        %d_result, %token9 = gpu.alloc async [%token0] (%c4) : memref<?xf32>
        %token10 = gpu.memcpy async [%token9] %d_result, %h_result : memref<?xf32>, memref<?xf32>

        %token11 = gpu.wait async [%token2, %token4, %token6, %token8, %token10]

        // create csr 
        %spmat, %token_1 = gpu.create_csr async [%token11] %mat_row, %mat_col, %mat_nnz, %d_offset, %d_indexs, %d_values : memref<?xi32>, memref<?xi32>, memref<?xf32>

        %dninput, %token_2 = gpu.create_dn_tensor async [%token_1] %d_vector, %mat_row : index into memref<?xf32>

        %dnresult, %token_3 = gpu.create_dn_tensor async [%token_2] %d_result, %mat_row : index into memref<?xf32> 

        %bufferSz, %token_4 = gpu.spmv_buffer_size async [%token_3] %spmat, %dninput, %dninput into f32 // 这里的dn_input应该不重要

        %d_buffer, %token_5 = gpu.alloc async [%token_4] (%bufferSz) : memref<?xf32>

        %token_6 =  gpu.spmv async[%token_5] %spmat, %dninput, %dnresult, %d_buffer : memref<?xf32> into f32

        %fin_token = gpu.memcpy async [%token_6] %h_result, %d_result : memref<?xf32>, memref<?xf32>
        gpu.wait [%fin_token]

        %unrank_h_result = memref.cast %h_result : memref<?xf32> to memref<*xf32>
        func.call @printMemrefF32(%unrank_h_result) : (memref<*xf32>) -> ()
        return
    }    
}

I try to compute
{1, 0, 0, 0; 0, 2, 0, 0; 0, 0, 3, 0; 0, 0, 0, 4} x {1, 2, 3, 4} but result is {0, 0, 0, 0}.
I try to use gpu.host_register to register h_offset, h_indexs, h_values and use h_offset, h_indexs, h_values in create_csr but still has a zero result.
thanks for your help!

There are a few details missing, for example, with respect to the “breakdown” of the sparse data structures, which is required to get the data properly back from device to host. So, why not let a compiler deal with the hairy details, especially for sparsity (which is why I have been a fierce proponent of sparse compilers since 1996 :wink:).

Let’s start with the dense MV and annotate the matrix as sparse, in CSR form:

#CSR = #sparse_tensor.encoding<{
  lvlTypes = [ "dense", "compressed" ],
  posWidth = 32,
  crdWidth = 32
}>

func.func @matvecCSR(%A: tensor<?x?xf64, #CSR>,
                     %x: tensor<?xf64>,
                     %y_in: tensor<?xf64>) -> tensor<?xf64> {
    %y_out = linalg.matvec
      ins(%A, %x: tensor<?x?xf64, #CSR>, tensor<?xf64>)
      outs(%y_in: tensor<?xf64>) -> tensor<?xf64>
    return %y_out : tensor<?xf64>
}

Then we invoke the sparsifier pipeline of MLIR, with GPU acceleration enabled (by default, codegen is CPU only:

mlir-opt --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" spmv.mlir

And, voilà, we get the proper sequence for SpMV (showing the IR right after the GPU part of the sparsifier, since further downstream introduces a lot of implementation details that are harder to read):

  func.func @matvecCSR(%arg0: tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>, %arg1: tensor<?xf64>, %arg2: tensor<?xf64>) -> tensor<?xf64> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = sparse_tensor.number_of_entries %arg0 : tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>
    %dim = tensor.dim %arg0, %c0 : tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>
    %dim_0 = tensor.dim %arg0, %c1 : tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>
    %1 = sparse_tensor.positions %arg0 {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>> to memref<?xi32>
    %2 = sparse_tensor.coordinates %arg0 {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>> to memref<?xi32>
    %3 = sparse_tensor.values %arg0 : tensor<?x?xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>> to memref<?xf64> 
    %4 = gpu.wait async
    %dim_1 = memref.dim %1, %c0 : memref<?xi32>
    %memref, %asyncToken = gpu.alloc async [%4] (%dim_1) : memref<?xi32>
    %5 = gpu.memcpy async [%asyncToken] %memref, %1 : memref<?xi32>, memref<?xi32>
    %6 = gpu.wait async
    %dim_2 = memref.dim %2, %c0 : memref<?xi32>
    %memref_3, %asyncToken_4 = gpu.alloc async [%6] (%dim_2) : memref<?xi32>
    %7 = gpu.memcpy async [%asyncToken_4] %memref_3, %2 : memref<?xi32>, memref<?xi32>
    %8 = gpu.wait async
    %dim_5 = memref.dim %3, %c0 : memref<?xf64>
    %memref_6, %asyncToken_7 = gpu.alloc async [%8] (%dim_5) : memref<?xf64>
    %9 = gpu.memcpy async [%asyncToken_7] %memref_6, %3 : memref<?xf64>, memref<?xf64>
    %10 = bufferization.to_memref %arg1 : memref<?xf64>
   %11 = gpu.wait async
    %dim_8 = memref.dim %10, %c0 : memref<?xf64>
    %memref_9, %asyncToken_10 = gpu.alloc async [%11] (%dim_8) : memref<?xf64>
    %12 = gpu.memcpy async [%asyncToken_10] %memref_9, %10 : memref<?xf64>, memref<?xf64>
    %13 = bufferization.to_memref %arg2 : memref<?xf64>
    %14 = gpu.wait async
    %dim_11 = memref.dim %13, %c0 : memref<?xf64>
    %memref_12, %asyncToken_13 = gpu.alloc async [%14] (%dim_11) : memref<?xf64>
    %15 = gpu.memcpy async [%asyncToken_13] %memref_12, %13 : memref<?xf64>, memref<?xf64>
    gpu.wait [%5, %7, %9, %12, %15]
    %16 = gpu.wait async
    %spmat, %asyncToken_14 = gpu.create_csr async [%16] %dim, %dim_0, %0, %memref, %memref_3, %memref_6 : memref<?xi32>, memref<?xi32>, memref<?xf64>
    %dnTensor, %asyncToken_15 = gpu.create_dn_tensor async [%asyncToken_14] %memref_9, %dim_0 : index into memref<?xf64>
    %dnTensor_16, %asyncToken_17 = gpu.create_dn_tensor async [%asyncToken_15] %memref_12, %dim : index into memref<?xf64>
    %bufferSz, %asyncToken_18 = gpu.spmv_buffer_size async [%asyncToken_17] %spmat, %dnTensor, %dnTensor_16 into f64
    %memref_19, %asyncToken_20 = gpu.alloc async [%asyncToken_18] (%bufferSz) : memref<?xi8>
    %17 = gpu.spmv async [%asyncToken_20] %spmat, %dnTensor, %dnTensor_16, %memref_19 : memref<?xi8> into f64
    %18 = gpu.destroy_sp_mat async [%17] %spmat
    %19 = gpu.destroy_dn_tensor async [%18] %dnTensor
    %20 = gpu.destroy_dn_tensor async [%19] %dnTensor_16
    %21 = gpu.dealloc async [%20] %memref : memref<?xi32>
    %22 = gpu.dealloc async [%21] %memref_3 : memref<?xi32>
    %23 = gpu.dealloc async [%22] %memref_6 : memref<?xf64>
    %24 = gpu.dealloc async [%23] %memref_19 : memref<?xi8>
    %25 = gpu.dealloc async [%24] %memref_9 : memref<?xf64>
    %26 = gpu.memcpy async [%25] %13, %memref_12 : memref<?xf64>, memref<?xf64>
    %27 = gpu.dealloc async [%26] %memref_12 : memref<?xf64>
    gpu.wait [%27]
    %28 = bufferization.to_tensor %13 : memref<?xf64>
    return %28 : tensor<?xf64>
  }

I hope this helps!

1 Like

thanks for your explanation and example, that’s very helpful! :pray:

1 Like