Building with CUDA support, not sure if working

So I’m trying to build the llvm-project from scratch based on this commit (no particular reason for this commit, it was HEAD when I synced the repo latest).

I build it with these options

cmake -G Ninja \
	-DLLVM_ENABLE_PROJECTS="mlir" \
	-DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
	-DMLIR_ENABLE_CUDA_RUNNER=ON \
	-DCMAKE_BUILD_TYPE=Release \
	-DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc \
	-DCMAKE_CUDA_ARCHITECTURES=86 \
	-DLLVM_ENABLE_ASSERTIONS=ON \
	-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
	-DPython3_EXECUTABLE=$(which python) \
	../llvm

And now I want to see if things are actually working.

Given this simple matmul MLIR module.

module attributes {gpu.container_module} {
  // device code
  gpu.module @kernels {
    gpu.func @matmul_kernel(%a: memref<1024x1024xf32>,
                           %b: memref<1024x1024xf32>,
                           %c: memref<1024x1024xf32>)
        kernel attributes {gpu.known_block_size = array<i32: 32, 32, 1>} {
      // get the thread indices
      %tx = gpu.thread_id x
      %ty = gpu.thread_id y

      // load a[tx,ty] and store to c[tx,ty]
      %val = memref.load %a[%tx, %ty] : memref<1024x1024xf32>
      memref.store %val, %c[%tx, %ty] : memref<1024x1024xf32>
      gpu.return
    }
  }

  // host code
  func.func @matmul(%a: memref<1024x1024xf32>,
                    %b: memref<1024x1024xf32>,
                    %c: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index

    gpu.launch_func @kernels::@matmul_kernel
      blocks in (%c32, %c32, %c1)
      threads in (%c32, %c32, %c1)
      args(%a : memref<1024x1024xf32>,
           %b : memref<1024x1024xf32>,
           %c : memref<1024x1024xf32>)

    return %c : memref<1024x1024xf32>
  }
}

Lowered through this pipeline:

mlir-opt matmul_gpu.mlir \
	--gpu-lower-to-nvvm-pipeline="cubin-chip=sm_86 cubin-features=+ptx70" \
	--llvm-legalize-for-export \
	-o lowered_gpu.mlir

Yields:

module attributes {gpu.container_module} {
  gpu.binary @kernels  [#gpu.object<#nvvm.target<chip = "sm_86", features = "+ptx70">, "P\EDU\BA\01\00\10\00\00\0E\00\00\00\00\00\00\02\00\01\01@\00\00\00\A8\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00V\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00y\00\00\00\00\00\00\00\00\00\00\00\00\0B\00\00\00\00\00\00\80\08\00\00\00\00\00\00V\05V\00@\008\00\03\00@\00\0A\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.info\00.text.matmul_kernel\00.nv.info.matmul_kernel\00.nv.shared.matmul_kernel\00.nv.constant0.matmul_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.info\00matmul_kernel\00.text.matmul_kernel\00.nv.info.matmul_kernel\00.nv.shared.matmul_kernel\00.nv.constant0.matmul_kernel\00_param\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00N\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\92\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B5\00\00\00\03\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\04|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\04\04\00\00\00\04,\00\00\00\0C\81\80\80(\00\04\FC\FF\FF?\00\00\00\00\00\00\00\04/\08\00\04\00\00\00\08\00\00\00\04#\08\00\04\00\00\00\00\00\00\00\04\12\08\00\04\00\00\00\00\00\00\00\04\11\08\00\04\00\00\00\00\00\00\00\047\04\00y\00\00\00\015\00\00\04\0A\08\00\02\00\00\00`\01\A8\00\03\19\A8\00\04\17\0C\00\00\00\00\00\14\00\A0\00\00\F0!\00\04\17\0C\00\00\00\00\00\13\00\98\00\00\F0!\00\04\17\0C\00\00\00\00\00\12\00\90\00\00\F0!\00\04\17\0C\00\00\00\00\00\11\00\88\00\00\F0!\00\04\17\0C\00\00\00\00\00\10\00\80\00\00\F0!\00\04\17\0C\00\00\00\00\00\0F\00x\00\00\F0!\00\04\17\0C\00\00\00\00\00\0E\00p\00\00\F0!\00\04\17\0C\00\00\00\00\00\0D\00h\00\00\F0!\00\04\17\0C\00\00\00\00\00\0C\00`z\01\00\00\0A\00\00\00\0F\00\00\00\C4\0F\00\19y\00\00\00\00\00\00\00!\00\00\00\22\0E\00\02x\05\00\04\00\00\00\00\0F\00\00\00\E2\0F\00\B9z\04\00\00F\00\00\00\0A\00\00\00\E4\0F\00\19y\03\00\00\00\00\00\00\22\00\00\00b\0E\00\19x\00\00\0A\00\00\00\FF\06\00\00\00\C8\1F\00\12r\00\00\03\00\00\00\FF\FC\8E\07\00\CA/\00%v\02\00\00Z\00\00\05\00\8E\07\00\CC\0F\00\81y\03\02\04\00\00\00\00\19\1E\0C\00\A2\0E\00%v\04\00\00v\00\00\05\00\8E\07\00\CA\0F\00\86y\00\04\03\00\00\00\04\19\10\0C\00\E2O\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\00\00\F0\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18yx\00\00\00\00\00\00\00\02\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\A0\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\02\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\007\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E0\02\00\00\00\00\00\000\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00T\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\10\03\00\00\00\00\00\00xversion 7.1\0A.target sm_86\0A.address_size 64/\00\FA\19isible .entry matmul_kernel(\0A.param .u64\1B\00\11_\19\00?_0,#\00\0E\1F1#\00\0F\1F2#\00\0F\1F3#\00\0F\1F4#\00\0F\1F5#\00\0F\1F6#\00\0F\1F7#\00\0F\1F8#\00\0F\1F9#\00\0F\1F1_\01\11\0F`\01\10\1F1a\01\10\1F1b\01\10\1F1c\01\10\1F1d\01\10\1F1e\01\10\1F1f\01\10\1F1g\01\10\1F1h\01\10\F3\0820\0A)\0A{\0A.reg .b32 %r<5>;\11\00\10f\11\005f<2\11\00\F2\00b64 %rd<8>;\0A\0Ald^\00\22.u\16\00O1, [\88\00\02\F4\035];\0Acvta.to.global3\00!2,9\00\1D;L\00\1F3L\00\05\0FK\00\06\114K\00\813;\0Amov.u\CA\00\981, %tid.x\15\00\132\15\00cy;\0Ashl\F4\00\223,/\00c10;\0Aor\15\00$4,\1A\00\B32;\0Amul.wideF\002d5,\22\00\824;\0Aadd.s\85\00&6,\8B\00\115\D6\00\03\A5\00\02J\01\00\22\01\00&\00\19]5\00&7,\0B\01X5;\0Ast5\00\000\0007],=\00\B0;\0Aret;\0A\0A}\0A\00\00\00\00\00\00\00">]
  llvm.func @matmul(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: !llvm.ptr, %arg15: !llvm.ptr, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64) -> !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> {
    %0 = llvm.mlir.constant(32 : index) : i64
    %1 = llvm.mlir.constant(1 : index) : i64
    %2 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %3 = llvm.insertvalue %arg14, %2[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    %4 = llvm.insertvalue %arg15, %3[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    %5 = llvm.insertvalue %arg16, %4[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    %6 = llvm.insertvalue %arg17, %5[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    %7 = llvm.insertvalue %arg19, %6[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    %8 = llvm.insertvalue %arg18, %7[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    %9 = llvm.insertvalue %arg20, %8[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
    gpu.launch_func  @kernels::@matmul_kernel blocks in (%0, %0, %1) threads in (%0, %0, %1) : i64 args(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr, %arg2 : i64, %arg3 : i64, %arg4 : i64, %arg5 : i64, %arg6 : i64, %arg7 : !llvm.ptr, %arg8 : !llvm.ptr, %arg9 : i64, %arg10 : i64, %arg11 : i64, %arg12 : i64, %arg13 : i64, %arg14 : !llvm.ptr, %arg15 : !llvm.ptr, %arg16 : i64, %arg17 : i64, %arg18 : i64, %arg19 : i64, %arg20 : i64)
    llvm.return %9 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
  }
}

A couple of questions at this stage:

  1. So if I have understood this correct. The embedded binary here should be valid fatbin, right? Could I theoretically just extract that blob and load in in a cuda module and execute it? Or what exactly is it?

  2. If not, could I somehow generate a valid ptx/cubin/fatbin from the pipeline, or would I have to first convert it into llvm-ir and compile it to an object file and so on?

  3. If we continue on question 2, then we get this:
    And translated through this:

mlir-translate --mlir-to-llvmir lowered_gpu.mlir -o gpu_program.ll

Gets me:

; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

%0 = type { ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64 }

@kernels_bin_cst = internal constant [3600 x i8] c"P\EDU\BA\01\00\10\00\00\0E\00\00\00\00\00\00\02\00\01\01@\00\00\00\A8\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00V\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00y\00\00\00\00\00\00\00\00\00\00\00\00\0B\00\00\00\00\00\00\80\08\00\00\00\00\00\00V\05V\00@\008\00\03\00@\00\0A\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.info\00.text.matmul_kernel\00.nv.info.matmul_kernel\00.nv.shared.matmul_kernel\00.nv.constant0.matmul_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.info\00matmul_kernel\00.text.matmul_kernel\00.nv.info.matmul_kernel\00.nv.shared.matmul_kernel\00.nv.constant0.matmul_kernel\00_param\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00N\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\92\00\00\00\03\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B5\00\00\00\03\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\04|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\04\04\00\00\00\04,\00\00\00\0C\81\80\80(\00\04\FC\FF\FF?\00\00\00\00\00\00\00\04/\08\00\04\00\00\00\08\00\00\00\04#\08\00\04\00\00\00\00\00\00\00\04\12\08\00\04\00\00\00\00\00\00\00\04\11\08\00\04\00\00\00\00\00\00\00\047\04\00y\00\00\00\015\00\00\04\0A\08\00\02\00\00\00`\01\A8\00\03\19\A8\00\04\17\0C\00\00\00\00\00\14\00\A0\00\00\F0!\00\04\17\0C\00\00\00\00\00\13\00\98\00\00\F0!\00\04\17\0C\00\00\00\00\00\12\00\90\00\00\F0!\00\04\17\0C\00\00\00\00\00\11\00\88\00\00\F0!\00\04\17\0C\00\00\00\00\00\10\00\80\00\00\F0!\00\04\17\0C\00\00\00\00\00\0F\00x\00\00\F0!\00\04\17\0C\00\00\00\00\00\0E\00p\00\00\F0!\00\04\17\0C\00\00\00\00\00\0D\00h\00\00\F0!\00\04\17\0C\00\00\00\00\00\0C\00`z\01\00\00\0A\00\00\00\0F\00\00\00\C4\0F\00\19y\00\00\00\00\00\00\00!\00\00\00\22\0E\00\02x\05\00\04\00\00\00\00\0F\00\00\00\E2\0F\00\B9z\04\00\00F\00\00\00\0A\00\00\00\E4\0F\00\19y\03\00\00\00\00\00\00\22\00\00\00b\0E\00\19x\00\00\0A\00\00\00\FF\06\00\00\00\C8\1F\00\12r\00\00\03\00\00\00\FF\FC\8E\07\00\CA/\00%v\02\00\00Z\00\00\05\00\8E\07\00\CC\0F\00\81y\03\02\04\00\00\00\00\19\1E\0C\00\A2\0E\00%v\04\00\00v\00\00\05\00\8E\07\00\CA\0F\00\86y\00\04\03\00\00\00\04\19\10\0C\00\E2O\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\00\00\F0\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18yx\00\00\00\00\00\00\00\02\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\A0\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\02\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\007\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E0\02\00\00\00\00\00\000\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00T\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\10\03\00\00\00\00\00\00xversion 7.1\0A.target sm_86\0A.address_size 64/\00\FA\19isible .entry matmul_kernel(\0A.param .u64\1B\00\11_\19\00?_0,#\00\0E\1F1#\00\0F\1F2#\00\0F\1F3#\00\0F\1F4#\00\0F\1F5#\00\0F\1F6#\00\0F\1F7#\00\0F\1F8#\00\0F\1F9#\00\0F\1F1_\01\11\0F`\01\10\1F1a\01\10\1F1b\01\10\1F1c\01\10\1F1d\01\10\1F1e\01\10\1F1f\01\10\1F1g\01\10\1F1h\01\10\F3\0820\0A)\0A{\0A.reg .b32 %r<5>;\11\00\10f\11\005f<2\11\00\F2\00b64 %rd<8>;\0A\0Ald^\00\22.u\16\00O1, [\88\00\02\F4\035];\0Acvta.to.global3\00!2,9\00\1D;L\00\1F3L\00\05\0FK\00\06\114K\00\813;\0Amov.u\CA\00\981, %tid.x\15\00\132\15\00cy;\0Ashl\F4\00\223,/\00c10;\0Aor\15\00$4,\1A\00\B32;\0Amul.wideF\002d5,\22\00\824;\0Aadd.s\85\00&6,\8B\00\115\D6\00\03\A5\00\02J\01\00\22\01\00&\00\19]5\00&7,\0B\01X5;\0Ast5\00\000\0007],=\00\B0;\0Aret;\0A\0A}\0A\00\00\00\00\00\00\00", align 8
@kernels_matmul_kernel_kernel_name = private unnamed_addr constant [14 x i8] c"matmul_kernel\00", align 1

define { ptr, ptr, i64, [2 x i64], [2 x i64] } @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) {
  %22 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } undef, ptr %14, 0
  %23 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %22, ptr %15, 1
  %24 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %23, i64 %16, 2
  %25 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %24, i64 %17, 3, 0
  %26 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %25, i64 %19, 4, 0
  %27 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %26, i64 %18, 3, 1
  %28 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %27, i64 %20, 4, 1
  %29 = alloca %0, align 8
  %30 = alloca ptr, i64 21, align 8
  %31 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 0
  store ptr %0, ptr %31, align 8
  %32 = getelementptr ptr, ptr %30, i32 0
  store ptr %31, ptr %32, align 8
  %33 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 1
  store ptr %1, ptr %33, align 8
  %34 = getelementptr ptr, ptr %30, i32 1
  store ptr %33, ptr %34, align 8
  %35 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 2
  store i64 %2, ptr %35, align 4
  %36 = getelementptr ptr, ptr %30, i32 2
  store ptr %35, ptr %36, align 8
  %37 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 3
  store i64 %3, ptr %37, align 4
  %38 = getelementptr ptr, ptr %30, i32 3
  store ptr %37, ptr %38, align 8
  %39 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 4
  store i64 %4, ptr %39, align 4
  %40 = getelementptr ptr, ptr %30, i32 4
  store ptr %39, ptr %40, align 8
  %41 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 5
  store i64 %5, ptr %41, align 4
  %42 = getelementptr ptr, ptr %30, i32 5
  store ptr %41, ptr %42, align 8
  %43 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 6
  store i64 %6, ptr %43, align 4
  %44 = getelementptr ptr, ptr %30, i32 6
  store ptr %43, ptr %44, align 8
  %45 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 7
  store ptr %7, ptr %45, align 8
  %46 = getelementptr ptr, ptr %30, i32 7
  store ptr %45, ptr %46, align 8
  %47 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 8
  store ptr %8, ptr %47, align 8
  %48 = getelementptr ptr, ptr %30, i32 8
  store ptr %47, ptr %48, align 8
  %49 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 9
  store i64 %9, ptr %49, align 4
  %50 = getelementptr ptr, ptr %30, i32 9
  store ptr %49, ptr %50, align 8
  %51 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 10
  store i64 %10, ptr %51, align 4
  %52 = getelementptr ptr, ptr %30, i32 10
  store ptr %51, ptr %52, align 8
  %53 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 11
  store i64 %11, ptr %53, align 4
  %54 = getelementptr ptr, ptr %30, i32 11
  store ptr %53, ptr %54, align 8
  %55 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 12
  store i64 %12, ptr %55, align 4
  %56 = getelementptr ptr, ptr %30, i32 12
  store ptr %55, ptr %56, align 8
  %57 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 13
  store i64 %13, ptr %57, align 4
  %58 = getelementptr ptr, ptr %30, i32 13
  store ptr %57, ptr %58, align 8
  %59 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 14
  store ptr %14, ptr %59, align 8
  %60 = getelementptr ptr, ptr %30, i32 14
  store ptr %59, ptr %60, align 8
  %61 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 15
  store ptr %15, ptr %61, align 8
  %62 = getelementptr ptr, ptr %30, i32 15
  store ptr %61, ptr %62, align 8
  %63 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 16
  store i64 %16, ptr %63, align 4
  %64 = getelementptr ptr, ptr %30, i32 16
  store ptr %63, ptr %64, align 8
  %65 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 17
  store i64 %17, ptr %65, align 4
  %66 = getelementptr ptr, ptr %30, i32 17
  store ptr %65, ptr %66, align 8
  %67 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 18
  store i64 %18, ptr %67, align 4
  %68 = getelementptr ptr, ptr %30, i32 18
  store ptr %67, ptr %68, align 8
  %69 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 19
  store i64 %19, ptr %69, align 4
  %70 = getelementptr ptr, ptr %30, i32 19
  store ptr %69, ptr %70, align 8
  %71 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 20
  store i64 %20, ptr %71, align 4
  %72 = getelementptr ptr, ptr %30, i32 20
  store ptr %71, ptr %72, align 8
  %73 = call ptr @mgpuModuleLoad(ptr @kernels_bin_cst, i64 3600)
  %74 = call ptr @mgpuModuleGetFunction(ptr %73, ptr @kernels_matmul_kernel_kernel_name)
  %75 = call ptr @mgpuStreamCreate()
  call void @mgpuLaunchKernel(ptr %74, i64 32, i64 32, i64 1, i64 32, i64 32, i64 1, i32 0, ptr %75, ptr %30, ptr null, i64 21)
  call void @mgpuStreamSynchronize(ptr %75)
  call void @mgpuStreamDestroy(ptr %75)
  call void @mgpuModuleUnload(ptr %73)
  ret { ptr, ptr, i64, [2 x i64], [2 x i64] } %28
}

declare ptr @mgpuModuleLoad(ptr, i64)

declare ptr @mgpuModuleGetFunction(ptr, ptr)

declare ptr @mgpuStreamCreate()

declare void @mgpuLaunchKernel(ptr, i64, i64, i64, i64, i64, i64, i32, ptr, ptr, ptr, i64)

declare void @mgpuStreamSynchronize(ptr)

declare void @mgpuStreamDestroy(ptr)

declare void @mgpuModuleUnload(ptr)

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

And now when I try to compile it with this:

llc -opaque-pointers gpu_program.ll -o gpu_program.o

I get this error:

llc: error: llc: gpu_program.ll:19:32: error: expected type
  %31 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 0

Am I misunderstanding something here, or is this a bug? Am I missing something fundamental in my train of thought?

Any and all help is very appreciated!!

I expect your llc to be “too old” such that it is missing nuw support for GEPs.
It expects
%31 = getelementptr inbounds %0, ptr %29, i32 0, i32 0
not
%31 = getelementptr inbounds nuw %0, ptr %29, i32 0, i32 0
and therefore complaints.

Update llc, or build it as part of your LLVM built.

1 Like

Ah yes, now that you say it - it seems quite obvious. Thanks!

Ubuntu LLVM version 14.0.0

  Optimized build.
  Default target: x86_64-pc-linux-gnu
  Host CPU: goldmont
  ...

How about these questions:

Would you have any idea here?

Thanks again!

IR → [llc] → ptx → [ptxas] → cubin → …
Assuming it’s self contained, you can load the ptx or cubin on the device already,
for the offload JIT, we load the cubin (here is how it’s generated link, here is the actual load link)

1 Like