```
backend = LLVMJITBackend([CUDA_RUNTIME_LIB_PATH])
# this doesn't actually …anything (no pipeline) but does generate C API/wrappers
compiled_module = backend.compile(
find_ops(
mod.operation,
lambda x: "transform.target_tag" in x.attributes
and x.attributes["transform.target_tag"].value == "payload",
single=True,
),
Pipeline().add_pass(
"gpu-lower-to-nvvm-pipeline",
**{
"cubin-chip": "sm_80",
"cubin-features": "+ptx76",
"cubin-format": "fatbin",
},
),
)
print(compiled_module)
```
when I run it I get:
```
{
"name": "MlirCompilerError",
"message": "Lowering IR failed with the following diagnostics:
********************************************************************************
Failure while executing pass pipeline:
error: unknown: `ptxas` invocation failed. Log:
ptxas /tmp/mlir-main_kernel-nvptx64-nvidia-cuda-sm_80-12fe81.ptx, line 5; fatal : Unsupported .version 7.6; current version is '7.5'
ptxas fatal : Ptx assembly aborted due to errors
error: unknown: An error happened while serializing the module.
note: unknown: see current operation:
\"gpu.module\"() <{targets = [#nvvm.target<chip = \"sm_80\", features = \"+ptx76\">]}> ({
\"llvm.func\"() <{CConv = #llvm.cconv<ccc>, function_type = !llvm.func<void (ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64)>, linkage = #llvm.linkage<external>, sym_name = \"main_kernel\", visibility_ = 0 : i64}> ({
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: !llvm.ptr, %arg15: !llvm.ptr, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64):
%0 = \"llvm.mlir.constant\"() <{value = 1 : i64}> : () -> i64
%1 = \"llvm.mlir.constant\"() <{value = 0 : i64}> : () -> i64
%2 = \"llvm.mlir.constant\"() <{value = 0 : i32}> : () -> i32
%3 = \"llvm.mlir.constant\"() <{value = 16 : index}> : () -> i64
%4 = \"llvm.mlir.constant\"() <{value = 4 : index}> : () -> i64
%5 = \"llvm.mlir.constant\"() <{value = 0 : index}> : () -> i64
%6 = \"llvm.mlir.constant\"() <{value = -1 : index}> : () -> i64
%7 = \"llvm.mlir.constant\"() <{value = 2 : index}> : () -> i64
%8 = \"llvm.mlir.constant\"() <{value = -8 : index}> : () -> i64
%9 = \"llvm.mlir.constant\"() <{value = 1 : index}> : () -> i64
%10 = \"llvm.mlir.constant\"() <{value = 8 : index}> : () -> i64
%11 = \"llvm.mlir.constant\"() <{value = 9 : index}> : () -> i64
\"llvm.br\"()[^bb1] : () -> ()
^bb1: // pred: ^bb0
%12 = \"nvvm.read.ptx.sreg.tid.x\"() : () -> i32
%13 = \"llvm.sext\"(%12) : (i32) -> i64
%14 = \"llvm.icmp\"(%13, %5) <{predicate = 2 : i64}> : (i64, i64) -> i1
%15 = \"llvm.sub\"(%6, %13) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%16 = \"llvm.select\"(%14, %15, %13) <{fastmathFlags = #llvm.fastmath<none>}> : (i1, i64, i64) -> i64
%17 = \"llvm.sdiv\"(%16, %4) : (i64, i64) -> i64
%18 = \"llvm.sub\"(%6, %17) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%19 = \"llvm.select\"(%14, %18, %17) <{fastmathFlags = #llvm.fastmath<none>}> : (i1, i64, i64) -> i64
%20 = \"llvm.mul\"(%13, %7) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%21 = \"llvm.mul\"(%19, %8) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%22 = \"llvm.add\"(%20, %21) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%23 = \"llvm.mul\"(%19, %3) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%24 = \"llvm.add\"(%23, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%25 = \"llvm.getelementptr\"(%arg1, %24) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%26 = \"llvm.load\"(%25) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%27 = \"llvm.add\"(%22, %9) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%28 = \"llvm.add\"(%23, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%29 = \"llvm.getelementptr\"(%arg1, %28) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%30 = \"llvm.load\"(%29) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%31 = \"llvm.add\"(%19, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%32 = \"llvm.mul\"(%31, %3) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%33 = \"llvm.add\"(%32, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%34 = \"llvm.getelementptr\"(%arg1, %33) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%35 = \"llvm.load\"(%34) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%36 = \"llvm.add\"(%32, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%37 = \"llvm.getelementptr\"(%arg1, %36) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%38 = \"llvm.load\"(%37) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%39 = \"llvm.add\"(%22, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%40 = \"llvm.add\"(%23, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%41 = \"llvm.getelementptr\"(%arg1, %40) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%42 = \"llvm.load\"(%41) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%43 = \"llvm.add\"(%22, %11) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%44 = \"llvm.add\"(%23, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%45 = \"llvm.getelementptr\"(%arg1, %44) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%46 = \"llvm.load\"(%45) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%47 = \"llvm.add\"(%32, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%48 = \"llvm.getelementptr\"(%arg1, %47) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%49 = \"llvm.load\"(%48) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%50 = \"llvm.add\"(%32, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%51 = \"llvm.getelementptr\"(%arg1, %50) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%52 = \"llvm.load\"(%51) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%53 = \"llvm.mlir.undef\"() : () -> vector<2xf16>
%54 = \"llvm.insertelement\"(%53, %26, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%55 = \"llvm.shufflevector\"(%54, %54) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%56 = \"llvm.insertelement\"(%55, %26, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%57 = \"llvm.insertelement\"(%56, %30, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%58 = \"llvm.insertelement\"(%55, %35, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%59 = \"llvm.insertelement\"(%58, %38, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%60 = \"llvm.insertelement\"(%55, %42, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%61 = \"llvm.insertelement\"(%60, %46, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%62 = \"llvm.insertelement\"(%55, %49, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%63 = \"llvm.insertelement\"(%62, %52, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%64 = \"llvm.mul\"(%22, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%65 = \"llvm.add\"(%64, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%66 = \"llvm.getelementptr\"(%arg8, %65) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%67 = \"llvm.load\"(%66) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%68 = \"llvm.mul\"(%27, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%69 = \"llvm.add\"(%68, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%70 = \"llvm.getelementptr\"(%arg8, %69) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%71 = \"llvm.load\"(%70) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%72 = \"llvm.mul\"(%39, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%73 = \"llvm.add\"(%72, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%74 = \"llvm.getelementptr\"(%arg8, %73) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%75 = \"llvm.load\"(%74) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%76 = \"llvm.mul\"(%43, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%77 = \"llvm.add\"(%76, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%78 = \"llvm.getelementptr\"(%arg8, %77) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%79 = \"llvm.load\"(%78) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%80 = \"llvm.insertelement\"(%53, %67, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%81 = \"llvm.shufflevector\"(%80, %80) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%82 = \"llvm.insertelement\"(%81, %67, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%83 = \"llvm.insertelement\"(%82, %71, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%84 = \"llvm.insertelement\"(%81, %75, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%85 = \"llvm.insertelement\"(%84, %79, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%86 = \"llvm.mul\"(%19, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%87 = \"llvm.add\"(%86, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%88 = \"llvm.getelementptr\"(%arg15, %87) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%89 = \"llvm.load\"(%88) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%90 = \"llvm.add\"(%86, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%91 = \"llvm.getelementptr\"(%arg15, %90) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%92 = \"llvm.load\"(%91) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%93 = \"llvm.mul\"(%31, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%94 = \"llvm.add\"(%93, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%95 = \"llvm.getelementptr\"(%arg15, %94) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%96 = \"llvm.load\"(%95) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%97 = \"llvm.add\"(%93, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%98 = \"llvm.getelementptr\"(%arg15, %97) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%99 = \"llvm.load\"(%98) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%100 = \"llvm.insertelement\"(%53, %89, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%101 = \"llvm.shufflevector\"(%100, %100) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%102 = \"llvm.insertelement\"(%101, %89, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%103 = \"llvm.insertelement\"(%102, %92, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%104 = \"llvm.insertelement\"(%101, %96, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%105 = \"llvm.insertelement\"(%104, %99, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%106 = \"nvvm.mma.sync\"(%57, %59, %61, %63, %83, %85, %103, %105) <{layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, multiplicandAPtxType = #nvvm.mma_type<f16>, multiplicandBPtxType = #nvvm.mma_type<f16>, operandSegmentSizes = array<i32: 4, 2, 2>, shape = #nvvm.shape<m = 16, n = 8, k = 16>}> : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
%107 = \"llvm.extractvalue\"(%106) <{position = array<i64: 0>}> : (!llvm.struct<(vector<2xf16>, vector<2xf16>)>) -> vector<2xf16>
%108 = \"llvm.extractvalue\"(%106) <{position = array<i64: 1>}> : (!llvm.struct<(vector<2xf16>, vector<2xf16>)>) -> vector<2xf16>
%109 = \"llvm.extractelement\"(%107, %1) : (vector<2xf16>, i64) -> f16
%110 = \"llvm.extractelement\"(%107, %0) : (vector<2xf16>, i64) -> f16
%111 = \"llvm.extractelement\"(%108, %1) : (vector<2xf16>, i64) -> f16
%112 = \"llvm.extractelement\"(%108, %0) : (vector<2xf16>, i64) -> f16
\"llvm.store\"(%109, %88) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%110, %91) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%111, %95) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%112, %98) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.return\"() : () -> ()
}) {gpu.kernel, gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 1, 1>} : () -> ()
\"gpu.module_end\"() : () -> ()
}) {sym_name = \"main_kernel\"} : () -> ()
********************************************************************************
For developers, the error can be reproduced with:
$ mlir-opt -mlir-print-ir-after-all -mlir-disable-threading -pass-pipeline='builtin.module(gpu-lower-to-nvvm-pipeline{ cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=fatbin })' /tmp/UnnammedModule.mlir
",
"stack": "---------------------------------------------------------------------------
MLIRError Traceback (most recent call last)
File ~/miniconda3/envs/mlir-pycuda2/lib/python3.12/site-packages/mlir/extras/runtime/passes.py:58, in run_pipeline(module, pipeline, description, enable_ir_printing, print_pipeline, verify)
56 pm.enable_ir_printing()
---> 58 pm.run(module.operation)
59 except Exception as e:
MLIRError: Failure while executing pass pipeline:
error: unknown: `ptxas` invocation failed. Log:
ptxas /tmp/mlir-main_kernel-nvptx64-nvidia-cuda-sm_80-12fe81.ptx, line 5; fatal : Unsupported .version 7.6; current version is '7.5'
ptxas fatal : Ptx assembly aborted due to errors
error: unknown: An error happened while serializing the module.
note: unknown: see current operation:
\"gpu.module\"() <{targets = [#nvvm.target<chip = \"sm_80\", features = \"+ptx76\">]}> ({
\"llvm.func\"() <{CConv = #llvm.cconv<ccc>, function_type = !llvm.func<void (ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64)>, linkage = #llvm.linkage<external>, sym_name = \"main_kernel\", visibility_ = 0 : i64}> ({
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: !llvm.ptr, %arg15: !llvm.ptr, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64):
%0 = \"llvm.mlir.constant\"() <{value = 1 : i64}> : () -> i64
%1 = \"llvm.mlir.constant\"() <{value = 0 : i64}> : () -> i64
%2 = \"llvm.mlir.constant\"() <{value = 0 : i32}> : () -> i32
%3 = \"llvm.mlir.constant\"() <{value = 16 : index}> : () -> i64
%4 = \"llvm.mlir.constant\"() <{value = 4 : index}> : () -> i64
%5 = \"llvm.mlir.constant\"() <{value = 0 : index}> : () -> i64
%6 = \"llvm.mlir.constant\"() <{value = -1 : index}> : () -> i64
%7 = \"llvm.mlir.constant\"() <{value = 2 : index}> : () -> i64
%8 = \"llvm.mlir.constant\"() <{value = -8 : index}> : () -> i64
%9 = \"llvm.mlir.constant\"() <{value = 1 : index}> : () -> i64
%10 = \"llvm.mlir.constant\"() <{value = 8 : index}> : () -> i64
%11 = \"llvm.mlir.constant\"() <{value = 9 : index}> : () -> i64
\"llvm.br\"()[^bb1] : () -> ()
^bb1: // pred: ^bb0
%12 = \"nvvm.read.ptx.sreg.tid.x\"() : () -> i32
%13 = \"llvm.sext\"(%12) : (i32) -> i64
%14 = \"llvm.icmp\"(%13, %5) <{predicate = 2 : i64}> : (i64, i64) -> i1
%15 = \"llvm.sub\"(%6, %13) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%16 = \"llvm.select\"(%14, %15, %13) <{fastmathFlags = #llvm.fastmath<none>}> : (i1, i64, i64) -> i64
%17 = \"llvm.sdiv\"(%16, %4) : (i64, i64) -> i64
%18 = \"llvm.sub\"(%6, %17) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%19 = \"llvm.select\"(%14, %18, %17) <{fastmathFlags = #llvm.fastmath<none>}> : (i1, i64, i64) -> i64
%20 = \"llvm.mul\"(%13, %7) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%21 = \"llvm.mul\"(%19, %8) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%22 = \"llvm.add\"(%20, %21) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%23 = \"llvm.mul\"(%19, %3) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%24 = \"llvm.add\"(%23, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%25 = \"llvm.getelementptr\"(%arg1, %24) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%26 = \"llvm.load\"(%25) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%27 = \"llvm.add\"(%22, %9) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%28 = \"llvm.add\"(%23, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%29 = \"llvm.getelementptr\"(%arg1, %28) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%30 = \"llvm.load\"(%29) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%31 = \"llvm.add\"(%19, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%32 = \"llvm.mul\"(%31, %3) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%33 = \"llvm.add\"(%32, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%34 = \"llvm.getelementptr\"(%arg1, %33) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%35 = \"llvm.load\"(%34) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%36 = \"llvm.add\"(%32, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%37 = \"llvm.getelementptr\"(%arg1, %36) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%38 = \"llvm.load\"(%37) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%39 = \"llvm.add\"(%22, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%40 = \"llvm.add\"(%23, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%41 = \"llvm.getelementptr\"(%arg1, %40) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%42 = \"llvm.load\"(%41) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%43 = \"llvm.add\"(%22, %11) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%44 = \"llvm.add\"(%23, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%45 = \"llvm.getelementptr\"(%arg1, %44) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%46 = \"llvm.load\"(%45) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%47 = \"llvm.add\"(%32, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%48 = \"llvm.getelementptr\"(%arg1, %47) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%49 = \"llvm.load\"(%48) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%50 = \"llvm.add\"(%32, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%51 = \"llvm.getelementptr\"(%arg1, %50) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%52 = \"llvm.load\"(%51) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%53 = \"llvm.mlir.undef\"() : () -> vector<2xf16>
%54 = \"llvm.insertelement\"(%53, %26, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%55 = \"llvm.shufflevector\"(%54, %54) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%56 = \"llvm.insertelement\"(%55, %26, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%57 = \"llvm.insertelement\"(%56, %30, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%58 = \"llvm.insertelement\"(%55, %35, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%59 = \"llvm.insertelement\"(%58, %38, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%60 = \"llvm.insertelement\"(%55, %42, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%61 = \"llvm.insertelement\"(%60, %46, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%62 = \"llvm.insertelement\"(%55, %49, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%63 = \"llvm.insertelement\"(%62, %52, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%64 = \"llvm.mul\"(%22, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%65 = \"llvm.add\"(%64, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%66 = \"llvm.getelementptr\"(%arg8, %65) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%67 = \"llvm.load\"(%66) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%68 = \"llvm.mul\"(%27, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%69 = \"llvm.add\"(%68, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%70 = \"llvm.getelementptr\"(%arg8, %69) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%71 = \"llvm.load\"(%70) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%72 = \"llvm.mul\"(%39, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%73 = \"llvm.add\"(%72, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%74 = \"llvm.getelementptr\"(%arg8, %73) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%75 = \"llvm.load\"(%74) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%76 = \"llvm.mul\"(%43, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%77 = \"llvm.add\"(%76, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%78 = \"llvm.getelementptr\"(%arg8, %77) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%79 = \"llvm.load\"(%78) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%80 = \"llvm.insertelement\"(%53, %67, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%81 = \"llvm.shufflevector\"(%80, %80) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%82 = \"llvm.insertelement\"(%81, %67, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%83 = \"llvm.insertelement\"(%82, %71, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%84 = \"llvm.insertelement\"(%81, %75, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%85 = \"llvm.insertelement\"(%84, %79, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%86 = \"llvm.mul\"(%19, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%87 = \"llvm.add\"(%86, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%88 = \"llvm.getelementptr\"(%arg15, %87) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%89 = \"llvm.load\"(%88) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%90 = \"llvm.add\"(%86, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%91 = \"llvm.getelementptr\"(%arg15, %90) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%92 = \"llvm.load\"(%91) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%93 = \"llvm.mul\"(%31, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%94 = \"llvm.add\"(%93, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%95 = \"llvm.getelementptr\"(%arg15, %94) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%96 = \"llvm.load\"(%95) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%97 = \"llvm.add\"(%93, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%98 = \"llvm.getelementptr\"(%arg15, %97) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%99 = \"llvm.load\"(%98) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%100 = \"llvm.insertelement\"(%53, %89, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%101 = \"llvm.shufflevector\"(%100, %100) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%102 = \"llvm.insertelement\"(%101, %89, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%103 = \"llvm.insertelement\"(%102, %92, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%104 = \"llvm.insertelement\"(%101, %96, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%105 = \"llvm.insertelement\"(%104, %99, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%106 = \"nvvm.mma.sync\"(%57, %59, %61, %63, %83, %85, %103, %105) <{layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, multiplicandAPtxType = #nvvm.mma_type<f16>, multiplicandBPtxType = #nvvm.mma_type<f16>, operandSegmentSizes = array<i32: 4, 2, 2>, shape = #nvvm.shape<m = 16, n = 8, k = 16>}> : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
%107 = \"llvm.extractvalue\"(%106) <{position = array<i64: 0>}> : (!llvm.struct<(vector<2xf16>, vector<2xf16>)>) -> vector<2xf16>
%108 = \"llvm.extractvalue\"(%106) <{position = array<i64: 1>}> : (!llvm.struct<(vector<2xf16>, vector<2xf16>)>) -> vector<2xf16>
%109 = \"llvm.extractelement\"(%107, %1) : (vector<2xf16>, i64) -> f16
%110 = \"llvm.extractelement\"(%107, %0) : (vector<2xf16>, i64) -> f16
%111 = \"llvm.extractelement\"(%108, %1) : (vector<2xf16>, i64) -> f16
%112 = \"llvm.extractelement\"(%108, %0) : (vector<2xf16>, i64) -> f16
\"llvm.store\"(%109, %88) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%110, %91) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%111, %95) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%112, %98) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.return\"() : () -> ()
}) {gpu.kernel, gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 1, 1>} : () -> ()
\"gpu.module_end\"() : () -> ()
}) {sym_name = \"main_kernel\"} : () -> ()
During handling of the above exception, another exception occurred:
MlirCompilerError Traceback (most recent call last)
Cell In[9], line 3
1 backend = LLVMJITBackend([CUDA_RUNTIME_LIB_PATH])
2 # this doesn't actually anything (no pipeline) but does generate C API/wrappers
----> 3 compiled_module = backend.compile(
4 find_ops(
5 mod.operation,
6 lambda x: \"transform.target_tag\" in x.attributes
7 and x.attributes[\"transform.target_tag\"].value == \"payload\",
8 single=True,
9 ),
10 Pipeline().add_pass(
11 \"gpu-lower-to-nvvm-pipeline\",
12 **{
13 \"cubin-chip\": \"sm_80\",
14 \"cubin-features\": \"+ptx76\",
15 \"cubin-format\": \"fatbin\",
16 },
17 ),
18 )
19 print(compiled_module)
File ~/miniconda3/envs/mlir-pycuda2/lib/python3.12/site-packages/mlir/extras/runtime/refbackend.py:289, in LLVMJITBackend.compile(self, module, pipeline, kernel_name, enable_ir_printing, generate_kernel_wrapper, generate_return_consumer, return_consumer, verify)
280 if \"to-llvm\" in pipeline or generate_kernel_wrapper:
281 self.generate_c_api(
282 module,
283 kernel_name,
(...)
286 return_consumer,
287 )
--> 289 return run_pipeline(
290 module,
291 pipeline=pipeline,
292 description=\"Lowering IR\",
293 enable_ir_printing=enable_ir_printing,
294 verify=verify,
295 )
File ~/miniconda3/envs/mlir-pycuda2/lib/python3.12/site-packages/mlir/extras/runtime/passes.py:78, in run_pipeline(module, pipeline, description, enable_ir_printing, print_pipeline, verify)
67 message = f\"\"\"\\
68 {description} failed with the following diagnostics:
69
(...)
75 $ mlir-opt {debug_options} -pass-pipeline='{pipeline}' {filename}
76 \"\"\"
77 trimmed_message = \"\
\".join([m.lstrip() for m in message.split(\"\
\")])
---> 78 raise MlirCompilerError(trimmed_message)
79 finally:
80 sys.stderr = original_stderr
MlirCompilerError: Lowering IR failed with the following diagnostics:
********************************************************************************
Failure while executing pass pipeline:
error: unknown: `ptxas` invocation failed. Log:
ptxas /tmp/mlir-main_kernel-nvptx64-nvidia-cuda-sm_80-12fe81.ptx, line 5; fatal : Unsupported .version 7.6; current version is '7.5'
ptxas fatal : Ptx assembly aborted due to errors
error: unknown: An error happened while serializing the module.
note: unknown: see current operation:
\"gpu.module\"() <{targets = [#nvvm.target<chip = \"sm_80\", features = \"+ptx76\">]}> ({
\"llvm.func\"() <{CConv = #llvm.cconv<ccc>, function_type = !llvm.func<void (ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64, i64, i64, i64, i64)>, linkage = #llvm.linkage<external>, sym_name = \"main_kernel\", visibility_ = 0 : i64}> ({
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: !llvm.ptr, %arg15: !llvm.ptr, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64):
%0 = \"llvm.mlir.constant\"() <{value = 1 : i64}> : () -> i64
%1 = \"llvm.mlir.constant\"() <{value = 0 : i64}> : () -> i64
%2 = \"llvm.mlir.constant\"() <{value = 0 : i32}> : () -> i32
%3 = \"llvm.mlir.constant\"() <{value = 16 : index}> : () -> i64
%4 = \"llvm.mlir.constant\"() <{value = 4 : index}> : () -> i64
%5 = \"llvm.mlir.constant\"() <{value = 0 : index}> : () -> i64
%6 = \"llvm.mlir.constant\"() <{value = -1 : index}> : () -> i64
%7 = \"llvm.mlir.constant\"() <{value = 2 : index}> : () -> i64
%8 = \"llvm.mlir.constant\"() <{value = -8 : index}> : () -> i64
%9 = \"llvm.mlir.constant\"() <{value = 1 : index}> : () -> i64
%10 = \"llvm.mlir.constant\"() <{value = 8 : index}> : () -> i64
%11 = \"llvm.mlir.constant\"() <{value = 9 : index}> : () -> i64
\"llvm.br\"()[^bb1] : () -> ()
^bb1: // pred: ^bb0
%12 = \"nvvm.read.ptx.sreg.tid.x\"() : () -> i32
%13 = \"llvm.sext\"(%12) : (i32) -> i64
%14 = \"llvm.icmp\"(%13, %5) <{predicate = 2 : i64}> : (i64, i64) -> i1
%15 = \"llvm.sub\"(%6, %13) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%16 = \"llvm.select\"(%14, %15, %13) <{fastmathFlags = #llvm.fastmath<none>}> : (i1, i64, i64) -> i64
%17 = \"llvm.sdiv\"(%16, %4) : (i64, i64) -> i64
%18 = \"llvm.sub\"(%6, %17) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%19 = \"llvm.select\"(%14, %18, %17) <{fastmathFlags = #llvm.fastmath<none>}> : (i1, i64, i64) -> i64
%20 = \"llvm.mul\"(%13, %7) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%21 = \"llvm.mul\"(%19, %8) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%22 = \"llvm.add\"(%20, %21) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%23 = \"llvm.mul\"(%19, %3) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%24 = \"llvm.add\"(%23, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%25 = \"llvm.getelementptr\"(%arg1, %24) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%26 = \"llvm.load\"(%25) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%27 = \"llvm.add\"(%22, %9) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%28 = \"llvm.add\"(%23, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%29 = \"llvm.getelementptr\"(%arg1, %28) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%30 = \"llvm.load\"(%29) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%31 = \"llvm.add\"(%19, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%32 = \"llvm.mul\"(%31, %3) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%33 = \"llvm.add\"(%32, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%34 = \"llvm.getelementptr\"(%arg1, %33) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%35 = \"llvm.load\"(%34) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%36 = \"llvm.add\"(%32, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%37 = \"llvm.getelementptr\"(%arg1, %36) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%38 = \"llvm.load\"(%37) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%39 = \"llvm.add\"(%22, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%40 = \"llvm.add\"(%23, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%41 = \"llvm.getelementptr\"(%arg1, %40) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%42 = \"llvm.load\"(%41) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%43 = \"llvm.add\"(%22, %11) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%44 = \"llvm.add\"(%23, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%45 = \"llvm.getelementptr\"(%arg1, %44) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%46 = \"llvm.load\"(%45) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%47 = \"llvm.add\"(%32, %39) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%48 = \"llvm.getelementptr\"(%arg1, %47) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%49 = \"llvm.load\"(%48) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%50 = \"llvm.add\"(%32, %43) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%51 = \"llvm.getelementptr\"(%arg1, %50) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%52 = \"llvm.load\"(%51) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%53 = \"llvm.mlir.undef\"() : () -> vector<2xf16>
%54 = \"llvm.insertelement\"(%53, %26, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%55 = \"llvm.shufflevector\"(%54, %54) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%56 = \"llvm.insertelement\"(%55, %26, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%57 = \"llvm.insertelement\"(%56, %30, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%58 = \"llvm.insertelement\"(%55, %35, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%59 = \"llvm.insertelement\"(%58, %38, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%60 = \"llvm.insertelement\"(%55, %42, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%61 = \"llvm.insertelement\"(%60, %46, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%62 = \"llvm.insertelement\"(%55, %49, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%63 = \"llvm.insertelement\"(%62, %52, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%64 = \"llvm.mul\"(%22, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%65 = \"llvm.add\"(%64, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%66 = \"llvm.getelementptr\"(%arg8, %65) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%67 = \"llvm.load\"(%66) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%68 = \"llvm.mul\"(%27, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%69 = \"llvm.add\"(%68, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%70 = \"llvm.getelementptr\"(%arg8, %69) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%71 = \"llvm.load\"(%70) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%72 = \"llvm.mul\"(%39, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%73 = \"llvm.add\"(%72, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%74 = \"llvm.getelementptr\"(%arg8, %73) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%75 = \"llvm.load\"(%74) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%76 = \"llvm.mul\"(%43, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%77 = \"llvm.add\"(%76, %19) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%78 = \"llvm.getelementptr\"(%arg8, %77) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%79 = \"llvm.load\"(%78) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%80 = \"llvm.insertelement\"(%53, %67, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%81 = \"llvm.shufflevector\"(%80, %80) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%82 = \"llvm.insertelement\"(%81, %67, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%83 = \"llvm.insertelement\"(%82, %71, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%84 = \"llvm.insertelement\"(%81, %75, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%85 = \"llvm.insertelement\"(%84, %79, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%86 = \"llvm.mul\"(%19, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%87 = \"llvm.add\"(%86, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%88 = \"llvm.getelementptr\"(%arg15, %87) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%89 = \"llvm.load\"(%88) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%90 = \"llvm.add\"(%86, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%91 = \"llvm.getelementptr\"(%arg15, %90) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%92 = \"llvm.load\"(%91) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%93 = \"llvm.mul\"(%31, %10) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%94 = \"llvm.add\"(%93, %22) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%95 = \"llvm.getelementptr\"(%arg15, %94) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%96 = \"llvm.load\"(%95) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%97 = \"llvm.add\"(%93, %27) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%98 = \"llvm.getelementptr\"(%arg15, %97) <{elem_type = f16, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr, i64) -> !llvm.ptr
%99 = \"llvm.load\"(%98) <{ordering = 0 : i64}> : (!llvm.ptr) -> f16
%100 = \"llvm.insertelement\"(%53, %89, %2) : (vector<2xf16>, f16, i32) -> vector<2xf16>
%101 = \"llvm.shufflevector\"(%100, %100) <{mask = array<i32: 0, 0>}> : (vector<2xf16>, vector<2xf16>) -> vector<2xf16>
%102 = \"llvm.insertelement\"(%101, %89, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%103 = \"llvm.insertelement\"(%102, %92, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%104 = \"llvm.insertelement\"(%101, %96, %1) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%105 = \"llvm.insertelement\"(%104, %99, %0) : (vector<2xf16>, f16, i64) -> vector<2xf16>
%106 = \"nvvm.mma.sync\"(%57, %59, %61, %63, %83, %85, %103, %105) <{layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, multiplicandAPtxType = #nvvm.mma_type<f16>, multiplicandBPtxType = #nvvm.mma_type<f16>, operandSegmentSizes = array<i32: 4, 2, 2>, shape = #nvvm.shape<m = 16, n = 8, k = 16>}> : (vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
%107 = \"llvm.extractvalue\"(%106) <{position = array<i64: 0>}> : (!llvm.struct<(vector<2xf16>, vector<2xf16>)>) -> vector<2xf16>
%108 = \"llvm.extractvalue\"(%106) <{position = array<i64: 1>}> : (!llvm.struct<(vector<2xf16>, vector<2xf16>)>) -> vector<2xf16>
%109 = \"llvm.extractelement\"(%107, %1) : (vector<2xf16>, i64) -> f16
%110 = \"llvm.extractelement\"(%107, %0) : (vector<2xf16>, i64) -> f16
%111 = \"llvm.extractelement\"(%108, %1) : (vector<2xf16>, i64) -> f16
%112 = \"llvm.extractelement\"(%108, %0) : (vector<2xf16>, i64) -> f16
\"llvm.store\"(%109, %88) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%110, %91) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%111, %95) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.store\"(%112, %98) <{ordering = 0 : i64}> : (f16, !llvm.ptr) -> ()
\"llvm.return\"() : () -> ()
}) {gpu.kernel, gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 1, 1>} : () -> ()
\"gpu.module_end\"() : () -> ()
}) {sym_name = \"main_kernel\"} : () -> ()
********************************************************************************
For developers, the error can be reproduced with:
$ mlir-opt -mlir-print-ir-after-all -mlir-disable-threading -pass-pipeline='builtin.module(gpu-lower-to-nvvm-pipeline{ cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=fatbin })' /tmp/UnnammedModule.mlir
"
}
```
It runs fine on Google Colab A 100 GPU that has the compute capability of sm_80 but if I edit that for my GTX 1650 and put sm_70 and +ptx75 the kernel dies, same on Kaggle and same on Colab with any gpu less than 80.
How can I run this locally though? Here is link to notebook successfuly run https://github.com/nyck33/mlir-python-extras-copy/blob/main/colab_a100_cuda_e2e.ipynb