This is just a wrapper code that calls our library functions. There should be no vector code in there, only in our library code. The library code works fine when called from C++. My guess is that this could be some ABI issue (ie. compiler calls the functions in the wrong way).
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@__constant_4x32x32xf32 = private constant [4 x [32 x [32 x float]]] [[32 x [32 x float]] [[32 x float] [float 1.000000e+00, ... float 1.000000e+00]]]], align 128
; Function Attrs: mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite)
declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #0
; Function Attrs: mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) memory(inaccessiblemem: readwrite)
declare noalias noundef ptr @malloc(i64 noundef) local_unnamed_addr #1
declare void @printNewline() local_unnamed_addr #2
declare void @printF64(double) local_unnamed_addr #2
define { ptr, ptr, i64, [4 x i64], [4 x i64] } @_entry(ptr nocapture readnone %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9, i64 %10) local_unnamed_addr #2 {
.preheader5:
%11 = tail call dereferenceable_or_null(32832) ptr @malloc(i64 32832)
%12 = ptrtoint ptr %11 to i64
%13 = add i64 %12, 63
%14 = and i64 %13, -64
%15 = inttoptr i64 %14 to ptr
%16 = tail call i64 @xsmm_brgemm_dispatch(i64 1, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 1024, i64 1024, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 0, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 1024, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 2048, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 3072, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 4096, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 5120, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 6144, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %1, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 7168, i64 4)
%17 = tail call dereferenceable_or_null(32832) ptr @malloc(i64 32832)
%18 = ptrtoint ptr %17 to i64
%19 = add i64 %18, 63
%20 = and i64 %19, -64
%21 = inttoptr i64 %20 to ptr
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 0, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 1024, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 2048, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 3072, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 4096, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 5120, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 6144, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %15, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %21, i64 7168, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 0, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 1024, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 2048, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 0, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 3072, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 4096, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 5120, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 6144, i64 4)
tail call void @xsmm_brgemm_invoke(i64 1, i64 %16, ptr %21, i64 4096, ptr nonnull @__constant_4x32x32xf32, i64 0, ptr %15, i64 7168, i64 4)
%22 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } undef, ptr %11, 0
%23 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %22, ptr %15, 1
%24 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %23, i64 0, 2
%25 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %24, i64 2, 3, 0
%26 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %25, i64 4, 3, 1
%27 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %26, i64 32, 3, 2
%28 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %27, i64 32, 3, 3
%29 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %28, i64 4096, 4, 0
%30 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %29, i64 1024, 4, 1
%31 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %30, i64 32, 4, 2
%32 = insertvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %31, i64 1, 4, 3
tail call void @free(ptr %17)
ret { ptr, ptr, i64, [4 x i64], [4 x i64] } %32
}
declare void @xsmm_brgemm_invoke(i64, i64, ptr, i64, ptr, i64, ptr, i64, i64) local_unnamed_addr #2
declare i64 @xsmm_brgemm_dispatch(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) local_unnamed_addr #2
declare double @perf_stop_timer(i64) local_unnamed_addr #2
declare i64 @perf_start_timer() local_unnamed_addr #2
define void @entry() local_unnamed_addr #2 {
%1 = tail call i64 @perf_start_timer()
%2 = tail call { ptr, ptr, i64, [4 x i64], [4 x i64] } @_entry(ptr nonnull poison, ptr nonnull @__wrapper_0, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison)
%3 = extractvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %2, 0
tail call void @free(ptr %3)
%4 = tail call double @perf_stop_timer(i64 %1)
%5 = tail call i64 @perf_start_timer()
br label %6
6: ; preds = %0, %6
%7 = phi i64 [ 0, %0 ], [ %10, %6 ]
%8 = tail call { ptr, ptr, i64, [4 x i64], [4 x i64] } @_entry(ptr nonnull poison, ptr nonnull @__wrapper_0, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison)
%9 = extractvalue { ptr, ptr, i64, [4 x i64], [4 x i64] } %8, 0
tail call void @free(ptr %9)
%10 = add nuw nsw i64 %7, 1
%11 = icmp samesign ult i64 %7, 9
br i1 %11, label %6, label %12
12: ; preds = %6
%13 = tail call double @perf_stop_timer(i64 %5)
%14 = fdiv double %13, 1.000000e+01
tail call void @printF64(double %14)
tail call void @printNewline()
ret void
}
attributes #0 = { mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "unsafe-fp-math"="true" }
attributes #1 = { mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) memory(inaccessiblemem: readwrite) "alloc-family"="malloc" "unsafe-fp-math"="true" }
attributes #2 = { "unsafe-fp-math"="true" }
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}