They don’t allow me to upload files because my account is too new so bare with the long reply.
here is what I get after I run:
hipcc -fpass-plugin=build/instr/libinstr.so -Lbuild/libtrace -ltrace -x hip -std=c++17 -g -emit-llvm -S -o test/managed_clock_test.instr.ll test/managed_clock_test.cpp
This is generated with the line that removes the old function from the module commented out so that you can see the contents of the old functions, however the contents of the new replacement functions stays the same regardless of if this line is included or not.
Device/GPU kernel module post instrumentation:
; __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa--gfx90a
; ModuleID = 'test/managed_clock_test.cpp'
source_filename = "test/managed_clock_test.cpp"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.scabbard::trace::device::DeviceTracker" = type { %"struct.scabbard::jobId_t", i64, i64, i64, i8, [128 x %"struct.scabbard::TraceData"] }
%"struct.scabbard::jobId_t" = type { i16, i16 }
%"struct.scabbard::TraceData" = type { i64, i16, [6 x i8], %"union.scabbard::ThreadId", ptr, %"struct.scabbard::LocationMetadata", i64 }
%"union.scabbard::ThreadId" = type <{ %"class.std::thread::id", [16 x i8] }>
%"class.std::thread::id" = type { i64 }
%"struct.scabbard::LocationMetadata" = type { i64, i32, i32 }
$scabbard.trace.device.dummyFunc = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1yE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1zE = comdat any
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1yE = comdat any
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1zE = comdat any
@_ZN8scabbard5trace12_GLOBAL__N_114src_id_reg_tmpE = internal addrspace(1) global i64 84, align 8
@_ZN8scabbard5trace12_GLOBAL__N_115src_id_reg_tmp2E = internal addrspace(1) global i64 84, align 8
@_ZN8scabbard5trace12_GLOBAL__N_115src_id_reg_tmp3E = internal addrspace(1) global i64 84, align 8
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected local_unnamed_addr addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1yE = weak protected local_unnamed_addr addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1zE = weak protected local_unnamed_addr addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected local_unnamed_addr addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1yE = weak protected local_unnamed_addr addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1zE = weak protected local_unnamed_addr addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn
define protected amdgpu_kernel void @_Z15tick_all_kernelPU7_AtomicmPmPli__old__scabbard_instr_replaced__old__(ptr addrspace(1) nocapture %0, ptr addrspace(1) nocapture writeonly %1, ptr addrspace(1) nocapture %2, i32 %3) local_unnamed_addr #0 !dbg !1756 {
%5 = addrspacecast ptr addrspace(1) %2 to ptr
tail call fastcc void @_Z10dummy_workPl__old__scabbard_instr_replaced__old__(ptr %5) #10, !dbg !1768
%6 = atomicrmw add ptr addrspace(1) %0, i64 1 seq_cst, align 8, !dbg !1769
%7 = add i64 %6, 1, !dbg !1769
%8 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !1770, !range !1782, !noundef !1783
%9 = add i32 %8, %3, !dbg !1784
%10 = zext i32 %9 to i64, !dbg !1785
%11 = getelementptr inbounds i64, ptr addrspace(1) %1, i64 %10, !dbg !1785
store i64 %7, ptr addrspace(1) %11, align 8, !dbg !1786, !tbaa !1787
fence syncscope("workgroup") release, !dbg !1791
tail call void @llvm.amdgcn.s.barrier(), !dbg !1806
fence syncscope("workgroup") acquire, !dbg !1807
tail call fastcc void @_Z10dummy_workPl__old__scabbard_instr_replaced__old__(ptr %5) #10, !dbg !1808
ret void, !dbg !1809
}
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(argmem: readwrite)
define internal fastcc void @_Z10dummy_workPl__old__scabbard_instr_replaced__old__(ptr nocapture %0) unnamed_addr #1 !dbg !1810 {
%2 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !1816, !range !1782, !noundef !1783
%3 = zext i32 %2 to i64, !dbg !1816
%4 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr(), !dbg !1817
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !1817
%6 = load i32, ptr addrspace(4) %4, align 4, !dbg !1817, !tbaa !1818
%7 = icmp ult i32 %5, %6, !dbg !1817
%8 = select i1 %7, i64 6, i64 9, !dbg !1817
%9 = getelementptr inbounds i16, ptr addrspace(4) %4, i64 %8, !dbg !1817
%10 = load i16, ptr addrspace(4) %9, align 2, !dbg !1817, !tbaa !1822
%11 = zext i16 %10 to i64, !dbg !1817
%12 = zext i32 %5 to i64, !dbg !1824
%13 = mul nuw nsw i64 %11, %12, !dbg !1825
%14 = add nuw nsw i64 %13, %3, !dbg !1826
%15 = shl i64 %14, 32, !dbg !1827
%16 = ashr exact i64 %15, 32, !dbg !1827
%17 = getelementptr inbounds i64, ptr %0, i64 %16, !dbg !1828
%18 = load i64, ptr %17, align 8, !dbg !1829, !tbaa !1787
%19 = add nsw i64 %16, %18, !dbg !1829
store i64 %19, ptr %17, align 8, !dbg !1829, !tbaa !1787
ret void, !dbg !1830
}
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn
define protected amdgpu_kernel void @_Z12dummy_kernelv__old__scabbard_instr_replaced__old__() local_unnamed_addr #0 !dbg !1831 {
fence syncscope("workgroup") release, !dbg !1832
tail call void @llvm.amdgcn.s.barrier(), !dbg !1836
fence syncscope("workgroup") acquire, !dbg !1837
ret void, !dbg !1838
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define protected amdgpu_kernel void @_Z10dummy_initPl__old__scabbard_instr_replaced__old__(ptr addrspace(1) nocapture writeonly %0) local_unnamed_addr #2 !dbg !1839 {
%2 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !1843, !range !1782, !noundef !1783
%3 = zext i32 %2 to i64, !dbg !1843
%4 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr(), !dbg !1844
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !1844
%6 = getelementptr inbounds i16, ptr addrspace(4) %4, i64 6, !dbg !1844
%7 = load i16, ptr addrspace(4) %6, align 4, !dbg !1844, !tbaa !1822
%8 = zext i16 %7 to i64, !dbg !1844
%9 = zext i32 %5 to i64, !dbg !1845
%10 = mul nuw nsw i64 %8, %9, !dbg !1846
%11 = add nuw nsw i64 %10, %3, !dbg !1847
%12 = shl i64 %11, 32, !dbg !1848
%13 = ashr exact i64 %12, 32, !dbg !1848
%14 = getelementptr inbounds i64, ptr addrspace(1) %0, i64 %13, !dbg !1848
store i64 0, ptr addrspace(1) %14, align 8, !dbg !1849, !tbaa !1787
ret void, !dbg !1850
}
; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.s.barrier() #3
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workitem.id.x() #4
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #4
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workgroup.id.x() #4
; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(readwrite, inaccessiblemem: none)
define amdgpu_kernel void @scabbard.trace.device.dummyFunc(ptr addrspace(1) nocapture %0, i32 %1, float %2, i16 %3, ptr addrspace(1) %4, ptr addrspace(1) nocapture readnone %5, ptr addrspace(1) nocapture readonly %6) local_unnamed_addr #5 comdat {
; this is a manually instrumented in function not corresponding to any other function in the original code, body omitted to meet post character limit
ret void
}
; Function Attrs: mustprogress nofree noinline norecurse nounwind willreturn memory(argmem: readwrite)
define internal fastcc void @"scabbard.trace.device.trace_append$mem"(ptr nocapture %0, i16 zeroext %1, ptr %2, ptr nocapture readonly %3, i32 %4, i32 %5) unnamed_addr #6 {
; this is a manually instrumented in function not corresponding to any other function in the original code, body omited to meet post character limit
ret void
}
; Function Attrs: mustprogress nofree noinline norecurse nounwind willreturn memory(argmem: readwrite)
define internal fastcc void @"scabbard.trace.device.trace_append$alloc"(ptr nocapture %0, i16 zeroext %1, ptr %2, ptr nocapture readonly %3, i32 %4, i32 %5, i64 %6) unnamed_addr #6 {
; this is a manually instrumented in function not corresponding to any other function in the original code, body omited to meet post character limit
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #7
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workgroup.id.y() #8
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workgroup.id.z() #8
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workitem.id.y() #8
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workitem.id.z() #8
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #9
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #7
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn
define protected amdgpu_kernel void @_Z15tick_all_kernelPU7_AtomicmPmPli(ptr addrspace(1) nocapture %0, ptr addrspace(1) nocapture writeonly %1, ptr addrspace(1) nocapture %2, i32 %3, ptr %4) local_unnamed_addr #0 !dbg !1874 {
unreachable
}
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local fastcc void @_Z10dummy_workPl(ptr nocapture %0, ptr %1) unnamed_addr #1 !dbg !1880 {
unreachable
}
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn
define protected amdgpu_kernel void @_Z12dummy_kernelv(ptr %0) local_unnamed_addr #0 !dbg !1884 {
fence syncscope("workgroup") release, !dbg !1885
tail call void @llvm.amdgcn.s.barrier(), !dbg !1889
fence syncscope("workgroup") acquire, !dbg !1890
ret void, !dbg !1891
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define protected amdgpu_kernel void @_Z10dummy_initPl(ptr addrspace(1) nocapture writeonly %0, ptr %1) local_unnamed_addr #2 !dbg !1892 {
unreachable
}
attributes #0 = { convergent mustprogress nofree norecurse nounwind willreturn "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
attributes #1 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
attributes #3 = { convergent mustprogress nocallback nofree nounwind willreturn }
attributes #4 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #5 = { mustprogress nofree norecurse nounwind willreturn memory(readwrite, inaccessiblemem: none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
attributes #6 = { mustprogress nofree noinline norecurse nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
attributes #7 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #9 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #10 = { nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!1747, !1748, !1749, !1750, !1751, !1752, !1753}
!opencl.ocl.version = !{!1754, !1754}
!llvm.ident = !{!1755, !1755}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.0 23483 7208e8d15fbf218deb74483ea8c549c67ca4985e)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !11, globals: !28, imports: !72, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "test/managed_clock_test.cpp", directory: "/g/g11/osterhou/repos/scabbard", checksumkind: CSK_MD5, checksum: "84927e4bf98b7e7efaf685dc29fe5570")
!2 = !{!3}
!3 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "_Lock_policy", scope: !5, file: !4, line: 49, baseType: !6, size: 32, elements: !7, identifier: "_ZTSN9__gnu_cxx12_Lock_policyE")
; ...
; removed metadata to meet post character limit
; ...
; __CLANG_OFFLOAD_BUNDLE____END__ hip-amdgcn-amd-amdhsa--gfx90a
I apologize for such a long example module, but the example needed to contain a device function that is called from a global function in order to show that some functions don’t get the unreachable
treatment.
In this example the “dummy_work
” or @_Z10dummy_workPl
/ @_Z10dummy_workPl__old__scabbard_instr_replaced__old__
function is a device function that does not get “optimised out” or whatever is happening after my pass runs.
NOTE:
I can confirm via debug print statements during my pass that these new functions contain bodies that are appropriately instrumented before my pass finishes, but the output file generated by the command returns as you see above.
Some other info, my pass should be running after all other optimization passes run given it is loaded last and registers at the end of the optimization passes with the pass manager at that time.
I do return PreservedAnalysis::none()
after my pass runs, but changing this does not seem to help at all.