Runtime error when increasing iters on gpu

Hi, I am running a program using gpu dialect, and I find it will occur a segment fault when increasing running iter. The problem also happen when running a vector add example

func.func @main() {
    %cn = arith.constant 32768: index
    %d_a = gpu.alloc (%cn) : memref<?xf32>
    %d_b = gpu.alloc (%cn) : memref<?xf32> 
    %d_c = gpu.alloc (%cn) : memref<?xf32>
    %h_a = memref.alloc(%cn) : memref<?xf32>
    %h_b = memref.alloc(%cn) : memref<?xf32>
    %h_c = memref.alloc(%cn) : memref<?xf32>
    %unrank_ha = memref.cast %h_a : memref<?xf32> to memref<*xf32>
    %unrank_hb = memref.cast %h_b : memref<?xf32> to memref<*xf32>
    %unrank_hc = memref.cast %h_c : memref<?xf32> to memref<*xf32>
        
    func.call @test_init_f32(%unrank_ha, %cn) : (memref<*xf32>, index) -> ()
    func.call @test_init_f32(%unrank_hb, %cn) : (memref<*xf32>, index) -> ()
    func.call @test_init_f32(%unrank_hc, %cn) : (memref<*xf32>, index) -> ()

    gpu.memcpy %d_a, %h_a : memref<?xf32>, memref<?xf32>
    gpu.memcpy %d_b, %h_b : memref<?xf32>, memref<?xf32>
    gpu.memcpy %d_c, %h_c : memref<?xf32>, memref<?xf32>
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index    
    %iter = arith.constant 80000 : index
    %dim_sz = arith.constant 128 : index
    %grid_sz = arith.constant 256: index

    scf.for %i = %c0 to %iter step %c1 {
        // print iter
        %i_iter = arith.index_cast %i : index to i32
        func.call @debug(%i_iter) : (i32) -> ()
        gpu.launch blocks(%arg1, %arg2, %arg3) in (%sz_x = %grid_sz, %sz_y = %c1, %sz_z = %c1) threads(%arg4, %arg5, %arg6) in (%tx = %dim_sz, %ty = %c1, %tz = %c1) { 
            %threadidx = gpu.thread_id  x
            %blockid = gpu.block_id  x
            %block_offset = arith.muli %blockid, %dim_sz : index
            %thread_offset = arith.addi %threadidx, %block_offset : index
            // threadidx + blockdim * blockIdx
            %mem_a = memref.load %d_a[%thread_offset] : memref<?xf32>
            %mem_b = memref.load %d_b[%thread_offset] : memref<?xf32>
            %mem_c = memref.load %d_c[%thread_offset] : memref<?xf32>
            // c[i] = a[i] + b[i] + c[i]
            %add_ab = arith.addf %mem_a, %mem_b : f32
            %add_abc = arith.addf %add_ab, %mem_c : f32
            memref.store %add_abc, %d_c[%thread_offset] : memref<?xf32>
            gpu.terminator
        }
    }

    gpu.dealloc %d_a : memref<?xf32>
    gpu.dealloc %d_b : memref<?xf32>
    gpu.dealloc %d_c : memref<?xf32>

    return
}

the program will hit a segment fault at 65372 iters and it doesn’t change when the vector size changed to 16384 or 65536. the problem can be replicated on different A100 card. I am running at llvm@79786c4d23f1fd7af438e4fd4e33ec109626bee4 and use the following pipelines

lower-test:
	@${BUDDY_OPT} ${INPUT} \
		--gpu-kernel-outlining \
		-gpu-async-region \
		-buffer-deallocation \
		-memref-expand \
		-convert-scf-to-cf \
		--convert-gpu-to-nvvm --gpu-to-cubin  \
		-convert-index-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -convert-func-to-llvm --gpu-to-llvm \
		-reconcile-unrealized-casts -o ./test.mlir

thanks for your help!