Nearly all architectures support “speculative” loads if they’re fully aligned, because they can only be in one page so either the first element faults because that page would fault, or the entire vector is loaded successfully.
Of course that’s just the assembly-level semantics, IIRC LLVM doesn’t have an instruction that does that…though it could be useful to add. I’d expect the semantics to be something like returning poison
in all lanes that are outside of an allocation, but the load itself is only UB if it’s not completely inside the pages containing the allocation. So, e.g. a lame strlen
using that kind of speculative load (I didn’t test it, but it probably works except for the TODO):
https://llvm.godbolt.org/z/vEfeW545f
define i64 @my_strlen(ptr %p) {
start:
%addr = ptrtoint ptr %p to i64
%align = and i64 %addr, 15
%aligned = icmp eq i64 %align, 0
br i1 %aligned, label %fast_path, label %not_aligned
fast_path:
%old_len = phi i64 [0, %start], [%len, %fast_path]
%fast_p = phi ptr [%p, %start], [%next_p, %fast_path]
%bytes = call <16 x i8> @llvm_speculative_load_16xi8_align_16(ptr %fast_p)
%frozen_bytes = freeze <16 x i8> %bytes
%found_zeros = icmp eq <16 x i8> %frozen_bytes, zeroinitializer
%found_zeros2 = bitcast <16 x i1> %found_zeros to i16
%zero_index2 = call i16 @llvm.cttz.i16(i16 %found_zeros2, i1 false)
%zero_index = zext i16 %zero_index2 to i64
%len = add i64 %zero_index, %old_len
%next_p = getelementptr inbounds i8, ptr %fast_p, i64 16
%done = icmp ne i64 %zero_index, 16
br i1 %done, label %finish, label %fast_path
finish:
ret i64 %len
not_aligned:
; TODO
ret i64 0
}
define private <16 x i8> @llvm_speculative_load_16xi8_align_16(ptr %p) {
; we don't have a speculative instruction, so just use load as a demo
%retval = load <16 x i8>, ptr %p, align 16
ret <16 x i8> %retval
}