[RFC] Supporting more early-exit loops

Nearly all architectures support “speculative” loads if they’re fully aligned, because they can only be in one page so either the first element faults because that page would fault, or the entire vector is loaded successfully.

Of course that’s just the assembly-level semantics, IIRC LLVM doesn’t have an instruction that does that…though it could be useful to add. I’d expect the semantics to be something like returning poison in all lanes that are outside of an allocation, but the load itself is only UB if it’s not completely inside the pages containing the allocation. So, e.g. a lame strlen using that kind of speculative load (I didn’t test it, but it probably works except for the TODO):

https://llvm.godbolt.org/z/vEfeW545f

define i64 @my_strlen(ptr %p) {
start:
    %addr = ptrtoint ptr %p to i64
    %align = and i64 %addr, 15
    %aligned = icmp eq i64 %align, 0
    br i1 %aligned, label %fast_path, label %not_aligned

fast_path:
    %old_len = phi i64 [0, %start], [%len, %fast_path]
    %fast_p = phi ptr [%p, %start], [%next_p, %fast_path]
    %bytes = call <16 x i8> @llvm_speculative_load_16xi8_align_16(ptr %fast_p)
    %frozen_bytes = freeze <16 x i8> %bytes
    %found_zeros = icmp eq <16 x i8> %frozen_bytes, zeroinitializer
    %found_zeros2 = bitcast <16 x i1> %found_zeros to i16
    %zero_index2 = call i16 @llvm.cttz.i16(i16 %found_zeros2, i1 false)
    %zero_index = zext i16 %zero_index2 to i64
    %len = add i64 %zero_index, %old_len
    %next_p = getelementptr inbounds i8, ptr %fast_p, i64 16
    %done = icmp ne i64 %zero_index, 16
    br i1 %done, label %finish, label %fast_path

finish:
    ret i64 %len

not_aligned:
    ; TODO
    ret i64 0
}

define private <16 x i8> @llvm_speculative_load_16xi8_align_16(ptr %p) {
    ; we don't have a speculative instruction, so just use load as a demo
    %retval = load <16 x i8>, ptr %p, align 16
    ret <16 x i8> %retval
}