Oh, and you might also want to check that you're actually getting SSE

code; if you accidentally disable SSE somehow, you'll end up with x87

code, which will completely expand the vectors into scalars.

Using <4 x float> types:

faust -llvm -vec -vs 4 -lv 1 plus.dsp -o plus_vec.ll

define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <4 x float>** noalias %inputs, <4 x float>** noalias %outputs) nounwind readnone ssp {

entry:

%input_array_ptr0 = getelementptr inbounds <4 x float>** %inputs, i64 0

%input0 = load <4 x float>** %input_array_ptr0

%input_array_ptr1 = getelementptr inbounds <4 x float>** %inputs, i64 1

%input1 = load <4 x float>** %input_array_ptr1

%input_array_ptr2 = getelementptr inbounds <4 x float>** %inputs, i64 2

%input2 = load <4 x float>** %input_array_ptr2

%input_array_ptr3 = getelementptr inbounds <4 x float>** %inputs, i64 3

%input3 = load <4 x float>** %input_array_ptr3

%output_array_ptr0 = getelementptr inbounds <4 x float>** %outputs, i64 0

%output0 = load <4 x float>** %output_array_ptr0

%out = icmp sgt i32 %count, 0

br i1 %out, label %convert, label %return

convert:

%count_64 = zext i32 %count to i64

br label %loop0

loop0:

%indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ]

%output_ptr0 = getelementptr <4 x float>* %output0, i64 %indvar

%input_ptr1 = getelementptr <4 x float>* %input1, i64 %indvar

%fVector0 = load <4 x float>* %input_ptr1, align 16;

%input_ptr0 = getelementptr <4 x float>* %input0, i64 %indvar

%fVector1 = load <4 x float>* %input_ptr0, align 16;

%fVector2 = fadd <4 x float> %fVector1, %fVector0;

%input_ptr3 = getelementptr <4 x float>* %input3, i64 %indvar

%fVector3 = load <4 x float>* %input_ptr3, align 16;

%input_ptr2 = getelementptr <4 x float>* %input2, i64 %indvar

%fVector4 = load <4 x float>* %input_ptr2, align 16;

%fVector5 = fadd <4 x float> %fVector4, %fVector3;

%fVector6 = fmul <4 x float> %fVector5, %fVector2;

store <4 x float> %fVector6, <4 x float>* %output_ptr0, align 16

%indvar.next = add i64 %indvar, 1

%exitcond = icmp eq i64 %indvar.next, %count_64

br i1 %exitcond, label %return, label %loop0

return:

ret void

}

Then llc -O3 plus_vec.ll gives :

_llvm_compute: ## @llvm_compute

## BB#0: ## %entry

testl %esi, %esi

jle LBB8_3

## BB#1: ## %convert

movq (%rcx), %rax

movq 24(%rdx), %rcx

movq 16(%rdx), %rdi

movq 8(%rdx), %r8

movq (%rdx), %rdx

xorl %r9d, %r9d

movl %esi, %esi

.align 4, 0x90

LBB8_2: ## %loop0

## Loop Depth 1

## Loop Header

## Inner Loop

movaps (%rdx,%r9), %xmm0

movaps (%rdi,%r9), %xmm1

addps (%r8,%r9), %xmm0

addps (%rcx,%r9), %xmm1

mulps %xmm0, %xmm1

movaps %xmm1, (%rax,%r9)

addq $16, %r9

decq %rsi

jne LBB8_2

LBB8_3: ## %return

ret

So generated code seems correct.

Thanks

