Hola LLVMers,
I’m working on engaging SSE via the LLVM vector ops on x86. I had some questions a while back that you all helped out on, but I’m seeing similar issues and was hoping you’d have some ideas. Below is the dump of the LLVM IR of a program which is designed to take a vector stored in a float*, build an LLVM vector from it, copy it to another vector, and then take it apart and store it back out in another float*. This will live on the boundary of our system and would be a function designed to promote a raw, potentially unaligned, value into a vector that the LLVM system can work with a whole bunch.
It is dying trying to store a our working vector into one of the LLVM vectors created on the stack. Despite the align-16 directive on the alloca instruction, it is not always aligning to a 16-byte boundary.
I did a sync and build this morning, so my LLVM is quite fresh.
Thank you for any help!
Chuck.
My program:
target datalayout = “E-p:32:32:32-i1:8:8:8-i8:8:8:8-i32:32:32:32-f32:32:32:32”
define void @promoteCopyAndReturn(float* %promoteReturn, float* %toPromote) {
Entry:
%Promoted_promoteReturn_Ptr = alloca <4 x float>, align 16 ; <<4 x float>*> [#uses=2]
%Promoted_toPromote_Ptr = alloca <4 x float>, align 16 ; <<4 x float>*> [#uses=2]
%elemPtr = getelementptr float* %toPromote, i32 0 ; <float*> [#uses=1]
%elemLoaded = load float* %elemPtr ; [#uses=1]
%vectorPromotion = insertelement <4 x float> undef, float %elemLoaded, i32 0 ; <<4 x float>> [#uses=1]
%elemPtr1 = getelementptr float* %toPromote, i32 1 ; <float*> [#uses=1]
%elemLoaded2 = load float* %elemPtr1 ; [#uses=1]
%vectorPromotion3 = insertelement <4 x float> %vectorPromotion, float %elemLoaded2, i32 1 ; <<4 x float>> [#uses=1]
%elemPtr4 = getelementptr float* %toPromote, i32 2 ; <float*> [#uses=1]
%elemLoaded5 = load float* %elemPtr4 ; [#uses=1]
%vectorPromotion6 = insertelement <4 x float> %vectorPromotion3, float %elemLoaded5, i32 2 ; <<4 x float>> [#uses=1]
%elemPtr7 = getelementptr float* %toPromote, i32 3 ; <float*> [#uses=1]
%elemLoaded8 = load float* %elemPtr7 ; [#uses=1]
%vectorPromotion9 = insertelement <4 x float> %vectorPromotion6, float %elemLoaded8, i32 3 ; <<4 x float>> [#uses=1]
store <4 x float> %vectorPromotion9, <4 x float>* %Promoted_toPromote_Ptr <<<<<<<<-------- dying when it executes this line (assembly below)
%toPromote10 = load <4 x float>* %Promoted_toPromote_Ptr ; <<4 x float>> [#uses=1]
br label %Body
Body: ; preds = %Entry
store <4 x float> %toPromote10, <4 x float>* %Promoted_promoteReturn_Ptr
br label %Exit
Exit: ; preds = %Body
%vectorToDemote = load <4 x float>* %Promoted_promoteReturn_Ptr ; <<4 x float>> [#uses=4]
%elemToDemote = extractelement <4 x float> %vectorToDemote, i32 0 ; [#uses=1]
%elemPtr11 = getelementptr float* %promoteReturn, i32 0 ; <float*> [#uses=1]
store float %elemToDemote, float* %elemPtr11
%elemToDemote12 = extractelement <4 x float> %vectorToDemote, i32 1 ; [#uses=1]
%elemPtr13 = getelementptr float* %promoteReturn, i32 1 ; <float*> [#uses=1]
store float %elemToDemote12, float* %elemPtr13
%elemToDemote14 = extractelement <4 x float> %vectorToDemote, i32 2 ; [#uses=1]
%elemPtr15 = getelementptr float* %promoteReturn, i32 2 ; <float*> [#uses=1]
store float %elemToDemote14, float* %elemPtr15
%elemToDemote16 = extractelement <4 x float> %vectorToDemote, i32 3 ; [#uses=1]
%elemPtr17 = getelementptr float* %promoteReturn, i32 3 ; <float*> [#uses=1]
store float %elemToDemote16, float* %elemPtr17
ret void
}
Assembler (intel format):
15c00010 83ec2c sub esp,2Ch
15c00013 8b442434 mov eax,dword ptr [esp+34h]
15c00017 f30f10400c movss xmm0,dword ptr [eax+0Ch]
15c0001c f30f104804 movss xmm1,dword ptr [eax+4]
15c00021 0f14c8 unpcklps xmm1,xmm0
15c00024 f30f104008 movss xmm0,dword ptr [eax+8]
15c00029 f30f1010 movss xmm2,dword ptr [eax]
15c0002d 0f14d0 unpcklps xmm2,xmm0
15c00030 0f14d1 unpcklps xmm2,xmm1
15c00033 0f291424 movaps xmmword ptr [esp],xmm2 ss:0023:0012f238=0012f2580122ef730000000100000000
The relevant registers:
Xmm2 8.000000e+000: 4.000000e+000: 2.000000e+000: 1.000000e+000 // the vector got nicely constructed
Esp 12f238 // but it has noplace to go and throws a general-protection exception.