Why does clang generate an x86 floating point instruction?

I’m curious about the x86 floating point instruction generated by clang.

I reckon somebody got confused.

Ubuntu clang version 15.0.7
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

clang++-15 -S tmp.ll -march=native -O3

        vmovddup        (%rsi), %xmm1                   # xmm1 = mem[0,0]
        vmovddup        (%rdi), %xmm2                   # xmm2 = mem[0,0]
        vmulsd  %xmm1, %xmm2, %xmm0
        vmulpd  8(%rdi), %xmm1, %xmm1
        vmulpd  8(%rsi), %xmm2, %xmm2
        vaddpd  %xmm2, %xmm1, %xmm1
        vmovhpd %xmm1, -8(%rsp)
        fldl    -8(%rsp)
        retq

From my C++ code I’m getting the following IR code:

; ModuleID = 'example'
source_filename = "example"

%DualNumber = type { [3 x double] }

define %DualNumber @multiply(ptr %0, ptr %1) {
entry:
  %2 = load %DualNumber, ptr %0, align 8
  %3 = extractvalue %DualNumber %2, 0, 0
  %4 = load %DualNumber, ptr %1, align 8
  %5 = extractvalue %DualNumber %4, 0, 0
  %6 = fmul double %3, %5
  %7 = alloca %DualNumber, align 8
  %8 = getelementptr %DualNumber, ptr %7, i32 0, i32 0, i32 0
  store double %6, ptr %8, align 8
  %9 = alloca i32, align 4
  store i32 1, ptr %9, align 4
  br label %cond

cond:                                             ; preds = %body, %entry
  %10 = load i32, ptr %9, align 4
  %11 = icmp ult i32 %10, 3
  br i1 %11, label %body, label %end

body:                                             ; preds = %cond
  %12 = getelementptr %DualNumber, ptr %7, i32 0, i32 0, i32 %10
  %13 = getelementptr %DualNumber, ptr %0, i32 0, i32 0, i32 %10
  %14 = load double, ptr %13, align 8
  %15 = fmul double %14, %5
  %16 = getelementptr %DualNumber, ptr %1, i32 0, i32 0, i32 %10
  %17 = load double, ptr %16, align 8
  %18 = fmul double %3, %17
  %19 = fadd double %18, %15
  store double %19, ptr %12, align 8
  %20 = add i32 %10, 1
  store i32 %20, ptr %9, align 4
  br label %cond

end:                                              ; preds = %cond
  %21 = load %DualNumber, ptr %7, align 8
  ret %DualNumber %21
}

which in turn was generated by the following C++ code:

#include <iostream>
#include "llvm/ADT/ArrayRef.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"

using namespace llvm;

static constexpr std::size_t SIZE = 3;

int main(int, char**) 
{
	LLVMContext sContext;
	Module sModule("example", sContext);
	IRBuilder<> sBuilder(sContext);
	const auto pDualNumberType = StructType::create(
		sContext,
		{	ArrayType::get(Type::getDoubleTy(sContext), SIZE)
		},
		"DualNumber"
	);
	// Define a function that multiplies two dual numbers.
	const auto pMultiplyType = FunctionType::get(
		pDualNumberType,
		{	pDualNumberType->getPointerTo(),
			pDualNumberType->getPointerTo()
		},
		false
	);
	const auto pFunction = Function::Create(
		pMultiplyType,
		Function::ExternalLinkage,
		"multiply",
		sModule
	);
	sBuilder.SetInsertPoint(
		BasicBlock::Create(
			sContext,
			"entry",
			pFunction
		)
	);
	const auto pX = pFunction->args().begin();
	const auto pXValue = sBuilder.CreateExtractValue(
		sBuilder.CreateLoad(
			pDualNumberType,
			pX
		),
		{	0,
			0
		}
	);
	const auto pY = std::next(pFunction->args().begin());
	const auto pYValue = sBuilder.CreateExtractValue(
		sBuilder.CreateLoad(
			pDualNumberType,
			pY
		),
		{	0,
			0
		}
	);
	const auto pRetValue = sBuilder.CreateFMul(pXValue, pYValue);
	const auto pResult = sBuilder.CreateAlloca(pDualNumberType);
	const auto pZero = ConstantInt::get(Type::getInt32Ty(sContext), 0);
	sBuilder.CreateStore(
		pRetValue,
		sBuilder.CreateGEP(
			pDualNumberType,
			pResult,
			{	pZero,
				pZero,
				pZero,
			}	
		)
	);
		/// loop variable
	const auto i = sBuilder.CreateAlloca(Type::getInt32Ty(sContext));
		/// initialization of the loop variable
	sBuilder.CreateStore(ConstantInt::get(Type::getInt32Ty(sContext), 1), i);
	const auto pConditionBlock = BasicBlock::Create(sContext, "cond", pFunction);
	const auto pLoopBody = BasicBlock::Create(sContext, "body", pFunction);
	//const auto incBB = BasicBlock::Create(sContext, "inc", pFunction);
	const auto pLoopExit = BasicBlock::Create(sContext, "end", pFunction);
	sBuilder.CreateBr(pConditionBlock);
	sBuilder.SetInsertPoint(pConditionBlock);
	const auto iVal = sBuilder.CreateLoad(Type::getInt32Ty(sContext), i);
	sBuilder.CreateCondBr(
		sBuilder.CreateICmpULT(iVal, ConstantInt::get(Type::getInt32Ty(sContext), SIZE)),
		pLoopBody,
		pLoopExit
	);
	sBuilder.SetInsertPoint(pLoopBody);
	sBuilder.CreateStore(
		sBuilder.CreateFAdd(
			sBuilder.CreateFMul(
				pXValue,
				sBuilder.CreateLoad(Type::getDoubleTy(sContext), sBuilder.CreateGEP(pDualNumberType, pY, {pZero, pZero, iVal}))
			),
			sBuilder.CreateFMul(
				sBuilder.CreateLoad(Type::getDoubleTy(sContext), sBuilder.CreateGEP(pDualNumberType, pX, {pZero, pZero, iVal})),
				pYValue
			)
		),
		sBuilder.CreateGEP(
			pDualNumberType,
			pResult,
			{	pZero,
				pZero,
				iVal
			}
		)
	);
	sBuilder.CreateStore(
		sBuilder.CreateAdd(iVal, ConstantInt::get(Type::getInt32Ty(sContext), 1)),
		i
	);
	sBuilder.CreateBr(pConditionBlock);
	sBuilder.SetInsertPoint(pLoopExit);
	sBuilder.CreateRet(sBuilder.CreateLoad(pDualNumberType, pResult));
	verifyFunction(*pFunction);
	sModule.print(outs(), nullptr);
}

The reason is that returning large objects is not possible – but they are passed back by an additional pointer parameter as the first parameter.

I think what is happening is that the x86-64 ABI handling in the backend partially inherits the x86-32 handling. x86-64 only defines xmm0 and xmm1 for returning doubles. Once we ran out of those registers we fell back to the x86-32 ABI and starting using x87 registers as that ABI does.

clang never generates IR directly returning a struct of 3 doubles.