Clang/LLVM doesn't emit IR for cuda template kernel


I’m trying to generate IR from CUDA C++. This works fine until templates come into play. Using a template device function within a global function works well. The specific instantiations of the template function are generated. However, trying to forward a template parameter from the kernel launch code to the device function breaks somehow the transformation process and an empty .ll file is emitted. It looks like the kernel template is not instantiated at all. Doing the instantiation by hand again works.

The used code:

#ifndef CUDACC

#include <stddef.h>

#define constant attribute((constant))

#define device attribute((device))

#define global attribute((global))

#define host attribute((host))

#define shared attribute((shared))

#define launch_bounds(…) attribute((launch_bounds(VA_ARGS)))

struct dim3 {

unsigned x, y, z;

host device dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}


typedef struct cudaStream *cudaStream_t;

int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,

cudaStream_t stream = 0);



device int blubblub(T& a, float& b, double& c)


a = a * b;

b = b - c;

c = a * c;

return a;



global void kernel(T a, float b, double c)


int result = blubblub(a, b, c);


int main()


kernel<<<dim3(1), dim3(1)>>>(5, 0.7f, 12.34);

return 0;


The command line to compile:

clang++ -x cuda -S -emit-llvm -target nvptx64 -Xclang -fcuda-is-device -o

clang version 3.5 (trunk 200831)

Any help to fix this problem is highly appreciated.

Michael Haidl