Hi,
i compiled the following code by llvm18, illegal instruction was encountered when executing. And the ICE was encountered when compiled by llvm19.
The compile option is:
-march=armv8.6-a+sve+sve2+sme -rtlib=compiler-rt -fopenmp
#include <arm_sve.h>
#include <iostream>
#include <arm_sme.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <pthread.h>
const int M = 3456;
const int N = 3456;
const int K = 3456;
const int align_len = 64;
const int c_align_len = 64;
const int blk_sizei = 128;
const int blk_sizej = 64;
const int blk_sizek = 128;
__attribute__((noinline)) __arm_new("za") void gemmSME(double* matA,
double* matB, double* matC, int M, int N, int K, double alpha) __arm_streaming
{
/* 2x4 tiles*/
svbool_t pci = svwhilelt_b64_u32(0, 8);
#pragma omp parallel for num_threads(32)
for(int ci =0;ci< 8;ci++)
{
//__asm__ volatile("smstart sm");
//__asm__ volatile("smstart za");
svld1_hor_za64(0, ci, pci, matC + (ci)*N + (8*0));
svld1_hor_za64(1, ci, pci, matC + (ci)*N + (8*1));
svld1_hor_za64(2, ci, pci, matC + (ci)*N + (8*2));
svld1_hor_za64(3, ci, pci, matC + (ci)*N + (8*3));
//__asm__ volatile("smstop sm");
//__asm__ volatile("smstop za");
}
}
int main() __arm_streaming
{
double* A = (double*)aligned_alloc(align_len,sizeof(double) * M * K);
double* B = (double*)aligned_alloc(align_len,sizeof(double) * K * N);
double* C = (double*)aligned_alloc(c_align_len,sizeof(double) * M * N);
double* A2 = (double*)aligned_alloc(align_len,sizeof(double) * M * K);
double* B2 = (double*)aligned_alloc(align_len,sizeof(double) * K * N);
double* C2 = (double*)aligned_alloc(c_align_len,sizeof(double) * M * N);
int num_threads = 1;
double alpha = 1.0;
// SME gemm
gemmSME(A, B, C, M, N, K, alpha);
}
Under llvm18, I found out that it was because the streaming mode attribute was missing in the outlined function by openmp. So I inserted handwritten assembly before and after the openmp loop to turn on streaming mode:
__asm__ volatile("smstop sm");
__asm__ volatile("smstop za");
and
__asm__ volatile("smstop sm");
__asm__ volatile("smstop za");
and the code is executed correctly.
I am wondering is anything missing when sme combined with openmp.