Hi Jeroen,
Sorry, I missed that. I tried the patch, and this program:
#include <stdint.h>
#define __remote attribute((address_space(1)))
__remote int* A;
__remote int* B;
void vec_add(__remote int* __restrict a,
__remote int* __restrict b,
int n) {
#pragma unroll 4
for(int i=0; i<n; ++i) {
a[i] += b[i];
}
}
int main(int argc, char** argv) {
__remote int* __restrict a = A;
__remote int* __restrict b = B;
#pragma unroll 4
for(int i=0; i<4; ++i) {
a[i] += b[i];
}
return 0;
}
vec_add give following schedule:
*** Final schedule for %bb.8 ***
SU(0): %33:gpr = LW %56:gpr, -8 :: (load 4 from %ir.scevgep8, !tbaa !14, !noalias !13, addrspace 1)
SU(1): %34:gpr = LW %55:gpr, -8 :: (load 4 from %ir.scevgep14, !tbaa !14, !noalias !13, addrspace 1)
SU(4): %36:gpr = LW %56:gpr, -4 :: (load 4 from %ir.scevgep10, !tbaa !14, !noalias !13, addrspace 1)
SU(5): %37:gpr = LW %55:gpr, -4 :: (load 4 from %ir.scevgep16, !tbaa !14, !noalias !13, addrspace 1)
SU(8): %39:gpr = LW %56:gpr, 0 :: (load 4 from %ir.lsr.iv6, !tbaa !14, !noalias !13, addrspace 1)
SU(9): %40:gpr = LW %55:gpr, 0 :: (load 4 from %ir.lsr.iv12, !tbaa !14, !noalias !13, addrspace 1)
SU(12): %42:gpr = LW %56:gpr, 4 :: (load 4 from %ir.scevgep9, !tbaa !14, !noalias !13, addrspace 1)
SU(13): %43:gpr = LW %55:gpr, 4 :: (load 4 from %ir.scevgep15, !tbaa !14, !noalias !13, addrspace 1)
SU(2): %35:gpr = nsw ADD %34:gpr, %33:gpr
SU(3): SW %35:gpr, %55:gpr, -8 :: (store 4 into %ir.scevgep14, !tbaa !14, !noalias !13, addrspace 1)
SU(6): %38:gpr = nsw ADD %37:gpr, %36:gpr
SU(7): SW %38:gpr, %55:gpr, -4 :: (store 4 into %ir.scevgep16, !tbaa !14, !noalias !13, addrspace 1)
SU(10): %41:gpr = nsw ADD %40:gpr, %39:gpr
SU(11): SW %41:gpr, %55:gpr, 0 :: (store 4 into %ir.lsr.iv12, !tbaa !14, !noalias !13, addrspace 1)
SU(14): %44:gpr = nsw ADD %43:gpr, %42:gpr
SU(15): SW %44:gpr, %55:gpr, 4 :: (store 4 into %ir.scevgep15, !tbaa !14, !noalias !13, addrspace 1)
SU(16): %57:gpr = nuw nsw ADDI %57:gpr, 4
SU(17): %56:gpr = ADDI %56:gpr, 16
SU(18): %55:gpr = ADDI %55:gpr, 16
And main gives following schedule:
*** Final schedule for %bb.0 ***
SU(0): %2:gpr = LUI target-flags(riscv-hi) @A
SU(2): %4:gpr = LUI target-flags(riscv-hi) @B
SU(3): %5:gpr = LW %4:gpr, target-flags(riscv-lo) @B :: (dereferenceable load 4 from @B, !tbaa !9, !noalias !22)
SU(1): %3:gpr = LW %2:gpr, target-flags(riscv-lo) @A :: (dereferenceable load 4 from @A, !tbaa !9, !noalias !22)
SU(4): %6:gpr = LW %5:gpr, 0 :: (load 4 from %ir.3, !tbaa !14, !noalias !22, addrspace 1)
SU(5): %7:gpr = LW %3:gpr, 0 :: (load 4 from %ir.1, !tbaa !14, !noalias !22, addrspace 1)
SU(6): %8:gpr = nsw ADD %7:gpr, %6:gpr
SU(7): SW %8:gpr, %3:gpr, 0 :: (store 4 into %ir.1, !tbaa !14, !noalias !22, addrspace 1)
SU(8): %9:gpr = LW %5:gpr, 4 :: (load 4 from %ir.arrayidx.1, !tbaa !14, !noalias !22, addrspace 1)
SU(9): %10:gpr = LW %3:gpr, 4 :: (load 4 from %ir.arrayidx1.1, !tbaa !14, !noalias !22, addrspace 1)
SU(10): %11:gpr = nsw ADD %10:gpr, %9:gpr
SU(11): SW %11:gpr, %3:gpr, 4 :: (store 4 into %ir.arrayidx1.1, !tbaa !14, !noalias !22, addrspace 1)
SU(12): %12:gpr = LW %5:gpr, 8 :: (load 4 from %ir.arrayidx.2, !tbaa !14, !noalias !22, addrspace 1)
SU(13): %13:gpr = LW %3:gpr, 8 :: (load 4 from %ir.arrayidx1.2, !tbaa !14, !noalias !22, addrspace 1)
SU(14): %14:gpr = nsw ADD %13:gpr, %12:gpr
SU(15): SW %14:gpr, %3:gpr, 8 :: (store 4 into %ir.arrayidx1.2, !tbaa !14, !noalias !22, addrspace 1)
SU(16): %15:gpr = LW %5:gpr, 12 :: (load 4 from %ir.arrayidx.3, !tbaa !14, !noalias !22, addrspace 1)
SU(17): %16:gpr = LW %3:gpr, 12 :: (load 4 from %ir.arrayidx1.3, !tbaa !14, !noalias !22, addrspace 1)
SU(18): %17:gpr = nsw ADD %16:gpr, %15:gpr
SU(20): $x10 = COPY $x0
SU(19): SW %17:gpr, %3:gpr, 12 :: (store 4 into %ir.arrayidx1.3, !tbaa !14, !noalias !22, addrspace 1)
This is great! Memory accesses are marked noalias. I wanted memory accesses to be annotated as noalias to basically remove loop-carried dependencies so that I can reorder them for efficient scheduling. But when I look at Schedule DAG,
For vec_add I see something like this (note BotQ.A, scheduler can choose any of those => no loop carried dependence):
- Latency limited.
** ScheduleDAGMILive::schedule picking next node
Queue BotQ.P:
Queue BotQ.A: 16 15 11 7 3
Cand SU(16) ORDER
Pick Bot ORDER
For main, at best I see something like this:
** ScheduleDAGMILive::schedule picking next node
Cycle: 45 BotQ.A
Queue BotQ.P:
Queue BotQ.A: 12 13
Cand SU(12) ORDER
Cand SU(13) ORDER
In theory, schedules for vec_add and main should be the same right? Is there anything else I should do to make the __restrict remove loop-carried dependence in main?
Attaching IR and scheduler log for reference…

tmp.log (78.1 KB)
tmp.ll (10.7 KB)