RFC Loop Versioning for unit stride

Using the following code:

program bm
  use module, only: func1d

  implicit none
  integer, parameter :: size = 4000
  real*8 :: aa(size)
  real*8 :: bb(size)
  real*8 :: cc(size * 2)

  aa = 1
  bb = 2
  cc = 3

  call do_bench("a + b", aa, bb, size)
  call do_bench("a + c", aa, cc, size)
  call do_bench("a + c(2:)", aa, cc(2:), size)
  call do_bench("a + c(::2)", aa, cc(::2), size)
  
contains
  subroutine do_bench(msg, aa, bb, size)
    character(*) :: msg
    integer, parameter :: loops = 250000
    integer:: size
    real*8 :: aa(1:)
    real*8 :: bb(1:)
    
    real*8 :: time, time_start, time_end
    integer::i

    real*8 :: expect
    real*8 :: check

    expect = aa(1) + loops * bb(1)

    call CPU_TIME(time_start)
    do i = 1, loops
       call func1d(aa, bb, size)
    end do
    call CPU_TIME(time_end)
    time = time_end - time_start
    check = checksum(aa, size)
    if (check .ne. expect * size) then
       print *, msg, ": Checksum mismatch fot ",  check, " Expect:", size * expect
    end if
    print "(A12, F8.5, A2)", msg, time, " s"
  end subroutine do_bench
    
  function checksum(arr, size)
    integer ::size
    real*8  :: arr(1:)
    integer :: i
    real*8 :: checksum
    real*8 :: sum
    sum = 0
    do i = 1, size
       sum = sum + arr(i)
    end do
    checksum = sum
  end function checksum
    
end program bm

With module.f90 containing this:

  subroutine func1d(a, b, n)
    real*8  :: a(:), b(:)
    integer :: n
    integer :: i
    
    do i=1, n
       a(i) = a(i) + b(i)
    end do
  end subroutine func1d

I get the following results when compiling with -Ofast -fno-loop-versioning:

$ ./bm1d-noloop 
       a + b 0.61709 s
       a + c 0.58340 s
   a + c(2:) 0.58243 s
  a + c(::2) 0.63377 s

And with loop versioning enabled (leaving out the -fno-loop-versioning and using -Ofast):

$ ./bm1d-loop 
       a + b 0.37326 s
       a + c 0.36045 s
   a + c(2:) 0.36667 s
  a + c(::2) 0.63324 s

That’s roughly a 38% speed up.

The efficiency goes down with larger array size, because L1 cache hit ratio goes down.

1 Like