Using the following code:
program bm
use module, only: func1d
implicit none
integer, parameter :: size = 4000
real*8 :: aa(size)
real*8 :: bb(size)
real*8 :: cc(size * 2)
aa = 1
bb = 2
cc = 3
call do_bench("a + b", aa, bb, size)
call do_bench("a + c", aa, cc, size)
call do_bench("a + c(2:)", aa, cc(2:), size)
call do_bench("a + c(::2)", aa, cc(::2), size)
contains
subroutine do_bench(msg, aa, bb, size)
character(*) :: msg
integer, parameter :: loops = 250000
integer:: size
real*8 :: aa(1:)
real*8 :: bb(1:)
real*8 :: time, time_start, time_end
integer::i
real*8 :: expect
real*8 :: check
expect = aa(1) + loops * bb(1)
call CPU_TIME(time_start)
do i = 1, loops
call func1d(aa, bb, size)
end do
call CPU_TIME(time_end)
time = time_end - time_start
check = checksum(aa, size)
if (check .ne. expect * size) then
print *, msg, ": Checksum mismatch fot ", check, " Expect:", size * expect
end if
print "(A12, F8.5, A2)", msg, time, " s"
end subroutine do_bench
function checksum(arr, size)
integer ::size
real*8 :: arr(1:)
integer :: i
real*8 :: checksum
real*8 :: sum
sum = 0
do i = 1, size
sum = sum + arr(i)
end do
checksum = sum
end function checksum
end program bm
With module.f90 containing this:
subroutine func1d(a, b, n)
real*8 :: a(:), b(:)
integer :: n
integer :: i
do i=1, n
a(i) = a(i) + b(i)
end do
end subroutine func1d
I get the following results when compiling with -Ofast -fno-loop-versioning:
$ ./bm1d-noloop
a + b 0.61709 s
a + c 0.58340 s
a + c(2:) 0.58243 s
a + c(::2) 0.63377 s
And with loop versioning enabled (leaving out the -fno-loop-versioning and using -Ofast):
$ ./bm1d-loop
a + b 0.37326 s
a + c 0.36045 s
a + c(2:) 0.36667 s
a + c(::2) 0.63324 s
That’s roughly a 38% speed up.
The efficiency goes down with larger array size, because L1 cache hit ratio goes down.