私のシステムのgfortranコンパイラーでコンパイルされたFortranコードよりも高速に(numpyを使用してBLAS操作を行う)Pythonコードを作成できることがわかりました。
$ gfortran -o sn6a sn6a.f90 -O3 -march=native
$ ./sn6a 5500
1.274224153
1.274224153
1.274224153
1.9640001 sec per iteration
$ python ./foo1.py
1.27422415279
1.27422415279
1.27422415279
1.20618661245 sec per iteration
foo1.py:
import numpy
import scipy.linalg
import timeit
def specNormDot(A,n):
u = numpy.ones(n)
v = numpy.zeros(n)
for i in xrange(10):
v = numpy.dot(numpy.dot(A,u),A)
u = numpy.dot(numpy.dot(A,v),A)
print numpy.sqrt(numpy.vdot(u,v)/numpy.vdot(v,v))
return
n = 5500
ii, jj = numpy.meshgrid(numpy.arange(1,n+1), numpy.arange(1,n+1))
A = (1./((ii+jj-2.)*(ii+jj-1.)/2. + ii))
t = timeit.Timer("specNormDot(A,n)", "from __main__ import specNormDot,A,n")
ntries = 3
print t.timeit(ntries)/ntries, "sec per iteration"
sn6a.f90、非常にわずかに変更されたspectrum_norm6.f90:
program spectral_norm6
! This uses spectral_norm3 as a starting point, but does not use the
! Fortrans
! builtin matmul and dotproduct (to make sure it does not call some
! optimized
! BLAS behind the scene).
implicit none
integer, parameter :: dp = kind(0d0)
real(dp), allocatable :: A(:, :), u(:), v(:)
integer :: i, j, n
character(len=6) :: argv
integer :: calc, iter
integer, parameter :: niters=3
call get_command_argument(1, argv)
read(argv, *) n
allocate(u(n), v(n), A(n, n))
do j = 1, n
do i = 1, n
A(i, j) = Ac(i, j)
end do
end do
call tick(calc)
do iter=1,niters
u = 1
do i = 1, 10
v = AvA(A, u)
u = AvA(A, v)
end do
write(*, "(f0.9)") sqrt(dot_product2(u, v) / dot_product2(v, v))
enddo
print *, tock(calc)/niters, ' sec per iteration'
contains
pure real(dp) function Ac(i, j) result(r)
integer, intent(in) :: i, j
r = 1._dp / ((i+j-2) * (i+j-1)/2 + i)
end function
pure function matmul2(v, A) result(u)
! Calculates u = matmul(v, A), but much faster (in gfortran)
real(dp), intent(in) :: v(:), A(:, :)
real(dp) :: u(size(v))
integer :: i
do i = 1, size(v)
u(i) = dot_product2(A(:, i), v)
end do
end function
pure real(dp) function dot_product2(u, v) result(w)
! Calculates w = dot_product(u, v)
real(dp), intent(in) :: u(:), v(:)
integer :: i
w = 0
do i = 1, size(u)
w = w + u(i)*v(i)
end do
end function
pure function matmul3(A, v) result(u)
! Calculates u = matmul(v, A), but much faster (in gfortran)
real(dp), intent(in) :: v(:), A(:, :)
real(dp) :: u(size(v))
integer :: i, j
u = 0
do j = 1, size(v)
do i = 1, size(v)
u(i) = u(i) + A(i, j)*v(j)
end do
end do
end function
pure function AvA(A, v) result(u)
! Calculates u = matmul2(matmul3(A, v), A)
! In gfortran, this function is sligthly faster than calling
! matmul2(matmul3(A, v), A) directly.
real(dp), intent(in) :: v(:), A(:, :)
real(dp) :: u(size(v))
u = matmul2(matmul3(A, v), A)
end function
subroutine tick(t)
integer, intent(OUT) :: t
call system_clock(t)
end subroutine tick
! returns time in seconds from now to time described by t
real function tock(t)
integer, intent(in) :: t
integer :: now, clock_rate
call system_clock(now,clock_rate)
tock = real(now - t)/real(clock_rate)
end function tock
end program