2019年6月6日木曜日

Octave + cuFFT + OpenACC. Works with GCC-8


I succeeded to run cuFFT + OpenACC + Octave with GCC-8.
Waiting for a response from PGI.

$ sudo apt-get install gcc-8-offload-nvptx
$ sudo apt-get install g++-8


$ cat Makefile 
CC= g++-8 -fopenacc
LD=g++-8 -fopenacc -L/usr/local/cuda-9.2/lib64 -lcufft
C_OPT=-fPIC
C_INC=-I/usr/local/cuda-9.2/include
OCT_INC=-I/usr/include/octave-4.2.2/octave/.. -I/usr/include/octave-4.2.2/octave
OCT_LIBS=-L/usr/lib/x86_64-linux-gnu/octave/4.2.2 -loctinterp -loctave 

.SUFFIXES:
.SUFFIXES: .o .cc 

.cc.o:; $(CC) $(C_INC) $(OCT_INC) -c $(C_OPT) $<
.f90.o:; $(F90) -c $(F_OPT) $<

OBJS = \
       testFFTGPU.o

TARGET = testFFTGPU.oct

all: $(TARGET)

$(TARGET): $(OBJS)
$(LD) $(OCT_INC) -shared -Wl,-Bsymbolic  -o $(TARGET)  $(OBJS)  $(OCT_LIBS) -Wl,-Bsymbolic-functions

run:
./run.sh

clean:
rm -f *.o *.d *.mod $(TARGET) *~
rm -f work.pc work.pcl *.optrpt log.txt
rm -f *.gpu *.ptx *.oct

$ cat run.sh 
LD_PRELOAD=/opt/pgi/linux86-64-nollvm/2019/cuda/9.2/lib64/libcudart.so.9.2:/opt/pgi/linux86-64-llvm/2019/cuda/9.2/lib64/libcufft.so.9.2 octave callOct.m 

$ cat callOct.m 

mat = rand(10,10);
mat1 = mat;
mat2 = mat;
fftGPU = testFFTGPU(mat2);

ifft_mat1 = ifft2(mat1);
fftGPU - ifft_mat1

$ cat testFFTGPU.cc 
#include<math.h>
#include <octave/oct.h>
#include <octave/parse.h>
#include <complex>
#include <chrono>
#include <fftw3.h>
#include "openacc.h"
#include "cufft.h"

void inv_CUFFT(Complex *in_data, Complex *out_data, int nc, int nr, void *stream)
{
    cufftHandle plan;
    cufftResult ResPlan = cufftPlan2d(&plan, nc,nr, CUFFT_Z2Z);
    cufftSetStream(plan, (cudaStream_t)stream);
    cufftResult ResExec = cufftExecZ2Z(plan, 
                                      (cufftDoubleComplex*)in_data, 
                                      (cufftDoubleComplex*)out_data,
                                      CUFFT_INVERSE);
    cufftDestroy(plan);
}


DEFUN_DLD(testFFTGPU, args, ,
            "main body;")
{
   fprintf(stderr,"device type: %d\n", acc_get_device_type());
   fprintf(stderr,"Num devices: %d\n", acc_get_num_devices(acc_device_nvidia));

   ComplexMatrix Matrix(args(0).complex_matrix_value());
   octave_value_list retval;

   ComplexMatrix out(Matrix.dims());

   double *pmat = reinterpret_cast<double *> (const_cast<Complex *>(Matrix.fortran_vec()));
   double *pout = reinterpret_cast<double *> (const_cast<Complex *>(out.fortran_vec()));
//   Complex *pmat = (Matrix.fortran_vec());
//   Complex *pout = (out.fortran_vec());



   static dim_vector dv = Matrix.dims();

   int Nc = dv(0);
   int Nr = dv(1);

#pragma acc data copy(pmat[0:Nc*Nr*2],pout[0:Nc*Nr*2])
{
   void *stream = acc_get_cuda_stream(acc_async_sync);
#pragma acc host_data use_device(pmat,pout)
{
   inv_CUFFT((Complex*)pmat,(Complex*)pout,Nc,Nr,stream);
}
#pragma acc parallel
   for(int i=0;i<Nr*Nc*2;i++){
      pout[i] = pout[i]/double(Nr*Nc);
   }
}

   retval(0) = out;
   return retval;
}


$ sh run.sh 
octave: X11 DISPLAY environment variable not set
octave: disabling GUI features
device type: 5
Num devices: 4
ans =

 Columns 1 and 2:

   0.0000e+00 + 0.0000e+00i   3.4694e-18 + 2.6021e-18i
   6.9389e-18 - 8.6736e-18i   0.0000e+00 - 6.9389e-18i
  -1.9082e-17 - 1.5613e-17i  -1.7347e-18 - 3.4694e-18i
   3.4694e-18 + 1.0408e-17i  -5.4210e-18 + 3.4694e-18i
  -2.7756e-17 - 1.0408e-17i   0.0000e+00 + 0.0000e+00i
   4.5103e-17 + 0.0000e+00i   6.0715e-18 + 0.0000e+00i
  -2.7756e-17 + 1.0408e-17i   0.0000e+00 + 0.0000e+00i
   3.4694e-18 - 1.0408e-17i  -2.3852e-18 - 4.3368e-18i
  -1.9082e-17 + 1.5613e-17i  -2.6021e-18 - 3.4694e-18i
   6.9389e-18 + 8.6736e-18i   3.4694e-18 + 5.2042e-18i

 Columns 3 and 4:

  -1.5613e-17 + 0.0000e+00i  -5.2042e-18 + 6.9389e-18i
   1.0408e-17 - 3.4694e-18i   6.9389e-18 + 3.4694e-18i
   0.0000e+00 - 6.7221e-18i   0.0000e+00 - 6.9389e-18i
  -6.9389e-18 + 7.8063e-18i  -6.9389e-18 + 0.0000e+00i
  -3.4694e-18 + 6.9389e-18i  -1.1276e-17 + 4.4452e-18i
  -3.4694e-18 + 3.4694e-18i  -3.4694e-18 + 0.0000e+00i
  -3.9031e-18 + 0.0000e+00i  -3.4694e-18 + 0.0000e+00i
   6.9389e-18 + 0.0000e+00i  -1.0408e-17 - 3.3314e-18i
   0.0000e+00 - 3.4694e-18i   1.7347e-18 + 1.7347e-18i
  -6.9389e-18 + 1.0408e-17i   0.0000e+00 - 3.4694e-18i

 Columns 5 and 6:

  -1.0408e-17 - 9.3241e-18i  -3.9899e-17 + 0.0000e+00i
   0.0000e+00 - 3.4694e-18i  -5.6379e-18 + 3.8489e-18i
   6.9389e-18 + 6.9389e-18i   6.9389e-18 + 1.1926e-18i
   8.6736e-18 - 3.4694e-18i   0.0000e+00 + 8.6736e-18i
   1.0408e-17 - 1.0408e-17i   6.9389e-18 + 1.7347e-17i
  -1.0408e-17 + 1.0408e-17i  -5.2042e-18 + 0.0000e+00i
   0.0000e+00 + 0.0000e+00i   6.9389e-18 - 1.7347e-17i
   0.0000e+00 + 8.6736e-18i   0.0000e+00 - 8.6736e-18i
   3.4694e-18 + 3.4694e-18i   6.9389e-18 - 1.1926e-18i
   0.0000e+00 + 3.4694e-18i  -5.6379e-18 - 3.8489e-18i

 Columns 7 and 8:

  -1.0408e-17 + 9.3241e-18i  -5.2042e-18 - 6.9389e-18i
   0.0000e+00 - 3.4694e-18i   0.0000e+00 + 3.4694e-18i
   3.4694e-18 - 3.4694e-18i   1.7347e-18 - 1.7347e-18i
   0.0000e+00 - 8.6736e-18i  -1.0408e-17 + 3.3314e-18i
   0.0000e+00 + 0.0000e+00i  -3.4694e-18 + 0.0000e+00i
  -1.0408e-17 - 1.0408e-17i  -3.4694e-18 + 0.0000e+00i
   1.0408e-17 + 1.0408e-17i  -1.1276e-17 - 4.4452e-18i
   8.6736e-18 + 3.4694e-18i  -6.9389e-18 + 0.0000e+00i
   6.9389e-18 - 6.9389e-18i   0.0000e+00 + 6.9389e-18i
   0.0000e+00 + 3.4694e-18i   6.9389e-18 - 3.4694e-18i

 Columns 9 and 10:

  -1.5613e-17 + 0.0000e+00i   3.4694e-18 - 2.6021e-18i
  -6.9389e-18 - 1.0408e-17i   3.4694e-18 - 5.2042e-18i
   0.0000e+00 + 3.4694e-18i  -2.6021e-18 + 3.4694e-18i
   6.9389e-18 + 0.0000e+00i  -2.3852e-18 + 4.3368e-18i
  -3.9031e-18 + 0.0000e+00i   0.0000e+00 + 0.0000e+00i
  -3.4694e-18 - 3.4694e-18i   6.0715e-18 + 0.0000e+00i
  -3.4694e-18 - 6.9389e-18i   0.0000e+00 + 0.0000e+00i
  -6.9389e-18 - 7.8063e-18i  -5.4210e-18 - 3.4694e-18i
   0.0000e+00 + 6.7221e-18i  -1.7347e-18 + 3.4694e-18i
   1.0408e-17 + 3.4694e-18i   0.0000e+00 + 6.9389e-18i