櫛田慶幸の日記@Wien: GPGPU

ラベル GPGPU の投稿を表示しています。すべての投稿を表示

2021年4月22日木曜日

"lto1: error: ‘-fcf-protection=full’ is not supported for this target" on GCC-10

Probably someone has updated the DGX station system, and g++-10 compiler complains as

lto1: error: ‘-fcf-protection=full’ is not supported for this target.

To prevent this I needed to give the linker options as;

LD= g++-10 -fopenacc -foffload="-fcf-protection=none -fno-stack-protector"

2020年6月16日火曜日

ROCm on Ubuntu 20.04

Just installed ROCm on Ubuntu 20.04, even though my GPU is a bit (too) old (R9 290).

Basically just followed the following site to the line of;

sudo apt install rocm-libs hipcub miopen-hip

https://dev.to/shawonashraf/setting-up-your-amd-gpu-for-tensorflow-in-ubuntu-20-04-31f5

A bit confusing point is; we need to do

% . /etc/profile.d/rocm.sh

after the installation of packages.

2019年10月14日月曜日

Accepted at SC19 (WACCPD19)!
See you guys in Denver.

Acceleration in Acoustic Wave Propagation Modelling using OpenACC/OpenMP and its hybrid for the Global Monitoring System

https://waccpd.org/program/

2019年7月23日火曜日

working example of OpenACC enabled Oct file with cuFFT

Zenodoにアップロードしました。

https://zenodo.org/record/3345905#.XTa_7PIzZaQ

2019年6月6日木曜日

Octave + cuFFT + OpenACC. Works with GCC-8

I succeeded to run cuFFT + OpenACC + Octave with GCC-8.
Waiting for a response from PGI.

$ sudo apt-get install gcc-8-offload-nvptx

$ sudo apt-get install g++-8

$ cat Makefile
CC= g++-8 -fopenacc
LD=g++-8 -fopenacc -L/usr/local/cuda-9.2/lib64 -lcufft
C_OPT=-fPIC
C_INC=-I/usr/local/cuda-9.2/include
OCT_INC=-I/usr/include/octave-4.2.2/octave/.. -I/usr/include/octave-4.2.2/octave
OCT_LIBS=-L/usr/lib/x86_64-linux-gnu/octave/4.2.2 -loctinterp -loctave

.SUFFIXES:
.SUFFIXES: .o .cc

.cc.o:; $(CC) $(C_INC) $(OCT_INC) -c $(C_OPT) $<
.f90.o:; $(F90) -c $(F_OPT) $<

OBJS = \
testFFTGPU.o

TARGET = testFFTGPU.oct

all: $(TARGET)

$(TARGET): $(OBJS)
$(LD) $(OCT_INC) -shared -Wl,-Bsymbolic -o $(TARGET) $(OBJS) $(OCT_LIBS) -Wl,-Bsymbolic-functions

run:
./run.sh

clean:
rm -f *.o *.d *.mod $(TARGET) *~
rm -f work.pc work.pcl *.optrpt log.txt
rm -f *.gpu *.ptx *.oct

$ cat run.sh

LD_PRELOAD=/opt/pgi/linux86-64-nollvm/2019/cuda/9.2/lib64/libcudart.so.9.2:/opt/pgi/linux86-64-llvm/2019/cuda/9.2/lib64/libcufft.so.9.2 octave callOct.m

$ cat callOct.m

mat = rand(10,10);

mat1 = mat;

mat2 = mat;

fftGPU = testFFTGPU(mat2);

ifft_mat1 = ifft2(mat1);

fftGPU - ifft_mat1

$ cat testFFTGPU.cc

#include<math.h>

#include <octave/oct.h>

#include <octave/parse.h>

#include <complex>

#include <chrono>

#include <fftw3.h>

#include "openacc.h"

#include "cufft.h"

void inv_CUFFT(Complex *in_data, Complex *out_data, int nc, int nr, void *stream)

{

cufftHandle plan;

cufftResult ResPlan = cufftPlan2d(&plan, nc,nr, CUFFT_Z2Z);

cufftSetStream(plan, (cudaStream_t)stream);

cufftResult ResExec = cufftExecZ2Z(plan,

(cufftDoubleComplex*)in_data,

(cufftDoubleComplex*)out_data,

CUFFT_INVERSE);

cufftDestroy(plan);

}

DEFUN_DLD(testFFTGPU, args, ,

"main body;")

{

fprintf(stderr,"device type: %d\n", acc_get_device_type());

fprintf(stderr,"Num devices: %d\n", acc_get_num_devices(acc_device_nvidia));

ComplexMatrix Matrix(args(0).complex_matrix_value());

octave_value_list retval;

ComplexMatrix out(Matrix.dims());

double *pmat = reinterpret_cast<double *> (const_cast<Complex *>(Matrix.fortran_vec()));

double *pout = reinterpret_cast<double *> (const_cast<Complex *>(out.fortran_vec()));

// Complex *pmat = (Matrix.fortran_vec());

// Complex *pout = (out.fortran_vec());

static dim_vector dv = Matrix.dims();

int Nc = dv(0);

int Nr = dv(1);

#pragma acc data copy(pmat[0:Nc*Nr*2],pout[0:Nc*Nr*2])

{

void *stream = acc_get_cuda_stream(acc_async_sync);

#pragma acc host_data use_device(pmat,pout)

{

inv_CUFFT((Complex*)pmat,(Complex*)pout,Nc,Nr,stream);

}

#pragma acc parallel

for(int i=0;i<Nr*Nc*2;i++){

pout[i] = pout[i]/double(Nr*Nc);

}

retval(0) = out;

return retval;

}

$ sh run.sh
octave: X11 DISPLAY environment variable not set
octave: disabling GUI features
device type: 5
Num devices: 4
ans =

Columns 1 and 2:

0.0000e+00 + 0.0000e+00i 3.4694e-18 + 2.6021e-18i
6.9389e-18 - 8.6736e-18i 0.0000e+00 - 6.9389e-18i
-1.9082e-17 - 1.5613e-17i -1.7347e-18 - 3.4694e-18i
3.4694e-18 + 1.0408e-17i -5.4210e-18 + 3.4694e-18i
-2.7756e-17 - 1.0408e-17i 0.0000e+00 + 0.0000e+00i
4.5103e-17 + 0.0000e+00i 6.0715e-18 + 0.0000e+00i
-2.7756e-17 + 1.0408e-17i 0.0000e+00 + 0.0000e+00i
3.4694e-18 - 1.0408e-17i -2.3852e-18 - 4.3368e-18i
-1.9082e-17 + 1.5613e-17i -2.6021e-18 - 3.4694e-18i
6.9389e-18 + 8.6736e-18i 3.4694e-18 + 5.2042e-18i

Columns 3 and 4:

-1.5613e-17 + 0.0000e+00i -5.2042e-18 + 6.9389e-18i
1.0408e-17 - 3.4694e-18i 6.9389e-18 + 3.4694e-18i
0.0000e+00 - 6.7221e-18i 0.0000e+00 - 6.9389e-18i
-6.9389e-18 + 7.8063e-18i -6.9389e-18 + 0.0000e+00i
-3.4694e-18 + 6.9389e-18i -1.1276e-17 + 4.4452e-18i
-3.4694e-18 + 3.4694e-18i -3.4694e-18 + 0.0000e+00i
-3.9031e-18 + 0.0000e+00i -3.4694e-18 + 0.0000e+00i
6.9389e-18 + 0.0000e+00i -1.0408e-17 - 3.3314e-18i
0.0000e+00 - 3.4694e-18i 1.7347e-18 + 1.7347e-18i
-6.9389e-18 + 1.0408e-17i 0.0000e+00 - 3.4694e-18i

Columns 5 and 6:

-1.0408e-17 - 9.3241e-18i -3.9899e-17 + 0.0000e+00i
0.0000e+00 - 3.4694e-18i -5.6379e-18 + 3.8489e-18i
6.9389e-18 + 6.9389e-18i 6.9389e-18 + 1.1926e-18i
8.6736e-18 - 3.4694e-18i 0.0000e+00 + 8.6736e-18i
1.0408e-17 - 1.0408e-17i 6.9389e-18 + 1.7347e-17i
-1.0408e-17 + 1.0408e-17i -5.2042e-18 + 0.0000e+00i
0.0000e+00 + 0.0000e+00i 6.9389e-18 - 1.7347e-17i
0.0000e+00 + 8.6736e-18i 0.0000e+00 - 8.6736e-18i
3.4694e-18 + 3.4694e-18i 6.9389e-18 - 1.1926e-18i
0.0000e+00 + 3.4694e-18i -5.6379e-18 - 3.8489e-18i

Columns 7 and 8:

-1.0408e-17 + 9.3241e-18i -5.2042e-18 - 6.9389e-18i
0.0000e+00 - 3.4694e-18i 0.0000e+00 + 3.4694e-18i
3.4694e-18 - 3.4694e-18i 1.7347e-18 - 1.7347e-18i
0.0000e+00 - 8.6736e-18i -1.0408e-17 + 3.3314e-18i
0.0000e+00 + 0.0000e+00i -3.4694e-18 + 0.0000e+00i
-1.0408e-17 - 1.0408e-17i -3.4694e-18 + 0.0000e+00i
1.0408e-17 + 1.0408e-17i -1.1276e-17 - 4.4452e-18i
8.6736e-18 + 3.4694e-18i -6.9389e-18 + 0.0000e+00i
6.9389e-18 - 6.9389e-18i 0.0000e+00 + 6.9389e-18i
0.0000e+00 + 3.4694e-18i 6.9389e-18 - 3.4694e-18i

Columns 9 and 10:

-1.5613e-17 + 0.0000e+00i 3.4694e-18 - 2.6021e-18i
-6.9389e-18 - 1.0408e-17i 3.4694e-18 - 5.2042e-18i
0.0000e+00 + 3.4694e-18i -2.6021e-18 + 3.4694e-18i
6.9389e-18 + 0.0000e+00i -2.3852e-18 + 4.3368e-18i
-3.9031e-18 + 0.0000e+00i 0.0000e+00 + 0.0000e+00i
-3.4694e-18 - 3.4694e-18i 6.0715e-18 + 0.0000e+00i
-3.4694e-18 - 6.9389e-18i 0.0000e+00 + 0.0000e+00i
-6.9389e-18 - 7.8063e-18i -5.4210e-18 - 3.4694e-18i
0.0000e+00 + 6.7221e-18i -1.7347e-18 + 3.4694e-18i
1.0408e-17 + 3.4694e-18i 0.0000e+00 + 6.9389e-18i

2019年5月25日土曜日

octave + cufft + pgi (openacc). !!!!!!!! This does not work !!!!!!!!!!

I tried to mimic the following page to call cuFFT from "octfile"(octave).

However, PGI compiler (19.4) dies.

https://devtalk.nvidia.com/default/topic/523203/openacc-toolkit/fft-using-openacc/

If you want to try, save the following files and type

$ sh compile.sh

$ octave callOct.m

============ compile.sh ==============

#!/bin/bash

env CXX="pgc++" \

CXXFLAGS="-fast -acc -ta=tesla,cuda9.2,managed -Minfo=accel -Mcudalib=cufft" \

XTRA_CXXFLAGS=" " \

CPPFLAGS=" " \

DL_LD="pgc++" \

mkoctfile --verbose testFFTGPU.cc

============ compile.sh ==============
x

=== testFFTGPU.cc ===

#include<math.h>

#include <octave/oct.h>

#include <octave/parse.h>

#include <complex>

#include <chrono>

#include <fftw3.h>

#include "openacc.h"

#include "cufft.h"

void inv_CUFFT(Complex *in_data, Complex *out_data, int nc, int nr, void *stream)

{

cufftHandle plan;

cufftResult ResPlan = cufftPlan2d(&plan, nc,nr, CUFFT_Z2Z);

cufftSetStream(plan, (cudaStream_t)stream);

cufftResult ResExec = cufftExecZ2Z(plan,

(cufftDoubleComplex*)in_data,

(cufftDoubleComplex*)out_data,

CUFFT_INVERSE);

cufftDestroy(plan);

}

DEFUN_DLD(testFFTGPU, args, ,

"main body;")

{

ComplexMatrix Matrix(args(0).complex_matrix_value());

octave_value_list retval;

ComplexMatrix out(Matrix.dims());

Complex *pmat = reinterpret_cast<Complex *> (const_cast<Complex *>(Matrix.fortran_vec()));

Complex *pout = reinterpret_cast<Complex *> (const_cast<Complex *>(out.fortran_vec()));

static dim_vector dv = Matrix.dims();

int Nc = dv(0);

int Nr = dv(1);

#pragma acc data copy(pmat[0:Nc*Nr],pout[0:Nc*Nr])

{

void *stream = acc_get_cuda_stream(acc_async_sync);

#pragma acc host_data use_device(pmat,pout)

{

inv_CUFFT(pmat,pout,Nc,Nr,stream);

}

retval(0) = out;

return retval;

}

=== end testFFTGPU.cc ==

=== callOct.m ===

mat = rand(100,100);

mat1 = mat;

mat2 = mat;

fftGPU = testFFTGPU(mat2);

ifft_mat1 = ifft2(mat1);

fftGPU - ifft_mat1

=== end callOct.m ===

2018年3月16日金曜日

openmp offloading のメモ

llvm clangがgpuのoffloadingに対応してるっぽいので確かめている。
上手く行ってないけど。
テスト用のコードをメモ。
#include<stdlib.h>
#include<stdio.h>
#include<omp.h>

int main(){

int i;
int N=100000;
float *vec;
float sum=0.0;

int isDevInit = -1;
#pragma omp target map(from:isDevInit)
{
isDevInit = omp_is_initial_device();
}

if (!isDevInit) {
printf("run on acc. %d\n",isDevInit);
}

vec=(float *)malloc(sizeof(float)*N);
for(i=0;i<N;i++){
vec[i] = (float)i;
}
#pragma omp target map(to:vec) map(tofrom:sum)
{
fprintf(stdout,"num dev %d\n",omp_get_num_devices());
#pragma omp parallel for reduction(+:sum)
for(i=0;i<N;i++){
sum += vec[i];
}
}
fprintf(stdout,"sum: %e\n",sum);

free(vec);

}

2016年7月12日火曜日

Open source OpenCL environment on Ubuntu 16.04

scientific linux 6の環境がようやく捨てられるので、ubuntu 16.04に移行。
とこｒが、Ubuntu 16.04はAMDのプロプライエタリドライバが使えないらしく、OpenCL環境をどうしたものかと思っていたら、ubuntuのレポジトリにあるものだけでとりあえずは作れるみたい。

apt-get install libclc-dev mesa-opencl-icd ocl-idc-opencl-dev
で必要なものが手に入って、
/usr/lib/x86_64-linux-gnu/libOpenCL.so.1 あたりとリンクすれば動く。

パフォーマンスは気になるけど、面倒じゃないのは助かる。
（ドライバに関しては、AMDが手伝ってるみたいなので少なくとも長期的には問題ない？）

2016年3月27日日曜日

UbuntuでACML

GPGPUされたBLAS/LAPACKを手軽に使いたかったので、Ubuntu (14.04)でACML6.1を使う。

sudo update-alternatives --install /usr/lib/libblas.so.3 libblas.so.3 /opt/ACML/6.1.0/gfortran64_mp/lib/libacml_mp.so 60

sudo update-alternatives --install /usr/lib/liblapack.so.3 liblapack.so.3 /opt/ACML/6.1.0/gfortran64_mp/lib/libacml_mp.so 60

して、登録後
sudo update-alternatives --config libblas.so.3
sudo update-alternatives --config liblapack.so.3

AMD A8-3850とRadeon R9 390という歪な構成ながら、問題サイズが大きければOpenBLASよりも速いこともあるようなので、まぁまぁ？ACMLの時はtopでみてる感じだとCPU使用率50%ぐらい。OpenBLASの時はもちろんほとんど400%

計算例：
Python:
>>> from numpy import *
>>> from time import *
>>> a=random.rand(10000,10000); t=time(); a=linalg.inv(a); time()-t

ACML: 46.84487009048462
OpenBLAS: 72.2047929763794

OCTAVE:
octave:1> a=rand(10000,10000); b=a;
octave:2> tic; c=a*b; toc;

ACML:4.4 sec

OpenBLAS: 46.8 sec

すばらしい。

追記：
octaveをインストールする時にいろいろ文句を言われたので、下の方がいいかも。
sudo update-alternatives --install /usr/lib/libblas.so.3 libblas.so.3 /opt/ACML/6.1.0/gfortran64_mp/lib/libacml_mp.so 60 --slave /usr/lib/libblas.so.3gf libblas.so.3gf /opt/ACML/6.1.0/gfortran64_mp/lib/libacml_mp.so

sudo update-alternatives --install /usr/lib/liblapack.so.3 liblapack.so.3 /opt/ACML/6.1.0/gfortran64_mp/lib/libacml_mp.so 60 --slave /usr/lib/liblapack.so.3gf liblapack.so.3gf /opt/ACML/6.1.0/gfortran64_mp/lib/libacml_mp.so

2015年8月1日土曜日

clBLAS

clBLASをインストールするにはACMLが必要。5.3.1を使った
srcディレクトリで
$ccmake .
して、ACMLのincludeディレクトリと、libacml.aを指定する。

2015年7月13日月曜日

Radeon R9 390 on Linux for GPGPU

下記の組み合わせで動いた
RHEL7はOpenCLのデバイスとして認識されなかった。

OS: Ubuntu 14.04 (LTS)
Driver: catalyst 15.7, (http://support.amd.com/en-us/download/desktop?os=Ubuntu+x86+64)

ちなみに、SDKは3.0beta

2015年7月5日日曜日

GPGPU (radeon r9 390)

研究用にGPGPU出来る環境が欲しくなったので、GPUを買う。
Radeon R9 390.
が、ケースに入らない！ので、3.5インチベイを破壊してスペースを作る。
HDDは5インチベイに入れるようにマウンタを注文した。
あと、電源が8pin *2だったので、変換コネクタも。

小さな切り傷がいっぱい出来てしまった。

登録: 投稿 (Atom)

櫛田慶幸の日記@Wien