2020年8月13日木曜日

FPGA + OpenCL on Coursera (exercise in Week3)

 Again, someone who has no experience may need some help;


SimpleKernel.cl

1
2
3
4
5
6
7
8
9
//ACL Kernel
__kernel void SimpleKernel (__global float* restrict x, __global float* restrict y, __global float* restrict z, uint vectorSize)
{
    int i;

    for(i=0; i<vectorSize; i++){
       z[i] =x[i]*y[i];
    }
}


main.cpp
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <math.h>
#include <fstream>
#include <stdio.h>
#include <string>

#include "CL/cl.hpp"
#include "utility.h"

static const cl_uint vectorSize = 4096; //must be evenly divisible by workSize
static const cl_uint workSize = 256;

//#define EXERCISE1

int main(void)
{
	cl_int err;

	//Setup Platform
	//Get Platform ID
	std::vector<cl::Platform> PlatformList;

	////////////// Exercise 1 Step 2.3
	err = cl::Platform::get(&PlatformList);
	checkErr(err, "Get Platform List");
	checkErr(PlatformList.size()>=1 ? CL_SUCCESS : -1, "cl::Platform::get");
	print_platform_info(&PlatformList);
	//Look for Fast Emulation Platform
	uint current_platform_id=get_platform_id_with_string(&PlatformList, "Emulation");
	printf("Using Platform: %d\n\n", current_platform_id);
	
	//Setup Device
	//Get Device ID
	std::vector<cl::Device> DeviceList;

	////////////// Exercise 1 Step 2.5
	err = PlatformList[current_platform_id].getDevices(CL_DEVICE_TYPE_ALL, &DeviceList);
	checkErr(err, "Get Devices");
	print_device_info(&DeviceList);
	
	//Create Context
	////////////// Exercise 1 Step 2.6 
	cl::Context mycontext = cl::Context(DeviceList);

	checkErr(err, "Context Constructor");
	
	//Create Command queue
	////////////// Exercise 1 Step 2.7
	cl::CommandQueue myqueue = cl::CommandQueue(mycontext , DeviceList[0], 0, &err);
	checkErr(err, "Queue Constructor");

	//Create Buffers for input and output
	////////////// Exercise 1 Step 2.8
	cl::Buffer Buffer_In (mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_In2(mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_Out(mycontext, CL_MEM_WRITE_ONLY,  vectorSize * sizeof(cl_float));

	//Inputs and Outputs to Kernel, X and Y are inputs, Z is output
	//The aligned attribute is used to ensure alignment
	//so that DMA could be used if we were working with a real FPGA board
	cl_float X[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Y[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Z[vectorSize]  __attribute__ ((aligned (64)));

	//Allocates memory with value from 0 to 1000
	cl_float LO= 0;   cl_float HI=1000;
	fill_generate(X, Y, Z, LO, HI, vectorSize);

	//Write data to device
	////////////// Exercise 1 Step 2.9
	err = myqueue.enqueueWriteBuffer(Buffer_In , true, 0, vectorSize * sizeof(cl_float), X);
	checkErr(err, "WriteBuffer");
	err = myqueue.enqueueWriteBuffer(Buffer_In2, true, 0, vectorSize * sizeof(cl_float), Y);
	checkErr(err, "WriteBuffer 2");
	myqueue.finish();

#ifndef EXERCISE1
	// create the kernel
	const char *kernel_name = "SimpleKernel";

	//Read in binaries from file
	std::ifstream aocx_stream("../SimpleKernel.aocx", std::ios::in|std::ios::binary);
	checkErr(aocx_stream.is_open() ? CL_SUCCESS:-1, "SimpleKernel.aocx");
	std::string prog(std::istreambuf_iterator<char>(aocx_stream), (std::istreambuf_iterator<char>()));
	cl::Program::Binaries mybinaries (1, std::make_pair(prog.c_str(), prog.length()));

	// Create the Program from the AOCX file.
	////////////////////// Exercise 2 Step 2.3    ///////////////////
	cl::Program myprogram(mycontext, DeviceList, mybinaries);
	checkErr(err, "Program Constructor");

	// build the program
	//////////////      Compile the Kernel.... For Intel FPGA, nothing is done here, but this conforms to the standard
	//////////////       Exercise 2   Step 2.4    ///////////////////
	err= myprogram.build(DeviceList,NULL);
	checkErr(err, "Build Program");

	// create the kernel
	//////////////       Find Kernel in Program
	//////////////       Exercise 2   Step 2.5    ///////////////////
	cl::Kernel mykernel(myprogram, kernel_name,&err);
	checkErr(err, "Kernel Creation");

	//////////////     Set Arguments to the Kernels
	//////////////       Exercise 2   Step 2.6    ///////////////////
	err = mykernel.setArg(0, Buffer_In);
	checkErr(err, "Arg 0");
	err = mykernel.setArg(1, Buffer_In2);
	checkErr(err, "Arg 1");
	err = mykernel.setArg(2, Buffer_Out);
	checkErr(err, "Arg 2");
	err = mykernel.setArg(3, vectorSize);
	checkErr(err, "Arg 3");


	printf("\nLaunching the kernel...\n");
	
	// Launch Kernel
	//////////////       Exercise 2   Step 2.7    ///////////////////
	err= myqueue.enqueueNDRangeKernel(mykernel,cl::NullRange,cl::NDRange(1),cl::NullRange);
	checkErr(err, "Kernel Execution");

	// read the output
	//////////////       Exercise 2   Step 2.8    ///////////////////
	err= myqueue.enqueueReadBuffer(Buffer_Out , true, 0, vectorSize * sizeof(cl_float), Z);
	checkErr(err, "Read Buffer");

	err=myqueue.finish();
	checkErr(err, "Finish Queue");
	
	float CalcZ[vectorSize];

	for (uint i=0; i<vectorSize; i++)
	{
		//////////////  Equivalent Code running on CPUs
		//////////////       Exercise 2   Step 2.9    ///////////////////
		CalcZ[i] = X[i]*Y[i];
				
	}

	//Print Performance Results
	verification (X, Y, Z, CalcZ, vectorSize);

#endif

    return 1;
}

FPGA + OpenCL on Coursera (exercise in Week2)

 To be honest, the exercise is a bit inconsiderate, and so, I would like to show mine here.


  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <math.h>
#include <fstream>
#include <stdio.h>
#include <string>

#include "CL/cl.hpp"
#include "utility.h"

static const cl_uint vectorSize = 4096; //must be evenly divisible by workSize
static const cl_uint workSize = 256;

#define EXERCISE1

int main(void)
{
	cl_int err;

	//Setup Platform
	//Get Platform ID
	std::vector<cl::Platform> PlatformList;

	////////////// Exercise 1 Step 2.3
	err = cl::Platform::get(&PlatformList);
	checkErr(err, "Get Platform List");
	checkErr(PlatformList.size()>=1 ? CL_SUCCESS : -1, "cl::Platform::get");
	print_platform_info(&PlatformList);
	//Look for Fast Emulation Platform
	uint current_platform_id=get_platform_id_with_string(&PlatformList, "Emulation");
	printf("Using Platform: %d\n\n", current_platform_id);
	
	//Setup Device
	//Get Device ID
	std::vector<cl::Device> DeviceList;

	////////////// Exercise 1 Step 2.5
	err = PlatformList[current_platform_id].getDevices(CL_DEVICE_TYPE_ALL, &DeviceList);
	checkErr(err, "Get Devices");
	print_device_info(&DeviceList);
	
	//Create Context
	////////////// Exercise 1 Step 2.6 
	cl::Context mycontext = cl::Context(DeviceList);

	checkErr(err, "Context Constructor");
	
	//Create Command queue
	////////////// Exercise 1 Step 2.7
	cl::CommandQueue myqueue = cl::CommandQueue(mycontext , DeviceList[0], 0, &err);
	checkErr(err, "Queue Constructor");

	//Create Buffers for input and output
	////////////// Exercise 1 Step 2.8
	cl::Buffer Buffer_In (mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_In2(mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_Out(mycontext, CL_MEM_WRITE_ONLY,  vectorSize * sizeof(cl_float));

	//Inputs and Outputs to Kernel, X and Y are inputs, Z is output
	//The aligned attribute is used to ensure alignment
	//so that DMA could be used if we were working with a real FPGA board
	cl_float X[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Y[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Z[vectorSize]  __attribute__ ((aligned (64)));

	//Allocates memory with value from 0 to 1000
	cl_float LO= 0;   cl_float HI=1000;
	fill_generate(X, Y, Z, LO, HI, vectorSize);

	//Write data to device
	////////////// Exercise 1 Step 2.9
	err = myqueue.enqueueWriteBuffer(Buffer_In , true, 0, vectorSize * sizeof(cl_float), X);
	checkErr(err, "WriteBuffer");
	err = myqueue.enqueueWriteBuffer(Buffer_In2, true, 0, vectorSize * sizeof(cl_float), Y);
	checkErr(err, "WriteBuffer 2");
	myqueue.finish();

#ifndef EXERCISE1
	// create the kernel
	const char *kernel_name = "SimpleKernel";

	//Read in binaries from file
	std::ifstream aocx_stream("../SimpleKernel.aocx", std::ios::in|std::ios::binary);
	checkErr(aocx_stream.is_open() ? CL_SUCCESS:-1, "SimpleKernel.aocx");
	std::string prog(std::istreambuf_iterator<char>(aocx_stream), (std::istreambuf_iterator<char>()));
	cl::Program::Binaries mybinaries (1, std::make_pair(prog.c_str(), prog.length()));

	// Create the Program from the AOCX file.
	////////////////////// Exercise 2 Step 2.3    ///////////////////
	cl::Program
	checkErr(err, "Program Constructor");

	// build the program
	//////////////      Compile the Kernel.... For Intel FPGA, nothing is done here, but this conforms to the standard
	//////////////       Exercise 2   Step 2.4    ///////////////////
	err=
	checkErr(err, "Build Program");

	// create the kernel
	//////////////       Find Kernel in Program
	//////////////       Exercise 2   Step 2.5    ///////////////////
	cl::Kernel
	checkErr(err, "Kernel Creation");

	//////////////     Set Arguments to the Kernels
	//////////////       Exercise 2   Step 2.6    ///////////////////
	err =
	checkErr(err, "Arg 0");
	err =
	checkErr(err, "Arg 1");
	err =
	checkErr(err, "Arg 2");
	err =
	checkErr(err, "Arg 3");


	printf("\nLaunching the kernel...\n");
	
	// Launch Kernel
	//////////////       Exercise 2   Step 2.7    ///////////////////
	err=
	checkErr(err, "Kernel Execution");

	// read the output
	//////////////       Exercise 2   Step 2.8    ///////////////////
	err=
	checkErr(err, "Read Buffer");

	err=myqueue.finish();
	checkErr(err, "Finish Queue");
	
	float CalcZ[vectorSize];

	for (uint i=0; i<vectorSize; i++)
	{
		//////////////  Equivalent Code running on CPUs
		//////////////       Exercise 2   Step 2.9    ///////////////////
		CalcZ[i] =
				
	}

	//Print Performance Results
	verification (X, Y, Z, CalcZ, vectorSize);

#endif

    return 1;
}

FPGA + OpenCL environment building

 An environment should be built by myself, and I am writing down what I have done so far to get it.


(1) Install Centos 7 (this is the requirement by Intel's SDK)
(2) Download SDK from here. Version 19.1 might be the best for the course. (30GB!)
(3) Install everything under /root. Root access is necessary for the emulator.
(4) Install eclipse. Download from here. The latest version might be OK.

I tried to use an eclipse included in SDK, but there was an issue in a menu window.
Actually, no need to stick to the bundled one, at least for the exercise in Week2.

Coursera's FPGA + OpenCL course

As a part of my training duties, I have just started a course on Coursera.
It seems, it is actually just a copy of Intel's one but OK to me.


https://www.coursera.org/learn/opencl-fpga-introduction

2020年8月5日水曜日

ハンガリーで結婚式に参加した

妻の同僚のハンガリー人が結婚式に私も招待してくれたので、参加してきました。
ブダペストから更に来るまで1時間半もかかるようなところなので、
前泊(正確には前々日に移動だけ)して、ブダペスト観光しつつ、当日は新郎新婦の友人に車で送ってもらいました(外国人には公共交通機関での移動は無理っぽい)。

式が14時始まりで、披露宴が終わるのが4AMという死にそうな予定で、もちろん我々は途中で諦めましたが、それでも1時。翌日は新郎新婦の車に乗せてもらってウィーンまで帰ったのですが、途中で逃げた我々より新郎新婦のほうがしっかりしていた。体力の違いか、若さか。

写真は観光に行った、Gerbeaudという喫茶店、Matthias Church, 世界遺産になってるメトロ(M1線)、鎖橋、結婚式の式次第に式の様子。








2020年8月4日火曜日

List of gates available in Q#

I always lose the list of Q#, and so, I would like to put it as memo here.

https://docs.microsoft.com/en-us/qsharp/api/qsharp/microsoft.quantum.intrinsic

Grover's algorithm (Algorithm 1, in Section 2)

Continuation of Q# practice (Bell state).
Grover's algorithm described in Section 2, Algorithm 1.
In this example, Oracle is expressed just with Toffoli (CCNOT) gate, and this is not a general implementation, as far as I understand.

In the literature, they constructed CCNOT with natively supported gates, but Q# has it, and I just used it (control and ancillary are a bit confusing to me though).

By running my sample code the numbers of measurements observed in 1000 trials are;
00:0, 01:0, 10:0, 11:1000
Unlike the previous one, the result is completely the same as the theory.

I feel I should split Q# and C# parts in a better way.
In any case, I am getting used to it, hopefully.

Sample codes are;
Q# (Program.qs)
/// # Summary
/// 
namespace Grover {

    open Microsoft.Quantum.Canon;
    open Microsoft.Quantum.Intrinsic;
    

    operation Grover() : (Int,Int,Int,Int) {
        Message("This is Q# implementation of Algorithm 1 in Quantum Algorithm Implementaions for Beginners");

        mutable n00 = 0;
        mutable n01 = 0;
        mutable n10 = 0;
        mutable n11 = 0;

        for(trial in 1..1000){
        using ( (qubit0, qubit1, qubit2) = (Qubit(), Qubit(),Qubit()) ){
            // qubit0: ancillar
            // qubit1, 2: x1, x2

            //Initializaiton
            X(qubit0);
            H(qubit0);
            H(qubit1);
            H(qubit2);
            // qubit0 is called ancillar, looking at the following site and the article, it should be the target
            // https://quantumcomputing.stackexchange.com/questions/3943/how-do-you-implement-the-toffoli-gate-using-only-single-qubit-and-cnot-gates
            CCNOT(qubit1,qubit2,qubit0); //Toffoli gate

            // Grover diffusion operator
            H(qubit1);
            X(qubit1);
            H(qubit1);

            H(qubit2);
            X(qubit2);

            CNOT(qubit2,qubit1);

            H(qubit1);
            X(qubit1);
            H(qubit1);

            X(qubit2);
            H(qubit2);

            let res0 = M(qubit1);
            let res1 = M(qubit2);
            if(res0 == Zero){
                if(res1 == Zero){
                    set n00 += 1;
                }else{
                    set n01 += 1;
                }
            }else{
                if(res1 == Zero){
                    set n10 += 1;
                }else{
                    set n11 += 1;
                }

            }
            Reset(qubit0);
            Reset(qubit1);
            Reset(qubit2);
        }
        }
        return(n00,n01,n10,n11);
    }
}



C# (Driver.cs)
using System;

using Microsoft.Quantum.Simulation.Core;
using Microsoft.Quantum.Simulation.Simulators;

namespace Grover
{
    class Driver
    {
        static void Main(string[] args)
        {
            
            using (var qsim = new QuantumSimulator())
            {
                long a,b,c,d;
                (a,b,c,d) = Grover.Run(qsim).Result;
                Console.WriteLine("{0},{1},{2},{3}",a,b,c,d);
            }
        }
    }
}