櫛田慶幸の日記@Wien: 8月 2020

2020年8月13日木曜日

FPGA + OpenCL on Coursera (exercise in Week3)

Again, someone who has no experience may need some help;

SimpleKernel.cl

//ACL Kernel
__kernel void SimpleKernel (__global float* restrict x, __global float* restrict y, __global float* restrict z, uint vectorSize)
{
    int i;

    for(i=0; i<vectorSize; i++){
       z[i] =x[i]*y[i];
    }
}

main.cpp

#include <math.h>
#include <fstream>
#include <stdio.h>
#include <string>

#include "CL/cl.hpp"
#include "utility.h"

static const cl_uint vectorSize = 4096; //must be evenly divisible by workSize
static const cl_uint workSize = 256;

//#define EXERCISE1

int main(void)
{
	cl_int err;

	//Setup Platform
	//Get Platform ID
	std::vector<cl::Platform> PlatformList;

	////////////// Exercise 1 Step 2.3
	err = cl::Platform::get(&PlatformList);
	checkErr(err, "Get Platform List");
	checkErr(PlatformList.size()>=1 ? CL_SUCCESS : -1, "cl::Platform::get");
	print_platform_info(&PlatformList);
	//Look for Fast Emulation Platform
	uint current_platform_id=get_platform_id_with_string(&PlatformList, "Emulation");
	printf("Using Platform: %d\n\n", current_platform_id);
	
	//Setup Device
	//Get Device ID
	std::vector<cl::Device> DeviceList;

	////////////// Exercise 1 Step 2.5
	err = PlatformList[current_platform_id].getDevices(CL_DEVICE_TYPE_ALL, &DeviceList);
	checkErr(err, "Get Devices");
	print_device_info(&DeviceList);
	
	//Create Context
	////////////// Exercise 1 Step 2.6 
	cl::Context mycontext = cl::Context(DeviceList);

	checkErr(err, "Context Constructor");
	
	//Create Command queue
	////////////// Exercise 1 Step 2.7
	cl::CommandQueue myqueue = cl::CommandQueue(mycontext , DeviceList[0], 0, &err);
	checkErr(err, "Queue Constructor");

	//Create Buffers for input and output
	////////////// Exercise 1 Step 2.8
	cl::Buffer Buffer_In (mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_In2(mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_Out(mycontext, CL_MEM_WRITE_ONLY,  vectorSize * sizeof(cl_float));

	//Inputs and Outputs to Kernel, X and Y are inputs, Z is output
	//The aligned attribute is used to ensure alignment
	//so that DMA could be used if we were working with a real FPGA board
	cl_float X[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Y[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Z[vectorSize]  __attribute__ ((aligned (64)));

	//Allocates memory with value from 0 to 1000
	cl_float LO= 0;   cl_float HI=1000;
	fill_generate(X, Y, Z, LO, HI, vectorSize);

	//Write data to device
	////////////// Exercise 1 Step 2.9
	err = myqueue.enqueueWriteBuffer(Buffer_In , true, 0, vectorSize * sizeof(cl_float), X);
	checkErr(err, "WriteBuffer");
	err = myqueue.enqueueWriteBuffer(Buffer_In2, true, 0, vectorSize * sizeof(cl_float), Y);
	checkErr(err, "WriteBuffer 2");
	myqueue.finish();

#ifndef EXERCISE1
	// create the kernel
	const char *kernel_name = "SimpleKernel";

	//Read in binaries from file
	std::ifstream aocx_stream("../SimpleKernel.aocx", std::ios::in|std::ios::binary);
	checkErr(aocx_stream.is_open() ? CL_SUCCESS:-1, "SimpleKernel.aocx");
	std::string prog(std::istreambuf_iterator<char>(aocx_stream), (std::istreambuf_iterator<char>()));
	cl::Program::Binaries mybinaries (1, std::make_pair(prog.c_str(), prog.length()));

	// Create the Program from the AOCX file.
	////////////////////// Exercise 2 Step 2.3    ///////////////////
	cl::Program myprogram(mycontext, DeviceList, mybinaries);
	checkErr(err, "Program Constructor");

	// build the program
	//////////////      Compile the Kernel.... For Intel FPGA, nothing is done here, but this conforms to the standard
	//////////////       Exercise 2   Step 2.4    ///////////////////
	err= myprogram.build(DeviceList,NULL);
	checkErr(err, "Build Program");

	// create the kernel
	//////////////       Find Kernel in Program
	//////////////       Exercise 2   Step 2.5    ///////////////////
	cl::Kernel mykernel(myprogram, kernel_name,&err);
	checkErr(err, "Kernel Creation");

	//////////////     Set Arguments to the Kernels
	//////////////       Exercise 2   Step 2.6    ///////////////////
	err = mykernel.setArg(0, Buffer_In);
	checkErr(err, "Arg 0");
	err = mykernel.setArg(1, Buffer_In2);
	checkErr(err, "Arg 1");
	err = mykernel.setArg(2, Buffer_Out);
	checkErr(err, "Arg 2");
	err = mykernel.setArg(3, vectorSize);
	checkErr(err, "Arg 3");


	printf("\nLaunching the kernel...\n");
	
	// Launch Kernel
	//////////////       Exercise 2   Step 2.7    ///////////////////
	err= myqueue.enqueueNDRangeKernel(mykernel,cl::NullRange,cl::NDRange(1),cl::NullRange);
	checkErr(err, "Kernel Execution");

	// read the output
	//////////////       Exercise 2   Step 2.8    ///////////////////
	err= myqueue.enqueueReadBuffer(Buffer_Out , true, 0, vectorSize * sizeof(cl_float), Z);
	checkErr(err, "Read Buffer");

	err=myqueue.finish();
	checkErr(err, "Finish Queue");
	
	float CalcZ[vectorSize];

	for (uint i=0; i<vectorSize; i++)
	{
		//////////////  Equivalent Code running on CPUs
		//////////////       Exercise 2   Step 2.9    ///////////////////
		CalcZ[i] = X[i]*Y[i];
				
	}

	//Print Performance Results
	verification (X, Y, Z, CalcZ, vectorSize);

#endif

    return 1;
}

FPGA + OpenCL on Coursera (exercise in Week2)

To be honest, the exercise is a bit inconsiderate, and so, I would like to show mine here.

#include <math.h>
#include <fstream>
#include <stdio.h>
#include <string>

#include "CL/cl.hpp"
#include "utility.h"

static const cl_uint vectorSize = 4096; //must be evenly divisible by workSize
static const cl_uint workSize = 256;

#define EXERCISE1

int main(void)
{
	cl_int err;

	//Setup Platform
	//Get Platform ID
	std::vector<cl::Platform> PlatformList;

	////////////// Exercise 1 Step 2.3
	err = cl::Platform::get(&PlatformList);
	checkErr(err, "Get Platform List");
	checkErr(PlatformList.size()>=1 ? CL_SUCCESS : -1, "cl::Platform::get");
	print_platform_info(&PlatformList);
	//Look for Fast Emulation Platform
	uint current_platform_id=get_platform_id_with_string(&PlatformList, "Emulation");
	printf("Using Platform: %d\n\n", current_platform_id);
	
	//Setup Device
	//Get Device ID
	std::vector<cl::Device> DeviceList;

	////////////// Exercise 1 Step 2.5
	err = PlatformList[current_platform_id].getDevices(CL_DEVICE_TYPE_ALL, &DeviceList);
	checkErr(err, "Get Devices");
	print_device_info(&DeviceList);
	
	//Create Context
	////////////// Exercise 1 Step 2.6 
	cl::Context mycontext = cl::Context(DeviceList);

	checkErr(err, "Context Constructor");
	
	//Create Command queue
	////////////// Exercise 1 Step 2.7
	cl::CommandQueue myqueue = cl::CommandQueue(mycontext , DeviceList[0], 0, &err);
	checkErr(err, "Queue Constructor");

	//Create Buffers for input and output
	////////////// Exercise 1 Step 2.8
	cl::Buffer Buffer_In (mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_In2(mycontext, CL_MEM_READ_ONLY ,  vectorSize * sizeof(cl_float));
	cl::Buffer Buffer_Out(mycontext, CL_MEM_WRITE_ONLY,  vectorSize * sizeof(cl_float));

	//Inputs and Outputs to Kernel, X and Y are inputs, Z is output
	//The aligned attribute is used to ensure alignment
	//so that DMA could be used if we were working with a real FPGA board
	cl_float X[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Y[vectorSize]  __attribute__ ((aligned (64)));
	cl_float Z[vectorSize]  __attribute__ ((aligned (64)));

	//Allocates memory with value from 0 to 1000
	cl_float LO= 0;   cl_float HI=1000;
	fill_generate(X, Y, Z, LO, HI, vectorSize);

	//Write data to device
	////////////// Exercise 1 Step 2.9
	err = myqueue.enqueueWriteBuffer(Buffer_In , true, 0, vectorSize * sizeof(cl_float), X);
	checkErr(err, "WriteBuffer");
	err = myqueue.enqueueWriteBuffer(Buffer_In2, true, 0, vectorSize * sizeof(cl_float), Y);
	checkErr(err, "WriteBuffer 2");
	myqueue.finish();

#ifndef EXERCISE1
	// create the kernel
	const char *kernel_name = "SimpleKernel";

	//Read in binaries from file
	std::ifstream aocx_stream("../SimpleKernel.aocx", std::ios::in|std::ios::binary);
	checkErr(aocx_stream.is_open() ? CL_SUCCESS:-1, "SimpleKernel.aocx");
	std::string prog(std::istreambuf_iterator<char>(aocx_stream), (std::istreambuf_iterator<char>()));
	cl::Program::Binaries mybinaries (1, std::make_pair(prog.c_str(), prog.length()));

	// Create the Program from the AOCX file.
	////////////////////// Exercise 2 Step 2.3    ///////////////////
	cl::Program
	checkErr(err, "Program Constructor");

	// build the program
	//////////////      Compile the Kernel.... For Intel FPGA, nothing is done here, but this conforms to the standard
	//////////////       Exercise 2   Step 2.4    ///////////////////
	err=
	checkErr(err, "Build Program");

	// create the kernel
	//////////////       Find Kernel in Program
	//////////////       Exercise 2   Step 2.5    ///////////////////
	cl::Kernel
	checkErr(err, "Kernel Creation");

	//////////////     Set Arguments to the Kernels
	//////////////       Exercise 2   Step 2.6    ///////////////////
	err =
	checkErr(err, "Arg 0");
	err =
	checkErr(err, "Arg 1");
	err =
	checkErr(err, "Arg 2");
	err =
	checkErr(err, "Arg 3");


	printf("\nLaunching the kernel...\n");
	
	// Launch Kernel
	//////////////       Exercise 2   Step 2.7    ///////////////////
	err=
	checkErr(err, "Kernel Execution");

	// read the output
	//////////////       Exercise 2   Step 2.8    ///////////////////
	err=
	checkErr(err, "Read Buffer");

	err=myqueue.finish();
	checkErr(err, "Finish Queue");
	
	float CalcZ[vectorSize];

	for (uint i=0; i<vectorSize; i++)
	{
		//////////////  Equivalent Code running on CPUs
		//////////////       Exercise 2   Step 2.9    ///////////////////
		CalcZ[i] =
				
	}

	//Print Performance Results
	verification (X, Y, Z, CalcZ, vectorSize);

#endif

    return 1;
}

FPGA + OpenCL environment building

An environment should be built by myself, and I am writing down what I have done so far to get it.

(1) Install Centos 7 (this is the requirement by Intel's SDK)
(2) Download SDK from here. Version 19.1 might be the best for the course. (30GB!)
(3) Install everything under /root. Root access is necessary for the emulator.
(4) Install eclipse. Download from here. The latest version might be OK.

I tried to use an eclipse included in SDK, but there was an issue in a menu window.
Actually, no need to stick to the bundled one, at least for the exercise in Week2.

Coursera's FPGA + OpenCL course

As a part of my training duties, I have just started a course on Coursera.
It seems, it is actually just a copy of Intel's one but OK to me.

https://www.coursera.org/learn/opencl-fpga-introduction

2020年8月5日水曜日

ハンガリーで結婚式に参加した

妻の同僚のハンガリー人が結婚式に私も招待してくれたので、参加してきました。

ブダペストから更に来るまで1時間半もかかるようなところなので、

前泊（正確には前々日に移動だけ）して、ブダペスト観光しつつ、当日は新郎新婦の友人に車で送ってもらいました（外国人には公共交通機関での移動は無理っぽい）。

式が14時始まりで、披露宴が終わるのが４AMという死にそうな予定で、もちろん我々は途中で諦めましたが、それでも1時。翌日は新郎新婦の車に乗せてもらってウィーンまで帰ったのですが、途中で逃げた我々より新郎新婦のほうがしっかりしていた。体力の違いか、若さか。

写真は観光に行った、Gerbeaudという喫茶店、Matthias Church, 世界遺産になってるメトロ（M1線）、鎖橋、結婚式の式次第に式の様子。

2020年8月4日火曜日

List of gates available in Q#

I always lose the list of Q#, and so, I would like to put it as memo here.

https://docs.microsoft.com/en-us/qsharp/api/qsharp/microsoft.quantum.intrinsic

Grover's algorithm (Algorithm 1, in Section 2)

Continuation of Q# practice (Bell state).

Grover's algorithm described in Section 2, Algorithm 1.

In this example, Oracle is expressed just with Toffoli (CCNOT) gate, and this is not a general implementation, as far as I understand.

In the literature, they constructed CCNOT with natively supported gates, but Q# has it, and I just used it (control and ancillary are a bit confusing to me though).

By running my sample code the numbers of measurements observed in 1000 trials are;

00:0, 01:0, 10:0, 11:1000

Unlike the previous one, the result is completely the same as the theory.

I feel I should split Q# and C# parts in a better way.

In any case, I am getting used to it, hopefully.

Sample codes are;

Q# (Program.qs)

/// # Summary
/// 
namespace Grover {

    open Microsoft.Quantum.Canon;
    open Microsoft.Quantum.Intrinsic;
    

    operation Grover() : (Int,Int,Int,Int) {
        Message("This is Q# implementation of Algorithm 1 in Quantum Algorithm Implementaions for Beginners");

        mutable n00 = 0;
        mutable n01 = 0;
        mutable n10 = 0;
        mutable n11 = 0;

        for(trial in 1..1000){
        using ( (qubit0, qubit1, qubit2) = (Qubit(), Qubit(),Qubit()) ){
            // qubit0: ancillar
            // qubit1, 2: x1, x2

            //Initializaiton
            X(qubit0);
            H(qubit0);
            H(qubit1);
            H(qubit2);
            // qubit0 is called ancillar, looking at the following site and the article, it should be the target
            // https://quantumcomputing.stackexchange.com/questions/3943/how-do-you-implement-the-toffoli-gate-using-only-single-qubit-and-cnot-gates
            CCNOT(qubit1,qubit2,qubit0); //Toffoli gate

            // Grover diffusion operator
            H(qubit1);
            X(qubit1);
            H(qubit1);

            H(qubit2);
            X(qubit2);

            CNOT(qubit2,qubit1);

            H(qubit1);
            X(qubit1);
            H(qubit1);

            X(qubit2);
            H(qubit2);

            let res0 = M(qubit1);
            let res1 = M(qubit2);
            if(res0 == Zero){
                if(res1 == Zero){
                    set n00 += 1;
                }else{
                    set n01 += 1;
                }
            }else{
                if(res1 == Zero){
                    set n10 += 1;
                }else{
                    set n11 += 1;
                }

            }
            Reset(qubit0);
            Reset(qubit1);
            Reset(qubit2);
        }
        }
        return(n00,n01,n10,n11);
    }
}

C# (Driver.cs)

using System;

using Microsoft.Quantum.Simulation.Core;
using Microsoft.Quantum.Simulation.Simulators;

namespace Grover
{
    class Driver
    {
        static void Main(string[] args)
        {
            
            using (var qsim = new QuantumSimulator())
            {
                long a,b,c,d;
                (a,b,c,d) = Grover.Run(qsim).Result;
                Console.WriteLine("{0},{1},{2},{3}",a,b,c,d);
            }
        }
    }
}

登録: 投稿 (Atom)