Again, someone who has no experience may need some help;
SimpleKernel.cl
1 2 3 4 5 6 7 8 9 | //ACL Kernel __kernel void SimpleKernel (__global float* restrict x, __global float* restrict y, __global float* restrict z, uint vectorSize) { int i; for(i=0; i<vectorSize; i++){ z[i] =x[i]*y[i]; } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #include <math.h> #include <fstream> #include <stdio.h> #include <string> #include "CL/cl.hpp" #include "utility.h" static const cl_uint vectorSize = 4096; //must be evenly divisible by workSize static const cl_uint workSize = 256; //#define EXERCISE1 int main(void) { cl_int err; //Setup Platform //Get Platform ID std::vector<cl::Platform> PlatformList; ////////////// Exercise 1 Step 2.3 err = cl::Platform::get(&PlatformList); checkErr(err, "Get Platform List"); checkErr(PlatformList.size()>=1 ? CL_SUCCESS : -1, "cl::Platform::get"); print_platform_info(&PlatformList); //Look for Fast Emulation Platform uint current_platform_id=get_platform_id_with_string(&PlatformList, "Emulation"); printf("Using Platform: %d\n\n", current_platform_id); //Setup Device //Get Device ID std::vector<cl::Device> DeviceList; ////////////// Exercise 1 Step 2.5 err = PlatformList[current_platform_id].getDevices(CL_DEVICE_TYPE_ALL, &DeviceList); checkErr(err, "Get Devices"); print_device_info(&DeviceList); //Create Context ////////////// Exercise 1 Step 2.6 cl::Context mycontext = cl::Context(DeviceList); checkErr(err, "Context Constructor"); //Create Command queue ////////////// Exercise 1 Step 2.7 cl::CommandQueue myqueue = cl::CommandQueue(mycontext , DeviceList[0], 0, &err); checkErr(err, "Queue Constructor"); //Create Buffers for input and output ////////////// Exercise 1 Step 2.8 cl::Buffer Buffer_In (mycontext, CL_MEM_READ_ONLY , vectorSize * sizeof(cl_float)); cl::Buffer Buffer_In2(mycontext, CL_MEM_READ_ONLY , vectorSize * sizeof(cl_float)); cl::Buffer Buffer_Out(mycontext, CL_MEM_WRITE_ONLY, vectorSize * sizeof(cl_float)); //Inputs and Outputs to Kernel, X and Y are inputs, Z is output //The aligned attribute is used to ensure alignment //so that DMA could be used if we were working with a real FPGA board cl_float X[vectorSize] __attribute__ ((aligned (64))); cl_float Y[vectorSize] __attribute__ ((aligned (64))); cl_float Z[vectorSize] __attribute__ ((aligned (64))); //Allocates memory with value from 0 to 1000 cl_float LO= 0; cl_float HI=1000; fill_generate(X, Y, Z, LO, HI, vectorSize); //Write data to device ////////////// Exercise 1 Step 2.9 err = myqueue.enqueueWriteBuffer(Buffer_In , true, 0, vectorSize * sizeof(cl_float), X); checkErr(err, "WriteBuffer"); err = myqueue.enqueueWriteBuffer(Buffer_In2, true, 0, vectorSize * sizeof(cl_float), Y); checkErr(err, "WriteBuffer 2"); myqueue.finish(); #ifndef EXERCISE1 // create the kernel const char *kernel_name = "SimpleKernel"; //Read in binaries from file std::ifstream aocx_stream("../SimpleKernel.aocx", std::ios::in|std::ios::binary); checkErr(aocx_stream.is_open() ? CL_SUCCESS:-1, "SimpleKernel.aocx"); std::string prog(std::istreambuf_iterator<char>(aocx_stream), (std::istreambuf_iterator<char>())); cl::Program::Binaries mybinaries (1, std::make_pair(prog.c_str(), prog.length())); // Create the Program from the AOCX file. ////////////////////// Exercise 2 Step 2.3 /////////////////// cl::Program myprogram(mycontext, DeviceList, mybinaries); checkErr(err, "Program Constructor"); // build the program ////////////// Compile the Kernel.... For Intel FPGA, nothing is done here, but this conforms to the standard ////////////// Exercise 2 Step 2.4 /////////////////// err= myprogram.build(DeviceList,NULL); checkErr(err, "Build Program"); // create the kernel ////////////// Find Kernel in Program ////////////// Exercise 2 Step 2.5 /////////////////// cl::Kernel mykernel(myprogram, kernel_name,&err); checkErr(err, "Kernel Creation"); ////////////// Set Arguments to the Kernels ////////////// Exercise 2 Step 2.6 /////////////////// err = mykernel.setArg(0, Buffer_In); checkErr(err, "Arg 0"); err = mykernel.setArg(1, Buffer_In2); checkErr(err, "Arg 1"); err = mykernel.setArg(2, Buffer_Out); checkErr(err, "Arg 2"); err = mykernel.setArg(3, vectorSize); checkErr(err, "Arg 3"); printf("\nLaunching the kernel...\n"); // Launch Kernel ////////////// Exercise 2 Step 2.7 /////////////////// err= myqueue.enqueueNDRangeKernel(mykernel,cl::NullRange,cl::NDRange(1),cl::NullRange); checkErr(err, "Kernel Execution"); // read the output ////////////// Exercise 2 Step 2.8 /////////////////// err= myqueue.enqueueReadBuffer(Buffer_Out , true, 0, vectorSize * sizeof(cl_float), Z); checkErr(err, "Read Buffer"); err=myqueue.finish(); checkErr(err, "Finish Queue"); float CalcZ[vectorSize]; for (uint i=0; i<vectorSize; i++) { ////////////// Equivalent Code running on CPUs ////////////// Exercise 2 Step 2.9 /////////////////// CalcZ[i] = X[i]*Y[i]; } //Print Performance Results verification (X, Y, Z, CalcZ, vectorSize); #endif return 1; } |