__kernel void SpMV(
__global const unsigned int* iT,
__global const unsigned int* jT,
__global const float* T,
__global const float* U,
__global float* V,
__global int iNumElements)
{
// get index into global data array
int iGID = get_global_id(0);
float sum=0;
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
if (iGID < iNumElements)
{
for (int i=iT[iGID]; i<iT[iGID+1]; i++)
sum += T[i-1]*U[jT[i-1]-1];
} else return;
V[iGID]=sum;
}
__kernel void sAXPY(
__global const unsigned int N,
__global const float ALPHA,
__global const float* X,
__global float* Y){
//get index into global data array
unsigned int i = get_global_id(0);
if (i < N)
Y[i] = ALPHA*X[i] + Y[i];
}
Device Info:
Extensions: cl_khr_icd cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics
cl_khr_int64_extended_atomics cl_khr_byte_addressable_store
__kernel void sDOT(
__global const unsigned int N,
__global const float* X,
__global const float* Y,
__global float* DOT,
__global int* FLAG,
__local float* sdata) {
// get index into global data array
unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);
sdata[tid] = (i<N) ? X[i]*Y[i] : 0;
if (i==0) {
DOT[0]=0;
*FLAG=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s=1; s < get_local_size(0); s *= 2)
{
int index = 2 * s * tid;
if (index < get_local_size(0))
{
sdata[index] += sdata[index + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) {
while (atom_cmpxchg(FLAG,0,1)==1);
DOT[0] += sdata[0];
atom_cmpxchg(FLAG,1,0);
}
}
/********************
Gaspard2 MDE
IP name: MultInstance
function name: mult
Parameters: a,b,c
*********************/
//OpenCL Functions
void mult(
__global const float* a,
__global const float* b,
__global float* c)
{
c[0]=a[0]*b[0];
}
Outline
GPU/OpenCL/Linear Algebra
Gaspard2 Environment
Gaspard2 and MARTE
Part 2 -
Part1: What's GPU?
Matrix Vector Multiplication
1980/1990
Gaspard2 and OpenCL
Modeling in UML + MARTE
- Sparse, Square, Symmetric, CSR format
- ~449.000 elements, N=30880
- CSR format
- Purpose
- Environment
- MDE
- Computation Model: ArrayOL
- Gaspard2 and OpenCL Example
Wendell Rodrigues, PhD Student
DaRT Team - USTL -LIFL - INRIA Lille
M. Jean-Luc Dekeyser and M.Frederic Guyomarc'h
Array-OL: Array Oriented Language
- Specification language for signal processing applications
- Based on a multidimensional arrays
- Allows to express the whole potential parallelism of applications
CL Code:
Region Nord-Pas de Calais - Valeo - GPUTech
Potential Parallelism Expression of Application
- task parallelism
- data parallelism
Vector Processor: Cray
Connection Machines: CM-5
Gaspard2 Environment
read instruction and decode it
fetch these 10 numbers
fetch those 10 numbers
fetch another 10 numbers
add and multiply them
put the results here
It is a "massively parallel" hypercubic arrangement of thousands of microprocessors, each with its own 4 kbits of RAM, which together executed in a SIMD fashion
Gaspard2 Purpose
Final Overview
http://gpumodeling.blogspot.com
Gaspard2 Task Parallelism
APPLICATION MODELING
ARCHITECTURE MODELING
GPU: Graphics Processing Unit
GPU Architecture
- Current GPUs are both a vector machine and massively parallel
- More than SIMD, they are SPMD
- But they're always a co-processor
- Programming Languages: Shader Languages (Sh, Cg, etc.); Brook+(CAL); CUDA, OpenCL (NVidia and AMD)
Task Graph:
- blocks: tasks
- connectors: data dependency
- Tagged values: host and device
- Communication Bus: allows to know how to transfer data
- Memory Hierarchy: tagged values (private, local, constant, global)
SAXPY
An environment that allows to implement the MDE (Model Driven Engineering) for RTES(Real-Time Embedded Systems) conception.
- model reuse
- human-friendly models
- promotes parallelism to help reduce the power consumption and improving performance.
- different target platforms
- opensource and free (DaRT Team/INRIA Lille)
- GPUs are good for linear algebra (low cost, good performance)
- Right now, Gaspard2 is able to generate VHDL, OpenMP C and Fortran, SystemC, Synchronous
- My work concentrates on the new chain, the OpenCL chain
- The OpenCL chain beta version is almost ready
OpenCL (Computing Language)
Scheduling
Application
Apple, NVidia, AMD, Intel, IBM, RapidMind, Electronic Arts
(EA), 3DLABS, Activision Blizzard, ARM, Barco, Broadcom,
Codeplay, Ericsson, Freescale, HI, Imagination Technologies,
Kestrel Institute, Motorola, Movidia, Nokia, QNX, Samsung,
Seaweed, Takumi, TI and Umeå University
Task Graph: v1//v2, k, r1
Kernel Graph: op1, op2
BLAS Level 1 AXPY is a combination of scalar
multiplication and vector addition
Performs y ← αx + y , where α is a scalar and y,x are
vectors
- Papyrus UML/Eclipse
- MARTE Profile (OMG)
- ArrayOL Concepts (tiler, reshape, etc)
Conjugate Gradient
- An opened Standard managed by the Khronos group
– Influenced & guided by Apple
– Spec 1.0 approved Dec’08
- A system for executing short "Enhanced C" routines (kernels) across devices
– All around Heterogeneous Platforms – Host & Devices
– Devices: CPU, GPU, Accelerator (FPGA)
– Samples, Vector Types, etc.
- Offers hybrid execution capability
Tiler Specification
- origin: {0}
- paving: {{1}}
- fitting: {{}}
Future compiler versions will provide optimization levels:
- CPU controls the main loop and scalar operations
- Many kernel launches in the main loop
for (int i = 0; i < n; i++) do
y[i] = a*x[i] + y[i];
end for
Memory copies
Work-Groups and Work-Items Dimension
Memory Access Coalescing
Gaspard2 Eclipse Environment
Gaspard2 Data Parallelism
Task Repetition
- pattern producers and consumers
Refactored GPGPU
(0,1)
NVidia's CUDA was a disruptive technology
- writes C on the GPU
- extends to non-traditional usages
- provides synchronization mechanism
OpenCL deepens and extends the revolution
ASSOCIATION
(0,0)
(1,0)
At this moment, the user should explicit:
- where and how the tasks will be executed
- where and how the ports (memory allocations) will be placed
Constant, Global, Local (Shared) or Private???
Implementation Details
Dot Product
BLAS Level 1 DOT, also known as the scalar product, is an
operation which takes two vectors over the real numbers
and returns a real-valued scalar.
x.y = Σ xi yi = x1 y1 + x2 y2 + ... + xn yn
Simple CPU Sequential C Version
Simple CPU Parallel OCL Version
- SpMV
- AXPY
- DotProduct
- Scalar and Loop Control
DEPLOYMENT
At this moment, the user should indicate:
- IP (Intelectual Property): library, piece of code, binary
OpenCL: kernel functions
- Each ELEMENTARY TASK has an IP
void sumvec(float *A, float *B,
float *C)
{
for (i=0;i<N;i++)
C[i] = A[i] + B[i];
}
__kernel void sumvec(__global const float *A,
__global const float *B,
__global float *C)
{
int gid = get_global_id(0);
C[gid] = A[gid] + B[gid];
}
Elementary Task
__kernel void sDOT(
__global const unsigned int N,
__global const float* X,
__global const float* Y,
__global float* DOT,
__local float* SharedData) {
//get global and local index
unsigned int i = get_global_id(0);
unsigned int tid = get_local_id(0);
SharedData[tid] = (i<N) ? X[i]*Y[i] : 0; //operates in local memory
barrier(CLK_LOCAL_MEM_FENCE);
//do reduction in local shared mem
for(int s=1; s < get_local_size(0); s *= 2) {
int index = 2 * s * tid;
if (index < get_local_size(0))
SharedData[index] += SharedData[index + s];
barrier(CLK_LOCAL_MEM_FENCE);
}
//first thread of each work-group writes on global memory
if (tid == 0) DOT[get_group_id(0)] = SharedData[0];
}
for (int i = 0; i < n; i++) do
dot += x[i]*y[i];
end for
Transformation Chain
- Barrier works only for the work-items from the same work-group
- Max work group size: 1024
- Vector size = 30880?
- Solutions?
a) another step to compute the final sum
problem: there's no SYNCHRONIZATION before kernel ends
b) Host makes the final reduction
30880=~30 adds
problem: You can't make a single kernel for the entire loop
Gaspard2 Transformation Engine
Transformation Library
Gaspard2 MDE/MDA
UML to MARTE
Instances of Ports
Tiler to Task
- process all the tilers in the model
- generates tasks and functions
Local and Global Task Graph
Scheduling
- based on the Task Graph
- assigns one possible scheduling discipline
QVTO (Query/View/Transformation Operational) : bit of code
Memory Mapping
- create Memory Mappings
- create data allocations based on flowports
OpenCL: Work-Groups and Work-items
Gaspard2 provides a transformation chain tool to execute different transformations.
At last, with a template Acceleo the engine generates the code files.
mapping Allocate::toKernel() : gpuapi::GPUAPI::Kernel {
init {
var kern := self.source->selectOne(true).oclAsType(AssemblyPart); //mapped to GPU
var ntasks := kern.type[StructuredComponent].ownedProperties[AssemblyPart]
->select(type[LinkTopologyTask]->isEmpty()).shape->shapeprod()->maxshape(); //threads count
}
name := kern.name;
assemblyPart := kern;
launchtopology := object gpuapi::GPUAPI::LaunchTopology{ //topology x,y,z = (int)total/256,256,0
dim := (ntasks/256 + 0.5).round();
dim += 256;
dim += 0;
};
// create functions (not tiler task) for each kernel
functions += kern.type[StructuredComponent].ownedProperties[AssemblyPart]->select(type[TilerTask]->isEmpty())
->map toIPFunctions(gpuapi::GPUAPI::placingOptions::Device);
//vars
variables += kern.ports[PortPart]->map FuncVars(gpuapi::GPUAPI::placingOptions::Device);
}
GPUAPI
- searches associated tasks
- defines kernels, variables, variable links (copyHostToDevice)
- launch topology
Model to Text (Acceleo)
Dot Product (Atomic Functions)
Code Generated
/src/
.cl, .cpp, Makefile
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
Model Driven Architecture (MDA)
Unified Modeling Language (UML)
Meta Object Facility (MOF)
XML Metadata Interchange (XMI)
Modeling and Analysis of Real-time and Embedded systems (MARTE)
Query / View / Transformation (QVT)
Modeling Example (Array Product)
Result:
- 3x less speed
- no performance
CODE GENERATED
Application Model
Architecture Model
Association Model
Deployment Model
QVT Transformations and Metamodels
(add, delete and transform elements)
Acceleo Model to Text
Linear Algebra and OpenCL
Electric Machines Simulation
- Maxwell's equations
- linear systems
- We need a linear solver (iterative, direct, etc.)
Remarks
The transformation chain is hidden from the designer
This example shows an OpenCL code, but the MetaModel GPUAPI allows to generate CUDA as well.
- changes in Acceleo Model Templates
- There are no BLAS, FFT, etc. libraries for OpenCL (not yet!)
- Make yourself the functions you need
- Our problem: Conjugate Gradient with Sparse Matrices (order of 500k or more elements)
Implementing...
Loop Control
- switching between host and device
- data remain in GPU memory
Remarks
Writing code
- if we don't need to write the code...
- a tool to generate OpenCL