mirror of
https://github.com/autc04/Retro68.git
synced 2024-12-13 03:29:50 +00:00
259 lines
8.5 KiB
C
259 lines
8.5 KiB
C
/*
|
|
|
|
matmul.c : Matrix Multiplication with tiling for openmp4 example
|
|
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
|
|
#define BLOCK_SIZE 16
|
|
/*
|
|
#define BLOCK_SIZE 32
|
|
*/
|
|
#define NSECPERSEC 1000000000L
|
|
|
|
typedef struct {
|
|
int width;
|
|
int height;
|
|
int stride;
|
|
int hpad;
|
|
float* elements;
|
|
} Matrix;
|
|
|
|
/* Correctly extract the number of nanoseconds from the two time structures */
|
|
long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
|
|
long int nanosecs;
|
|
if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
|
|
((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
|
|
( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
|
|
else nanosecs =
|
|
(((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
|
|
( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
|
|
return nanosecs;
|
|
}
|
|
|
|
void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
|
|
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
|
|
void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
|
|
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
|
|
void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
|
|
const float* B,const int LDB, const float beta,float* C, const int LDC) ;
|
|
|
|
int verify(float* v_res, float* v_ref, int len) {
|
|
int passed = 1;
|
|
int i;
|
|
for (i = 0; i < len; ++i) {
|
|
if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
|
|
__builtin_abort ();
|
|
}
|
|
}
|
|
return passed;
|
|
}
|
|
|
|
|
|
int main(int argc, char* argv[]){
|
|
|
|
Matrix A,B,Bt,C,Cref;
|
|
int a1,a2,a3,i,j;
|
|
struct timespec start_time1, end_time1;
|
|
struct timespec start_time2, end_time2;
|
|
long int nanosecs,total_ops;
|
|
float gflopsTiled,gflopsCPU;
|
|
|
|
a1 = 35;
|
|
a2 = 28;
|
|
a3 = 47;
|
|
|
|
A.height = a1;
|
|
A.width = a2;
|
|
A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
|
|
|
|
B.height = a2;
|
|
B.width = a3;
|
|
B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
|
|
|
|
/* Bt is same as B but stored in column-major order */
|
|
Bt.height = B.height;
|
|
Bt.width = B.width;
|
|
Bt.stride = B.stride;
|
|
Bt.hpad = B.hpad;
|
|
Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
|
|
|
|
C.height = a1;
|
|
C.width = a3;
|
|
C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
|
|
|
|
Cref.height = a1;
|
|
Cref.width = a3;
|
|
Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
|
|
Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
|
|
|
|
for(i = 0; i < A.hpad ; i++)
|
|
for(j = 0; j < A.stride; j++) {
|
|
if (( j<A.width ) && (i<A.height)) {
|
|
A.elements[i*A.stride + j] = (i % 3);
|
|
} else {
|
|
A.elements[i*A.stride + j] = 0.0;
|
|
}
|
|
}
|
|
|
|
/* Initialize B and Bt */
|
|
for(i = 0; i < B.hpad ; i++)
|
|
for(j = 0; j < B.stride; j++) {
|
|
if (( j<B.width ) && (i<B.height)) {
|
|
B.elements[i*B.stride+j] = (j % 2);
|
|
Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
|
|
} else {
|
|
B.elements[i*B.stride+j] = 0.0;
|
|
Bt.elements[j*Bt.stride+i] = 0.0;
|
|
}
|
|
}
|
|
|
|
/* zero C, and Cref */
|
|
for(i = 0; i < C.hpad; i++)
|
|
for(j = 0; j < C.stride; j++) {
|
|
C.elements[i*C.stride+j] = 0.0;
|
|
Cref.elements[i*Cref.stride+j] = 0.0;
|
|
}
|
|
|
|
simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
|
|
tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
|
|
|
|
verify(C.elements, Cref.elements, C.height * C.stride);
|
|
return 0;
|
|
}
|
|
|
|
void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
|
|
const float* B,const int LDB, const float beta,float* C, const int LDC) {
|
|
/* A,B, and C are in row-major order */
|
|
int c_row,c_col,inner;
|
|
float sum;
|
|
for (c_col = 0 ; c_col<N; c_col++ ) {
|
|
for (c_row = 0 ; c_row<M; c_row++ ) {
|
|
sum = 0.0 ;
|
|
for (inner = 0 ; inner<K; inner++ ) {
|
|
sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
|
|
}
|
|
C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***************************
|
|
|
|
tiled_sgemm_tt: Tiled matrix multiplication:
|
|
|
|
***************************/
|
|
|
|
void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
|
|
const float*B, const int LDB, const float beta, float*C, const int LDC){
|
|
|
|
#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
|
|
#pragma omp distribute collapse(2)
|
|
for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
|
|
for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
|
|
|
|
// We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
|
|
// The grid global dimensions are M,N,1
|
|
// The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
|
|
|
|
// -------------------------------------------------------------------
|
|
// The rest of this code forms the HSAIL kernel with the
|
|
// pairs of "paralell for collapse(2)" loops repalced with a barrier.
|
|
// The kernel initializes these values
|
|
// C_row_start = get_group_id(0) * BLOCK_SIZE
|
|
// C_col_start = get_group_id(1) * BLOCK_SIZE
|
|
// row=get_local_id(0)
|
|
// col=get_local_id(1)
|
|
// -------------------------------------------------------------------
|
|
|
|
// Each team has a local copy of these mini matrices
|
|
float As[BLOCK_SIZE][BLOCK_SIZE];
|
|
float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
|
float Cs[BLOCK_SIZE][BLOCK_SIZE];
|
|
int C_row, C_col;
|
|
|
|
/* Zero Cs for this BLOCK */
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
// REPLACE NEXT THREE LINES WITH A BARRIER
|
|
#pragma omp parallel for collapse(2)
|
|
for (int row=0 ; row < BLOCK_SIZE ; row++) {
|
|
for (int col=0 ; col < BLOCK_SIZE ; col++) {
|
|
// END BARRIER
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
Cs[row][col] = 0.0;
|
|
}
|
|
}
|
|
|
|
// This kblock loop is run on the master thread of each team
|
|
for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) {
|
|
|
|
// Copy global memory values to local memory
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
// REPLACE NEXT THREE LINES WITH A BARRIER
|
|
#pragma omp parallel for collapse(2)
|
|
for (int row=0 ; row < BLOCK_SIZE ; row++) {
|
|
for (int col=0 ; col < BLOCK_SIZE ; col++) {
|
|
// END BARRIER
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
C_row = C_row_start + row;
|
|
C_col = C_col_start + col;
|
|
if ((C_row < M) && (kblock + col < K))
|
|
As[row][col] = A[(C_row*LDA)+ kblock + col];
|
|
else
|
|
As[row][col] = 0;
|
|
if ((kblock + row < K) && C_col < N)
|
|
Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
|
|
else
|
|
Bs[row][col] = 0;
|
|
}
|
|
}
|
|
|
|
// Calculate Cs <- Sum(As X Bs) across all kblocks
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
// REPLACE NEXT THREE LINES WITH A BARRIER
|
|
#pragma omp parallel for collapse(2)
|
|
for (int row=0 ; row < BLOCK_SIZE ; row++) {
|
|
for (int col=0 ; col < BLOCK_SIZE ; col++) {
|
|
// END BARRIER
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
for (int e = 0; e < BLOCK_SIZE; ++e)
|
|
Cs[row][col] += As[row][e] * Bs[e][col];
|
|
}
|
|
}
|
|
|
|
} /* End for kblock .. */
|
|
|
|
|
|
// Scale Update actual C from Cs
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
// REPLACE NEXT THREE LINES WITH A BARRIER
|
|
#pragma omp parallel for collapse(2)
|
|
for (int row=0 ; row < BLOCK_SIZE ; row++) {
|
|
for (int col=0 ; col < BLOCK_SIZE ; col++) {
|
|
// END BARRIER
|
|
// - - - - - - - - - - - - - - - - - - - -
|
|
C_row = C_row_start + row;
|
|
C_col = C_col_start + col;
|
|
if ((C_row < M) && (C_col < N)) {
|
|
C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
|
|
}
|
|
}
|
|
}
|
|
|
|
// -------------------------------------------------------------------
|
|
// This is the end of the kernel
|
|
|
|
}
|
|
}
|
|
|
|
}
|