mirror of https://github.com/commaai/tinygrad.git
195 lines
4.4 KiB
C
195 lines
4.4 KiB
C
// single: clang -O2 -march=native gemm.c
|
|
// multi: clang -O2 -march=native gemm.c -DNTHREADS=32 -lpthread
|
|
#define _GNU_SOURCE
|
|
|
|
// https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
|
|
#include <stdint.h>
|
|
#include <time.h>
|
|
#include <sched.h>
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
#include <math.h>
|
|
#include <string.h>
|
|
#include <immintrin.h>
|
|
#include <pthread.h>
|
|
#include <unistd.h>
|
|
#include <stdatomic.h>
|
|
|
|
//#define DEBUG
|
|
|
|
#ifdef DEBUG
|
|
#define N 8
|
|
#endif
|
|
|
|
#ifndef N
|
|
// NOTE: if you change this you have to rerun gemm.py
|
|
#define N 512
|
|
#endif
|
|
|
|
#ifndef NTHREADS
|
|
#define NTHREADS 1
|
|
#endif
|
|
|
|
// aligned?
|
|
float A[N*N] __attribute__ ((aligned (64)));
|
|
float B[N*N] __attribute__ ((aligned (64)));
|
|
float C[N*N] __attribute__ ((aligned (64)));
|
|
float val[N*N] __attribute__ ((aligned (64)));
|
|
|
|
__m256 *Am = (__m256*)A;
|
|
__m256 *Bm = (__m256*)B;
|
|
__m256 *Cm = (__m256*)C;
|
|
|
|
uint64_t nanos() {
|
|
struct timespec start;
|
|
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
|
|
return (uint64_t)start.tv_sec*1000000000 + (uint64_t)start.tv_nsec;
|
|
}
|
|
|
|
float Bf[N*N] __attribute__ ((aligned (64)));
|
|
__m256 *Bfm = (__m256*)Bf;
|
|
|
|
#define BLOCK 8
|
|
#define BLOCK_Y 4
|
|
#define BLOCK_X 2
|
|
void matmul(int sy, int ey) {
|
|
// 136.77 GFLOPS on single core numpy
|
|
// 4.9 GHz is max boost for 5950X
|
|
// 32 FLOPS/cycle (16 FMAs, aka 2x 8 single wide / 32 byte FMAs)
|
|
// theoretical max is 156.8 GFLOPS, we see 150
|
|
// multicore theo max = 2508.8 GFLOPS, we see 1501.434299
|
|
|
|
// Bf = (y/8, k, 8)
|
|
for (int y = sy; y < ey; y+=BLOCK_Y) {
|
|
for (int x = 0; x < N; x+=BLOCK*BLOCK_X) {
|
|
|
|
__m256 acc[BLOCK_Y][BLOCK_X] = {};
|
|
for (int k = 0; k < N; k++) {
|
|
for (int iy = 0; iy < BLOCK_Y; iy++) {
|
|
__m256 ta = _mm256_broadcast_ss(&A[(y+iy)*N + k]);
|
|
for (int ix = 0; ix < BLOCK_X; ix++) {
|
|
acc[iy][ix] = _mm256_fmadd_ps(ta, Bfm[((x+ix*BLOCK)*N + k*8)/8], acc[iy][ix]);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int iy = 0; iy < BLOCK_Y; iy++) {
|
|
for (int ix = 0; ix < BLOCK_X; ix++) {
|
|
Cm[((y+iy)*N + x + ix * BLOCK)/8] = acc[iy][ix];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
|
atomic_int nready = 0;
|
|
atomic_int ndone = 0;
|
|
|
|
void *matmul_thread(void *n) {
|
|
int k = (int)(int64_t)n;
|
|
int sy = (N/NTHREADS) * k;
|
|
int ey = (N/NTHREADS) * (k+1);
|
|
|
|
cpu_set_t set;
|
|
CPU_ZERO(&set);
|
|
CPU_SET(k,&set);
|
|
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &set);
|
|
|
|
nready++;
|
|
|
|
// gotta have main lock once to signal start
|
|
pthread_mutex_lock(&lock);
|
|
pthread_mutex_unlock(&lock);
|
|
|
|
matmul(sy, ey);
|
|
|
|
// we done
|
|
ndone++;
|
|
return NULL;
|
|
}
|
|
|
|
int main() {
|
|
printf("hello with %d threads\n", NTHREADS);
|
|
|
|
#ifdef DEBUG
|
|
for (int i = 0; i < N*N; i++) A[i] = i;
|
|
for (int i = 0; i < N*N; i++) B[i] = i;
|
|
#else
|
|
FILE *f = fopen("/tmp/matmul", "rb");
|
|
if (f == NULL) {
|
|
printf("please pregenerate python /tmp/matmul file\n");
|
|
return -1;
|
|
}
|
|
fread(A, 1, sizeof(float)*N*N, f);
|
|
fread(B, 1, sizeof(float)*N*N, f);
|
|
fread(val, 1, sizeof(float)*N*N, f);
|
|
fclose(f);
|
|
#endif
|
|
|
|
// preswizzle
|
|
for (int y = 0; y < N; y+=8) {
|
|
for (int x = 0; x < N; x++) {
|
|
for (int iy = 0; iy < 8; iy++) {
|
|
Bf[y*N + x*8 + iy] = B[(y+iy)*N + x];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
memset(C, 0, N*N*sizeof(float));
|
|
|
|
#if NTHREADS != 1
|
|
nready = 0;
|
|
ndone = 0;
|
|
pthread_mutex_lock(&lock);
|
|
pthread_t threads[NTHREADS];
|
|
for (int j = 0; j < NTHREADS; j++) {
|
|
pthread_create(&threads[j], NULL, matmul_thread, (void *)(uint64_t)j);
|
|
}
|
|
while (nready != NTHREADS) usleep(1);
|
|
#endif
|
|
|
|
uint64_t start = nanos();
|
|
#if NTHREADS == 1
|
|
matmul(0, N);
|
|
#else
|
|
// unlocking mutex starts threads
|
|
pthread_mutex_unlock(&lock);
|
|
while (ndone != NTHREADS) usleep(1);
|
|
#endif
|
|
uint64_t end = nanos();
|
|
|
|
#if NTHREADS != 1
|
|
for (int j = 0; j < NTHREADS; j++) {
|
|
pthread_join(threads[j], NULL);
|
|
}
|
|
#endif
|
|
|
|
double gflop = (2.0*N*N*N)*1e-9;
|
|
double s = (end-start)*1e-9;
|
|
printf("%f GFLOP/S -- %.2f ms\n", gflop/s, s*1e3);
|
|
|
|
// hack around throttling
|
|
//if (i%4 == 0) sleep(1);
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
for (int i = 0; i < N*N; i++) {
|
|
if (i%N == 0 && i != 0) printf("\n");
|
|
printf("%f ", C[i]);
|
|
}
|
|
printf("\n");
|
|
#else
|
|
for (int k = 0; k < N*N; k++) {
|
|
if (fabsf(C[k] - val[k]) > 1e-3) {
|
|
printf("MISMATCH AT %d, %f != %f\n", k, C[k], val[k]);
|
|
return -1;
|
|
}
|
|
}
|
|
printf("match\n");
|
|
#endif
|
|
|
|
return 0;
|
|
}
|