watch how luminal searches for the best kernels
extern "C" __global__ void kernel0(float* a, float* b, float* c, float* d) {
int loop_f = blockIdx.x;
float* g = c + (loop_f*64);
float* h = a + (loop_f*64);
float* i = d + (loop_f*64);
int loop_j = blockIdx.y;
float* k = b + loop_j;
float* l = h + loop_j;
float* m = i + loop_j;
float n[1] = {0.0};
for (int load = 0; load < 1; ++load) {
n[0] = *(l + 0);
}
for (int loop_o = 0; loop_o < 64; ++loop_o) {
float* p = k + (loop_o*64);
float* q = g + loop_o;
float r = *q * *p;
float s = r + *n;
*n = s;
}
*m = *n;
}