Luminal

watch how luminal searches for the best kernels

square matmul
Compile

square matmul Kernel

extern "C" __global__ void kernel0(float* a, float* b, float* c, float* d) {
  int loop_f = blockIdx.x;
  float* g = c + (loop_f*64);
  float* h = a + (loop_f*64);
  float* i = d + (loop_f*64);
  int loop_j = blockIdx.y;
  float* k = b + loop_j;
  float* l = h + loop_j;
  float* m = i + loop_j;
  float n[1] = {0.0};
  for (int load = 0; load < 1; ++load) {
  	n[0] = *(l + 0);
  }
  for (int loop_o = 0; loop_o < 64; ++loop_o) {
  	float* p = k + (loop_o*64);
  	float* q = g + loop_o;
  	float r = *q * *p;
  	float s = r + *n;
  	*n = s;
  }
  *m = *n;
}
CUDA (Nvidia)
NVIDIA H100