Merge remote-tracking branch 'origin/wip'

2025-09-10 01:05:07 +02:00 · 2025-08-02 10:03:33 +02:00
parent c80eb6ef20 0caffbaaac
commit 22f35fa800
3 changed files with 170 additions and 10 deletions
--- a/config.py
+++ b/config.py
@@ -1,4 +1,25 @@
 import os
 import subprocess
 testcmd=subprocess.run(["python", "test_py_mods.py"], capture_output=True)
 print(testcmd.stderr.decode("utf-8").strip())
 assert testcmd.returncode == 0
 have_ffprobe=False
 try:
    have_ffprobe = subprocess.run(["ffprobe", "--version"], capture_output=True)
    have_ffprobe = testcmd.returncode == 0
 except:pass
 if not have_ffprobe:
    print("warn: ffprobe not installed")
 have_pngquant=False
 try:
    have_pngquant = subprocess.run(["pngquant", "--version"], capture_output=True)
    have_pngquant = have_pngquant.returncode == 0
 except:pass
 if not have_pngquant:
    print("warn: pngquant not installed")
 web_targets = []
@@ -63,18 +84,37 @@ rule cargo_release_bin
  command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
  pool = console
 rule touch
  command = touch $out
 """
 if have_ffprobe:
  gen += """
 rule expect_img_size
  command = eval "[ $$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 $in) = $size ]" && touch $out
-rule touch
+rule ffmpeg_compress
  command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error
  """
 else:
  gen += """
 rule expect_img_size
  command = touch $out
 rule ffmpeg_compress
-  command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error
+  command = cp $in $out
  """
 if have_pngquant:
  gen += """
 rule pngquant
  command = pngquant $in -o $out --force --quality $quality
-"""
+  """
 else:
  gen += """
 rule pngquant
  command = cp $in $out
  """
 gen += """
 build build/deploy/coffee.js : python_capture gen_coffee_js.py
@@ -88,7 +128,7 @@ web_targets.append("build/coffee_server")
 pages = [x for x in os.listdir("./pages/")]
 gen += """
-build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ
+build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
 build gen_typst: phony build/pages.typ | """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
 """
--- a/pages/article-gpu-arch-1.typ
+++ b/pages/article-gpu-arch-1.typ
@@ -45,7 +45,8 @@
  Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
  Compute units also have some fast local memory (tens of kilobytes),
-  main memory access queues, texture units, a scalar unit, and other features. (see future article)
+  main memory access queues, texture units, a scalar unit, and other features.
  Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles.
  The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
 ]
@@ -67,6 +68,15 @@
  => waves are really similar to SIMD on modern CPUs
 ]
 #section[
  In modern GPUs, instruction execution in waves is superscalar,
  so there are multiple different execution units for executing different kinds of instructions,
  and multiple instructions can be executed at once, if there are free execution units,
  and they don't depend on each other.
  We'll be exploring that in a future article.
 ]
 #section[
  == Local memory
  The local memory inside GPUs is banked, typically into 32 banks.
@@ -144,6 +154,7 @@
  - 48 vector registers of 16x32b per wave
  - one scalar unit per CU
  - 128 global memory ports
  - 16 async task completion "signal" slots per wave
  - no fancy out of order or superscalar execution
  - support standard 32 bit floating point, without exceptions.
@@ -181,6 +192,7 @@
  - `Sreg`: the first element of a vector register, as scalar
  - `Sany`: a `Simm` or an `Sreg`
  - `dist`: `Vany`, or a `Sany` broadcasted to each element
  - `sig`: one of the 16 completion signal slots
 ]
 #section[
@@ -210,19 +222,120 @@
 ]
 #section[
-  === Memory
+  === Local memory
-  - `fn local_load`
+  - load 32 bit value at each elem where mask is true:
-  TODO
+    `fn local_load32(out out: Vreg, in mask: M, in addr: Vreg)`
  - store 32 bit value at each elem where mask is true:
    `fn local_store32(in addr: Vreg, in mask: M, in val: Vany)`
 ]
 #section[
  === Global (async) memory
  - start an async global load, and make the given signal correspond to the completion of the access:
     load 32 bit value at each elem where mask is true:
    `fn global_load32(out sig: sig, out out: Vreg, in mask: M, in addr: Vreg)`
  - see above and `local_store32`
    `fn global_store32(out sig: sig, in addr: Vreg, in mask: M, in val: Vany)`
  - `fn sig_done1(out r: Sreg, in sig: sig)`
  - `fn sig_done2(out r: Sreg, in sig1: sig, in sig2: sig)`
  - `fn sig_wait(out r: Sreg, in sig: sig)`
  - `fn sig_waitall2(out r: Sreg, in sig1: sig, in sig2: sig)`
  - `fn sig_waitall3(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig)`
  - `fn sig_waitall4(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig, in sig4: sig)`
  As a future extension, we could add a instruction that waits for any of the
  given signals to complete, and then jump to a specific location, depending on which of those completed.
 ]
 #section[
  === Control flow (whole wave)
-  TODO
+  - branch if scalar is zero:
    `fn brz(in dest: Simm, in val: Sany)`
  - branch if scalar is not zero:
    `fn brnz(in dest: Simm, in val: Sany)`
  - branch on the whole wave if each element has a true value for the mask:
    `fn br_all(in dest: Simm, in cond: M)`
  - branch on the whole wave if any element has a true value for the mask:
    `fn br_any(in dest: Simm, in cond: M)`
 ]
 #section[
  = Hand-compiling code
-  TODO
+  Now that we decided on a simple compute-only GPU architecture,
  we can try hand-compiling an OpenCL program.
  I asked an LLM to produce a N*N matmul example (comments written manually):
  ```c
  // convenient number for our specifc hardware
  #define TILE_SIZE 8
  // this kernel will be launched with dimensions:
  //   global[2] = { 128,128 } = { N, N };
  //   local[2]  = { 8,8 } = { TILE_SIZE, TILE_SIZE };
  __kernel void matmul_tiled(
    __global float* A,
    __global float* B,
    __global float* C,
    const int N)
  {
    int row = get_global_id(1); // y
    int col = get_global_id(0); // x
    __local float Asub[TILE_SIZE][TILE_SIZE];
    __local float Bsub[TILE_SIZE][TILE_SIZE];
    float sum = 0.0f;
    for (int t = 0; t < N / TILE_SIZE; ++t) {
      // load tiles into local
      int tiledRow = row;
      int tiledCol = t * TILE_SIZE + get_local_id(0);
      if (tiledRow < N && tiledCol < N)
        Asub[get_local_id(1)][get_local_id(0)] = A[tiledRow * N + tiledCol];
      else
        Asub[get_local_id(1)][get_local_id(0)] = 0.0f;
      tiledRow = t * TILE_SIZE + get_local_id(1);
      tiledCol = col;
      if (tiledRow < N && tiledCol < N)
          Bsub[get_local_id(1)][get_local_id(0)] = B[tiledRow * N + tiledCol];
      else
          Bsub[get_local_id(1)][get_local_id(0)] = 0.0f;
      // sync local access across local grp
      barrier(CLK_LOCAL_MEM_FENCE);
      for (int k = 0; k < TILE_SIZE; ++k)
          sum += Asub[get_local_id(1)][k] * Bsub[k][get_local_id(0)];
      // sync local access across local grp
      barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (row < N && col < N)
      C[row * N + col] = sum;
  }
  ```
 ]
 #section[
  First, we have to decide on how we want to map the kernel to the hardware.
  Since the local dimension of the kernel is 8*8, which is 64,
  we can map each local group to one CU, by mapping 32 kernels to one wave,
  and using both waves available on one CU for the local group.
  Our global dimension is 128*128, which means that we would need 256 compute units.
  But since we probably don't have 256 compute units,
  GPUs, including ours, will have a on-hardware task scheduler,
  for scheduing tasks onto compute units.
 ]
 #section[
  = Outro
  Modern GPUs are really complex, but designing a simple GPU is not that hard either.
  Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles.
 ]
 ]
--- a/test_py_mods.py
+++ b/test_py_mods.py
@@ -0,0 +1,7 @@
 import os
 import sys
 import requests
 import json
 from feedgen.feed import FeedGenerator
 import subprocess
 import fontTools