diff --git a/config.py b/config.py index e08d7a0..18bb652 100644 --- a/config.py +++ b/config.py @@ -1,4 +1,25 @@ import os +import subprocess + +testcmd=subprocess.run(["python", "test_py_mods.py"], capture_output=True) +print(testcmd.stderr.decode("utf-8").strip()) +assert testcmd.returncode == 0 + +have_ffprobe=False +try: + have_ffprobe = subprocess.run(["ffprobe", "--version"], capture_output=True) + have_ffprobe = testcmd.returncode == 0 +except:pass +if not have_ffprobe: + print("warn: ffprobe not installed") + +have_pngquant=False +try: + have_pngquant = subprocess.run(["pngquant", "--version"], capture_output=True) + have_pngquant = have_pngquant.returncode == 0 +except:pass +if not have_pngquant: + print("warn: pngquant not installed") web_targets = [] @@ -63,18 +84,37 @@ rule cargo_release_bin command = (cd $in && cargo build --release) && cp $in/target/release/$file $out pool = console +rule touch + command = touch $out +""" + +if have_ffprobe: + gen += """ rule expect_img_size command = eval "[ $$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 $in) = $size ]" && touch $out -rule touch +rule ffmpeg_compress + command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error + """ +else: + gen += """ +rule expect_img_size command = touch $out rule ffmpeg_compress - command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error + command = cp $in $out + """ +if have_pngquant: + gen += """ rule pngquant command = pngquant $in -o $out --force --quality $quality -""" + """ +else: + gen += """ +rule pngquant + command = cp $in $out + """ gen += """ build build/deploy/coffee.js : python_capture gen_coffee_js.py @@ -88,7 +128,7 @@ web_targets.append("build/coffee_server") pages = [x for x in os.listdir("./pages/")] gen += """ -build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ +build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +""" build gen_typst: phony build/pages.typ | """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +""" """ diff --git a/pages/article-gpu-arch-1.typ b/pages/article-gpu-arch-1.typ index bd8f067..0e93336 100644 --- a/pages/article-gpu-arch-1.typ +++ b/pages/article-gpu-arch-1.typ @@ -45,7 +45,8 @@ Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp". Compute units also have some fast local memory (tens of kilobytes), - main memory access queues, texture units, a scalar unit, and other features. (see future article) + main memory access queues, texture units, a scalar unit, and other features. + Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles. The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory. ] @@ -67,6 +68,15 @@ => waves are really similar to SIMD on modern CPUs ] +#section[ + In modern GPUs, instruction execution in waves is superscalar, + so there are multiple different execution units for executing different kinds of instructions, + and multiple instructions can be executed at once, if there are free execution units, + and they don't depend on each other. + + We'll be exploring that in a future article. +] + #section[ == Local memory The local memory inside GPUs is banked, typically into 32 banks. @@ -144,6 +154,7 @@ - 48 vector registers of 16x32b per wave - one scalar unit per CU - 128 global memory ports + - 16 async task completion "signal" slots per wave - no fancy out of order or superscalar execution - support standard 32 bit floating point, without exceptions. @@ -181,6 +192,7 @@ - `Sreg`: the first element of a vector register, as scalar - `Sany`: a `Simm` or an `Sreg` - `dist`: `Vany`, or a `Sany` broadcasted to each element + - `sig`: one of the 16 completion signal slots ] #section[ @@ -210,19 +222,120 @@ ] #section[ - === Memory - - `fn local_load` - TODO + === Local memory + - load 32 bit value at each elem where mask is true: + `fn local_load32(out out: Vreg, in mask: M, in addr: Vreg)` + - store 32 bit value at each elem where mask is true: + `fn local_store32(in addr: Vreg, in mask: M, in val: Vany)` +] + +#section[ + === Global (async) memory + - start an async global load, and make the given signal correspond to the completion of the access: + load 32 bit value at each elem where mask is true: + `fn global_load32(out sig: sig, out out: Vreg, in mask: M, in addr: Vreg)` + - see above and `local_store32` + `fn global_store32(out sig: sig, in addr: Vreg, in mask: M, in val: Vany)` + - `fn sig_done1(out r: Sreg, in sig: sig)` + - `fn sig_done2(out r: Sreg, in sig1: sig, in sig2: sig)` + - `fn sig_wait(out r: Sreg, in sig: sig)` + - `fn sig_waitall2(out r: Sreg, in sig1: sig, in sig2: sig)` + - `fn sig_waitall3(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig)` + - `fn sig_waitall4(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig, in sig4: sig)` + + As a future extension, we could add a instruction that waits for any of the + given signals to complete, and then jump to a specific location, depending on which of those completed. ] #section[ === Control flow (whole wave) - TODO + - branch if scalar is zero: + `fn brz(in dest: Simm, in val: Sany)` + - branch if scalar is not zero: + `fn brnz(in dest: Simm, in val: Sany)` + - branch on the whole wave if each element has a true value for the mask: + `fn br_all(in dest: Simm, in cond: M)` + - branch on the whole wave if any element has a true value for the mask: + `fn br_any(in dest: Simm, in cond: M)` ] #section[ = Hand-compiling code - TODO + Now that we decided on a simple compute-only GPU architecture, + we can try hand-compiling an OpenCL program. + + I asked an LLM to produce a N*N matmul example (comments written manually): + ```c + // convenient number for our specifc hardware + #define TILE_SIZE 8 + + // this kernel will be launched with dimensions: + // global[2] = { 128,128 } = { N, N }; + // local[2] = { 8,8 } = { TILE_SIZE, TILE_SIZE }; + __kernel void matmul_tiled( + __global float* A, + __global float* B, + __global float* C, + const int N) + { + int row = get_global_id(1); // y + int col = get_global_id(0); // x + + __local float Asub[TILE_SIZE][TILE_SIZE]; + __local float Bsub[TILE_SIZE][TILE_SIZE]; + + float sum = 0.0f; + + for (int t = 0; t < N / TILE_SIZE; ++t) { + // load tiles into local + int tiledRow = row; + int tiledCol = t * TILE_SIZE + get_local_id(0); + if (tiledRow < N && tiledCol < N) + Asub[get_local_id(1)][get_local_id(0)] = A[tiledRow * N + tiledCol]; + else + Asub[get_local_id(1)][get_local_id(0)] = 0.0f; + + tiledRow = t * TILE_SIZE + get_local_id(1); + tiledCol = col; + if (tiledRow < N && tiledCol < N) + Bsub[get_local_id(1)][get_local_id(0)] = B[tiledRow * N + tiledCol]; + else + Bsub[get_local_id(1)][get_local_id(0)] = 0.0f; + + // sync local access across local grp + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < TILE_SIZE; ++k) + sum += Asub[get_local_id(1)][k] * Bsub[k][get_local_id(0)]; + + // sync local access across local grp + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (row < N && col < N) + C[row * N + col] = sum; + } + ``` +] + +#section[ + First, we have to decide on how we want to map the kernel to the hardware. + + Since the local dimension of the kernel is 8*8, which is 64, + we can map each local group to one CU, by mapping 32 kernels to one wave, + and using both waves available on one CU for the local group. + + Our global dimension is 128*128, which means that we would need 256 compute units. + But since we probably don't have 256 compute units, + GPUs, including ours, will have a on-hardware task scheduler, + for scheduing tasks onto compute units. +] + +#section[ + = Outro + Modern GPUs are really complex, but designing a simple GPU is not that hard either. + + Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles. ] ] diff --git a/test_py_mods.py b/test_py_mods.py new file mode 100644 index 0000000..b61c34b --- /dev/null +++ b/test_py_mods.py @@ -0,0 +1,7 @@ +import os +import sys +import requests +import json +from feedgen.feed import FeedGenerator +import subprocess +import fontTools