Merge remote-tracking branch 'origin/wip'

This commit is contained in:
2025-08-02 10:03:33 +02:00
3 changed files with 170 additions and 10 deletions

View File

@@ -1,4 +1,25 @@
import os import os
import subprocess
testcmd=subprocess.run(["python", "test_py_mods.py"], capture_output=True)
print(testcmd.stderr.decode("utf-8").strip())
assert testcmd.returncode == 0
have_ffprobe=False
try:
have_ffprobe = subprocess.run(["ffprobe", "--version"], capture_output=True)
have_ffprobe = testcmd.returncode == 0
except:pass
if not have_ffprobe:
print("warn: ffprobe not installed")
have_pngquant=False
try:
have_pngquant = subprocess.run(["pngquant", "--version"], capture_output=True)
have_pngquant = have_pngquant.returncode == 0
except:pass
if not have_pngquant:
print("warn: pngquant not installed")
web_targets = [] web_targets = []
@@ -63,18 +84,37 @@ rule cargo_release_bin
command = (cd $in && cargo build --release) && cp $in/target/release/$file $out command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
pool = console pool = console
rule touch
command = touch $out
"""
if have_ffprobe:
gen += """
rule expect_img_size rule expect_img_size
command = eval "[ $$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 $in) = $size ]" && touch $out command = eval "[ $$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 $in) = $size ]" && touch $out
rule touch rule ffmpeg_compress
command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error
"""
else:
gen += """
rule expect_img_size
command = touch $out command = touch $out
rule ffmpeg_compress rule ffmpeg_compress
command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error command = cp $in $out
"""
if have_pngquant:
gen += """
rule pngquant rule pngquant
command = pngquant $in -o $out --force --quality $quality command = pngquant $in -o $out --force --quality $quality
""" """
else:
gen += """
rule pngquant
command = cp $in $out
"""
gen += """ gen += """
build build/deploy/coffee.js : python_capture gen_coffee_js.py build build/deploy/coffee.js : python_capture gen_coffee_js.py
@@ -88,7 +128,7 @@ web_targets.append("build/coffee_server")
pages = [x for x in os.listdir("./pages/")] pages = [x for x in os.listdir("./pages/")]
gen += """ gen += """
build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
build gen_typst: phony build/pages.typ | """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +""" build gen_typst: phony build/pages.typ | """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
""" """

View File

@@ -45,7 +45,8 @@
Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp". Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
Compute units also have some fast local memory (tens of kilobytes), Compute units also have some fast local memory (tens of kilobytes),
main memory access queues, texture units, a scalar unit, and other features. (see future article) main memory access queues, texture units, a scalar unit, and other features.
Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles.
The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory. The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
] ]
@@ -67,6 +68,15 @@
=> waves are really similar to SIMD on modern CPUs => waves are really similar to SIMD on modern CPUs
] ]
#section[
In modern GPUs, instruction execution in waves is superscalar,
so there are multiple different execution units for executing different kinds of instructions,
and multiple instructions can be executed at once, if there are free execution units,
and they don't depend on each other.
We'll be exploring that in a future article.
]
#section[ #section[
== Local memory == Local memory
The local memory inside GPUs is banked, typically into 32 banks. The local memory inside GPUs is banked, typically into 32 banks.
@@ -144,6 +154,7 @@
- 48 vector registers of 16x32b per wave - 48 vector registers of 16x32b per wave
- one scalar unit per CU - one scalar unit per CU
- 128 global memory ports - 128 global memory ports
- 16 async task completion "signal" slots per wave
- no fancy out of order or superscalar execution - no fancy out of order or superscalar execution
- support standard 32 bit floating point, without exceptions. - support standard 32 bit floating point, without exceptions.
@@ -181,6 +192,7 @@
- `Sreg`: the first element of a vector register, as scalar - `Sreg`: the first element of a vector register, as scalar
- `Sany`: a `Simm` or an `Sreg` - `Sany`: a `Simm` or an `Sreg`
- `dist`: `Vany`, or a `Sany` broadcasted to each element - `dist`: `Vany`, or a `Sany` broadcasted to each element
- `sig`: one of the 16 completion signal slots
] ]
#section[ #section[
@@ -210,19 +222,120 @@
] ]
#section[ #section[
=== Memory === Local memory
- `fn local_load` - load 32 bit value at each elem where mask is true:
TODO `fn local_load32(out out: Vreg, in mask: M, in addr: Vreg)`
- store 32 bit value at each elem where mask is true:
`fn local_store32(in addr: Vreg, in mask: M, in val: Vany)`
]
#section[
=== Global (async) memory
- start an async global load, and make the given signal correspond to the completion of the access:
load 32 bit value at each elem where mask is true:
`fn global_load32(out sig: sig, out out: Vreg, in mask: M, in addr: Vreg)`
- see above and `local_store32`
`fn global_store32(out sig: sig, in addr: Vreg, in mask: M, in val: Vany)`
- `fn sig_done1(out r: Sreg, in sig: sig)`
- `fn sig_done2(out r: Sreg, in sig1: sig, in sig2: sig)`
- `fn sig_wait(out r: Sreg, in sig: sig)`
- `fn sig_waitall2(out r: Sreg, in sig1: sig, in sig2: sig)`
- `fn sig_waitall3(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig)`
- `fn sig_waitall4(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig, in sig4: sig)`
As a future extension, we could add a instruction that waits for any of the
given signals to complete, and then jump to a specific location, depending on which of those completed.
] ]
#section[ #section[
=== Control flow (whole wave) === Control flow (whole wave)
TODO - branch if scalar is zero:
`fn brz(in dest: Simm, in val: Sany)`
- branch if scalar is not zero:
`fn brnz(in dest: Simm, in val: Sany)`
- branch on the whole wave if each element has a true value for the mask:
`fn br_all(in dest: Simm, in cond: M)`
- branch on the whole wave if any element has a true value for the mask:
`fn br_any(in dest: Simm, in cond: M)`
] ]
#section[ #section[
= Hand-compiling code = Hand-compiling code
TODO Now that we decided on a simple compute-only GPU architecture,
we can try hand-compiling an OpenCL program.
I asked an LLM to produce a N*N matmul example (comments written manually):
```c
// convenient number for our specifc hardware
#define TILE_SIZE 8
// this kernel will be launched with dimensions:
// global[2] = { 128,128 } = { N, N };
// local[2] = { 8,8 } = { TILE_SIZE, TILE_SIZE };
__kernel void matmul_tiled(
__global float* A,
__global float* B,
__global float* C,
const int N)
{
int row = get_global_id(1); // y
int col = get_global_id(0); // x
__local float Asub[TILE_SIZE][TILE_SIZE];
__local float Bsub[TILE_SIZE][TILE_SIZE];
float sum = 0.0f;
for (int t = 0; t < N / TILE_SIZE; ++t) {
// load tiles into local
int tiledRow = row;
int tiledCol = t * TILE_SIZE + get_local_id(0);
if (tiledRow < N && tiledCol < N)
Asub[get_local_id(1)][get_local_id(0)] = A[tiledRow * N + tiledCol];
else
Asub[get_local_id(1)][get_local_id(0)] = 0.0f;
tiledRow = t * TILE_SIZE + get_local_id(1);
tiledCol = col;
if (tiledRow < N && tiledCol < N)
Bsub[get_local_id(1)][get_local_id(0)] = B[tiledRow * N + tiledCol];
else
Bsub[get_local_id(1)][get_local_id(0)] = 0.0f;
// sync local access across local grp
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TILE_SIZE; ++k)
sum += Asub[get_local_id(1)][k] * Bsub[k][get_local_id(0)];
// sync local access across local grp
barrier(CLK_LOCAL_MEM_FENCE);
}
if (row < N && col < N)
C[row * N + col] = sum;
}
```
]
#section[
First, we have to decide on how we want to map the kernel to the hardware.
Since the local dimension of the kernel is 8*8, which is 64,
we can map each local group to one CU, by mapping 32 kernels to one wave,
and using both waves available on one CU for the local group.
Our global dimension is 128*128, which means that we would need 256 compute units.
But since we probably don't have 256 compute units,
GPUs, including ours, will have a on-hardware task scheduler,
for scheduing tasks onto compute units.
]
#section[
= Outro
Modern GPUs are really complex, but designing a simple GPU is not that hard either.
Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles.
] ]
] ]

7
test_py_mods.py Normal file
View File

@@ -0,0 +1,7 @@
import os
import sys
import requests
import json
from feedgen.feed import FeedGenerator
import subprocess
import fontTools