mirror of
https://github.com/alex-s168/website.git
synced 2025-09-10 09:05:08 +02:00
Merge remote-tracking branch 'origin/wip'
This commit is contained in:
46
config.py
46
config.py
@@ -1,4 +1,25 @@
|
|||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
testcmd=subprocess.run(["python", "test_py_mods.py"], capture_output=True)
|
||||||
|
print(testcmd.stderr.decode("utf-8").strip())
|
||||||
|
assert testcmd.returncode == 0
|
||||||
|
|
||||||
|
have_ffprobe=False
|
||||||
|
try:
|
||||||
|
have_ffprobe = subprocess.run(["ffprobe", "--version"], capture_output=True)
|
||||||
|
have_ffprobe = testcmd.returncode == 0
|
||||||
|
except:pass
|
||||||
|
if not have_ffprobe:
|
||||||
|
print("warn: ffprobe not installed")
|
||||||
|
|
||||||
|
have_pngquant=False
|
||||||
|
try:
|
||||||
|
have_pngquant = subprocess.run(["pngquant", "--version"], capture_output=True)
|
||||||
|
have_pngquant = have_pngquant.returncode == 0
|
||||||
|
except:pass
|
||||||
|
if not have_pngquant:
|
||||||
|
print("warn: pngquant not installed")
|
||||||
|
|
||||||
web_targets = []
|
web_targets = []
|
||||||
|
|
||||||
@@ -63,18 +84,37 @@ rule cargo_release_bin
|
|||||||
command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
|
command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
|
||||||
pool = console
|
pool = console
|
||||||
|
|
||||||
|
rule touch
|
||||||
|
command = touch $out
|
||||||
|
"""
|
||||||
|
|
||||||
|
if have_ffprobe:
|
||||||
|
gen += """
|
||||||
rule expect_img_size
|
rule expect_img_size
|
||||||
command = eval "[ $$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 $in) = $size ]" && touch $out
|
command = eval "[ $$(ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=s=x:p=0 $in) = $size ]" && touch $out
|
||||||
|
|
||||||
rule touch
|
rule ffmpeg_compress
|
||||||
|
command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
gen += """
|
||||||
|
rule expect_img_size
|
||||||
command = touch $out
|
command = touch $out
|
||||||
|
|
||||||
rule ffmpeg_compress
|
rule ffmpeg_compress
|
||||||
command = ffmpeg -y -i $in -compression_level 100 $out -hide_banner -loglevel error
|
command = cp $in $out
|
||||||
|
"""
|
||||||
|
|
||||||
|
if have_pngquant:
|
||||||
|
gen += """
|
||||||
rule pngquant
|
rule pngquant
|
||||||
command = pngquant $in -o $out --force --quality $quality
|
command = pngquant $in -o $out --force --quality $quality
|
||||||
"""
|
"""
|
||||||
|
else:
|
||||||
|
gen += """
|
||||||
|
rule pngquant
|
||||||
|
command = cp $in $out
|
||||||
|
"""
|
||||||
|
|
||||||
gen += """
|
gen += """
|
||||||
build build/deploy/coffee.js : python_capture gen_coffee_js.py
|
build build/deploy/coffee.js : python_capture gen_coffee_js.py
|
||||||
@@ -88,7 +128,7 @@ web_targets.append("build/coffee_server")
|
|||||||
pages = [x for x in os.listdir("./pages/")]
|
pages = [x for x in os.listdir("./pages/")]
|
||||||
|
|
||||||
gen += """
|
gen += """
|
||||||
build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ
|
build build/pages.typ build/pages.json : python pages.gen.py | pages.in.typ """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
|
||||||
|
|
||||||
build gen_typst: phony build/pages.typ | """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
|
build gen_typst: phony build/pages.typ | """+ " ".join(f"build/{x}.git_rev.txt.iso" for x in pages) +"""
|
||||||
"""
|
"""
|
||||||
|
@@ -45,7 +45,8 @@
|
|||||||
|
|
||||||
Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
|
Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
|
||||||
Compute units also have some fast local memory (tens of kilobytes),
|
Compute units also have some fast local memory (tens of kilobytes),
|
||||||
main memory access queues, texture units, a scalar unit, and other features. (see future article)
|
main memory access queues, texture units, a scalar unit, and other features.
|
||||||
|
Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles.
|
||||||
|
|
||||||
The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
|
The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
|
||||||
]
|
]
|
||||||
@@ -67,6 +68,15 @@
|
|||||||
=> waves are really similar to SIMD on modern CPUs
|
=> waves are really similar to SIMD on modern CPUs
|
||||||
]
|
]
|
||||||
|
|
||||||
|
#section[
|
||||||
|
In modern GPUs, instruction execution in waves is superscalar,
|
||||||
|
so there are multiple different execution units for executing different kinds of instructions,
|
||||||
|
and multiple instructions can be executed at once, if there are free execution units,
|
||||||
|
and they don't depend on each other.
|
||||||
|
|
||||||
|
We'll be exploring that in a future article.
|
||||||
|
]
|
||||||
|
|
||||||
#section[
|
#section[
|
||||||
== Local memory
|
== Local memory
|
||||||
The local memory inside GPUs is banked, typically into 32 banks.
|
The local memory inside GPUs is banked, typically into 32 banks.
|
||||||
@@ -144,6 +154,7 @@
|
|||||||
- 48 vector registers of 16x32b per wave
|
- 48 vector registers of 16x32b per wave
|
||||||
- one scalar unit per CU
|
- one scalar unit per CU
|
||||||
- 128 global memory ports
|
- 128 global memory ports
|
||||||
|
- 16 async task completion "signal" slots per wave
|
||||||
- no fancy out of order or superscalar execution
|
- no fancy out of order or superscalar execution
|
||||||
- support standard 32 bit floating point, without exceptions.
|
- support standard 32 bit floating point, without exceptions.
|
||||||
|
|
||||||
@@ -181,6 +192,7 @@
|
|||||||
- `Sreg`: the first element of a vector register, as scalar
|
- `Sreg`: the first element of a vector register, as scalar
|
||||||
- `Sany`: a `Simm` or an `Sreg`
|
- `Sany`: a `Simm` or an `Sreg`
|
||||||
- `dist`: `Vany`, or a `Sany` broadcasted to each element
|
- `dist`: `Vany`, or a `Sany` broadcasted to each element
|
||||||
|
- `sig`: one of the 16 completion signal slots
|
||||||
]
|
]
|
||||||
|
|
||||||
#section[
|
#section[
|
||||||
@@ -210,19 +222,120 @@
|
|||||||
]
|
]
|
||||||
|
|
||||||
#section[
|
#section[
|
||||||
=== Memory
|
=== Local memory
|
||||||
- `fn local_load`
|
- load 32 bit value at each elem where mask is true:
|
||||||
TODO
|
`fn local_load32(out out: Vreg, in mask: M, in addr: Vreg)`
|
||||||
|
- store 32 bit value at each elem where mask is true:
|
||||||
|
`fn local_store32(in addr: Vreg, in mask: M, in val: Vany)`
|
||||||
|
]
|
||||||
|
|
||||||
|
#section[
|
||||||
|
=== Global (async) memory
|
||||||
|
- start an async global load, and make the given signal correspond to the completion of the access:
|
||||||
|
load 32 bit value at each elem where mask is true:
|
||||||
|
`fn global_load32(out sig: sig, out out: Vreg, in mask: M, in addr: Vreg)`
|
||||||
|
- see above and `local_store32`
|
||||||
|
`fn global_store32(out sig: sig, in addr: Vreg, in mask: M, in val: Vany)`
|
||||||
|
- `fn sig_done1(out r: Sreg, in sig: sig)`
|
||||||
|
- `fn sig_done2(out r: Sreg, in sig1: sig, in sig2: sig)`
|
||||||
|
- `fn sig_wait(out r: Sreg, in sig: sig)`
|
||||||
|
- `fn sig_waitall2(out r: Sreg, in sig1: sig, in sig2: sig)`
|
||||||
|
- `fn sig_waitall3(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig)`
|
||||||
|
- `fn sig_waitall4(out r: Sreg, in sig1: sig, in sig2: sig, in sig3: sig, in sig4: sig)`
|
||||||
|
|
||||||
|
As a future extension, we could add a instruction that waits for any of the
|
||||||
|
given signals to complete, and then jump to a specific location, depending on which of those completed.
|
||||||
]
|
]
|
||||||
|
|
||||||
#section[
|
#section[
|
||||||
=== Control flow (whole wave)
|
=== Control flow (whole wave)
|
||||||
TODO
|
- branch if scalar is zero:
|
||||||
|
`fn brz(in dest: Simm, in val: Sany)`
|
||||||
|
- branch if scalar is not zero:
|
||||||
|
`fn brnz(in dest: Simm, in val: Sany)`
|
||||||
|
- branch on the whole wave if each element has a true value for the mask:
|
||||||
|
`fn br_all(in dest: Simm, in cond: M)`
|
||||||
|
- branch on the whole wave if any element has a true value for the mask:
|
||||||
|
`fn br_any(in dest: Simm, in cond: M)`
|
||||||
]
|
]
|
||||||
|
|
||||||
#section[
|
#section[
|
||||||
= Hand-compiling code
|
= Hand-compiling code
|
||||||
TODO
|
Now that we decided on a simple compute-only GPU architecture,
|
||||||
|
we can try hand-compiling an OpenCL program.
|
||||||
|
|
||||||
|
I asked an LLM to produce a N*N matmul example (comments written manually):
|
||||||
|
```c
|
||||||
|
// convenient number for our specifc hardware
|
||||||
|
#define TILE_SIZE 8
|
||||||
|
|
||||||
|
// this kernel will be launched with dimensions:
|
||||||
|
// global[2] = { 128,128 } = { N, N };
|
||||||
|
// local[2] = { 8,8 } = { TILE_SIZE, TILE_SIZE };
|
||||||
|
__kernel void matmul_tiled(
|
||||||
|
__global float* A,
|
||||||
|
__global float* B,
|
||||||
|
__global float* C,
|
||||||
|
const int N)
|
||||||
|
{
|
||||||
|
int row = get_global_id(1); // y
|
||||||
|
int col = get_global_id(0); // x
|
||||||
|
|
||||||
|
__local float Asub[TILE_SIZE][TILE_SIZE];
|
||||||
|
__local float Bsub[TILE_SIZE][TILE_SIZE];
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
|
||||||
|
for (int t = 0; t < N / TILE_SIZE; ++t) {
|
||||||
|
// load tiles into local
|
||||||
|
int tiledRow = row;
|
||||||
|
int tiledCol = t * TILE_SIZE + get_local_id(0);
|
||||||
|
if (tiledRow < N && tiledCol < N)
|
||||||
|
Asub[get_local_id(1)][get_local_id(0)] = A[tiledRow * N + tiledCol];
|
||||||
|
else
|
||||||
|
Asub[get_local_id(1)][get_local_id(0)] = 0.0f;
|
||||||
|
|
||||||
|
tiledRow = t * TILE_SIZE + get_local_id(1);
|
||||||
|
tiledCol = col;
|
||||||
|
if (tiledRow < N && tiledCol < N)
|
||||||
|
Bsub[get_local_id(1)][get_local_id(0)] = B[tiledRow * N + tiledCol];
|
||||||
|
else
|
||||||
|
Bsub[get_local_id(1)][get_local_id(0)] = 0.0f;
|
||||||
|
|
||||||
|
// sync local access across local grp
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for (int k = 0; k < TILE_SIZE; ++k)
|
||||||
|
sum += Asub[get_local_id(1)][k] * Bsub[k][get_local_id(0)];
|
||||||
|
|
||||||
|
// sync local access across local grp
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (row < N && col < N)
|
||||||
|
C[row * N + col] = sum;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
]
|
||||||
|
|
||||||
|
#section[
|
||||||
|
First, we have to decide on how we want to map the kernel to the hardware.
|
||||||
|
|
||||||
|
Since the local dimension of the kernel is 8*8, which is 64,
|
||||||
|
we can map each local group to one CU, by mapping 32 kernels to one wave,
|
||||||
|
and using both waves available on one CU for the local group.
|
||||||
|
|
||||||
|
Our global dimension is 128*128, which means that we would need 256 compute units.
|
||||||
|
But since we probably don't have 256 compute units,
|
||||||
|
GPUs, including ours, will have a on-hardware task scheduler,
|
||||||
|
for scheduing tasks onto compute units.
|
||||||
|
]
|
||||||
|
|
||||||
|
#section[
|
||||||
|
= Outro
|
||||||
|
Modern GPUs are really complex, but designing a simple GPU is not that hard either.
|
||||||
|
|
||||||
|
Subscribe to the #flink("atom.xml")[Atom feed] to get notified of future articles.
|
||||||
]
|
]
|
||||||
|
|
||||||
]
|
]
|
||||||
|
7
test_py_mods.py
Normal file
7
test_py_mods.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from feedgen.feed import FeedGenerator
|
||||||
|
import subprocess
|
||||||
|
import fontTools
|
Reference in New Issue
Block a user