even better

This commit is contained in:
2025-07-25 21:40:21 +02:00
parent b1220399b1
commit c14d78b2b0
22 changed files with 359 additions and 32 deletions

2
coffee/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
target
Cargo.lock

11
coffee/Cargo.toml Normal file
View File

@@ -0,0 +1,11 @@
[package]
name = "coffee"
version = "0.1.0"
edition = "2024"
[dependencies]
axum = "0.6"
reqwest = "0.12.22"
scraper = "0.23.1"
serde = { version = "1.0.219", features = ["derive"] }
tokio = { version = "1.46.1", features = ["full"] }

1
coffee/README Normal file
View File

@@ -0,0 +1 @@
CHATGPT GENERATED BECAUSE LAZY

43
coffee/gen_js.py Normal file
View File

@@ -0,0 +1,43 @@
import requests
import json
response = requests.get("https://gist.githubusercontent.com/erdem/8c7d26765831d0f9a8c62f02782ae00d/raw/248037cd701af0a4957cce340dabb0fd04e38f4c/countries.json")
response.raise_for_status()
response = json.loads(response.text)
out = ""
for item in response:
tz = item["timezones"]
name = item["name"]
for x in tz:
out += f"{x}|{name}|"
print("""function userCountry() {
const tz = Intl.DateTimeFormat().resolvedOptions().timeZone;
if(tz==null) return null;
const c=\""""+out+"""\".split("|");
for(let i=0;i<c.length;i+=2){
if(c[i]===timezone){
return c[i+1];
}}
return null;
}
async function byCountry(country) {
const url = `http://127.0.0.1:3000/price/${encodeURIComponent(country)}`;
try {
const response = await fetch(url);
if(!response.ok){throw new Error(`HTTP error ${response.status}`);}
const data = await response.json();
return data.price;
} catch (error) {
console.error("Failed to fetch price:", error);
return null;
}}
const c = userCountry();
if(c!=null){
byCountry(c).then(price => console.log("coffe price: " + price));
}
""")

67
coffee/src/main.rs Normal file
View File

@@ -0,0 +1,67 @@
use axum::{
extract::Path,
response::Json,
routing::get,
Router,
};
use reqwest::Client;
use scraper::{Html, Selector};
use serde::Serialize;
use std::net::SocketAddr;
#[derive(Serialize)]
struct PriceResponse {
price: f64,
}
async fn by_country(Path(country): Path<String>) -> Json<PriceResponse> {
let url = format!("https://coffeestics.com/countries/{}", country);
// Fetch the page
let response = Client::new()
.get(&url)
.send()
.await
.expect("Failed to fetch page")
.text()
.await
.expect("Failed to get text");
// Parse HTML
let document = Html::parse_document(&response);
// Create selector that matches the element you want
let selector = Selector::parse("body > div:nth-of-type(1) > div:nth-of-type(1) > section:nth-of-type(3) > div > div > div:nth-of-type(1) > div:nth-of-type(3) > a > div:nth-of-type(2)")
.unwrap();
// Extract text and parse as float
let price_str = document
.select(&selector)
.next()
.expect("Element not found")
.text()
.collect::<String>();
let price: f64 = price_str.trim().trim_start_matches('$')
.parse()
.expect("Failed to parse price");
Json(PriceResponse { price })
}
#[tokio::main]
async fn main() {
// Build our router
let app = Router::new()
.route("/price/:country", get(by_country));
// Run server
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
println!("Listening on {}", addr);
axum::Server::bind(&addr)
.serve(app.into_make_service())
.await
.unwrap();
}

View File

@@ -1,5 +1,7 @@
import os
web_targets = []
gen = """
build always: phony
@@ -26,6 +28,7 @@ rule git_inp
rule badges_list
command = typst query $in "<meta-people>" --root . --input query=true --field value --one | jq -r . | jq -r 'to_entries[] | [.key,.value.badge] | @tsv' > $out
build build/badges.txt: badges_list common.typ
rule curl
command = curl $url > $out
@@ -38,28 +41,33 @@ rule cpdir
rule runclean
command = rm -rf build && ninja -t clean
build clean : runclean
rule ttf2woff
command = fonttools ttLib.woff2 compress $in -o $out
rule python_capture
command = python $in > $out
build build/badges.txt: badges_list common.typ
rule minhtml
command = minhtml --minify-js --minify-css $in -o $out
build build.ninja: regen | config.py build/badges.txt res/fonts
build build.ninja: regen | config.py build/badges.txt res pages
build clean : runclean
build build/deploy/coffee.js : python_capture coffee/gen_js.py
rule cargo_release_bin
command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
pool = console
build build/coffee_server : cargo_release_bin coffee
file = coffee
"""
pages = [
"article-make-regex-engine-1.typ",
"project-etc-nand.typ",
"index.typ",
"compiler-pattern-matching.typ",
"article-favicon.typ",
"article-gpu-arch-1.typ",
]
web_targets.append("build/coffee_server")
fonts = [x for x in os.listdir("./res/fonts/")]
pages = [x for x in os.listdir("./pages/")]
fonts = [x for x in os.listdir("./fonts/")]
variants = [
{
@@ -80,18 +88,20 @@ variants = [
},
]
web_targets = []
for page in pages:
gr = "build/" + page + ".git_rev.txt"
gen += "\n"
gen += "build "+gr+" : git_inp pages/" + page + " | build/git_rev.txt"
for var in variants:
tg = "build/" + page + var["suffix"]
web_targets.append(tg)
gen += "\n"
gen += "build "+tg+" : typst " + "pages/" + page + " | "+gr+"\n"
gen += " flags = " + var["args"] + " $$(cat "+gr+")\n"
if tg.endswith(".html"):
gen += "\n"
deploy_tg = f"build/deploy/{page}"+var["suffix"]
web_targets.append(deploy_tg)
gen += f"build {deploy_tg} : minhtml {tg}\n"
if os.path.isfile("build/badges.txt"):
badges = None
@@ -104,38 +114,40 @@ if os.path.isfile("build/badges.txt"):
badge = badge.split("\t")
user = badge[0]
url = badge[1]
tg = "build/res/badges/" + user
tg = "build/deploy/res/badges/" + user
web_targets.append(tg)
gen += "\n"
gen += "build "+tg+": "
if user == "alex":
gen += "cp res/badge.png | build/res/_.txt\n"
gen += "cp res/badge.png\n"
else:
gen += "curl | build/res/_.txt\n"
gen += "curl\n"
gen += " url = "+url+"\n"
for font in fonts:
font = font.replace(".ttf", "")
tg = f"build/res/{font}.woff2"
tg = f"build/deploy/res/{font}.woff2"
web_targets.append(tg)
gen += "\n"
gen += f"build {tg} : ttf2woff res/fonts/{font}.ttf | build/res/_.txt\n"
gen += f"build {tg} : ttf2woff fonts/{font}.ttf\n"
gen += "\n"
gen += "build build/index.html : cp build/index.typ.desktop.html\n"
web_targets.append("build/index.html")
gen += "build build/deploy/index.html : cp build/deploy/index.typ.desktop.html\n"
web_targets.append("build/deploy/index.html")
gen += """
build build/res/_.txt : cpdir res | res/_.txt
outdir = build
"""
web_targets.append("build/res/_.txt")
for root, dirnames, filenames in os.walk("res"):
for file in filenames:
file = os.path.join(root,file)
tg = f"build/deploy/{file}" # file includes "res/"!
gen += "\n"
gen += f"build {tg} : cp {file}"
web_targets.append(tg)
gen += """
build web: phony """+ " ".join(web_targets) +"""
rule pub_cmd
command = rsync -avz build/* root@195.26.251.204:/srv/http/alex
command = rsync -avz build/deploy/* root@195.26.251.204:/srv/http/alex
pool = console
build pub: pub_cmd web

View File

@@ -5,11 +5,11 @@
#simple-page(
gen-table-of-contents: true,
[GPU architecture: SIMD - Alexander Nutz]
[Designing a GPU architecture: Waves]
)[
#section[
#title[GPU Architecture: Compute Cores]
#title[Designing a GPU Architecture: Waves]
#sized-p(small-font-size)[
#rev-and-authors((people.alex,))
@@ -20,9 +20,200 @@
#section[
= Introduction
GPUs consists of multiple (commonly 64) compute units.
In this article, we'll be looking into the hardware of GPUs, and then designing our own.
Specifically GPUs with unified shader architecture.
]
#section[
== Comparision with CPUs
GPUs focus on operating on a lot of data at once (triangles, vertecies, pixels, ...),
while CPUs focus on high performance on a single core, and low compute delay.
]
#section[
= GPU Architecture
GPUs consists of multiple (these days at least 32) compute units (= CU).
Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
Compute units also have some fast local memory (tens of kilobytes),
main memory access queues, texture units, a scalar unit, and other features. (see future article)
The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
]
#section[
== Waves
A wave is a SIMD processing unit consisting of typically 32 "lanes" (sometimes called threads).
Each wave in a CU has seperate control flow, and doesn't have to be related.
Instructions that waves support:
- arithmetic operations
- cross-lane data movement
- CU local and global memory access: each SIMD lane can access a completely different address. similar to CPU gather / scatter.
- synchronization with other CUs in the work group (see future article)
Since only the whole wave can do control flow, and not each lane, all operations can be masked so that they only apply to specific lanes.
=> waves are really similar to SIMD on modern CPUs
]
#section[
== Local memory
The local memory inside GPUs is banked, typically into 32 banks.
The memory word size is typically 32 bits.
The addresses are interlaved, so for two banks:
- addr 0 => bank 0
- addr 1 => bank 1
- addr 2 => bank 0
- addr 3 => bank 1
- ...
Each bank has an dedicated access port, so for 32 banks, you get 32 access ports.
The lanes of the waves inside a CU get routed to the local memory banks magically.
]
#section[
=== Why are the banks interlaved?
When the whole wave wants to read a contiguos array of `f32`, so when each wave performs `some_f32_array[lane_id()]`,
all 32 banks can be used at the same time.
]
#section[
=== Why multiple waves share the same local memory
A wave doesn't do memory accesses every instruction, but also does computations.
This means that there are cycles where the memory isn't doing anything.
By making multiple waves share the same local memory and access ports, you save resources.
]
#section[
== Global memory
Since global memory reads/writes are really slow, they happen asynchronosly.
This means that a wave requests an access, then can continue executing, and then eventually waits for that access to finish.
Because of this, modern compilers automagically start the access before the data is needed, and then wait for the data later on.
]
#section[
== Scalar unit
Most newer GPUs also have a scalar unit for saving energy when performing simple operations.
When the controller sees a scalar instruction in the code running on a wave, it automatically makes the code run on the scalar unit.
The scalar unit can be used for:
- address calculation
- partial reductions
- execution of expensive operations not implemented on SIMD because of costs
]
#section[
= GPU Programming Terminology
- "work item": typically maps to a SIMD lane
- "kernel": the code for a work item
- "work group": consists of multiple work items. typically maps to an CU. the `__local` memory in OpenCL applies to this.
- "compute task": a set of work groups
]
#section[
OpenCL and other APIs let you specify both the number of work groups and work items.
Since a program might specify a higher number of work items per work group than we have available,
the compiler needs to be able to put multiple work items onto one SIMD lane.
]
#section[
= Our own architecture
We'll go with these specs for now:
- N compute units
- 2 waves per CU
- 32 lanes per wave.
- 1KiB local memory per lane => 64 KiB
- 48 vector registers of 16x32b per wave
- one scalar unit per CU
- 128 global memory ports
- no fancy out of order or superscalar execution
- support standard 32 bit floating point, without exceptions.
Note that we won't specifiy the exact instruction encoding.
]
#section[
== Predefined Constants
We will pre-define 16 constants (as virtual vector registers):
- `zero`
- `one`
- `sid`: 0,1,2,3,4,5,6
- `wave`: the ID of the wave in the compute task, broadcasted to all elements.
- `u8_max`: 255,255,...
- `n2nd`: 1,2,1,2,...
- `n3rd`: 1,2,4,1,...
- `n4th`: 1,2,4,8,1,...
- `lo16`: 1,1,1,... (x16) 0,0,0,... (x16)
- `ch2`: 1,1,0,0,1,1,...
- `ch4`: 1,1,1,1,0,0,0,0,1,...
- `alo8`: 1 (x8) 0 (x8) 1 (x8) 0 (x8)
- a few reserved ones
]
#section[
== Operands
We define the following instruction operands:
- `Vreg`: vector register
- `M`: (read only) vector gp reg as mask (1b).
only first 32 registers can be used as mask.
the operand consists of two masks and-ed together, each of which can conditionally be inverted first.
this means that this operand takes up 12 bits
- `Vany`: `Vreg` or `M`
- `Simm`: immediate scalar value
- `Sreg`: the first element of a vector register, as scalar
- `Sany`: a `Simm` or an `Sreg`
- `dist`: `Vany`, or a `Sany` broadcasted to each element
]
#section[
== Instructions
We will add more instructions in future articles.
]
#section[
=== Data Movement
- `fn mov(out out: Vreg, in wrmask: M, in val: dist)`
- `fn select(out out: Vreg, in select: M, in false: dist, in true: dist)`
- `fn first_where_true(out out: Sreg, in where: M, in values: dist)`:
if none of the elements are true, it doesn't overwrite the previous value in out.
- cross-lane operations: not important for this article
]
#section[
=== Mathematics
- simple (unmasked) `u32`, `i32`, and `f32` elementwise arithmetic and logic operations:
`fn add<u32>(out out: Vreg, in left: Vany, in right: dist)`
- scalar arithmetic and logic operations:
`fn add<u32>(out out: Sreg, in left: Sany, in right: Sany)`
- partial reduction operations:
"chunks" the input with a size of 8, reduces each chunk, and stores it in the first element of the chunk.
this means that every 8th element will contain a partial result.
- and operations to finish that reduction into the first element of the vector
]
#section[
=== Memory
- `fn local_load`
TODO
]
#section[
=== Control flow (whole wave)
TODO
]
#section[
= Hand-compiling code
TODO
]
]

View File