diff --git a/coffee/.gitignore b/coffee/.gitignore new file mode 100644 index 0000000..a9d37c5 --- /dev/null +++ b/coffee/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/coffee/Cargo.toml b/coffee/Cargo.toml new file mode 100644 index 0000000..9d5f00e --- /dev/null +++ b/coffee/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "coffee" +version = "0.1.0" +edition = "2024" + +[dependencies] +axum = "0.6" +reqwest = "0.12.22" +scraper = "0.23.1" +serde = { version = "1.0.219", features = ["derive"] } +tokio = { version = "1.46.1", features = ["full"] } diff --git a/coffee/README b/coffee/README new file mode 100644 index 0000000..7fad626 --- /dev/null +++ b/coffee/README @@ -0,0 +1 @@ +CHATGPT GENERATED BECAUSE LAZY diff --git a/coffee/gen_js.py b/coffee/gen_js.py new file mode 100644 index 0000000..3b57501 --- /dev/null +++ b/coffee/gen_js.py @@ -0,0 +1,43 @@ +import requests +import json + +response = requests.get("https://gist.githubusercontent.com/erdem/8c7d26765831d0f9a8c62f02782ae00d/raw/248037cd701af0a4957cce340dabb0fd04e38f4c/countries.json") +response.raise_for_status() +response = json.loads(response.text) + +out = "" +for item in response: + tz = item["timezones"] + name = item["name"] + for x in tz: + out += f"{x}|{name}|" + +print("""function userCountry() { +const tz = Intl.DateTimeFormat().resolvedOptions().timeZone; +if(tz==null) return null; +const c=\""""+out+"""\".split("|"); +for(let i=0;i console.log("coffe price: " + price)); +} +""") diff --git a/coffee/src/main.rs b/coffee/src/main.rs new file mode 100644 index 0000000..d161238 --- /dev/null +++ b/coffee/src/main.rs @@ -0,0 +1,67 @@ +use axum::{ + extract::Path, + response::Json, + routing::get, + Router, +}; +use reqwest::Client; +use scraper::{Html, Selector}; +use serde::Serialize; +use std::net::SocketAddr; + +#[derive(Serialize)] +struct PriceResponse { + price: f64, +} + +async fn by_country(Path(country): Path) -> Json { + let url = format!("https://coffeestics.com/countries/{}", country); + + // Fetch the page + let response = Client::new() + .get(&url) + .send() + .await + .expect("Failed to fetch page") + .text() + .await + .expect("Failed to get text"); + + // Parse HTML + let document = Html::parse_document(&response); + + // Create selector that matches the element you want + let selector = Selector::parse("body > div:nth-of-type(1) > div:nth-of-type(1) > section:nth-of-type(3) > div > div > div:nth-of-type(1) > div:nth-of-type(3) > a > div:nth-of-type(2)") + .unwrap(); + + // Extract text and parse as float + let price_str = document + .select(&selector) + .next() + .expect("Element not found") + .text() + .collect::(); + + let price: f64 = price_str.trim().trim_start_matches('$') + .parse() + .expect("Failed to parse price"); + + Json(PriceResponse { price }) +} + +#[tokio::main] +async fn main() { + // Build our router + let app = Router::new() + .route("/price/:country", get(by_country)); + + // Run server + let addr = SocketAddr::from(([127, 0, 0, 1], 3000)); + println!("Listening on {}", addr); + + axum::Server::bind(&addr) + .serve(app.into_make_service()) + .await + .unwrap(); +} + diff --git a/config.py b/config.py index 48dddaf..6d409fb 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,7 @@ import os +web_targets = [] + gen = """ build always: phony @@ -26,6 +28,7 @@ rule git_inp rule badges_list command = typst query $in "" --root . --input query=true --field value --one | jq -r . | jq -r 'to_entries[] | [.key,.value.badge] | @tsv' > $out +build build/badges.txt: badges_list common.typ rule curl command = curl $url > $out @@ -38,28 +41,33 @@ rule cpdir rule runclean command = rm -rf build && ninja -t clean +build clean : runclean rule ttf2woff command = fonttools ttLib.woff2 compress $in -o $out +rule python_capture + command = python $in > $out -build build/badges.txt: badges_list common.typ +rule minhtml + command = minhtml --minify-js --minify-css $in -o $out -build build.ninja: regen | config.py build/badges.txt res/fonts +build build.ninja: regen | config.py build/badges.txt res pages -build clean : runclean +build build/deploy/coffee.js : python_capture coffee/gen_js.py + +rule cargo_release_bin + command = (cd $in && cargo build --release) && cp $in/target/release/$file $out + pool = console + +build build/coffee_server : cargo_release_bin coffee + file = coffee """ -pages = [ - "article-make-regex-engine-1.typ", - "project-etc-nand.typ", - "index.typ", - "compiler-pattern-matching.typ", - "article-favicon.typ", - "article-gpu-arch-1.typ", -] +web_targets.append("build/coffee_server") -fonts = [x for x in os.listdir("./res/fonts/")] +pages = [x for x in os.listdir("./pages/")] +fonts = [x for x in os.listdir("./fonts/")] variants = [ { @@ -80,18 +88,20 @@ variants = [ }, ] -web_targets = [] - for page in pages: gr = "build/" + page + ".git_rev.txt" gen += "\n" gen += "build "+gr+" : git_inp pages/" + page + " | build/git_rev.txt" for var in variants: tg = "build/" + page + var["suffix"] - web_targets.append(tg) gen += "\n" gen += "build "+tg+" : typst " + "pages/" + page + " | "+gr+"\n" gen += " flags = " + var["args"] + " $$(cat "+gr+")\n" + if tg.endswith(".html"): + gen += "\n" + deploy_tg = f"build/deploy/{page}"+var["suffix"] + web_targets.append(deploy_tg) + gen += f"build {deploy_tg} : minhtml {tg}\n" if os.path.isfile("build/badges.txt"): badges = None @@ -104,38 +114,40 @@ if os.path.isfile("build/badges.txt"): badge = badge.split("\t") user = badge[0] url = badge[1] - tg = "build/res/badges/" + user + tg = "build/deploy/res/badges/" + user web_targets.append(tg) gen += "\n" gen += "build "+tg+": " if user == "alex": - gen += "cp res/badge.png | build/res/_.txt\n" + gen += "cp res/badge.png\n" else: - gen += "curl | build/res/_.txt\n" + gen += "curl\n" gen += " url = "+url+"\n" for font in fonts: font = font.replace(".ttf", "") - tg = f"build/res/{font}.woff2" + tg = f"build/deploy/res/{font}.woff2" web_targets.append(tg) gen += "\n" - gen += f"build {tg} : ttf2woff res/fonts/{font}.ttf | build/res/_.txt\n" + gen += f"build {tg} : ttf2woff fonts/{font}.ttf\n" gen += "\n" -gen += "build build/index.html : cp build/index.typ.desktop.html\n" -web_targets.append("build/index.html") +gen += "build build/deploy/index.html : cp build/deploy/index.typ.desktop.html\n" +web_targets.append("build/deploy/index.html") -gen += """ -build build/res/_.txt : cpdir res | res/_.txt - outdir = build -""" -web_targets.append("build/res/_.txt") +for root, dirnames, filenames in os.walk("res"): + for file in filenames: + file = os.path.join(root,file) + tg = f"build/deploy/{file}" # file includes "res/"! + gen += "\n" + gen += f"build {tg} : cp {file}" + web_targets.append(tg) gen += """ build web: phony """+ " ".join(web_targets) +""" rule pub_cmd - command = rsync -avz build/* root@195.26.251.204:/srv/http/alex + command = rsync -avz build/deploy/* root@195.26.251.204:/srv/http/alex pool = console build pub: pub_cmd web diff --git a/res/fonts/DejaVuMathTeXGyre.ttf b/fonts/DejaVuMathTeXGyre.ttf similarity index 100% rename from res/fonts/DejaVuMathTeXGyre.ttf rename to fonts/DejaVuMathTeXGyre.ttf diff --git a/res/fonts/DejaVuSans-Bold.ttf b/fonts/DejaVuSans-Bold.ttf similarity index 100% rename from res/fonts/DejaVuSans-Bold.ttf rename to fonts/DejaVuSans-Bold.ttf diff --git a/res/fonts/DejaVuSans-BoldOblique.ttf b/fonts/DejaVuSans-BoldOblique.ttf similarity index 100% rename from res/fonts/DejaVuSans-BoldOblique.ttf rename to fonts/DejaVuSans-BoldOblique.ttf diff --git a/res/fonts/DejaVuSans-ExtraLight.ttf b/fonts/DejaVuSans-ExtraLight.ttf similarity index 100% rename from res/fonts/DejaVuSans-ExtraLight.ttf rename to fonts/DejaVuSans-ExtraLight.ttf diff --git a/res/fonts/DejaVuSans-Oblique.ttf b/fonts/DejaVuSans-Oblique.ttf similarity index 100% rename from res/fonts/DejaVuSans-Oblique.ttf rename to fonts/DejaVuSans-Oblique.ttf diff --git a/res/fonts/DejaVuSans.ttf b/fonts/DejaVuSans.ttf similarity index 100% rename from res/fonts/DejaVuSans.ttf rename to fonts/DejaVuSans.ttf diff --git a/res/fonts/DejaVuSansMono-Bold.ttf b/fonts/DejaVuSansMono-Bold.ttf similarity index 100% rename from res/fonts/DejaVuSansMono-Bold.ttf rename to fonts/DejaVuSansMono-Bold.ttf diff --git a/res/fonts/DejaVuSansMono-BoldOblique.ttf b/fonts/DejaVuSansMono-BoldOblique.ttf similarity index 100% rename from res/fonts/DejaVuSansMono-BoldOblique.ttf rename to fonts/DejaVuSansMono-BoldOblique.ttf diff --git a/res/fonts/DejaVuSansMono-Oblique.ttf b/fonts/DejaVuSansMono-Oblique.ttf similarity index 100% rename from res/fonts/DejaVuSansMono-Oblique.ttf rename to fonts/DejaVuSansMono-Oblique.ttf diff --git a/res/fonts/DejaVuSansMono.ttf b/fonts/DejaVuSansMono.ttf similarity index 100% rename from res/fonts/DejaVuSansMono.ttf rename to fonts/DejaVuSansMono.ttf diff --git a/res/fonts/DejaVuSerif-Bold.ttf b/fonts/DejaVuSerif-Bold.ttf similarity index 100% rename from res/fonts/DejaVuSerif-Bold.ttf rename to fonts/DejaVuSerif-Bold.ttf diff --git a/res/fonts/DejaVuSerif-BoldItalic.ttf b/fonts/DejaVuSerif-BoldItalic.ttf similarity index 100% rename from res/fonts/DejaVuSerif-BoldItalic.ttf rename to fonts/DejaVuSerif-BoldItalic.ttf diff --git a/res/fonts/DejaVuSerif-Italic.ttf b/fonts/DejaVuSerif-Italic.ttf similarity index 100% rename from res/fonts/DejaVuSerif-Italic.ttf rename to fonts/DejaVuSerif-Italic.ttf diff --git a/res/fonts/DejaVuSerif.ttf b/fonts/DejaVuSerif.ttf similarity index 100% rename from res/fonts/DejaVuSerif.ttf rename to fonts/DejaVuSerif.ttf diff --git a/pages/article-gpu-arch-1.typ b/pages/article-gpu-arch-1.typ index f5c61ff..32e4fee 100644 --- a/pages/article-gpu-arch-1.typ +++ b/pages/article-gpu-arch-1.typ @@ -5,11 +5,11 @@ #simple-page( gen-table-of-contents: true, - [GPU architecture: SIMD - Alexander Nutz] + [Designing a GPU architecture: Waves] )[ #section[ - #title[GPU Architecture: Compute Cores] + #title[Designing a GPU Architecture: Waves] #sized-p(small-font-size)[ #rev-and-authors((people.alex,)) @@ -20,9 +20,200 @@ #section[ = Introduction - GPUs consists of multiple (commonly 64) compute units. - + In this article, we'll be looking into the hardware of GPUs, and then designing our own. + Specifically GPUs with unified shader architecture. ] +#section[ + == Comparision with CPUs + GPUs focus on operating on a lot of data at once (triangles, vertecies, pixels, ...), + while CPUs focus on high performance on a single core, and low compute delay. +] + +#section[ + = GPU Architecture + GPUs consists of multiple (these days at least 32) compute units (= CU). + + Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp". + Compute units also have some fast local memory (tens of kilobytes), + main memory access queues, texture units, a scalar unit, and other features. (see future article) + + The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory. +] + +#section[ + == Waves + A wave is a SIMD processing unit consisting of typically 32 "lanes" (sometimes called threads). + + Each wave in a CU has seperate control flow, and doesn't have to be related. + + Instructions that waves support: + - arithmetic operations + - cross-lane data movement + - CU local and global memory access: each SIMD lane can access a completely different address. similar to CPU gather / scatter. + - synchronization with other CUs in the work group (see future article) + + Since only the whole wave can do control flow, and not each lane, all operations can be masked so that they only apply to specific lanes. + + => waves are really similar to SIMD on modern CPUs +] + +#section[ + == Local memory + The local memory inside GPUs is banked, typically into 32 banks. + The memory word size is typically 32 bits. + + The addresses are interlaved, so for two banks: + - addr 0 => bank 0 + - addr 1 => bank 1 + - addr 2 => bank 0 + - addr 3 => bank 1 + - ... + + Each bank has an dedicated access port, so for 32 banks, you get 32 access ports. + + The lanes of the waves inside a CU get routed to the local memory banks magically. +] + +#section[ + === Why are the banks interlaved? + When the whole wave wants to read a contiguos array of `f32`, so when each wave performs `some_f32_array[lane_id()]`, + all 32 banks can be used at the same time. +] + +#section[ + === Why multiple waves share the same local memory + A wave doesn't do memory accesses every instruction, but also does computations. + This means that there are cycles where the memory isn't doing anything. + + By making multiple waves share the same local memory and access ports, you save resources. +] + +#section[ + == Global memory + Since global memory reads/writes are really slow, they happen asynchronosly. + + This means that a wave requests an access, then can continue executing, and then eventually waits for that access to finish. + + Because of this, modern compilers automagically start the access before the data is needed, and then wait for the data later on. +] + +#section[ + == Scalar unit + Most newer GPUs also have a scalar unit for saving energy when performing simple operations. + + When the controller sees a scalar instruction in the code running on a wave, it automatically makes the code run on the scalar unit. + + The scalar unit can be used for: + - address calculation + - partial reductions + - execution of expensive operations not implemented on SIMD because of costs +] + +#section[ + = GPU Programming Terminology + - "work item": typically maps to a SIMD lane + - "kernel": the code for a work item + - "work group": consists of multiple work items. typically maps to an CU. the `__local` memory in OpenCL applies to this. + - "compute task": a set of work groups +] + +#section[ + OpenCL and other APIs let you specify both the number of work groups and work items. + + Since a program might specify a higher number of work items per work group than we have available, + the compiler needs to be able to put multiple work items onto one SIMD lane. +] + +#section[ + = Our own architecture + We'll go with these specs for now: + - N compute units + - 2 waves per CU + - 32 lanes per wave. + - 1KiB local memory per lane => 64 KiB + - 48 vector registers of 16x32b per wave + - one scalar unit per CU + - 128 global memory ports + - no fancy out of order or superscalar execution + - support standard 32 bit floating point, without exceptions. + + Note that we won't specifiy the exact instruction encoding. +] + +#section[ + == Predefined Constants + We will pre-define 16 constants (as virtual vector registers): + - `zero` + - `one` + - `sid`: 0,1,2,3,4,5,6 + - `wave`: the ID of the wave in the compute task, broadcasted to all elements. + - `u8_max`: 255,255,... + - `n2nd`: 1,2,1,2,... + - `n3rd`: 1,2,4,1,... + - `n4th`: 1,2,4,8,1,... + - `lo16`: 1,1,1,... (x16) 0,0,0,... (x16) + - `ch2`: 1,1,0,0,1,1,... + - `ch4`: 1,1,1,1,0,0,0,0,1,... + - `alo8`: 1 (x8) 0 (x8) 1 (x8) 0 (x8) + - a few reserved ones +] + +#section[ + == Operands + We define the following instruction operands: + - `Vreg`: vector register + - `M`: (read only) vector gp reg as mask (1b). + only first 32 registers can be used as mask. + the operand consists of two masks and-ed together, each of which can conditionally be inverted first. + this means that this operand takes up 12 bits + - `Vany`: `Vreg` or `M` + - `Simm`: immediate scalar value + - `Sreg`: the first element of a vector register, as scalar + - `Sany`: a `Simm` or an `Sreg` + - `dist`: `Vany`, or a `Sany` broadcasted to each element +] + +#section[ + == Instructions + We will add more instructions in future articles. +] + +#section[ + === Data Movement + - `fn mov(out out: Vreg, in wrmask: M, in val: dist)` + - `fn select(out out: Vreg, in select: M, in false: dist, in true: dist)` + - `fn first_where_true(out out: Sreg, in where: M, in values: dist)`: + if none of the elements are true, it doesn't overwrite the previous value in out. + - cross-lane operations: not important for this article +] + +#section[ + === Mathematics + - simple (unmasked) `u32`, `i32`, and `f32` elementwise arithmetic and logic operations: + `fn add(out out: Vreg, in left: Vany, in right: dist)` + - scalar arithmetic and logic operations: + `fn add(out out: Sreg, in left: Sany, in right: Sany)` + - partial reduction operations: + "chunks" the input with a size of 8, reduces each chunk, and stores it in the first element of the chunk. + this means that every 8th element will contain a partial result. + - and operations to finish that reduction into the first element of the vector +] + +#section[ + === Memory + - `fn local_load` + TODO +] + +#section[ + === Control flow (whole wave) + TODO +] + +#section[ + = Hand-compiling code + TODO +] ] diff --git a/res/_.txt b/res/_.txt deleted file mode 100644 index e69de29..0000000