even better

2025-09-09 17:05:07 +02:00 · 2025-07-25 21:40:21 +02:00
parent b1220399b1
commit c14d78b2b0
22 changed files with 359 additions and 32 deletions
--- a/coffee/.gitignore
+++ b/coffee/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
--- a/coffee/Cargo.toml
+++ b/coffee/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "coffee"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+axum = "0.6"
+reqwest = "0.12.22"
+scraper = "0.23.1"
+serde = { version = "1.0.219", features = ["derive"] }
+tokio = { version = "1.46.1", features = ["full"] }
--- a/coffee/README
+++ b/coffee/README
@@ -0,0 +1 @@
+CHATGPT GENERATED BECAUSE LAZY
--- a/coffee/gen_js.py
+++ b/coffee/gen_js.py
@@ -0,0 +1,43 @@
+import requests
+import json
+
+response = requests.get("https://gist.githubusercontent.com/erdem/8c7d26765831d0f9a8c62f02782ae00d/raw/248037cd701af0a4957cce340dabb0fd04e38f4c/countries.json")
+response.raise_for_status()
+response = json.loads(response.text)
+
+out = ""
+for item in response:
+    tz = item["timezones"]
+    name = item["name"]
+    for x in tz:
+        out += f"{x}|{name}|"
+
+print("""function userCountry() {
+const tz = Intl.DateTimeFormat().resolvedOptions().timeZone;
+if(tz==null) return null;
+const c=\""""+out+"""\".split("|");
+for(let i=0;i<c.length;i+=2){
+if(c[i]===timezone){
+return c[i+1];
+}}
+return null;
+}
+
+async function byCountry(country) {
+const url = `http://127.0.0.1:3000/price/${encodeURIComponent(country)}`;
+
+try {
+const response = await fetch(url);
+if(!response.ok){throw new Error(`HTTP error ${response.status}`);}
+const data = await response.json();
+return data.price;
+} catch (error) {
+console.error("Failed to fetch price:", error);
+return null;
+}}
+
+const c = userCountry();
+if(c!=null){
+byCountry(c).then(price => console.log("coffe price: " + price));
+}
+""")
--- a/coffee/src/main.rs
+++ b/coffee/src/main.rs
@@ -0,0 +1,67 @@
+use axum::{
+    extract::Path,
+    response::Json,
+    routing::get,
+    Router,
+};
+use reqwest::Client;
+use scraper::{Html, Selector};
+use serde::Serialize;
+use std::net::SocketAddr;
+
+#[derive(Serialize)]
+struct PriceResponse {
+    price: f64,
+}
+
+async fn by_country(Path(country): Path<String>) -> Json<PriceResponse> {
+    let url = format!("https://coffeestics.com/countries/{}", country);
+
+    // Fetch the page
+    let response = Client::new()
+        .get(&url)
+        .send()
+        .await
+        .expect("Failed to fetch page")
+        .text()
+        .await
+        .expect("Failed to get text");
+
+    // Parse HTML
+    let document = Html::parse_document(&response);
+
+    // Create selector that matches the element you want
+    let selector = Selector::parse("body > div:nth-of-type(1) > div:nth-of-type(1) > section:nth-of-type(3) > div > div > div:nth-of-type(1) > div:nth-of-type(3) > a > div:nth-of-type(2)")
+        .unwrap();
+
+    // Extract text and parse as float
+    let price_str = document
+        .select(&selector)
+        .next()
+        .expect("Element not found")
+        .text()
+        .collect::<String>();
+
+    let price: f64 = price_str.trim().trim_start_matches('$')
+        .parse()
+        .expect("Failed to parse price");
+
+    Json(PriceResponse { price })
+}
+
+#[tokio::main]
+async fn main() {
+    // Build our router
+    let app = Router::new()
+        .route("/price/:country", get(by_country));
+
+    // Run server
+    let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
+    println!("Listening on {}", addr);
+
+    axum::Server::bind(&addr)
+        .serve(app.into_make_service())
+        .await
+        .unwrap();
+}
+
--- a/config.py
+++ b/config.py
@@ -1,5 +1,7 @@
 import os

+web_targets = []
+
 gen = """
 build always: phony

@@ -26,6 +28,7 @@ rule git_inp

 rule badges_list
  command = typst query $in "<meta-people>" --root . --input query=true --field value --one | jq -r . | jq -r 'to_entries[] | [.key,.value.badge] | @tsv' > $out
+build build/badges.txt: badges_list common.typ

 rule curl
  command = curl $url > $out
@@ -38,28 +41,33 @@ rule cpdir

 rule runclean
  command = rm -rf build && ninja -t clean
+build clean : runclean

 rule ttf2woff
  command = fonttools ttLib.woff2 compress $in -o $out

+rule python_capture
+  command = python $in > $out

-build build/badges.txt: badges_list common.typ
+rule minhtml
+  command = minhtml --minify-js --minify-css $in -o $out

-build build.ninja: regen | config.py build/badges.txt res/fonts
+build build.ninja: regen | config.py build/badges.txt res pages

-build clean : runclean
+build build/deploy/coffee.js : python_capture coffee/gen_js.py
+
+rule cargo_release_bin
+  command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
+  pool = console
+
+build build/coffee_server : cargo_release_bin coffee
+  file = coffee
 """

-pages = [
-    "article-make-regex-engine-1.typ",
-    "project-etc-nand.typ",
-    "index.typ",
-    "compiler-pattern-matching.typ",
-    "article-favicon.typ",
-    "article-gpu-arch-1.typ",
-]
+web_targets.append("build/coffee_server")

-fonts = [x for x in os.listdir("./res/fonts/")]
+pages = [x for x in os.listdir("./pages/")]
+fonts = [x for x in os.listdir("./fonts/")]

 variants = [
    {
@@ -80,18 +88,20 @@ variants = [
    },
 ]

-web_targets = []
-
 for page in pages:
    gr = "build/" + page + ".git_rev.txt"
    gen += "\n"
    gen += "build "+gr+" : git_inp pages/" + page + " | build/git_rev.txt"
    for var in variants:
        tg = "build/" + page + var["suffix"]
-        web_targets.append(tg)
        gen += "\n"
        gen += "build "+tg+" : typst " + "pages/" + page + " | "+gr+"\n"
        gen += "  flags = " + var["args"] + " $$(cat "+gr+")\n"
+        if tg.endswith(".html"):
+            gen += "\n"
+            deploy_tg = f"build/deploy/{page}"+var["suffix"]
+            web_targets.append(deploy_tg)
+            gen += f"build {deploy_tg} : minhtml {tg}\n"

 if os.path.isfile("build/badges.txt"):
    badges = None
@@ -104,38 +114,40 @@ if os.path.isfile("build/badges.txt"):
        badge = badge.split("\t")
        user = badge[0]
        url = badge[1]
-        tg = "build/res/badges/" + user
+        tg = "build/deploy/res/badges/" + user
        web_targets.append(tg)
        gen += "\n"
        gen += "build "+tg+": "
        if user == "alex":
-            gen += "cp res/badge.png | build/res/_.txt\n"
+            gen += "cp res/badge.png\n"
        else:
-            gen += "curl | build/res/_.txt\n"
+            gen += "curl\n"
            gen += "  url = "+url+"\n"

 for font in fonts:
    font = font.replace(".ttf", "")
-    tg = f"build/res/{font}.woff2"
+    tg = f"build/deploy/res/{font}.woff2"
    web_targets.append(tg)
    gen += "\n"
-    gen += f"build {tg} : ttf2woff res/fonts/{font}.ttf | build/res/_.txt\n"
+    gen += f"build {tg} : ttf2woff fonts/{font}.ttf\n"

 gen += "\n"
-gen += "build build/index.html : cp build/index.typ.desktop.html\n"
-web_targets.append("build/index.html")
+gen += "build build/deploy/index.html : cp build/deploy/index.typ.desktop.html\n"
+web_targets.append("build/deploy/index.html")

-gen += """
-build build/res/_.txt : cpdir res | res/_.txt
-  outdir = build
-"""
-web_targets.append("build/res/_.txt")
+for root, dirnames, filenames in os.walk("res"):
+    for file in filenames:
+        file = os.path.join(root,file)
+        tg = f"build/deploy/{file}"  # file includes "res/"!
+        gen += "\n"
+        gen += f"build {tg} : cp {file}"
+        web_targets.append(tg)

 gen += """
 build web: phony """+ " ".join(web_targets) +"""

 rule pub_cmd
-  command = rsync -avz build/* root@195.26.251.204:/srv/http/alex
+  command = rsync -avz build/deploy/* root@195.26.251.204:/srv/http/alex
  pool = console
 build pub: pub_cmd web

--- a/res/fonts/DejaVuMathTeXGyre.ttf
+++ b/res/fonts/DejaVuMathTeXGyre.ttf
--- a/res/fonts/DejaVuSans-Bold.ttf
+++ b/res/fonts/DejaVuSans-Bold.ttf
--- a/res/fonts/DejaVuSans-BoldOblique.ttf
+++ b/res/fonts/DejaVuSans-BoldOblique.ttf
--- a/res/fonts/DejaVuSans-ExtraLight.ttf
+++ b/res/fonts/DejaVuSans-ExtraLight.ttf
--- a/res/fonts/DejaVuSans-Oblique.ttf
+++ b/res/fonts/DejaVuSans-Oblique.ttf
--- a/res/fonts/DejaVuSans.ttf
+++ b/res/fonts/DejaVuSans.ttf
--- a/res/fonts/DejaVuSansMono-Bold.ttf
+++ b/res/fonts/DejaVuSansMono-Bold.ttf
--- a/res/fonts/DejaVuSansMono-BoldOblique.ttf
+++ b/res/fonts/DejaVuSansMono-BoldOblique.ttf
--- a/res/fonts/DejaVuSansMono-Oblique.ttf
+++ b/res/fonts/DejaVuSansMono-Oblique.ttf
--- a/res/fonts/DejaVuSansMono.ttf
+++ b/res/fonts/DejaVuSansMono.ttf
--- a/res/fonts/DejaVuSerif-Bold.ttf
+++ b/res/fonts/DejaVuSerif-Bold.ttf
--- a/res/fonts/DejaVuSerif-BoldItalic.ttf
+++ b/res/fonts/DejaVuSerif-BoldItalic.ttf
--- a/res/fonts/DejaVuSerif-Italic.ttf
+++ b/res/fonts/DejaVuSerif-Italic.ttf
--- a/res/fonts/DejaVuSerif.ttf
+++ b/res/fonts/DejaVuSerif.ttf
--- a/pages/article-gpu-arch-1.typ
+++ b/pages/article-gpu-arch-1.typ
@@ -5,11 +5,11 @@

 #simple-page(
  gen-table-of-contents: true,
-  [GPU architecture: SIMD - Alexander Nutz]
+  [Designing a GPU architecture: Waves]
 )[

 #section[
-  #title[GPU Architecture: Compute Cores]
+  #title[Designing a GPU Architecture: Waves]

  #sized-p(small-font-size)[
    #rev-and-authors((people.alex,))
@@ -20,9 +20,200 @@

 #section[
  = Introduction
-  GPUs consists of multiple (commonly 64) compute units.
-
+  In this article, we'll be looking into the hardware of GPUs, and then designing our own.
+  Specifically GPUs with unified shader architecture.
 ]

+#section[
+  == Comparision with CPUs
+  GPUs focus on operating on a lot of data at once (triangles, vertecies, pixels, ...),
+  while CPUs focus on high performance on a single core, and low compute delay.
+]
+
+#section[
+  = GPU Architecture
+  GPUs consists of multiple (these days at least 32) compute units (= CU).
+
+  Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
+  Compute units also have some fast local memory (tens of kilobytes),
+  main memory access queues, texture units, a scalar unit, and other features. (see future article)
+
+  The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
+]
+
+#section[
+  == Waves
+  A wave is a SIMD processing unit consisting of typically 32 "lanes" (sometimes called threads).
+
+  Each wave in a CU has seperate control flow, and doesn't have to be related.
+
+  Instructions that waves support:
+  - arithmetic operations
+  - cross-lane data movement
+  - CU local and global memory access: each SIMD lane can access a completely different address. similar to CPU gather / scatter.
+  - synchronization with other CUs in the work group (see future article)
+
+  Since only the whole wave can do control flow, and not each lane, all operations can be masked so that they only apply to specific lanes.
+
+  => waves are really similar to SIMD on modern CPUs
+]
+
+#section[
+  == Local memory
+  The local memory inside GPUs is banked, typically into 32 banks.
+  The memory word size is typically 32 bits.
+
+  The addresses are interlaved, so for two banks:
+  - addr 0 => bank 0
+  - addr 1 => bank 1
+  - addr 2 => bank 0
+  - addr 3 => bank 1
+  - ...
+
+  Each bank has an dedicated access port, so for 32 banks, you get 32 access ports.
+
+  The lanes of the waves inside a CU get routed to the local memory banks magically.
+]
+
+#section[
+  === Why are the banks interlaved?
+  When the whole wave wants to read a contiguos array of `f32`, so when each wave performs `some_f32_array[lane_id()]`,
+  all 32 banks can be used at the same time.
+]
+
+#section[
+  === Why multiple waves share the same local memory
+  A wave doesn't do memory accesses every instruction, but also does computations.
+  This means that there are cycles where the memory isn't doing anything.
+
+  By making multiple waves share the same local memory and access ports, you save resources.
+]
+
+#section[
+  == Global memory
+  Since global memory reads/writes are really slow, they happen asynchronosly.
+
+  This means that a wave requests an access, then can continue executing, and then eventually waits for that access to finish.
+
+  Because of this, modern compilers automagically start the access before the data is needed, and then wait for the data later on.
+]
+
+#section[
+  == Scalar unit
+  Most newer GPUs also have a scalar unit for saving energy when performing simple operations.
+
+  When the controller sees a scalar instruction in the code running on a wave, it automatically makes the code run on the scalar unit.
+
+  The scalar unit can be used for:
+  - address calculation
+  - partial reductions
+  - execution of expensive operations not implemented on SIMD because of costs
+]
+
+#section[
+  = GPU Programming Terminology
+  - "work item": typically maps to a SIMD lane
+  - "kernel": the code for a work item
+  - "work group": consists of multiple work items. typically maps to an CU. the `__local` memory in OpenCL applies to this.
+  - "compute task": a set of work groups
+]
+
+#section[
+  OpenCL and other APIs let you specify both the number of work groups and work items.
+
+  Since a program might specify a higher number of work items per work group than we have available,
+  the compiler needs to be able to put multiple work items onto one SIMD lane.
+]
+
+#section[
+  = Our own architecture
+  We'll go with these specs for now:
+  - N compute units
+  - 2 waves per CU
+  - 32 lanes per wave.
+  - 1KiB local memory per lane => 64 KiB
+  - 48 vector registers of 16x32b per wave
+  - one scalar unit per CU
+  - 128 global memory ports
+  - no fancy out of order or superscalar execution
+  - support standard 32 bit floating point, without exceptions.
+
+  Note that we won't specifiy the exact instruction encoding.
+]
+
+#section[
+  == Predefined Constants
+  We will pre-define 16 constants (as virtual vector registers):
+  - `zero`
+  - `one`
+  - `sid`: 0,1,2,3,4,5,6
+  - `wave`: the ID of the wave in the compute task, broadcasted to all elements.
+  - `u8_max`: 255,255,...
+  - `n2nd`: 1,2,1,2,...
+  - `n3rd`: 1,2,4,1,...
+  - `n4th`: 1,2,4,8,1,...
+  - `lo16`: 1,1,1,... (x16) 0,0,0,... (x16)
+  - `ch2`: 1,1,0,0,1,1,...
+  - `ch4`: 1,1,1,1,0,0,0,0,1,...
+  - `alo8`: 1 (x8)  0 (x8)  1 (x8)  0 (x8)
+  - a few reserved ones
+]
+
+#section[
+  == Operands
+  We define the following instruction operands:
+  - `Vreg`: vector register
+  - `M`:  (read only) vector gp reg as mask (1b).
+          only first 32 registers can be used as mask. 
+          the operand consists of two masks and-ed together, each of which can conditionally be inverted first.
+          this means that this operand takes up 12 bits
+  - `Vany`: `Vreg` or `M`
+  - `Simm`: immediate scalar value
+  - `Sreg`: the first element of a vector register, as scalar
+  - `Sany`: a `Simm` or an `Sreg`
+  - `dist`: `Vany`, or a `Sany` broadcasted to each element
+]
+
+#section[
+  == Instructions
+  We will add more instructions in future articles.
+]
+
+#section[
+  === Data Movement
+  - `fn mov(out out: Vreg, in wrmask: M, in val: dist)`
+  - `fn select(out out: Vreg, in select: M, in false: dist, in true: dist)`
+  - `fn first_where_true(out out: Sreg, in where: M, in values: dist)`:
+    if none of the elements are true, it doesn't overwrite the previous value in out.
+  - cross-lane operations: not important for this article
+]
+
+#section[
+  === Mathematics
+  - simple (unmasked) `u32`, `i32`, and `f32` elementwise arithmetic and logic operations:
+    `fn add<u32>(out out: Vreg, in left: Vany, in right: dist)`
+  - scalar arithmetic and logic operations:
+    `fn add<u32>(out out: Sreg, in left: Sany, in right: Sany)`
+  - partial reduction operations:
+    "chunks" the input with a size of 8, reduces each chunk, and stores it in the first element of the chunk.
+    this means that every 8th element will contain a partial result.
+  - and operations to finish that reduction into the first element of the vector
+]
+
+#section[
+  === Memory
+  - `fn local_load`
+  TODO
+]
+
+#section[
+  === Control flow (whole wave)
+  TODO
+]
+
+#section[
+  = Hand-compiling code
+  TODO
+]

 ]
--- a/res/_.txt
+++ b/res/_.txt