mirror of
https://github.com/alex-s168/website.git
synced 2025-09-09 17:05:07 +02:00
even better
This commit is contained in:
2
coffee/.gitignore
vendored
Normal file
2
coffee/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
target
|
||||
Cargo.lock
|
11
coffee/Cargo.toml
Normal file
11
coffee/Cargo.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "coffee"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
axum = "0.6"
|
||||
reqwest = "0.12.22"
|
||||
scraper = "0.23.1"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
tokio = { version = "1.46.1", features = ["full"] }
|
1
coffee/README
Normal file
1
coffee/README
Normal file
@@ -0,0 +1 @@
|
||||
CHATGPT GENERATED BECAUSE LAZY
|
43
coffee/gen_js.py
Normal file
43
coffee/gen_js.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
response = requests.get("https://gist.githubusercontent.com/erdem/8c7d26765831d0f9a8c62f02782ae00d/raw/248037cd701af0a4957cce340dabb0fd04e38f4c/countries.json")
|
||||
response.raise_for_status()
|
||||
response = json.loads(response.text)
|
||||
|
||||
out = ""
|
||||
for item in response:
|
||||
tz = item["timezones"]
|
||||
name = item["name"]
|
||||
for x in tz:
|
||||
out += f"{x}|{name}|"
|
||||
|
||||
print("""function userCountry() {
|
||||
const tz = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
||||
if(tz==null) return null;
|
||||
const c=\""""+out+"""\".split("|");
|
||||
for(let i=0;i<c.length;i+=2){
|
||||
if(c[i]===timezone){
|
||||
return c[i+1];
|
||||
}}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function byCountry(country) {
|
||||
const url = `http://127.0.0.1:3000/price/${encodeURIComponent(country)}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if(!response.ok){throw new Error(`HTTP error ${response.status}`);}
|
||||
const data = await response.json();
|
||||
return data.price;
|
||||
} catch (error) {
|
||||
console.error("Failed to fetch price:", error);
|
||||
return null;
|
||||
}}
|
||||
|
||||
const c = userCountry();
|
||||
if(c!=null){
|
||||
byCountry(c).then(price => console.log("coffe price: " + price));
|
||||
}
|
||||
""")
|
67
coffee/src/main.rs
Normal file
67
coffee/src/main.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
use axum::{
|
||||
extract::Path,
|
||||
response::Json,
|
||||
routing::get,
|
||||
Router,
|
||||
};
|
||||
use reqwest::Client;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::Serialize;
|
||||
use std::net::SocketAddr;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct PriceResponse {
|
||||
price: f64,
|
||||
}
|
||||
|
||||
async fn by_country(Path(country): Path<String>) -> Json<PriceResponse> {
|
||||
let url = format!("https://coffeestics.com/countries/{}", country);
|
||||
|
||||
// Fetch the page
|
||||
let response = Client::new()
|
||||
.get(&url)
|
||||
.send()
|
||||
.await
|
||||
.expect("Failed to fetch page")
|
||||
.text()
|
||||
.await
|
||||
.expect("Failed to get text");
|
||||
|
||||
// Parse HTML
|
||||
let document = Html::parse_document(&response);
|
||||
|
||||
// Create selector that matches the element you want
|
||||
let selector = Selector::parse("body > div:nth-of-type(1) > div:nth-of-type(1) > section:nth-of-type(3) > div > div > div:nth-of-type(1) > div:nth-of-type(3) > a > div:nth-of-type(2)")
|
||||
.unwrap();
|
||||
|
||||
// Extract text and parse as float
|
||||
let price_str = document
|
||||
.select(&selector)
|
||||
.next()
|
||||
.expect("Element not found")
|
||||
.text()
|
||||
.collect::<String>();
|
||||
|
||||
let price: f64 = price_str.trim().trim_start_matches('$')
|
||||
.parse()
|
||||
.expect("Failed to parse price");
|
||||
|
||||
Json(PriceResponse { price })
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
// Build our router
|
||||
let app = Router::new()
|
||||
.route("/price/:country", get(by_country));
|
||||
|
||||
// Run server
|
||||
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
|
||||
println!("Listening on {}", addr);
|
||||
|
||||
axum::Server::bind(&addr)
|
||||
.serve(app.into_make_service())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
68
config.py
68
config.py
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
|
||||
web_targets = []
|
||||
|
||||
gen = """
|
||||
build always: phony
|
||||
|
||||
@@ -26,6 +28,7 @@ rule git_inp
|
||||
|
||||
rule badges_list
|
||||
command = typst query $in "<meta-people>" --root . --input query=true --field value --one | jq -r . | jq -r 'to_entries[] | [.key,.value.badge] | @tsv' > $out
|
||||
build build/badges.txt: badges_list common.typ
|
||||
|
||||
rule curl
|
||||
command = curl $url > $out
|
||||
@@ -38,28 +41,33 @@ rule cpdir
|
||||
|
||||
rule runclean
|
||||
command = rm -rf build && ninja -t clean
|
||||
build clean : runclean
|
||||
|
||||
rule ttf2woff
|
||||
command = fonttools ttLib.woff2 compress $in -o $out
|
||||
|
||||
rule python_capture
|
||||
command = python $in > $out
|
||||
|
||||
build build/badges.txt: badges_list common.typ
|
||||
rule minhtml
|
||||
command = minhtml --minify-js --minify-css $in -o $out
|
||||
|
||||
build build.ninja: regen | config.py build/badges.txt res/fonts
|
||||
build build.ninja: regen | config.py build/badges.txt res pages
|
||||
|
||||
build clean : runclean
|
||||
build build/deploy/coffee.js : python_capture coffee/gen_js.py
|
||||
|
||||
rule cargo_release_bin
|
||||
command = (cd $in && cargo build --release) && cp $in/target/release/$file $out
|
||||
pool = console
|
||||
|
||||
build build/coffee_server : cargo_release_bin coffee
|
||||
file = coffee
|
||||
"""
|
||||
|
||||
pages = [
|
||||
"article-make-regex-engine-1.typ",
|
||||
"project-etc-nand.typ",
|
||||
"index.typ",
|
||||
"compiler-pattern-matching.typ",
|
||||
"article-favicon.typ",
|
||||
"article-gpu-arch-1.typ",
|
||||
]
|
||||
web_targets.append("build/coffee_server")
|
||||
|
||||
fonts = [x for x in os.listdir("./res/fonts/")]
|
||||
pages = [x for x in os.listdir("./pages/")]
|
||||
fonts = [x for x in os.listdir("./fonts/")]
|
||||
|
||||
variants = [
|
||||
{
|
||||
@@ -80,18 +88,20 @@ variants = [
|
||||
},
|
||||
]
|
||||
|
||||
web_targets = []
|
||||
|
||||
for page in pages:
|
||||
gr = "build/" + page + ".git_rev.txt"
|
||||
gen += "\n"
|
||||
gen += "build "+gr+" : git_inp pages/" + page + " | build/git_rev.txt"
|
||||
for var in variants:
|
||||
tg = "build/" + page + var["suffix"]
|
||||
web_targets.append(tg)
|
||||
gen += "\n"
|
||||
gen += "build "+tg+" : typst " + "pages/" + page + " | "+gr+"\n"
|
||||
gen += " flags = " + var["args"] + " $$(cat "+gr+")\n"
|
||||
if tg.endswith(".html"):
|
||||
gen += "\n"
|
||||
deploy_tg = f"build/deploy/{page}"+var["suffix"]
|
||||
web_targets.append(deploy_tg)
|
||||
gen += f"build {deploy_tg} : minhtml {tg}\n"
|
||||
|
||||
if os.path.isfile("build/badges.txt"):
|
||||
badges = None
|
||||
@@ -104,38 +114,40 @@ if os.path.isfile("build/badges.txt"):
|
||||
badge = badge.split("\t")
|
||||
user = badge[0]
|
||||
url = badge[1]
|
||||
tg = "build/res/badges/" + user
|
||||
tg = "build/deploy/res/badges/" + user
|
||||
web_targets.append(tg)
|
||||
gen += "\n"
|
||||
gen += "build "+tg+": "
|
||||
if user == "alex":
|
||||
gen += "cp res/badge.png | build/res/_.txt\n"
|
||||
gen += "cp res/badge.png\n"
|
||||
else:
|
||||
gen += "curl | build/res/_.txt\n"
|
||||
gen += "curl\n"
|
||||
gen += " url = "+url+"\n"
|
||||
|
||||
for font in fonts:
|
||||
font = font.replace(".ttf", "")
|
||||
tg = f"build/res/{font}.woff2"
|
||||
tg = f"build/deploy/res/{font}.woff2"
|
||||
web_targets.append(tg)
|
||||
gen += "\n"
|
||||
gen += f"build {tg} : ttf2woff res/fonts/{font}.ttf | build/res/_.txt\n"
|
||||
gen += f"build {tg} : ttf2woff fonts/{font}.ttf\n"
|
||||
|
||||
gen += "\n"
|
||||
gen += "build build/index.html : cp build/index.typ.desktop.html\n"
|
||||
web_targets.append("build/index.html")
|
||||
gen += "build build/deploy/index.html : cp build/deploy/index.typ.desktop.html\n"
|
||||
web_targets.append("build/deploy/index.html")
|
||||
|
||||
gen += """
|
||||
build build/res/_.txt : cpdir res | res/_.txt
|
||||
outdir = build
|
||||
"""
|
||||
web_targets.append("build/res/_.txt")
|
||||
for root, dirnames, filenames in os.walk("res"):
|
||||
for file in filenames:
|
||||
file = os.path.join(root,file)
|
||||
tg = f"build/deploy/{file}" # file includes "res/"!
|
||||
gen += "\n"
|
||||
gen += f"build {tg} : cp {file}"
|
||||
web_targets.append(tg)
|
||||
|
||||
gen += """
|
||||
build web: phony """+ " ".join(web_targets) +"""
|
||||
|
||||
rule pub_cmd
|
||||
command = rsync -avz build/* root@195.26.251.204:/srv/http/alex
|
||||
command = rsync -avz build/deploy/* root@195.26.251.204:/srv/http/alex
|
||||
pool = console
|
||||
build pub: pub_cmd web
|
||||
|
||||
|
@@ -5,11 +5,11 @@
|
||||
|
||||
#simple-page(
|
||||
gen-table-of-contents: true,
|
||||
[GPU architecture: SIMD - Alexander Nutz]
|
||||
[Designing a GPU architecture: Waves]
|
||||
)[
|
||||
|
||||
#section[
|
||||
#title[GPU Architecture: Compute Cores]
|
||||
#title[Designing a GPU Architecture: Waves]
|
||||
|
||||
#sized-p(small-font-size)[
|
||||
#rev-and-authors((people.alex,))
|
||||
@@ -20,9 +20,200 @@
|
||||
|
||||
#section[
|
||||
= Introduction
|
||||
GPUs consists of multiple (commonly 64) compute units.
|
||||
|
||||
In this article, we'll be looking into the hardware of GPUs, and then designing our own.
|
||||
Specifically GPUs with unified shader architecture.
|
||||
]
|
||||
|
||||
#section[
|
||||
== Comparision with CPUs
|
||||
GPUs focus on operating on a lot of data at once (triangles, vertecies, pixels, ...),
|
||||
while CPUs focus on high performance on a single core, and low compute delay.
|
||||
]
|
||||
|
||||
#section[
|
||||
= GPU Architecture
|
||||
GPUs consists of multiple (these days at least 32) compute units (= CU).
|
||||
|
||||
Each compute unit has multiple SIMD units, also called "wave", "wavefront" or "warp".
|
||||
Compute units also have some fast local memory (tens of kilobytes),
|
||||
main memory access queues, texture units, a scalar unit, and other features. (see future article)
|
||||
|
||||
The main memory (graphics memory) is typically outside of the GPU, and is slow, but high-bandwidth memory.
|
||||
]
|
||||
|
||||
#section[
|
||||
== Waves
|
||||
A wave is a SIMD processing unit consisting of typically 32 "lanes" (sometimes called threads).
|
||||
|
||||
Each wave in a CU has seperate control flow, and doesn't have to be related.
|
||||
|
||||
Instructions that waves support:
|
||||
- arithmetic operations
|
||||
- cross-lane data movement
|
||||
- CU local and global memory access: each SIMD lane can access a completely different address. similar to CPU gather / scatter.
|
||||
- synchronization with other CUs in the work group (see future article)
|
||||
|
||||
Since only the whole wave can do control flow, and not each lane, all operations can be masked so that they only apply to specific lanes.
|
||||
|
||||
=> waves are really similar to SIMD on modern CPUs
|
||||
]
|
||||
|
||||
#section[
|
||||
== Local memory
|
||||
The local memory inside GPUs is banked, typically into 32 banks.
|
||||
The memory word size is typically 32 bits.
|
||||
|
||||
The addresses are interlaved, so for two banks:
|
||||
- addr 0 => bank 0
|
||||
- addr 1 => bank 1
|
||||
- addr 2 => bank 0
|
||||
- addr 3 => bank 1
|
||||
- ...
|
||||
|
||||
Each bank has an dedicated access port, so for 32 banks, you get 32 access ports.
|
||||
|
||||
The lanes of the waves inside a CU get routed to the local memory banks magically.
|
||||
]
|
||||
|
||||
#section[
|
||||
=== Why are the banks interlaved?
|
||||
When the whole wave wants to read a contiguos array of `f32`, so when each wave performs `some_f32_array[lane_id()]`,
|
||||
all 32 banks can be used at the same time.
|
||||
]
|
||||
|
||||
#section[
|
||||
=== Why multiple waves share the same local memory
|
||||
A wave doesn't do memory accesses every instruction, but also does computations.
|
||||
This means that there are cycles where the memory isn't doing anything.
|
||||
|
||||
By making multiple waves share the same local memory and access ports, you save resources.
|
||||
]
|
||||
|
||||
#section[
|
||||
== Global memory
|
||||
Since global memory reads/writes are really slow, they happen asynchronosly.
|
||||
|
||||
This means that a wave requests an access, then can continue executing, and then eventually waits for that access to finish.
|
||||
|
||||
Because of this, modern compilers automagically start the access before the data is needed, and then wait for the data later on.
|
||||
]
|
||||
|
||||
#section[
|
||||
== Scalar unit
|
||||
Most newer GPUs also have a scalar unit for saving energy when performing simple operations.
|
||||
|
||||
When the controller sees a scalar instruction in the code running on a wave, it automatically makes the code run on the scalar unit.
|
||||
|
||||
The scalar unit can be used for:
|
||||
- address calculation
|
||||
- partial reductions
|
||||
- execution of expensive operations not implemented on SIMD because of costs
|
||||
]
|
||||
|
||||
#section[
|
||||
= GPU Programming Terminology
|
||||
- "work item": typically maps to a SIMD lane
|
||||
- "kernel": the code for a work item
|
||||
- "work group": consists of multiple work items. typically maps to an CU. the `__local` memory in OpenCL applies to this.
|
||||
- "compute task": a set of work groups
|
||||
]
|
||||
|
||||
#section[
|
||||
OpenCL and other APIs let you specify both the number of work groups and work items.
|
||||
|
||||
Since a program might specify a higher number of work items per work group than we have available,
|
||||
the compiler needs to be able to put multiple work items onto one SIMD lane.
|
||||
]
|
||||
|
||||
#section[
|
||||
= Our own architecture
|
||||
We'll go with these specs for now:
|
||||
- N compute units
|
||||
- 2 waves per CU
|
||||
- 32 lanes per wave.
|
||||
- 1KiB local memory per lane => 64 KiB
|
||||
- 48 vector registers of 16x32b per wave
|
||||
- one scalar unit per CU
|
||||
- 128 global memory ports
|
||||
- no fancy out of order or superscalar execution
|
||||
- support standard 32 bit floating point, without exceptions.
|
||||
|
||||
Note that we won't specifiy the exact instruction encoding.
|
||||
]
|
||||
|
||||
#section[
|
||||
== Predefined Constants
|
||||
We will pre-define 16 constants (as virtual vector registers):
|
||||
- `zero`
|
||||
- `one`
|
||||
- `sid`: 0,1,2,3,4,5,6
|
||||
- `wave`: the ID of the wave in the compute task, broadcasted to all elements.
|
||||
- `u8_max`: 255,255,...
|
||||
- `n2nd`: 1,2,1,2,...
|
||||
- `n3rd`: 1,2,4,1,...
|
||||
- `n4th`: 1,2,4,8,1,...
|
||||
- `lo16`: 1,1,1,... (x16) 0,0,0,... (x16)
|
||||
- `ch2`: 1,1,0,0,1,1,...
|
||||
- `ch4`: 1,1,1,1,0,0,0,0,1,...
|
||||
- `alo8`: 1 (x8) 0 (x8) 1 (x8) 0 (x8)
|
||||
- a few reserved ones
|
||||
]
|
||||
|
||||
#section[
|
||||
== Operands
|
||||
We define the following instruction operands:
|
||||
- `Vreg`: vector register
|
||||
- `M`: (read only) vector gp reg as mask (1b).
|
||||
only first 32 registers can be used as mask.
|
||||
the operand consists of two masks and-ed together, each of which can conditionally be inverted first.
|
||||
this means that this operand takes up 12 bits
|
||||
- `Vany`: `Vreg` or `M`
|
||||
- `Simm`: immediate scalar value
|
||||
- `Sreg`: the first element of a vector register, as scalar
|
||||
- `Sany`: a `Simm` or an `Sreg`
|
||||
- `dist`: `Vany`, or a `Sany` broadcasted to each element
|
||||
]
|
||||
|
||||
#section[
|
||||
== Instructions
|
||||
We will add more instructions in future articles.
|
||||
]
|
||||
|
||||
#section[
|
||||
=== Data Movement
|
||||
- `fn mov(out out: Vreg, in wrmask: M, in val: dist)`
|
||||
- `fn select(out out: Vreg, in select: M, in false: dist, in true: dist)`
|
||||
- `fn first_where_true(out out: Sreg, in where: M, in values: dist)`:
|
||||
if none of the elements are true, it doesn't overwrite the previous value in out.
|
||||
- cross-lane operations: not important for this article
|
||||
]
|
||||
|
||||
#section[
|
||||
=== Mathematics
|
||||
- simple (unmasked) `u32`, `i32`, and `f32` elementwise arithmetic and logic operations:
|
||||
`fn add<u32>(out out: Vreg, in left: Vany, in right: dist)`
|
||||
- scalar arithmetic and logic operations:
|
||||
`fn add<u32>(out out: Sreg, in left: Sany, in right: Sany)`
|
||||
- partial reduction operations:
|
||||
"chunks" the input with a size of 8, reduces each chunk, and stores it in the first element of the chunk.
|
||||
this means that every 8th element will contain a partial result.
|
||||
- and operations to finish that reduction into the first element of the vector
|
||||
]
|
||||
|
||||
#section[
|
||||
=== Memory
|
||||
- `fn local_load`
|
||||
TODO
|
||||
]
|
||||
|
||||
#section[
|
||||
=== Control flow (whole wave)
|
||||
TODO
|
||||
]
|
||||
|
||||
#section[
|
||||
= Hand-compiling code
|
||||
TODO
|
||||
]
|
||||
|
||||
]
|
||||
|
Reference in New Issue
Block a user