From 8c50ef8b7cead87fab2ffadc409dadb46f63dcb4 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Thu, 29 Aug 2024 20:22:39 +0800 Subject: [PATCH] start uop docs (#6291) * start uop docs * only need show_labels * sink comes first * hotfix: invalid * touchups * 2 space indent works * limit some buffer uops * better BARRIER doc, Op -> UOp when it makes sense. * make KernelInfo optional * more work relative links don't work * this can be local in multi reduce+pads * add UOps.SHAPETRACKER details * UOps.CONST both types * nit: local buffer isn't device Buffer, habit * nit2: dtype -> DType --- docs/developer/uop.md | 11 +++ mkdocs.yml | 1 + tinygrad/ops.py | 167 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 168 insertions(+), 11 deletions(-) create mode 100644 docs/developer/uop.md diff --git a/docs/developer/uop.md b/docs/developer/uop.md new file mode 100644 index 00000000..d529aff9 --- /dev/null +++ b/docs/developer/uop.md @@ -0,0 +1,11 @@ +::: tinygrad.ops.UOp + options: + members: false + members_order: source + show_labels: false + +::: tinygrad.ops.UOps + options: + members: true + members_order: source + show_labels: false diff --git a/mkdocs.yml b/mkdocs.yml index 6512cd7b..871ca8a1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -23,6 +23,7 @@ nav: - Developer: - Intro: developer/developer.md - Function (autodiff): developer/function.md + - UOp: developer/uop.md - Runtime: - developer/runtime.md - HCQ: developer/hcq.md diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 7adcdaa3..3d3b00fd 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -40,20 +40,165 @@ def identity_element(op:BinaryOps, dt:DType): return dtypes.as_const({BinaryOps. # the order of these UOps controls the order of the toposort class UOps(Enum): - # ops that aren't rendered - SINK = auto(); EXT = auto(); EXPAND = auto(); CONTRACT = auto(); SHAPETRACKER = auto(); SWIZZLE = auto() # noqa: E702 - DEFINE_GLOBAL = auto(); DEFINE_VAR = auto(); DEFINE_LOCAL = auto(); DEFINE_ACC = auto() # noqa: E702 - CONST = auto(); SPECIAL = auto() # noqa: E702 - NOOP = auto(); GEP = auto() # noqa: E702 + # uops that aren't rendered + SINK = auto() + """ + Holds `UOps.STORE`. SINK defines the AST for a Kernel. + + - **`dtype`**: `None` + - **`src`**: `Tuple[UOp, ...]`, Only local STOREs are allowed. + - **`arg`**: `Optional[KernelInfo]` + + NOTE: `ScheduleItem` ASTs do not have the `KernelInfo` arg, `Kernel` inserts this to the SINK later. + """ + EXT = auto() + """ + Holds a single MetaOp. EXT UOps do not need a Kernel. + + - **`dtype`**: Output DType + - **`src`**: `Tuple[]` + - **`arg`**: (`MetaOps.CUSTOM | MetaOps.COPY | MetaOps.EMPTY | MetaOps.VIEW`, LazyBuffer arg) + """ + EXPAND = auto() + CONTRACT = auto() + SHAPETRACKER = auto() + """ + Defines the ShapeTracker for a buffer UOp `UOps.LOAD`, `UOps.STORE` or `UOps.CONST`. + + - **`dtype`**: `None` + - **`src`**: `Tuple[]` + - **`arg`**: `ShapeTracker` + """ + SWIZZLE = auto() + DEFINE_GLOBAL = auto() + DEFINE_VAR = auto() + DEFINE_LOCAL = auto() + DEFINE_ACC = auto() + CONST = auto() + """ + Defines a single scalar constant value. + + - **`dtype`**: The scalar DType of the value. + + - **`src`**: + The scheduler creates a CONST with a single SHAPETRACKER UOp src: `Tuple[UOp]`. + + The Lowerer replaces the SHAPETRACKER with an empty src. + It uses the ShapeTracker valid to create a `WHERE` UOp mask with sources: `(The actual CONST UOp, CONST 0, 0.0 or False)` + + - **`arg`**: The value. + """ + SPECIAL = auto() + NOOP = auto() + GEP = auto() # math ops - CAST = auto(); BITCAST = auto(); VECTORIZE = auto() # noqa: E702 - ALU = auto(); REDUCE = auto(); REDUCE_AXIS = auto(); WMMA = auto() # noqa: E702 + CAST = auto() + """ + - **`dtype`**: The casted scalar DType + - **`src`**: `Tuple[UOp]` + - **`arg`**: `None` + """ + BITCAST = auto() + """ + - **`dtype`**: The bitcasted scalar DType + - **`src`**: `Tuple[UOp]` + - **`arg`**: `None` + """ + VECTORIZE = auto() + """ + - **`dtype`**: The upcasted vector DType + - **`src`**: `Tuple[UOp, ...]` + - **`arg`**: `None` + + NOTE: Length of sources must match `dtype.count` + """ + ALU = auto() + """ + - **`dtype`**: Output DType + - **`src`**: `Tuple[UOp] | Tuple[UOp, UOp] | Tuple[UOp, UOp, UOp]` + - **`arg`**: `UnaryOps | BinaryOps | TernaryOps` + """ + REDUCE = auto() + REDUCE_AXIS = auto() + """ + - **`dtype`**: Output DType + - **`src`**: Input to reduce `Tuple[UOp]` + - **`arg`**: `(BinaryOps.ADD | BinaryOps.MUL | BinaryOps.MAX, Tuple[int, ...])` + """ + WMMA = auto() # memory/assignment ops - LOAD = auto(); STORE = auto(); PHI = auto() # noqa: E702 + LOAD = auto() + """ + - **`dtype`**: Output DType + - **`src`**: + + The scheduler and Kernel create LOADs with a SHAPETRACKER uop in src. + + - Normal LOAD: `Tuple[UOp, UOp]` + - Buffer UOp `UOps.DEFINE_GLOBAL`. + - SHAPETRACKER UOp. + + - Local LOAD: `Tuple[UOp, UOp, UOp]` + - Buffer UOp `UOps.DEFINE_LOCAL`. + - SHAPETRACKER UOp. + - Local UOps.STORE to the same local buffer. We will barrier this later. + + The Lowerer replaces the SHAPETRACKER with an indexing uop and gates the LOAD if needed. + + - Normal LOAD: `Tuple[UOp, UOp]` + - Buffer UOp `UOps.DEFINE_GLOBAL`. + - Indexing UOp, can only return `dtypes.int32`. + - Gated LOAD: `Tuple[UOp, UOp, UOp, UOp]` + - Buffer UOp `UOps.DEFINE_GLOBAL`. + - Indexing UOp, can only return `dtypes.int32`. + - Gate UOp, can only return `dtypes.bool`. + - Value if gate is `False`, can only be a `UOps.CONST` with arg 0, 0.0 or `False`. + - Barriered LOAD: `Tuple[UOp, UOp, UOp, UOp]` + - Buffer UOp `UOps.DEFINE_LOCAL`. + - Indexing UOp, can only return `dtypes.int32`. + - Gate UOp, can only return `dtypes.bool`. + - Barrier UOp `UOps.BARRIER`. + - **`arg`**: `None` + """ + STORE = auto() + """ + - **`dtype`**: `None` + - **`src`**: + + Similar to LOAD, the scheduler and Kernel create STOREs with a SHAPETRACKER uop in src: + + - Buffer UOp `UOps.DEFINE_GLOBAL` or `UOps.DEFINE_LOCAL`. + - SHAPETRACKER UOp. + - Value to store. + + The Lowerer replaces the SHAPETRACKER with an indexing uop and gates the STORE if needed. + + - Normal STORE: `Tuple[UOp, UOp, UOp]` + - Buffer UOp `UOps.DEFINE_GLOBAL` or `UOps.DEFINE_LOCAL`. + - Indexing Op, can only return `dtypes.int32`. + - Value to store. + - Gated STORE: `Tuple[UOp, UOp, UOp, UOp]` + - Buffer UOp `UOps.DEFINE_GLOBAL` or `UOps.DEFINE_LOCAL`. + - Indexing UOp, can only return `dtypes.int32`. + - Value to store. + - Gate UOp, can only return `dtypes.bool`. + - **`arg`**: `None` + """ + PHI = auto() # control flow ops - BARRIER = auto(); IF = auto(); RANGE = auto() # noqa: E702 - # these two are not graph nodes - ENDRANGE = auto(); ENDIF = auto() # noqa: E702 + BARRIER = auto() + """ + Inserts a warp sync between local stores and local loads. + + - **`dtype`**: `None` + - **`src`**: `Tuple[UOp, ...]`, Only local STOREs are allowed. + - **`arg`**: `None` + """ + IF = auto() + RANGE = auto() + # ops that are not graph nodes + ENDRANGE = auto() + ENDIF = auto() BUFFER_UOPS = {UOps.LOAD, UOps.STORE, UOps.CONST}