cloud device [pr] (#6964)

* first try at cloud device [pr] * real separation * we're free * clang works * unhappy with timeout * better timeouts and free * unrelated * use http verbs + add test * lines + better test * fix DELETE * shorter cloud * split key * fix sending renderer * PTXRenderer serialization * add sessions * http.client * minor timeout bump * fix keep-alive * inc server timeout * real fix timeout * that one too
2024-10-11 12:24:06 +08:00 · 2024-10-11 12:24:06 +08:00 · f50d0e0ee0
parent 23c09f4b4c
commit f50d0e0ee0
7 changed files with 175 additions and 3 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -229,6 +229,9 @@ jobs:
      - if: ${{ matrix.task == 'onnx' }}
        name: Test ONNX (CLANG)
        run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - if: ${{ matrix.task == 'onnx' }}
        name: Run CLOUD=1 Test
        run: CLOUDDEV=CLANG CLOUD=1 python3 test/test_ops.py TestOps.test_tiny_add
      - if: ${{ matrix.task == 'onnx' }}
        name: Test Action Space
        run: PYTHONPATH="." GPU=1 python3 extra/optimization/get_action_space.py
--- a/test/test_pickle.py
+++ b/test/test_pickle.py
@ -85,6 +85,11 @@ class TestPickle(unittest.TestCase):
    sched_pk = pickle.loads(pk)
    assert_equiv_uops(sched_pk[-1].ast, sched[-1].ast)
  def test_pickle_renderer(self):
    from tinygrad.device import Device
    pk = pickle.dumps(Device.default.renderer)
    pickle.loads(pk)
 class TestPickleJIT(unittest.TestCase):
  @classmethod
  def setUpClass(cls):
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -180,7 +180,7 @@ class CompileError(Exception): pass
 class Compiler:
  def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
-  def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
+  def compile(self, src:str) -> bytes: return src.encode()   # NOTE: empty compiler is the default
  def compile_cached(self, src:str) -> bytes:
    if self.cachekey is None or (lib := diskcache_get(self.cachekey, src)) is None:
      assert not getenv("ASSERT_COMPILE"), f"tried to compile with ASSERT_COMPILE set\n{src}"
--- a/tinygrad/renderer/init.py
+++ b/tinygrad/renderer/init.py
@ -89,4 +89,5 @@ class Renderer:
  extra_matcher: Any = None
  code_for_op: Dict[Op, Callable] = {}
  def __reduce__(self): return self.__class__, ()
  def render(self, name:str, uops:List[UOp]) -> str: raise NotImplementedError("needs a renderer")
--- a/tinygrad/renderer/assembly.py
+++ b/tinygrad/renderer/assembly.py
@ -65,7 +65,9 @@ class PTXRenderer(Renderer):
  tensor_cores = [tc for tc in CUDARenderer.tensor_cores if tc.dtype_in == dtypes.half]
  code_for_op = asm_for_op
  extra_matcher = ptx_matcher
-  def __init__(self, arch:str, device="CUDA"): self.device, self.tensor_cores = device, PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else []
+  def __init__(self, arch:str, device="CUDA"):
    self.device, self.tensor_cores, self.arch = device, PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else [], arch
  def __reduce__(self): return self.__class__, (self.arch, self.device)
  # language options
  kernel_prefix = """.version VERSION
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@ -313,7 +313,8 @@ class CUDARenderer(CStyleLanguage):
    st1_pattern=(((1,1),(1,0),(0,2),(0,3),(0,4)),((1,3),(1,5),(1,2),(0,0),(0,1),(1,4))),
    st2_pattern=(((1,1),(1,0),(1,4),(0,0),(0,1)),((0,4),(0,2),(1,5),(0,3),(1,3),(1,2))), reduce_axes=[(0,8),(1,2)],
    upcast_axes=([(0,8)],[(2,2),(3,2)],[(3,2),(2,2)])) for di, do in ([(dtypes.half,dtypes.float),(dtypes.bfloat16,dtypes.float)])]
-  def __init__(self, arch:str): self.tensor_cores = CUDARenderer.tensor_cores if int(arch[3:]) >= 80 else []
+  def __init__(self, arch:str): self.tensor_cores, self.arch = CUDARenderer.tensor_cores if int(arch[3:]) >= 80 else [], arch
  def __reduce__(self): return self.__class__, (self.arch,)
  # language options
  kernel_prefix = "extern \"C\" __global__ "
--- a/tinygrad/runtime/ops_cloud.py
+++ b/tinygrad/runtime/ops_cloud.py
@ -0,0 +1,160 @@
 # the CLOUD=1 device is a process boundary between the frontend/runtime
 # normally tinygrad is    frontend <-> middleware <-> runtime <-> hardware
 # with CLOUD tinygrad is  frontend <-> middleware <-> CloudDevice ///HTTP/// cloud_server <-> runtime <-> hardware
 # this client and server can be on the same machine, same network, or just same internet
 # it should be a secure (example: no use of pickle) boundary. HTTP is used for RPC
 from __future__ import annotations
 from typing import Tuple, Optional, Dict, Any, DefaultDict
 from collections import defaultdict
 import multiprocessing, functools, http.client, hashlib, json, time, contextlib, os, binascii
 from dataclasses import dataclass, field
 from tinygrad.helpers import getenv, DEBUG, fromimport, unwrap
 from tinygrad.device import Compiled, Allocator, Compiler, Device
 from http.server import HTTPServer, BaseHTTPRequestHandler
 # ***** backend *****
@dataclass
 class CloudSession:
  programs: Dict[Tuple[str, str], Any] = field(default_factory=dict)
  buffers: Dict[int, Tuple[Any, int]] = field(default_factory=dict)
  buffer_num = 0
 class CloudHandler(BaseHTTPRequestHandler):
  protocol_version = 'HTTP/1.1'
  dname: str
  sessions: DefaultDict[str, CloudSession] = defaultdict(CloudSession)
  def setup(self):
    super().setup()
    print(f"connection established with {self.client_address}, socket: {self.connection.fileno()}")
  def get_data(self):
    content_len = self.headers.get('Content-Length')
    assert content_len is not None
    return self.rfile.read(int(content_len))
  def get_json(self): return json.loads(self.get_data())
  def _fail(self):
    self.send_response(404)
    self.end_headers()
    return 0
  def _do(self, method):
    session = CloudHandler.sessions[unwrap(self.headers.get("Cookie")).split("session=")[1]]
    ret = b""
    if self.path == "/renderer" and method == "GET":
      cls, args = Device[CloudHandler.dname].renderer.__reduce__()
      ret = json.dumps((cls.__module__, cls.__name__, args)).encode()
    elif self.path.startswith("/alloc") and method == "POST":
      size = int(self.path.split("=")[-1])
      session.buffer_num += 1
      session.buffers[session.buffer_num] = (Device[CloudHandler.dname].allocator.alloc(size), size)
      ret = str(session.buffer_num).encode()
    elif self.path.startswith("/buffer"):
      key = int(self.path.split("/")[-1])
      buf,sz = session.buffers[key]
      if method == "GET": Device[CloudHandler.dname].allocator.copyout(memoryview(ret:=bytearray(sz)), buf)
      elif method == "PUT": Device[CloudHandler.dname].allocator.copyin(buf, memoryview(bytearray(self.get_data())))
      elif method == "DELETE":
        Device[CloudHandler.dname].allocator.free(buf,sz)
        del session.buffers[key]
      else: return self._fail()
    elif self.path.startswith("/program"):
      name, hsh = self.path.split("/")[-2:]
      if method == "PUT":
        src = self.get_data()
        assert hashlib.sha256(src).hexdigest() == hsh
        lib = Device[CloudHandler.dname].compiler.compile_cached(src.decode())
        session.programs[(name, hsh)] = Device[CloudHandler.dname].runtime(name, lib)
      elif method == "POST":
        j = self.get_json()
        bufs = [session.buffers[x][0] for x in j['bufs']]
        del j['bufs']
        r = session.programs[(name, hsh)](*bufs, **j)
        if r is not None: ret = str(r).encode()
      elif method == "DELETE": del session.programs[(name, hsh)]
      else: return self._fail()
    else: return self._fail()
    self.send_response(200)
    self.send_header('Content-Length', str(len(ret)))
    self.end_headers()
    return self.wfile.write(ret)
  def do_GET(self): return self._do("GET")
  def do_POST(self): return self._do("POST")
  def do_PUT(self): return self._do("PUT")
  def do_DELETE(self): return self._do("DELETE")
 def cloud_server(port:int):
  multiprocessing.current_process().name = "MainProcess"
  CloudHandler.dname = getenv("CLOUDDEV", "METAL") if Device.DEFAULT == "CLOUD" else Device.DEFAULT
  print(f"start cloud server on {port} with device {CloudHandler.dname}")
  server = HTTPServer(('', port), CloudHandler)
  server.serve_forever()
 # ***** frontend *****
 class CloudAllocator(Allocator):
  def __init__(self, device:CloudDevice):
    self.device = device
    super().__init__()
  def _alloc(self, size:int, options) -> int: return int(self.device.send("POST", f"alloc?size={size}"))
  def _free(self, opaque, options):
    with contextlib.suppress(ConnectionRefusedError, http.client.CannotSendRequest, http.client.RemoteDisconnected):
      self.device.send("DELETE", f"buffer/{opaque}", data=b"")
  def copyin(self, dest:int, src:memoryview): self.device.send("PUT", f"buffer/{dest}", data=bytes(src))
  def copyout(self, dest:memoryview, src:int):
    resp = self.device.send("GET", f"buffer/{src}")
    assert len(resp) == len(dest), f"buffer length mismatch {len(resp)} != {len(dest)}"
    dest[:] = resp
 class CloudProgram:
  def __init__(self, device:CloudDevice, name:str, lib:bytes):
    self.device = device
    self.prgid = f"{name}/{hashlib.sha256(lib).hexdigest()}"
    self.device.send("PUT", "program/"+self.prgid, lib)
    super().__init__()
  def __del__(self): self.device.send("DELETE", "program/"+self.prgid)
  def __call__(self, *bufs, global_size=None, local_size=None, vals:Tuple[int, ...]=(), wait=False):
    args = {"bufs": bufs, "vals": vals, "wait": wait}
    if global_size is not None: args["global_size"] = global_size
    if local_size is not None: args["local_size"] = local_size
    ret = self.device.send("POST", "program/"+self.prgid, json.dumps(args).encode())
    if wait: return float(ret)
 class CloudDevice(Compiled):
  def __init__(self, device:str):
    if (host:=getenv("HOST", "")) != "":
      self.host = host
    else:
      p = multiprocessing.Process(target=cloud_server, args=(6667,))
      p.daemon = True
      p.start()
      self.host = "127.0.0.1:6667"
    self.cookie = binascii.hexlify(os.urandom(0x10)).decode()
    if DEBUG >= 1: print(f"cloud with host {self.host}")
    while 1:
      try:
        self.conn = http.client.HTTPConnection(self.host, timeout=60.0)
        clouddev = json.loads(self.send("GET", "renderer").decode())
        break
      except Exception as e:
        print(e)
        time.sleep(0.1)
    if DEBUG >= 1: print(f"remote has device {clouddev}")
    # TODO: how to we have BEAM be cached on the backend? this should just send a specification of the compute. rethink what goes in Renderer
    assert clouddev[0].startswith("tinygrad.renderer."), f"bad renderer {clouddev}"
    renderer = fromimport(clouddev[0], clouddev[1])(*clouddev[2])
    super().__init__(device, CloudAllocator(self), renderer, Compiler(), functools.partial(CloudProgram, self))
  def send(self, method, path, data:Optional[bytes]=None) -> bytes:
    # TODO: retry logic
    self.conn.request(method, "/"+path, data, headers={"Cookie": f"session={self.cookie}"})
    response = self.conn.getresponse()
    assert response.status == 200, f"failed on {method} {path}"
    return response.read()
 if __name__ == "__main__": cloud_server(getenv("PORT", 6667))