tinygrad/test/test_optim.py

import numpy as np
import torch
import unittest
from tinygrad.tensor import Tensor
from tinygrad.nn.optim import Adam, SGD, AdamW
import pytest

pytestmark = pytest.mark.exclude_cuda

np.random.seed(1337)
x_init = np.random.randn(1,4).astype(np.float32)
W_init = np.random.randn(4,4).astype(np.float32)
m_init = np.random.randn(1,4).astype(np.float32)

class TinyNet:
  def __init__(self, tensor):
    self.x = tensor(x_init.copy(), requires_grad=True)
    self.W = tensor(W_init.copy(), requires_grad=True)
    self.m = tensor(m_init.copy())

  def forward(self):
    out = self.x.matmul(self.W).relu()
    # print(out.detach().numpy())
    out = out.log_softmax(1)
    out = out.mul(self.m).add(self.m).sum()
    return out

def step(tensor, optim, steps=1, kwargs={}):
  net = TinyNet(tensor)
  optim = optim([net.x, net.W], **kwargs)
  for _ in range(steps):
    out = net.forward()
    optim.zero_grad()
    out.backward()
    optim.step()
  return net.x.detach().numpy(), net.W.detach().numpy()

class TestOptim(unittest.TestCase):

  def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):
    for x,y in zip(step(Tensor, tinygrad_optim, steps, kwargs=opts),
                   step(torch.tensor, torch_optim, steps, kwargs=opts)):
      np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)

  def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)
  def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)
  def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)

  def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)
  def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)
  def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
  def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)

  def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)
  def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)
  def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
  def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)

  def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)
  def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)
  def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)
  def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)

  def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)
  def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)
  def test_multistep_sgd_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)
  def test_multistep_sgd_high_lr_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)

  def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)
  def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)
  def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)
  def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)

  def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)
  def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-4, 5e-4)

  def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)
  def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)

  def test_duped_weights(self):
    for Opt in [Adam, AdamW, SGD]:
      losses = []
      for i in range(2):
        w = Tensor(x_init.copy())
        opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)

        loss = None
        for _ in range(3):
          loss = w.sum()
          opt.zero_grad()
          loss.backward()
          opt.step()
        losses.append(loss.numpy())

      np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)

if __name__ == '__main__':
  unittest.main()
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00			`import numpy as np`
			`import torch`
			`import unittest`
Test split (#231) * Split tests Split tests into "Test CPU" and "Test GPU". Add test flag "TEST_DEVICES" which is a comma separated list of devices: CPU,GPU,ANE * Run tests based on provided TEST_DEVICES flag By default will run all "CPU,GPU,ANE" * fix bad quote * Revert changes and use GPU=1 This is done through setting the default Tensor Device to Device.CPU of GPU=1 is set. Run GPU tests: GPU=1 pytest -s -v 2021-01-01 22:19:03 +08:00			`from tinygrad.tensor import Tensor`
remove RMSprop, nobody uses it anymore 2023-03-21 03:31:34 +08:00			`from tinygrad.nn.optim import Adam, SGD, AdamW`
CI < 5 minutes (#1252) * models matrix * fix typo and install gpu deps * install llvm deps if needed * fix * testops with cuda * remove pip cache since not work * cuda env * install cuda deps * maybe it will work now * i can't read * all tests in matrix * trim down more * opencl stuff in matrix * opencl pip cache * test split * change cuda test exclusion * test * fix cuda maybe * add models * add more n=auto * third thing * fix bug * cache pip more * change name * update tests * try again cause why not * balance * try again... * try apt cache for cuda * try on gpu: * try cuda again * update packages step * replace libz-dev with zlib1g-dev * only cache cuda * why error * fix gpuocelot bug * apt cache err * apt cache to slow? * opt and image in single runner * add a couple n=autos * remove test matrix * try cuda apt cache again * libz-dev -> zlib1g-dev * remove -s since not supported by xdist * the cache takes too long and doesn't work * combine webgpu and metal tests * combine imagenet to c and cpu tests * torch tests with linters * torch back by itself * small windows clang test with torch tests * fix a goofy windows bug * im dumb * bro * clang with linters * fix pylint error * linter not work on windows * try with clang again * clang and imagenet? * install deps * fix * fix quote * clang by itself (windows too slow) * env vars for imagenet * cache pip for metal and webgpu tests * try torch with metal and webgpu * doesn't work, too long * remove -v * try -n=logical * don't use logical * revert accidental thing * remove some prints unless CI * fix print unless CI * ignore speed tests for slow tests * clang windows in matrix (ubuntu being tested in imagenet->c test) * try manual pip cache * fix windows pip cache path * all manual pip cache * fix pip cache dir for macos * print_ci function in helpers * CI as variable, no print_ci * missed one * cuda tests with docker image * remove setup-python action for cuda * python->python3? * remove -s -v * try fix pip cache * maybe fix * try to fix pip cache * is this the path? * maybe cache pip * try again * create wheels dir * ? * cuda pip deps in dockerfile * disable pip cache for clang * image from ghcr instead of docker hub * why is clang like this * fast deps * try use different caches * remove the fast thing * try with lighter image * remove setup python for cuda * small docker and cuda fast deps * ignore a few more tests * cool docker thing (maybe) * oops * quotes * fix docker command * fix bug * ignore train efficientnet test * remove dockerfile (docker stuff takes too long) * remove docker stuff and normal cuda * oops * ignore the tests for cuda * does this work * ignore test_train on slow backends * add space * llvm ignore same tests as cuda * nvm * ignore lr scheduler tests * get some stats * fix ignore bug * remove extra ' * remove and * ignore test for llvm * change ignored tests and durationon all backends * fix * and -> or * ignore some more cuda tests * finally? * does this fix it * remove durations=0 * add some more tests to llvm * make last pytest more readable * fix * don't train efficientnet on cpu * try w/out pip cache * pip cache seems to be generally better * pytest file markers * try apt fast for cuda * use quick install for apt-fast * apt-fast not worth * apt-get to apt * fix typo * suppress warnings * register markers * disable debug on fuzz tests * change marker names * apt update and apt install in one command * update marker names in test.yml * webgpu pytest marker 2023-07-24 04:00:56 +08:00			`import pytest`

			`pytestmark = pytest.mark.exclude_cuda`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`np.random.seed(1337)`
			`x_init = np.random.randn(1,4).astype(np.float32)`
			`W_init = np.random.randn(4,4).astype(np.float32)`
			`m_init = np.random.randn(1,4).astype(np.float32)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
use class Foo: instead of class Foo(): (#1797) * use class Foo: instead of class Foo(): * add ruff linter, copy settings from .flake8 to ruff.toml 2023-09-07 03:20:25 +08:00			`class TinyNet:`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def __init__(self, tensor):`
			`self.x = tensor(x_init.copy(), requires_grad=True)`
			`self.W = tensor(W_init.copy(), requires_grad=True)`
			`self.m = tensor(m_init.copy())`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
			`def forward(self):`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`out = self.x.matmul(self.W).relu()`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`# print(out.detach().numpy())`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`out = out.log_softmax(1)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00			`out = out.mul(self.m).add(self.m).sum()`
			`return out`

multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def step(tensor, optim, steps=1, kwargs={}):`
			`net = TinyNet(tensor)`
			`optim = optim([net.x, net.W], **kwargs)`
			`for _ in range(steps):`
			`out = net.forward()`
			`optim.zero_grad()`
			`out.backward()`
			`optim.step()`
			`return net.x.detach().numpy(), net.W.detach().numpy()`

			`class TestOptim(unittest.TestCase):`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):`
			`for x,y in zip(step(Tensor, tinygrad_optim, steps, kwargs=opts),`
			`step(torch.tensor, torch_optim, steps, kwargs=opts)):`
			`np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)`
			`def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)`
add AdamW optimizer (#716) * add AdamW optimizer * one liner Adam optimizer 2023-03-20 03:51:06 +08:00			`def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)`
Dedup params in `Optimizer` (#1047) * Dedup params in optimizer * Passing the same tensor multiple times in the set of learnable params passed to optimizers can result in models completely failing to learn, but no errors are produced. This dedups tensors to avoid the problem. * Fix types * Use new variable to satisfy linter * Use `helpers.dedup` instead of `set()` to dedup params * Add test for duped params in optimizers 2023-06-26 15:49:23 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)`
			`def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)`
			`def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)`
multistep optim tests passing 2023-03-12 09:49:53 +08:00
			`def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)`
multistep optim tests passing 2023-03-12 09:49:53 +08:00
			`def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)`
			`def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_multistep_sgd_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)`
			`def test_multistep_sgd_high_lr_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)`
Consistent testing (#137) * Consistent GPU classes Convert the existing GPU classes into one standard format. Remove duplicated functions in `test_mnist` and create a TestMNISTGPU class. This reduces line count and ensures consistency. Use `@unittest.skipUnless(GPU, "Requires GPU")` instead of `if GPU:` to skip GPU testing. This will ensure that skipped tests are displayed accordingly in the pytest output. * Optim Testing now supports GPU * Tensor testing now supports GPU jacobian and gradcheck auto skipped until GPU float64 support added. * GPU support for custom constructor methods * Remove GPU flag from Model constructors It was requested that the `gpu` kwarg be removed from the model constructor. GPU conversion is now handled in the train function. This also required the conversion of Optimizer parameters as they are constructed prior to execution of the `train` function and are dependant on the model GPU state. * Fix typo: float32->float64 * Clean `get_parameters` utility Just a quick refactor w/ the new support for optimizers. * Remove GPU kwarg from TinyNet Remove `gpu` kwarg from tiny net to match test_mnist `train` function. 2020-12-09 18:25:27 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)`
Add WEBGPU tests to CI (#1463) * webgpu tests * assert device is webgpu * missed env set * exclude failing ci tests * ignore test file * changed acc for adam test 2023-08-07 01:32:01 +08:00			`def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)`
add AdamW optimizer (#716) * add AdamW optimizer * one liner Adam optimizer 2023-03-20 03:51:06 +08:00			`def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)`
Add WEBGPU tests to CI (#1463) * webgpu tests * assert device is webgpu * missed env set * exclude failing ci tests * ignore test file * changed acc for adam test 2023-08-07 01:32:01 +08:00			`def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)`
Remove POW llop and add SQRT llop (#1104) * fixed division by zero for fast operations * made et closer to 0 * replace POW llop with SQRT * updated mlops to swap SQRT and POW llops * updated hlops to swap POW and SQRT * added sqrt llop to cpu runtime * added sqrt llop to cstyle codegen * added POW llop to llvm ir codegen * added SQRT llop to torch runtime * moved pow from mlops to hlops * found a better way to do reverse pow * fixed indentation * added SQRT llop to triton * update docs to match new llops * removed POW operator from assembly codegen * added sqrt and rsqrt to pow hlop * rewrote pow function in tensor.py * Adjust tolerance * Adjust for adamw * Reduce for Adam too * removed accidental leftover code * removed all of accidental code * added rsqrt test * removed pow from mlops again it was added back when resolving merge conflicts --------- Co-authored-by: Jacky Lee <jla524@sfu.ca> 2023-07-06 09:07:58 +08:00			`def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-4, 5e-4)`
remove RMSprop, nobody uses it anymore 2023-03-21 03:31:34 +08:00
add AdamW optimizer (#716) * add AdamW optimizer * one liner Adam optimizer 2023-03-20 03:51:06 +08:00			`def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)`
Remove POW llop and add SQRT llop (#1104) * fixed division by zero for fast operations * made et closer to 0 * replace POW llop with SQRT * updated mlops to swap SQRT and POW llops * updated hlops to swap POW and SQRT * added sqrt llop to cpu runtime * added sqrt llop to cstyle codegen * added POW llop to llvm ir codegen * added SQRT llop to torch runtime * moved pow from mlops to hlops * found a better way to do reverse pow * fixed indentation * added SQRT llop to triton * update docs to match new llops * removed POW operator from assembly codegen * added sqrt and rsqrt to pow hlop * rewrote pow function in tensor.py * Adjust tolerance * Adjust for adamw * Reduce for Adam too * removed accidental leftover code * removed all of accidental code * added rsqrt test * removed pow from mlops again it was added back when resolving merge conflicts --------- Co-authored-by: Jacky Lee <jla524@sfu.ca> 2023-07-06 09:07:58 +08:00			`def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
Dedup params in `Optimizer` (#1047) * Dedup params in optimizer * Passing the same tensor multiple times in the set of learnable params passed to optimizers can result in models completely failing to learn, but no errors are produced. This dedups tensors to avoid the problem. * Fix types * Use new variable to satisfy linter * Use `helpers.dedup` instead of `set()` to dedup params * Add test for duped params in optimizers 2023-06-26 15:49:23 +08:00			`def test_duped_weights(self):`
			`for Opt in [Adam, AdamW, SGD]:`
			`losses = []`
			`for i in range(2):`
			`w = Tensor(x_init.copy())`
			`opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)`

			`loss = None`
			`for _ in range(3):`
			`loss = w.sum()`
			`opt.zero_grad()`
			`loss.backward()`
			`opt.step()`
			`losses.append(loss.numpy())`

			`np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)`

efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00			`if __name__ == '__main__':`
Over 90% on CIFAR with examples/hlb_cifar10.py (#1073) * fix eval, lr decay, best eval * 82.27 * 82.64 * 82.79, reproducable * add lr sched, 85.26 * 87.42 * 87.94 * 87.42 * tta with flip * training flip aug * refactor * using Tensor for LR is faster * 89.5 * refactor, flip only train set * 90.01 * 90.64 * eval jit * refactor * only JIT model * fix eval JIT * fix eval JIT * 90.82 * STEPS=900 reaches 90.22 * TTA envvar * TTA default 0 * fully jit training * refactor optim * fix sched * add label smoothing * param changes * patial gelu * OneCycle with pause * gelu maybe works * 90.12 * remove pause lr * maybe fix lr schedulers * scheduler test passing * comments * try mixup * shuffle! * add back the missing last eval * fix shuffle bugs * add mixup prob * fix mixup prob * 90.19 * correct mixup * correct mixup * correct mixup * 90.24 * 90.33 * refactor, add type hints * add gradient clipping * maybe fix test * full JIT * back to relu for now * pass mixup prob as param * add typehints * maybe CI works * try erf gelu * CI, types * remove useless import/ * refactor optim * refactor optim * try leakyrelu * try celu * gelu * 90.67 * remove grad clip * remove grad clip tests * revert params * add test for OneCycleLR * 90.62 * fix eval timing * fix eval timing again * so where i calculate mixup_prob matters --------- Co-authored-by: Kunwar Raj Singh <kunwar31@pop-os.localdomain> 2023-07-07 11:46:22 +08:00			`unittest.main()`