tinygrad/test/test_optim.py

import numpy as np
import torch
import unittest
from tinygrad import Tensor, Device, dtypes
from tinygrad.nn.optim import Adam, SGD, AdamW
from tinygrad.helpers import CI
from test.helpers import is_dtype_supported

np.random.seed(1337)
x_init = np.random.randn(1,4).astype(np.float32)
W_init = np.random.randn(4,4).astype(np.float32)
m_init = np.random.randn(1,4).astype(np.float32)

class TeenyNet:
  def __init__(self, tensor):
    self.x = tensor(x_init.copy(), requires_grad=True)
    self.W = tensor(W_init.copy(), requires_grad=True)
  def forward(self):
    return (self.x * self.W).sum()

class TinyNet:
  def __init__(self, tensor):
    self.x = tensor(x_init.copy(), requires_grad=True)
    self.W = tensor(W_init.copy(), requires_grad=True)
    self.m = tensor(m_init.copy())

  def forward(self):
    out = self.x.matmul(self.W).relu()
    # print(out.detach().numpy())
    out = out.log_softmax(1)
    out = out.mul(self.m).add(self.m).sum()
    return out

def step(tensor, optim, steps=1, teeny=False, **kwargs):
  net = TeenyNet(tensor) if teeny else TinyNet(tensor)
  optim = optim([net.x, net.W], **kwargs)
  for _ in range(steps):
    out = net.forward()
    optim.zero_grad()
    out.backward()
    optim.step()
  return net.x.detach().numpy(), net.W.detach().numpy()

@unittest.skipIf(CI and Device.DEFAULT == "CUDA", "slow")
class TestOptim(unittest.TestCase):

  def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):
    for x,y in zip(step(Tensor, tinygrad_optim, steps, **opts),
                   step(torch.tensor, torch_optim, steps, **opts)):
      np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)

  def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)
  def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)
  def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)

  def test_multistep_sgd_high_lr_teeny(self): self._test_sgd(2, {'lr': 1.1, 'teeny': True}, 1e-6, 1e-5)
  def test_multistep_adam_high_lr_teeny(self): self._test_adam(2, {'lr': 1.1, 'teeny': True}, 2e-4, 5e-4)

  def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)
  def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)
  def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
  def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)

  def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)
  def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)
  def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
  def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)

  def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)
  def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)
  def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)
  def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)

  def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)
  def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)
  def test_multistep_sgd_nesterov_momentum_wd(self):
    self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)
  def test_multistep_sgd_high_lr_nesterov_momentum_wd(self):
    self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)

  def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)
  def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)
  def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)
  def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)

  def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)
  def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-3, 5e-4)

  def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)
  def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)

  def test_duped_weights(self):
    for Opt in [Adam, AdamW, SGD]:
      losses = []
      for i in range(2):
        w = Tensor(x_init.copy())
        opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)

        loss = None
        for _ in range(3):
          loss = w.sum()
          opt.zero_grad()
          loss.backward()
          opt.step()
        losses.append(loss.numpy())

      np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)

  @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
  def test_mixed_precision(self):
    old_default_float, dtypes.default_float = dtypes.default_float, dtypes.half
    # weight update would overflow without upcasting
    self._test_sgd(10, {'lr': 1e10}, 1e-6, 3e-4)
    self._test_adam(1, {'lr': 1e10}, 1e-4, 1e-4)
    self._test_adamw(1, {'lr': 1e10}, 1e-4, 1e-4)
    dtypes.default_float = old_default_float

if __name__ == '__main__':
  unittest.main()
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00			`import numpy as np`
			`import torch`
			`import unittest`
use at least float32 for optim.lr (#4297) * use at least float32 for optim.lr when doing mixed precision training (float32 weight, default_float=half), still use float32 to store lr. it would have been upcasted later in actual weight update, but would have lost precision. this improved resnet convergence significantly * undo type annotation 2024-04-26 02:42:28 +08:00			`from tinygrad import Tensor, Device, dtypes`
remove RMSprop, nobody uses it anymore 2023-03-21 03:31:34 +08:00			`from tinygrad.nn.optim import Adam, SGD, AdamW`
Remove pytest markers (#2831) * remove pytest marker * fix some, skip some * tweak * fix * skip slow * skip more 2023-12-19 07:53:28 +08:00			`from tinygrad.helpers import CI`
use at least float32 for optim.lr (#4297) * use at least float32 for optim.lr when doing mixed precision training (float32 weight, default_float=half), still use float32 to store lr. it would have been upcasted later in actual weight update, but would have lost precision. this improved resnet convergence significantly * undo type annotation 2024-04-26 02:42:28 +08:00			`from test.helpers import is_dtype_supported`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`np.random.seed(1337)`
			`x_init = np.random.randn(1,4).astype(np.float32)`
			`W_init = np.random.randn(4,4).astype(np.float32)`
			`m_init = np.random.randn(1,4).astype(np.float32)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
remove realize from optimizer (#2880) * remove realize from optimizer * one still needed * opt realize 2023-12-21 08:42:41 +08:00			`class TeenyNet:`
			`def __init__(self, tensor):`
			`self.x = tensor(x_init.copy(), requires_grad=True)`
			`self.W = tensor(W_init.copy(), requires_grad=True)`
			`def forward(self):`
			`return (self.x * self.W).sum()`

use class Foo: instead of class Foo(): (#1797) * use class Foo: instead of class Foo(): * add ruff linter, copy settings from .flake8 to ruff.toml 2023-09-07 03:20:25 +08:00			`class TinyNet:`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def __init__(self, tensor):`
			`self.x = tensor(x_init.copy(), requires_grad=True)`
			`self.W = tensor(W_init.copy(), requires_grad=True)`
			`self.m = tensor(m_init.copy())`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
			`def forward(self):`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`out = self.x.matmul(self.W).relu()`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`# print(out.detach().numpy())`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`out = out.log_softmax(1)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00			`out = out.mul(self.m).add(self.m).sum()`
			`return out`

remove realize from optimizer (#2880) * remove realize from optimizer * one still needed * opt realize 2023-12-21 08:42:41 +08:00			`def step(tensor, optim, steps=1, teeny=False, **kwargs):`
			`net = TeenyNet(tensor) if teeny else TinyNet(tensor)`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`optim = optim([net.x, net.W], **kwargs)`
			`for _ in range(steps):`
			`out = net.forward()`
			`optim.zero_grad()`
			`out.backward()`
			`optim.step()`
			`return net.x.detach().numpy(), net.W.detach().numpy()`

Remove pytest markers (#2831) * remove pytest marker * fix some, skip some * tweak * fix * skip slow * skip more 2023-12-19 07:53:28 +08:00			`@unittest.skipIf(CI and Device.DEFAULT == "CUDA", "slow")`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`class TestOptim(unittest.TestCase):`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):`
remove realize from optimizer (#2880) * remove realize from optimizer * one still needed * opt realize 2023-12-21 08:42:41 +08:00			`for x,y in zip(step(Tensor, tinygrad_optim, steps, **opts),`
			`step(torch.tensor, torch_optim, steps, **opts)):`
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)`
			`def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)`
add AdamW optimizer (#716) * add AdamW optimizer * one liner Adam optimizer 2023-03-20 03:51:06 +08:00			`def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)`
Dedup params in `Optimizer` (#1047) * Dedup params in optimizer * Passing the same tensor multiple times in the set of learnable params passed to optimizers can result in models completely failing to learn, but no errors are produced. This dedups tensors to avoid the problem. * Fix types * Use new variable to satisfy linter * Use `helpers.dedup` instead of `set()` to dedup params * Add test for duped params in optimizers 2023-06-26 15:49:23 +08:00
remove realize from optimizer (#2880) * remove realize from optimizer * one still needed * opt realize 2023-12-21 08:42:41 +08:00			`def test_multistep_sgd_high_lr_teeny(self): self._test_sgd(2, {'lr': 1.1, 'teeny': True}, 1e-6, 1e-5)`
			`def test_multistep_adam_high_lr_teeny(self): self._test_adam(2, {'lr': 1.1, 'teeny': True}, 2e-4, 5e-4)`

multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)`
			`def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)`
			`def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)`
multistep optim tests passing 2023-03-12 09:49:53 +08:00
			`def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)`
Add weight decay to SGD (#883) * feat: add weight decay to sgd * fix: fix tests 2023-06-02 04:13:18 +08:00			`def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)`
			`def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)`
multistep optim tests passing 2023-03-12 09:49:53 +08:00
			`def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)`
			`def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)`
ruff checks the max line length is 150 (#2734) * ruff checks the max line length is 150 * fix tensor.py * a lot more * done 2023-12-13 09:34:47 +08:00			`def test_multistep_sgd_nesterov_momentum_wd(self):`
			`self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)`
			`def test_multistep_sgd_high_lr_nesterov_momentum_wd(self):`
			`self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)`
Consistent testing (#137) * Consistent GPU classes Convert the existing GPU classes into one standard format. Remove duplicated functions in `test_mnist` and create a TestMNISTGPU class. This reduces line count and ensures consistency. Use `@unittest.skipUnless(GPU, "Requires GPU")` instead of `if GPU:` to skip GPU testing. This will ensure that skipped tests are displayed accordingly in the pytest output. * Optim Testing now supports GPU * Tensor testing now supports GPU jacobian and gradcheck auto skipped until GPU float64 support added. * GPU support for custom constructor methods * Remove GPU flag from Model constructors It was requested that the `gpu` kwarg be removed from the model constructor. GPU conversion is now handled in the train function. This also required the conversion of Optimizer parameters as they are constructed prior to execution of the `train` function and are dependant on the model GPU state. * Fix typo: float32->float64 * Clean `get_parameters` utility Just a quick refactor w/ the new support for optimizers. * Remove GPU kwarg from TinyNet Remove `gpu` kwarg from tiny net to match test_mnist `train` function. 2020-12-09 18:25:27 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)`
Add WEBGPU tests to CI (#1463) * webgpu tests * assert device is webgpu * missed env set * exclude failing ci tests * ignore test file * changed acc for adam test 2023-08-07 01:32:01 +08:00			`def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)`
add AdamW optimizer (#716) * add AdamW optimizer * one liner Adam optimizer 2023-03-20 03:51:06 +08:00			`def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)`
Add WEBGPU tests to CI (#1463) * webgpu tests * assert device is webgpu * missed env set * exclude failing ci tests * ignore test file * changed acc for adam test 2023-08-07 01:32:01 +08:00			`def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
multistep optim tests passing 2023-03-12 09:49:53 +08:00			`def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)`
Use exec_alu for lazy const folding (#4039) 2024-04-03 08:52:05 +08:00			`def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-3, 5e-4)`
remove RMSprop, nobody uses it anymore 2023-03-21 03:31:34 +08:00
add AdamW optimizer (#716) * add AdamW optimizer * one liner Adam optimizer 2023-03-20 03:51:06 +08:00			`def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)`
Remove POW llop and add SQRT llop (#1104) * fixed division by zero for fast operations * made et closer to 0 * replace POW llop with SQRT * updated mlops to swap SQRT and POW llops * updated hlops to swap POW and SQRT * added sqrt llop to cpu runtime * added sqrt llop to cstyle codegen * added POW llop to llvm ir codegen * added SQRT llop to torch runtime * moved pow from mlops to hlops * found a better way to do reverse pow * fixed indentation * added SQRT llop to triton * update docs to match new llops * removed POW operator from assembly codegen * added sqrt and rsqrt to pow hlop * rewrote pow function in tensor.py * Adjust tolerance * Adjust for adamw * Reduce for Adam too * removed accidental leftover code * removed all of accidental code * added rsqrt test * removed pow from mlops again it was added back when resolving merge conflicts --------- Co-authored-by: Jacky Lee <jla524@sfu.ca> 2023-07-06 09:07:58 +08:00			`def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)`
efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00
Dedup params in `Optimizer` (#1047) * Dedup params in optimizer * Passing the same tensor multiple times in the set of learnable params passed to optimizers can result in models completely failing to learn, but no errors are produced. This dedups tensors to avoid the problem. * Fix types * Use new variable to satisfy linter * Use `helpers.dedup` instead of `set()` to dedup params * Add test for duped params in optimizers 2023-06-26 15:49:23 +08:00			`def test_duped_weights(self):`
			`for Opt in [Adam, AdamW, SGD]:`
			`losses = []`
			`for i in range(2):`
			`w = Tensor(x_init.copy())`
			`opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)`

			`loss = None`
			`for _ in range(3):`
			`loss = w.sum()`
			`opt.zero_grad()`
			`loss.backward()`
			`opt.step()`
			`losses.append(loss.numpy())`

			`np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)`

use at least float32 for optim.lr (#4297) * use at least float32 for optim.lr when doing mixed precision training (float32 weight, default_float=half), still use float32 to store lr. it would have been upcasted later in actual weight update, but would have lost precision. this improved resnet convergence significantly * undo type annotation 2024-04-26 02:42:28 +08:00			`@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")`
			`def test_mixed_precision(self):`
			`old_default_float, dtypes.default_float = dtypes.default_float, dtypes.half`
			`# weight update would overflow without upcasting`
			`self._test_sgd(10, {'lr': 1e10}, 1e-6, 3e-4)`
			`self._test_adam(1, {'lr': 1e10}, 1e-4, 1e-4)`
			`self._test_adamw(1, {'lr': 1e10}, 1e-4, 1e-4)`
			`dtypes.default_float = old_default_float`

efficient version of adam (#20) * counteracted bias initialization * test new adam * add optimizer tests * rename helper function names to fix the test * remove redundant import 2020-10-28 06:54:40 +08:00			`if __name__ == '__main__':`
Over 90% on CIFAR with examples/hlb_cifar10.py (#1073) * fix eval, lr decay, best eval * 82.27 * 82.64 * 82.79, reproducable * add lr sched, 85.26 * 87.42 * 87.94 * 87.42 * tta with flip * training flip aug * refactor * using Tensor for LR is faster * 89.5 * refactor, flip only train set * 90.01 * 90.64 * eval jit * refactor * only JIT model * fix eval JIT * fix eval JIT * 90.82 * STEPS=900 reaches 90.22 * TTA envvar * TTA default 0 * fully jit training * refactor optim * fix sched * add label smoothing * param changes * patial gelu * OneCycle with pause * gelu maybe works * 90.12 * remove pause lr * maybe fix lr schedulers * scheduler test passing * comments * try mixup * shuffle! * add back the missing last eval * fix shuffle bugs * add mixup prob * fix mixup prob * 90.19 * correct mixup * correct mixup * correct mixup * 90.24 * 90.33 * refactor, add type hints * add gradient clipping * maybe fix test * full JIT * back to relu for now * pass mixup prob as param * add typehints * maybe CI works * try erf gelu * CI, types * remove useless import/ * refactor optim * refactor optim * try leakyrelu * try celu * gelu * 90.67 * remove grad clip * remove grad clip tests * revert params * add test for OneCycleLR * 90.62 * fix eval timing * fix eval timing again * so where i calculate mixup_prob matters --------- Co-authored-by: Kunwar Raj Singh <kunwar31@pop-os.localdomain> 2023-07-07 11:46:22 +08:00			`unittest.main()`