import unittest
from tinygrad import Tensor, Device, Variable
from examples.gpt2 import Transformer
from tinygrad.nn.state import get_state_dict

class TestMethodCache(unittest.TestCase):
  def setUp(self):
    self.backup_compiler = Device[Device.DEFAULT].compiler
  def tearDown(self):
    Device[Device.DEFAULT].compiler = self.backup_compiler

  def test_simple_methodcache(self):
    a = Tensor([1])
    b = Tensor([2])
    c = Tensor([3])
    d = Tensor([4])
    (a+b).realize()
    Device[Device.DEFAULT].compiler = None
    (c+d).realize()

  def test_nested_methodcache(self):
    a,b,c,d = Tensor([1]), Tensor([2]), Tensor([3]), Tensor([4])
    ((a+b)+(a+b)).realize()
    Device[Device.DEFAULT].compiler = None
    ((c+d)+(c+d)).realize()

  def test_nested_methodcache_swap(self):
    a,b,c,d = Tensor([1]), Tensor([2]), Tensor([3]), Tensor([4])
    ((a+b)+(c+d)).realize()
    Device[Device.DEFAULT].compiler = None
    ((c+d)+(a+b)).realize()

  def test_small_transformer(self):
    args_tiny = {"dim": 16, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 10}
    model = Transformer(**args_tiny)
    for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype).realize())
    # NOTE: you have to do this twice due to the k-v cache
    for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
    for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
    Device[Device.DEFAULT].compiler = None
    for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()

if __name__ == '__main__':
  unittest.main()