fix optimizer

This commit is contained in:
George Hotz 2023-01-29 00:23:06 -08:00
parent a9cabce791
commit ebdec2b72f
3 changed files with 13 additions and 7 deletions

View File

@ -20,7 +20,7 @@ def get_random_intervention(k):
while 1:
up_axis = random.randint(0, k.shape_len-1)
amount = random.choice([4, 8])
if not all(x[up_axis] == 1 or x[up_axis]%amount == 0 for x in k.shapes): continue
if not all(st.shape[up_axis] == 1 or st.shape[up_axis]%amount == 0 for st in k.sts): continue
return 1, up_axis, amount
def apply_intervention(k, typ, *dat):
@ -63,16 +63,16 @@ def search(ast):
# TODO: support upcasting, splitting, and local grouping for reduce
CL.time_sum = 0
k.codegen()(*k.bufs)
if CL.time_sum < best_time:
print(f"accepting {inter} with time {best_time} -> {CL.time_sum}")
if CL.time_sum < best_time * 0.95:
print(f"accepting {inter} with time {best_time:.2f} -> {CL.time_sum:.2f} ratio {best_time/CL.time_sum:.2f}x")
best_time = CL.time_sum
winning_interventions.append(inter)
for i in range(200):
for i in range(100):
try:
test()
except Exception as e:
#traceback.print_exc()
traceback.print_exc()
pass
# run best
@ -138,6 +138,12 @@ if __name__ == "__main__":
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 512, 512, 1, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (512, 512))
elif int(os.getenv("FASTCONV", "0")):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(32, 1, 32, 32, 32, 64, 3, 3), views=[View((32, 1, 32, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 64, 34, 34), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(32, 1, 32, 32, 32, 64, 3, 3), views=[View((32, 1, 32, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 64, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (32, 1, 32, 32, 32, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 32, 32, 32))
else:
# reduce
buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))

View File

@ -139,7 +139,7 @@ class TestSpeed(unittest.TestCase):
helper_test_generic_square('mul_sum', 4096, f, f)
def test_add(self):
for N in [1024, 4096]:
for N in [1, 1024, 4096]:
def f(a, b): return a + b
helper_test_generic_square('add', N, f, f)

View File

@ -356,7 +356,7 @@ class GPUBuffer(ExplicitExecAST):
def exec_ast(cls, ast:LazyOp):
k = CLASTKernel(ast)
k.codegen()(*k.bufs)
if PRINT_AST == "1" or PRINT_AST == k.fxn.name:
if PRINT_AST == "1" or (hasattr(k, "fxn") and PRINT_AST == k.fxn.name):
print(k.fxn.name)
k.print()
if TEST_AST: