add RESET_STEP in bert to control reset (#6818)

same as resnet
This commit is contained in:
chenyu 2024-09-30 09:39:04 -04:00 committed by GitHub
parent 0c24fec9f4
commit f59517754e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 2 additions and 2 deletions

View File

@ -802,7 +802,7 @@ def train_bert():
if i % eval_step_freq == 0 or (BENCHMARK and i == BENCHMARK):
if MLLOGGER and RUNMLPERF:
MLLOGGER.start(key=mllog_constants.EVAL_START, value=None, metadata={"epoch_num": 1, "epoch_count": 1, "step_num": i})
train_step_bert.reset()
if getenv("RESET_STEP", 1): train_step_bert.reset()
eval_lm_losses = []
eval_clsf_losses = []
eval_lm_accs = []
@ -840,7 +840,7 @@ def train_bert():
MLLOGGER.event(key=mllog_constants.INIT_STOP, value=None)
return
eval_step_bert.reset()
if getenv("RESET_STEP", 1): eval_step_bert.reset()
del eval_data, eval_result
avg_lm_loss = sum(eval_lm_losses) / len(eval_lm_losses)
avg_clsf_loss = sum(eval_clsf_losses) / len(eval_clsf_losses)