From 79de0191a8006d914514dee59bbf1c1e8a566dda Mon Sep 17 00:00:00 2001 From: "Harle, Antoine (Contracteur)" Date: Wed, 12 Feb 2020 13:43:44 -0500 Subject: [PATCH] LR scheduler + Resolution pb ResNet50/WRN --- higher/smart_aug/datasets.py | 2 +- higher/smart_aug/dataug.py | 2 + higher/smart_aug/test_dataug.py | 25 ++++++++---- higher/smart_aug/train_utils.py | 70 +++++++++++++++++++++++++++------ 4 files changed, 78 insertions(+), 21 deletions(-) diff --git a/higher/smart_aug/datasets.py b/higher/smart_aug/datasets.py index 99a3a5c..c640f9c 100755 --- a/higher/smart_aug/datasets.py +++ b/higher/smart_aug/datasets.py @@ -7,7 +7,7 @@ from torch.utils.data.dataset import ConcatDataset import torchvision #Train/Validation batch size. -BATCH_SIZE = 300 +BATCH_SIZE = 512 #Test batch size. TEST_SIZE = BATCH_SIZE #TEST_SIZE = 10000 #legerement +Rapide / + Consomation memoire ! diff --git a/higher/smart_aug/dataug.py b/higher/smart_aug/dataug.py index 7c85473..8cc91e2 100755 --- a/higher/smart_aug/dataug.py +++ b/higher/smart_aug/dataug.py @@ -958,6 +958,8 @@ class Augmented_model(nn.Module): model.step(loss) + Does not support LR scheduler. + See ''run_simple_smartaug'' for a complete example. Args: diff --git a/higher/smart_aug/test_dataug.py b/higher/smart_aug/test_dataug.py index 67f042b..24ec173 100755 --- a/higher/smart_aug/test_dataug.py +++ b/higher/smart_aug/test_dataug.py @@ -82,7 +82,7 @@ if __name__ == "__main__": } #Parameters n_inner_iter = 1 - epochs = 150 + epochs = 200 dataug_epoch_start=0 optim_param={ 'Meta':{ @@ -91,10 +91,11 @@ if __name__ == "__main__": }, 'Inner':{ 'optim': 'SGD', - 'lr':1e-2, #1e-2 + 'lr':1e-1, #1e-2/1e-1 'momentum':0.9, #0.9 - 'decay':0.0001, + 'decay':0.0005, #0.0005 'nesterov':True, + 'scheduler':'exponential', #None, 'cosine', 'multiStep', 'exponential' } } @@ -103,21 +104,26 @@ if __name__ == "__main__": #model = ResNet(num_classes=10) import torchvision.models as models #model=models.resnet18() - model_name = 'resnet50' #'wide_resnet50_2' #'resnet18' #str(model) - model = getattr(models.resnet, model_name)(pretrained=False) + model_name = 'resnet18' #'wide_resnet50_2' #'resnet18' #str(model) + model = getattr(models.resnet, model_name)(pretrained=False, num_classes=len(dl_train.dataset.classes)) #### Classic #### if 'classic' in tasks: + torch.cuda.reset_max_memory_allocated() #reset_peak_stats + torch.cuda.reset_max_memory_cached() #reset_peak_stats t0 = time.perf_counter() + model = model.to(device) print("{} on {} for {} epochs".format(model_name, device_name, epochs)) + #print("RandAugment(N{}-M{:.2f})-{} on {} for {} epochs".format(rand_aug['N'],rand_aug['M'],model_name, device_name, epochs)) log= train_classic(model=model, opt_param=optim_param, epochs=epochs, print_freq=10) #log= train_classic_higher(model=model, epochs=epochs) exec_time=time.perf_counter() - t0 - max_cached = torch.cuda.max_memory_cached()/(1024.0 * 1024.0) #torch.cuda.max_memory_reserved() + max_allocated = torch.cuda.max_memory_allocated()/(1024.0 * 1024.0) + max_cached = torch.cuda.max_memory_cached()/(1024.0 * 1024.0) #torch.cuda.max_memory_reserved() #MB #### print('-'*9) times = [x["time"] for x in log] @@ -125,10 +131,13 @@ if __name__ == "__main__": "Time": (np.mean(times),np.std(times), exec_time), 'Optimizer': optim_param['Inner'], "Device": device_name, - "Memory": max_cached, + "Memory": [max_allocated, max_cached], + #"Rand_Aug": rand_aug, "Log": log} print(model_name,": acc", out["Accuracy"], "in:", out["Time"][0], "+/-", out["Time"][1]) filename = "{}-{} epochs".format(model_name,epochs) + #print("RandAugment-",model_name,": acc", out["Accuracy"], "in:", out["Time"][0], "+/-", out["Time"][1]) + #filename = "RandAugment(N{}-M{:.2f})-{}-{} epochs".format(rand_aug['N'],rand_aug['M'],model_name,epochs) with open("../res/log/%s.json" % filename, "w+") as f: try: json.dump(out, f, indent=True) @@ -163,7 +172,7 @@ if __name__ == "__main__": inner_it=n_inner_iter, dataug_epoch_start=dataug_epoch_start, opt_param=optim_param, - print_freq=10, + print_freq=1, unsup_loss=1, hp_opt=False, save_sample_freq=None) diff --git a/higher/smart_aug/train_utils.py b/higher/smart_aug/train_utils.py index 680c8a6..519b838 100755 --- a/higher/smart_aug/train_utils.py +++ b/higher/smart_aug/train_utils.py @@ -143,6 +143,8 @@ def train_classic(model, opt_param, epochs=1, print_freq=1): (list) Logs of training. Each items is a dict containing results of an epoch. """ device = next(model.parameters()).device + + #Optimizer #opt = torch.optim.Adam(model.parameters(), lr=1e-3) optim = torch.optim.SGD(model.parameters(), lr=opt_param['Inner']['lr'], @@ -150,11 +152,28 @@ def train_classic(model, opt_param, epochs=1, print_freq=1): weight_decay=opt_param['Inner']['decay'], nesterov=opt_param['Inner']['nesterov']) #lr=1e-2 / momentum=0.9 + #Scheduler + inner_scheduler=None + if opt_param['Inner']['scheduler']=='cosine': + inner_scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs, eta_min=0.) + elif opt_param['Inner']['scheduler']=='multiStep': + #Multistep milestones inspired by AutoAugment + inner_scheduler=torch.optim.lr_scheduler.MultiStepLR(optim, + milestones=[int(epochs/3), int(epochs*2/3), int(epochs*2.7/3)], + gamma=0.1) + elif opt_param['Inner']['scheduler']=='exponential': + #inner_scheduler=torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.1) #Wrong gamma + inner_scheduler=torch.optim.lr_scheduler.LambdaLR(optim, lambda epoch: (1 - epoch / epochs) ** 0.9) + elif opt_param['Inner']['scheduler'] is not None: + raise ValueError("Lr scheduler unknown : %s"%opt_param['Inner']['scheduler']) + + #Training model.train() dl_val_it = iter(dl_val) log = [] for epoch in range(epochs): #print_torch_mem("Start epoch") + #print(optim.param_groups[0]['lr']) t0 = time.perf_counter() for i, (features, labels) in enumerate(dl_train): #viz_sample_data(imgs=features, labels=labels, fig_name='../samples/data_sample_epoch{}_noTF'.format(epoch)) @@ -168,6 +187,10 @@ def train_classic(model, opt_param, epochs=1, print_freq=1): loss.backward() optim.step() + + if inner_scheduler is not None: + inner_scheduler.step() + #### Tests #### tf = time.perf_counter() @@ -175,15 +198,6 @@ def train_classic(model, opt_param, epochs=1, print_freq=1): accuracy, f1 =test(model) model.train() - #### Print #### - if(print_freq and epoch%print_freq==0): - print('-'*9) - print('Epoch : %d/%d'%(epoch,epochs)) - print('Time : %.00f'%(tf - t0)) - print('Train loss :',loss.item(), '/ val loss', val_loss.item()) - print('Accuracy max:', accuracy) - print('F1 :', ["{0:0.4f}".format(i) for i in f1]) - #### Log #### data={ "epoch": epoch, @@ -196,6 +210,14 @@ def train_classic(model, opt_param, epochs=1, print_freq=1): "param": None, } log.append(data) + #### Print #### + if(print_freq and epoch%print_freq==0): + print('-'*9) + print('Epoch : %d/%d'%(epoch,epochs)) + print('Time : %.00f'%(tf - t0)) + print('Train loss :',loss.item(), '/ val loss', val_loss.item()) + print('Accuracy max:', max([x["acc"] for x in log])) + print('F1 :', ["{0:0.4f}".format(i) for i in f1]) return log @@ -236,8 +258,8 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start ## Optimizers ## #Inner Opt - optim = torch.optim.SGD(model.parameters(), - lr=opt_param['Inner']['lr'], + inner_opt = torch.optim.SGD(model['model']['original'].parameters(), + lr=opt_param['Inner']['lr'], momentum=opt_param['Inner']['momentum'], weight_decay=opt_param['Inner']['decay'], nesterov=opt_param['Inner']['nesterov']) #lr=1e-2 / momentum=0.9 @@ -247,6 +269,21 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start grad_callback=(lambda grads: clip_norm(grads, max_norm=10)), track_higher_grads=high_grad_track) + #Scheduler + inner_scheduler=None + if opt_param['Inner']['scheduler']=='cosine': + inner_scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs, eta_min=0.) + elif opt_param['Inner']['scheduler']=='multiStep': + #Multistep milestones inspired by AutoAugment + inner_scheduler=torch.optim.lr_scheduler.MultiStepLR(optim, + milestones=[int(epochs/3), int(epochs*2/3), int(epochs*2.7/3)], + gamma=0.1) + elif opt_param['Inner']['scheduler']=='exponential': + #inner_scheduler=torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.1) #Wrong gamma + inner_scheduler=torch.optim.lr_scheduler.LambdaLR(optim, lambda epoch: (1 - epoch / epochs) ** 0.9) + elif opt_param['Inner']['scheduler'] is not None: + raise ValueError("Lr scheduler unknown : %s"%opt_param['Inner']['scheduler']) + #Meta Opt hyper_param = list(model['data_aug'].parameters()) if hp_opt : @@ -286,7 +323,7 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start #print_graph(loss) #to visualize computational graph #t = time.process_time() - diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step) + diffopt.step(loss)#(opt.zero_grad, loss.backward, opt.step) #print(len(model['model']['functional']._fast_params),"step", time.process_time()-t) @@ -318,6 +355,13 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start tf = time.perf_counter() + if inner_scheduler is not None: + inner_scheduler.step() + #Transfer inner_opt lr to diffopt + for diff_param_group in diffopt.param_groups: + for param_group in inner_opt.param_groups: + diff_param_group['lr'] = param_group['lr'] + if (save_sample_freq and epoch%save_sample_freq==0): #Data sample saving try: viz_sample_data(imgs=xs, labels=ys, fig_name='../samples/data_sample_epoch{}_noTF'.format(epoch)) @@ -396,6 +440,8 @@ def run_simple_smartaug(model, opt_param, epochs=1, inner_it=1, print_freq=1, un Training loss can either be computed directly from augmented inputs (unsup_loss=0). However, it is recommended to use the mixed loss computation, which combine original and augmented inputs to compute the loss (unsup_loss>0). + Does not support LR scheduler. + Args: model (nn.Module): Augmented model to train. opt_param (dict): Dictionnary containing optimizers parameters.