From 79de0191a8006d914514dee59bbf1c1e8a566dda Mon Sep 17 00:00:00 2001
From: "Harle, Antoine (Contracteur)" <Antoine.Harle@Teledyne.com>
Date: Wed, 12 Feb 2020 13:43:44 -0500
Subject: [PATCH] LR scheduler + Resolution pb ResNet50/WRN

---
 higher/smart_aug/datasets.py    |  2 +-
 higher/smart_aug/dataug.py      |  2 +
 higher/smart_aug/test_dataug.py | 25 ++++++++----
 higher/smart_aug/train_utils.py | 70 +++++++++++++++++++++++++++------
 4 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/higher/smart_aug/datasets.py b/higher/smart_aug/datasets.py
index 99a3a5c..c640f9c 100755
--- a/higher/smart_aug/datasets.py
+++ b/higher/smart_aug/datasets.py
@@ -7,7 +7,7 @@ from torch.utils.data.dataset import ConcatDataset
 import torchvision
 
 #Train/Validation batch size.
-BATCH_SIZE = 300
+BATCH_SIZE = 512
 #Test batch size.
 TEST_SIZE = BATCH_SIZE 
 #TEST_SIZE = 10000 #legerement +Rapide / + Consomation memoire !
diff --git a/higher/smart_aug/dataug.py b/higher/smart_aug/dataug.py
index 7c85473..8cc91e2 100755
--- a/higher/smart_aug/dataug.py
+++ b/higher/smart_aug/dataug.py
@@ -958,6 +958,8 @@ class Augmented_model(nn.Module):
 
                 model.step(loss)
                 
+            Does not support LR scheduler.
+            
             See ''run_simple_smartaug'' for a complete example.
 
             Args:
diff --git a/higher/smart_aug/test_dataug.py b/higher/smart_aug/test_dataug.py
index 67f042b..24ec173 100755
--- a/higher/smart_aug/test_dataug.py
+++ b/higher/smart_aug/test_dataug.py
@@ -82,7 +82,7 @@ if __name__ == "__main__":
     }
     #Parameters
     n_inner_iter = 1
-    epochs = 150
+    epochs = 200
     dataug_epoch_start=0
     optim_param={
         'Meta':{
@@ -91,10 +91,11 @@ if __name__ == "__main__":
         },
         'Inner':{
             'optim': 'SGD',
-            'lr':1e-2, #1e-2
+            'lr':1e-1, #1e-2/1e-1
             'momentum':0.9, #0.9
-            'decay':0.0001,
+            'decay':0.0005, #0.0005
             'nesterov':True,
+            'scheduler':'exponential', #None, 'cosine', 'multiStep', 'exponential'
         }
     }
 
@@ -103,21 +104,26 @@ if __name__ == "__main__":
     #model = ResNet(num_classes=10)
     import torchvision.models as models
     #model=models.resnet18()
-    model_name = 'resnet50' #'wide_resnet50_2' #'resnet18' #str(model)
-    model = getattr(models.resnet, model_name)(pretrained=False)
+    model_name = 'resnet18' #'wide_resnet50_2' #'resnet18' #str(model)
+    model = getattr(models.resnet, model_name)(pretrained=False, num_classes=len(dl_train.dataset.classes))
 
     #### Classic ####
     if 'classic' in tasks:
+        torch.cuda.reset_max_memory_allocated() #reset_peak_stats
+        torch.cuda.reset_max_memory_cached() #reset_peak_stats
         t0 = time.perf_counter()
+
         model = model.to(device)
 
 
         print("{} on {} for {} epochs".format(model_name, device_name, epochs))
+        #print("RandAugment(N{}-M{:.2f})-{} on {} for {} epochs".format(rand_aug['N'],rand_aug['M'],model_name, device_name, epochs))
         log= train_classic(model=model, opt_param=optim_param, epochs=epochs, print_freq=10)
         #log= train_classic_higher(model=model, epochs=epochs)
 
         exec_time=time.perf_counter() - t0
-        max_cached = torch.cuda.max_memory_cached()/(1024.0 * 1024.0) #torch.cuda.max_memory_reserved()
+        max_allocated = torch.cuda.max_memory_allocated()/(1024.0 * 1024.0)
+        max_cached = torch.cuda.max_memory_cached()/(1024.0 * 1024.0) #torch.cuda.max_memory_reserved() #MB
         ####
         print('-'*9)
         times = [x["time"] for x in log]
@@ -125,10 +131,13 @@ if __name__ == "__main__":
             "Time": (np.mean(times),np.std(times), exec_time), 
             'Optimizer': optim_param['Inner'], 
             "Device": device_name, 
-            "Memory": max_cached, 
+            "Memory": [max_allocated, max_cached], 
+            #"Rand_Aug": rand_aug, 
             "Log": log}
         print(model_name,": acc", out["Accuracy"], "in:", out["Time"][0], "+/-", out["Time"][1])
         filename = "{}-{} epochs".format(model_name,epochs)
+        #print("RandAugment-",model_name,": acc", out["Accuracy"], "in:", out["Time"][0], "+/-", out["Time"][1])
+        #filename = "RandAugment(N{}-M{:.2f})-{}-{} epochs".format(rand_aug['N'],rand_aug['M'],model_name,epochs)
         with open("../res/log/%s.json" % filename, "w+") as f:
             try:
                 json.dump(out, f, indent=True)
@@ -163,7 +172,7 @@ if __name__ == "__main__":
              inner_it=n_inner_iter, 
              dataug_epoch_start=dataug_epoch_start, 
              opt_param=optim_param,
-             print_freq=10, 
+             print_freq=1, 
              unsup_loss=1, 
              hp_opt=False,
              save_sample_freq=None)
diff --git a/higher/smart_aug/train_utils.py b/higher/smart_aug/train_utils.py
index 680c8a6..519b838 100755
--- a/higher/smart_aug/train_utils.py
+++ b/higher/smart_aug/train_utils.py
@@ -143,6 +143,8 @@ def train_classic(model, opt_param, epochs=1, print_freq=1):
             (list) Logs of training. Each items is a dict containing results of an epoch.
     """
     device = next(model.parameters()).device
+
+    #Optimizer
     #opt = torch.optim.Adam(model.parameters(), lr=1e-3)
     optim = torch.optim.SGD(model.parameters(), 
         lr=opt_param['Inner']['lr'], 
@@ -150,11 +152,28 @@ def train_classic(model, opt_param, epochs=1, print_freq=1):
         weight_decay=opt_param['Inner']['decay'], 
         nesterov=opt_param['Inner']['nesterov']) #lr=1e-2 / momentum=0.9
 
+    #Scheduler
+    inner_scheduler=None
+    if opt_param['Inner']['scheduler']=='cosine':
+        inner_scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs, eta_min=0.)
+    elif opt_param['Inner']['scheduler']=='multiStep':
+        #Multistep milestones inspired by AutoAugment
+        inner_scheduler=torch.optim.lr_scheduler.MultiStepLR(optim, 
+            milestones=[int(epochs/3), int(epochs*2/3), int(epochs*2.7/3)], 
+            gamma=0.1)
+    elif opt_param['Inner']['scheduler']=='exponential':
+        #inner_scheduler=torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.1) #Wrong gamma
+        inner_scheduler=torch.optim.lr_scheduler.LambdaLR(optim, lambda epoch: (1 - epoch / epochs) ** 0.9)
+    elif opt_param['Inner']['scheduler'] is not None:
+        raise ValueError("Lr scheduler unknown : %s"%opt_param['Inner']['scheduler'])
+
+    #Training
     model.train()
     dl_val_it = iter(dl_val)
     log = []
     for epoch in range(epochs):
         #print_torch_mem("Start epoch")
+        #print(optim.param_groups[0]['lr'])
         t0 = time.perf_counter()
         for i, (features, labels) in enumerate(dl_train):
             #viz_sample_data(imgs=features, labels=labels, fig_name='../samples/data_sample_epoch{}_noTF'.format(epoch))
@@ -168,6 +187,10 @@ def train_classic(model, opt_param, epochs=1, print_freq=1):
             loss.backward()
             optim.step()
 
+
+        if inner_scheduler is not None:
+            inner_scheduler.step()
+
         #### Tests ####
         tf = time.perf_counter()
 
@@ -175,15 +198,6 @@ def train_classic(model, opt_param, epochs=1, print_freq=1):
         accuracy, f1 =test(model)
         model.train()
 
-        #### Print ####
-        if(print_freq and epoch%print_freq==0):
-            print('-'*9)
-            print('Epoch : %d/%d'%(epoch,epochs))
-            print('Time : %.00f'%(tf - t0))
-            print('Train loss :',loss.item(), '/ val loss', val_loss.item())
-            print('Accuracy max:', accuracy)
-            print('F1 :', ["{0:0.4f}".format(i) for i in f1])
-
         #### Log ####
         data={
             "epoch": epoch,
@@ -196,6 +210,14 @@ def train_classic(model, opt_param, epochs=1, print_freq=1):
             "param": None,
         }
         log.append(data)
+        #### Print ####
+        if(print_freq and epoch%print_freq==0):
+            print('-'*9)
+            print('Epoch : %d/%d'%(epoch,epochs))
+            print('Time : %.00f'%(tf - t0))
+            print('Train loss :',loss.item(), '/ val loss', val_loss.item())
+            print('Accuracy max:', max([x["acc"] for x in log]))
+            print('F1 :', ["{0:0.4f}".format(i) for i in f1])
 
     return log
 
@@ -236,8 +258,8 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start
 
     ## Optimizers ##
     #Inner Opt
-    optim = torch.optim.SGD(model.parameters(), 
-        lr=opt_param['Inner']['lr'], 
+    inner_opt = torch.optim.SGD(model['model']['original'].parameters(), 
+        lr=opt_param['Inner']['lr'],   
         momentum=opt_param['Inner']['momentum'], 
         weight_decay=opt_param['Inner']['decay'], 
         nesterov=opt_param['Inner']['nesterov']) #lr=1e-2 / momentum=0.9
@@ -247,6 +269,21 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start
         grad_callback=(lambda grads: clip_norm(grads, max_norm=10)),
         track_higher_grads=high_grad_track)
 
+    #Scheduler
+    inner_scheduler=None
+    if opt_param['Inner']['scheduler']=='cosine':
+        inner_scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs, eta_min=0.)
+    elif opt_param['Inner']['scheduler']=='multiStep':
+        #Multistep milestones inspired by AutoAugment
+        inner_scheduler=torch.optim.lr_scheduler.MultiStepLR(optim, 
+            milestones=[int(epochs/3), int(epochs*2/3), int(epochs*2.7/3)], 
+            gamma=0.1)
+    elif opt_param['Inner']['scheduler']=='exponential':
+        #inner_scheduler=torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.1) #Wrong gamma
+        inner_scheduler=torch.optim.lr_scheduler.LambdaLR(optim, lambda epoch: (1 - epoch / epochs) ** 0.9)
+    elif opt_param['Inner']['scheduler'] is not None:
+        raise ValueError("Lr scheduler unknown : %s"%opt_param['Inner']['scheduler'])
+
     #Meta Opt
     hyper_param = list(model['data_aug'].parameters())
     if hp_opt : 
@@ -286,7 +323,7 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start
             #print_graph(loss) #to visualize computational graph
 
             #t = time.process_time()
-            diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step)
+            diffopt.step(loss)#(opt.zero_grad, loss.backward, opt.step)
             #print(len(model['model']['functional']._fast_params),"step", time.process_time()-t)
 
 
@@ -318,6 +355,13 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=1, dataug_epoch_start
                 
         tf = time.perf_counter()
 
+        if inner_scheduler is not None:
+            inner_scheduler.step()
+            #Transfer inner_opt lr to diffopt
+            for diff_param_group in diffopt.param_groups:
+                for param_group in inner_opt.param_groups:
+                    diff_param_group['lr'] = param_group['lr']
+
         if (save_sample_freq and epoch%save_sample_freq==0): #Data sample saving
                 try:
                     viz_sample_data(imgs=xs, labels=ys, fig_name='../samples/data_sample_epoch{}_noTF'.format(epoch))
@@ -396,6 +440,8 @@ def run_simple_smartaug(model, opt_param, epochs=1, inner_it=1, print_freq=1, un
             Training loss can either be computed directly from augmented inputs (unsup_loss=0).
             However, it is recommended to use the mixed loss computation, which combine original and augmented inputs to compute the loss (unsup_loss>0).
 
+            Does not support LR scheduler.
+
         Args:
             model (nn.Module): Augmented model to train.
             opt_param (dict): Dictionnary containing optimizers parameters.