From 5dd0e6ad8240b5c06e4965b58bcbed4cc2743493 Mon Sep 17 00:00:00 2001
From: "Harle, Antoine (Contracteur)" <Antoine.Harle@Teledyne.com>
Date: Fri, 17 Jan 2020 11:08:59 -0500
Subject: [PATCH] Ameliorations mineurs + clean up

---
 higher/dataug.py      | 41 +++++++++++-------------------
 higher/test_dataug.py |  5 ++--
 higher/train_utils.py | 58 ++++++++++++++-----------------------------
 3 files changed, 35 insertions(+), 69 deletions(-)

diff --git a/higher/dataug.py b/higher/dataug.py
index 466a8e5..c37ea2a 100755
--- a/higher/dataug.py
+++ b/higher/dataug.py
@@ -537,19 +537,30 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
         
         self._data_augmentation = True
 
+        #TF
         self._TF_dict = TF_dict
         self._TF= list(self._TF_dict.keys())
         self._nb_tf= len(self._TF)
-
         self._N_seqTF = N_TF
+
+        #Mag
         self._shared_mag = shared_mag
         self._fixed_mag = fixed_mag
 
+        #Distribution
+        self._fixed_prob=fixed_prob
+        self._samples = []
+
+        self._mix_dist = False
+        if mix_dist != 0.0: #Mix dist
+            self._mix_dist = True
+
         self._fixed_mix=True
         if mix_dist is None: #Learn Mix dist
             self._fixed_mix = False
             mix_dist=0.5
-
+        
+        #Params
         init_mag = float(TF.PARAMETER_MAX) if self._fixed_mag else float(TF.PARAMETER_MAX)/2
         self._params = nn.ParameterDict({
             "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme
@@ -562,14 +573,6 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
             if tf in self._TF: self._params['mag'].data[self._TF.index(tf)]=float(TF.PARAMETER_MAX) #TF fixe a max parameter
         #for t in TF.TF_no_mag: self._params['mag'][self._TF.index(t)].data-=self._params['mag'][self._TF.index(t)].data #Mag inutile pour les TF ignore_mag
 
-        #Distribution
-        self._fixed_prob=fixed_prob
-        self._samples = []
-        self._mix_dist = False
-        if mix_dist != 0.0: #Mix dist
-            self._mix_dist = True
-            #self._mix_factor = max(min(mix_dist, 0.999), 0.0)
-
         #Mag regularisation
         if not self._fixed_mag:
             if  self._shared_mag :
@@ -595,7 +598,6 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
                 else:
                     prob = self._params["prob"].detach() if self._fixed_prob else self._params["prob"]
                     mix_dist = self._params["mix_dist"].detach() if self._fixed_mix else self._params["mix_dist"]
-                    #self._distrib = (self._mix_factor*prob+(1-self._mix_factor)*uniforme_dist)#.softmax(dim=1) #Mix distrib reel / uniforme avec mix_factor
                     self._distrib = (mix_dist*prob+(1-mix_dist)*uniforme_dist)#.softmax(dim=1) #Mix distrib reel / uniforme avec mix_factor
 
                 cat_distrib= Categorical(probs=torch.ones((batch_size, self._nb_tf), device=device)*self._distrib)
@@ -613,14 +615,13 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
         
         for tf_idx in range(self._nb_tf):
             mask = sampled_TF==tf_idx #Create selection mask
-            smp_x = x[mask] #torch.masked_select() ? (NEcessite d'expand le mask au meme dim)
+            smp_x = x[mask] #torch.masked_select() ? (Necessite d'expand le mask au meme dim)
 
             if smp_x.shape[0]!=0: #if there's data to TF
                 magnitude=self._params["mag"] if self._shared_mag else self._params["mag"][tf_idx]
                 if self._fixed_mag: magnitude=magnitude.detach() #Fmodel tente systematiquement de tracker les gradient de tout les param
 
                 tf=self._TF[tf_idx]
-                #print(magnitude)
 
                 #In place
                 #x[mask]=self._TF_dict[tf](x=smp_x, mag=magnitude)
@@ -638,13 +639,11 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
             if soft :
                 self._params['prob'].data=F.softmax(self._params['prob'].data, dim=0) #Trop 'soft', bloque en dist uniforme si lr trop faible
             else:
-                #self._params['prob'].data = F.relu(self._params['prob'].data)
                 self._params['prob'].data = self._params['prob'].data.clamp(min=1/(self._nb_tf*100),max=1.0)
                 self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1
 
         if not self._fixed_mag:
             self._params['mag'].data = self._params['mag'].data.clamp(min=TF.PARAMETER_MIN, max=TF.PARAMETER_MAX)
-            #self._params['mag'].data = F.relu(self._params['mag'].data) - F.relu(self._params['mag'].data - TF.PARAMETER_MAX)
 
         if not self._fixed_mix:
             self._params['mix_dist'].data = self._params['mix_dist'].data.clamp(min=0.0, max=0.999)
@@ -653,12 +652,6 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
         if len(self._samples)==0 : return 1 #Pas d'echantillon = pas de ponderation
 
         prob = self._params["prob"].detach() if self._fixed_prob else self._params["prob"]
-        # 1 seule TF
-        #self._sample = self._samples[-1]
-        #w_loss = torch.zeros((self._sample.shape[0],self._nb_tf), device=self._sample.device)
-        #w_loss.scatter_(dim=1, index=self._sample.view(-1,1), value=1)
-        #w_loss = w_loss * self._params["prob"]/self._distrib #Ponderation par les proba (divisee par la distrib pour pas diminuer la loss)
-        #w_loss = torch.sum(w_loss,dim=1)
         
         #Plusieurs TF sequentielles (Attention ne prend pas en compte ordre !)
         w_loss = torch.zeros((self._samples[0].shape[0],self._nb_tf), device=self._samples[0].device)
@@ -672,7 +665,7 @@ class Data_augV5(nn.Module): #Optimisation jointe (mag, proba)
         return w_loss
 
     def reg_loss(self, reg_factor=0.005):
-        if self._fixed_mag: # or self._fixed_prob: #Pas de regularisation si trop peu de DOF
+        if self._fixed_mag:
             return torch.tensor(0)
         else:
             #return reg_factor * F.l1_loss(self._params['mag'][self._reg_mask], target=self._reg_tgt, reduction='mean') 
@@ -1109,9 +1102,6 @@ class Augmented_model(nn.Module):
 
         self.augment(mode=True)
 
-    #def initialize(self):
-    #    self._mods['model'].initialize()
-
     def forward(self, x):
         return self._mods['model'](self._mods['data_aug'](x))
     
@@ -1128,7 +1118,6 @@ class Augmented_model(nn.Module):
 
     def eval(self):
         return self.train(mode=False)
-        #super(Augmented_model, self).eval()
 
     def items(self):
         """Return an iterable of the ModuleDict key/value pairs.
diff --git a/higher/test_dataug.py b/higher/test_dataug.py
index 97a03d4..4b0e752 100755
--- a/higher/test_dataug.py
+++ b/higher/test_dataug.py
@@ -171,7 +171,7 @@ if __name__ == "__main__":
         t0 = time.process_time()
 
         tf_dict = {k: TF.TF_dict[k] for k in tf_names}
-        aug_model = Augmented_model(Data_augV5(TF_dict=tf_dict, N_TF=3, mix_dist=0.8, fixed_prob=False, fixed_mag=False, shared_mag=False), model).to(device)
+        aug_model = Augmented_model(Data_augV5(TF_dict=tf_dict, N_TF=3, mix_dist=1.0, fixed_prob=False, fixed_mag=False, shared_mag=False), model).to(device)
         #aug_model = Augmented_model(RandAug(TF_dict=tf_dict, N_TF=2), model).to(device)
 
         print("{} on {} for {} epochs - {} inner_it".format(str(aug_model), device_name, epochs, n_inner_iter))
@@ -182,8 +182,7 @@ if __name__ == "__main__":
              opt_param=optim_param,
              print_freq=1, 
              KLdiv=True, 
-             hp_opt=True,
-             loss_patience=None)
+             hp_opt=False)
 
         exec_time=time.process_time() - t0
         ####
diff --git a/higher/train_utils.py b/higher/train_utils.py
index fd9b9da..001f105 100755
--- a/higher/train_utils.py
+++ b/higher/train_utils.py
@@ -823,11 +823,9 @@ def run_dist_dataugV2(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
     #print("Copy ", countcopy)
     return log
 
-def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start=0, print_freq=1, KLdiv=False, hp_opt=False, loss_patience=None, save_sample=False):
+def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start=0, print_freq=1, KLdiv=False, hp_opt=False, save_sample=False):
     device = next(model.parameters()).device
     log = []
-    countcopy=0
-    val_loss=torch.tensor(0) #Necessaire si pas de metastep sur une epoch
     dl_val_it = iter(dl_val)
 
     high_grad_track = True
@@ -837,11 +835,6 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
         model.augment(mode=False)
         high_grad_track = False
 
-    val_loss_monitor= None
-    if loss_patience != None :
-        if dataug_epoch_start==-1: val_loss_monitor = loss_monitor(patience=loss_patience, end_train=2) #1st limit = dataug start
-        else: val_loss_monitor = loss_monitor(patience=loss_patience) #Val loss monitor (Not on val data : used by Dataug... => Test data)
-
     ## Optimizers ##
     #Inner Opt
     inner_opt = torch.optim.SGD(model['model']['original'].parameters(), lr=opt_param['Inner']['lr'], momentum=opt_param['Inner']['momentum']) #lr=1e-2 / momentum=0.9
@@ -859,17 +852,13 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
                 param_group[param]=torch.tensor(param_group[param]).to(device).requires_grad_()
                 hyper_param += [param_group[param]]
     meta_opt = torch.optim.Adam(hyper_param, lr=opt_param['Meta']['lr']) #lr=1e-2
-    #print(len(model['model']['functional']._fast_params))
 
     model.train()
     meta_opt.zero_grad()
 
     for epoch in range(1, epochs+1):
-        #print_torch_mem("Start epoch "+str(epoch))
-        #print(high_grad_track, fmodel._data_augmentation, len(fmodel._fast_params))
         t0 = time.process_time()
-        #with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, override=opt_param, track_higher_grads=high_grad_track) as (fmodel, diffopt):
-
+       
         for i, (xs, ys) in enumerate(dl_train):
             xs, ys = xs.to(device), ys.to(device)
             
@@ -900,24 +889,16 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
                     aug_loss=0
                     w_loss = model['data_aug'].loss_weight() #Weight loss
 
-                    #if epoch>50: #debut differe ?
                     #KL div w/ logits - Similarite predictions (distributions)
                     aug_loss = F.softmax(sup_logits, dim=1)*(log_sup-log_aug)
                     aug_loss = aug_loss.sum(dim=-1)
-                    #aug_loss = F.kl_div(aug_logits, sup_logits, reduction='none')
                     aug_loss = (w_loss * aug_loss).mean()
-
                     aug_loss += (F.cross_entropy(log_aug, ys , reduction='none') * w_loss).mean()
 
                     unsupp_coeff = 1
                     loss += aug_loss * unsupp_coeff
             
-            #to visualize computational graph
-            #print_graph(loss)
-
-            #loss.backward(retain_graph=True)
-            #print(fmodel['model']._params['b4'].grad)
-            #print('prob grad', fmodel['data_aug']['prob'].grad)
+            #print_graph(loss) #to visualize computational graph
 
             #t = time.process_time()
             diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step)
@@ -928,14 +909,14 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
                 #print("meta")
 
                 val_loss = compute_vaLoss(model=model, dl_it=dl_val_it, dl=dl_val) + model['data_aug'].reg_loss()
-                #print_graph(val_loss)
+                #print_graph(val_loss) #to visualize computational graph
 
                 val_loss.backward()
 
                 torch.nn.utils.clip_grad_norm_(model['data_aug'].parameters(), max_norm=10, norm_type=2) #Prevent exploding grad with RNN
 
                 meta_opt.step()
-                model['data_aug'].adjust_param(soft=False) #Contrainte sum(proba)=1
+                model['data_aug'].adjust_param(soft=True) #Contrainte sum(proba)=1
 
                 if hp_opt:
                     for param_group in diffopt.param_groups: 
@@ -949,11 +930,16 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
                 
         tf = time.process_time()
 
-        #viz_sample_data(imgs=xs, labels=ys, fig_name='samples/data_sample_epoch{}_noTF'.format(epoch))
-        #viz_sample_data(imgs=model['data_aug'](xs), labels=ys, fig_name='samples/data_sample_epoch{}'.format(epoch))
+        if save_sample:
+                try:
+                    viz_sample_data(imgs=xs, labels=ys, fig_name='samples/data_sample_epoch{}_noTF'.format(epoch))
+                    viz_sample_data(imgs=model['data_aug'](xs), labels=ys, fig_name='samples/data_sample_epoch{}'.format(epoch))
+                except:
+                    print("Couldn't save samples epoch"+epoch)
+                    pass
 
 
-        if(not high_grad_track): 
+        if(not val_loss): 
             val_loss = compute_vaLoss(model=model, dl_it=dl_val_it, dl=dl_val)
 
 
@@ -961,7 +947,6 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
         model.train()
 
         #### Log ####
-        #print(type(model['data_aug']) is dataug.Data_augV5)
         param = [{'p': p.item(), 'm':model['data_aug']['mag'].item()} for p in model['data_aug']['prob']] if model['data_aug']._shared_mag else [{'p': p.item(), 'm': m.item()} for p, m in zip(model['data_aug']['prob'], model['data_aug']['mag'])]
         data={
             "epoch": epoch,
@@ -985,24 +970,18 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
             print('Train loss :',loss.item(), '/ val loss', val_loss.item())
             print('Accuracy :', max([x["acc"] for x in log]))
             print('Data Augmention : {} (Epoch {})'.format(model._data_augmentation, dataug_epoch_start))
-            print('TF Proba :', model['data_aug']['prob'].data)
+            if not model['data_aug']._fixed_prob: print('TF Proba :', model['data_aug']['prob'].data)
             #print('proba grad',model['data_aug']['prob'].grad)
-            print('TF Mag :', model['data_aug']['mag'].data)
+            if not model['data_aug']._fixed_mag: print('TF Mag :', model['data_aug']['mag'].data)
             #print('Mag grad',model['data_aug']['mag'].grad)
-            print('Mix:', model['data_aug']['mix_dist'].data)
+            if not model['data_aug']._fixed_mix: print('Mix:', model['data_aug']['mix_dist'].item())
             #print('Reg loss:', model['data_aug'].reg_loss().item())
-            #print('Aug loss', aug_loss.item())
+
             if hp_opt : 
                 for param_group in diffopt.param_groups:
                     print('Opt param - lr:', param_group['lr'].item(),'- momentum:', param_group['momentum'].item())
         #############
-        if val_loss_monitor : 
-            model.eval()
-            val_loss_monitor.register(test_loss)#val_loss.item())
-            if val_loss_monitor.end_training(): break #Stop training
-            model.train()
-
-        if not model.is_augmenting() and (epoch == dataug_epoch_start or (val_loss_monitor and val_loss_monitor.limit_reached()==1)):
+        if not model.is_augmenting() and (epoch == dataug_epoch_start):
             print('Starting Data Augmention...')
             dataug_epoch_start = epoch
             model.augment(mode=True)
@@ -1015,5 +994,4 @@ def run_dist_dataugV3(model, opt_param, epochs=1, inner_it=0, dataug_epoch_start
         print("Couldn't save finals samples")
         pass
 
-    #print("Copy ", countcopy)
     return log