Rangement

2025-06-28 07:55:25 +02:00 · 2020-02-28 16:46:37 -05:00 · 2020-02-28 16:46:37 -05:00 · 4166922c34
commit 4166922c34
parent ca3367d19f
453 changed files with 9797 additions and 7 deletions
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/.gitignore
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/.gitignore
@ -0,0 +1,5 @@
+venv/
+__pycache__
+data/
+log/
+.vscode/
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/README.md
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/README.md
@ -0,0 +1,33 @@
+# Gradient Descent: The Ultimate Optimizer
+
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
+
+| ⚠️ WARNING: THIS IS NOT MY WORK ⚠️ |
+| --- |
+
+This repository contains the paper and code to the paper [Gradient Descent:
+The Ultimate Optimizer](https://arxiv.org/abs/1909.13371).
+
+I couldn't find the code (which is found in the appendix at the end of the
+paper) anywhere on the web. What I present here is the code of the paper with
+instructions on how to set it up.
+
+Getting the code in a runnable state required some fixes on my part so the
+code might be slightly different than that presented in the paper.
+
+## Set up 
+
+```sh
+git clone https://github.com/Rainymood/Gradient-Descent-The-Ultimate-Optimizer 
+cd Gradient-Descent-The-Ultimate-Optimizer
+virtualenv -p python3 venv
+source venv/bin/activate
+pip install -r requirements.txt
+python main.py
+```
+
+When you are done you can exit the virtualenv with 
+
+```shell
+deactivate
+```
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py
@ -0,0 +1,244 @@
+from hyperopt import *
+#from hyperopt_v2 import *
+
+import torchvision.transforms.functional as TF
+import torchvision.transforms as T
+
+#from scipy import ndimage
+import kornia
+
+import random
+
+
+class MNIST_FullyConnected_Augmented(Optimizable):
+    """
+    A fully-connected NN for the MNIST task. This is Optimizable but not itself
+    an optimizer.
+    """
+
+    def __init__(self, num_inp, num_hid, num_out, optimizer, device = torch.device('cuda')):
+        self.device = device
+        #print(self.device)
+        parameters = {
+            "w1": torch.zeros(num_inp, num_hid, device=self.device).t(),
+            "b1": torch.zeros(num_hid, device=self.device).t(),
+            "w2": torch.zeros(num_hid, num_out, device=self.device).t(),
+            "b2": torch.zeros(num_out, device=self.device).t(),
+
+            #Data augmentation
+            "prob": torch.tensor(0.5, device=self.device),
+            "mag": torch.tensor(180.0, device=self.device),
+        }
+        super().__init__(parameters, optimizer)
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
+        self.optimizer.initialize()
+        #print(self.device)
+
+    def forward(self, x):
+        """Compute a prediction."""
+        #print("Prob:",self.parameters["prob"].item())
+        if random.random() < self.parameters["prob"]:
+            #angle = 45
+            #x = TF.rotate(x, angle)
+            #print(self.device)
+            #x = F.linear(x, torch.ones(28*28, 28*28, device=self.device).t()*self.parameters["mag"], bias=None)
+            x = x + self.parameters["mag"]
+
+        x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
+        x = torch.tanh(x)
+        x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
+        x = torch.tanh(x)
+        x = F.log_softmax(x, dim=1)
+        return x
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def __str__(self):
+        return "mnist_FC_augmented / " + str(self.optimizer)
+
+class LeNet(Optimizable, nn.Module):
+    def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')):
+        nn.Module.__init__(self)
+        self.device = device
+        parameters = {
+            "w1": torch.zeros(20, num_inp, 5, 5, device=self.device),
+            "b1": torch.zeros(20, device=self.device),
+            "w2": torch.zeros(50, 20, 5, 5, device=self.device),
+            "b2": torch.zeros(50, device=self.device),
+            "w3": torch.zeros(500,4*4*50, device=self.device),
+            "b3": torch.zeros(500, device=self.device),
+            "w4": torch.zeros(10, 500, device=self.device),
+            "b4": torch.zeros(10, device=self.device),
+
+            #Data augmentation
+            "prob": torch.tensor(1.0, device=self.device),
+            "mag": torch.tensor(180.0, device=self.device),
+        }
+        super().__init__(parameters, optimizer)
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w3"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w4"], a=math.sqrt(5))
+        self.optimizer.initialize()
+
+    def forward(self, x):
+
+        if random.random() < self.parameters["prob"]:
+            
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self.parameters["mag"] # in degrees
+            angle = torch.ones(batch_size, device=self.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=self.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=self.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self.parameters["w4"], self.parameters["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def __str__(self):
+        return "mnist_CNN_augmented / " + str(self.optimizer)
+
+class LeNet_v2(Optimizable, nn.Module):
+    def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')):
+        
+        nn.Module.__init__(self)
+        self.device = device
+        self.conv1 = nn.Conv2d(num_inp, 20, 5, 1)
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        #self.fc1 = nn.Linear(4*4*50, 500)
+        self.fc1 = nn.Linear(1250, 500)
+        self.fc2 = nn.Linear(500, 10)
+
+        #print(self.conv1.weight)
+        parameters = {
+            "w1": self.conv1.weight,
+            "b1": self.conv1.bias,
+            "w2": self.conv2.weight,
+            "b2": self.conv2.bias,
+            "w3": self.fc1.weight,
+            "b3": self.fc1.bias,
+            "w4": self.fc2.weight,
+            "b4": self.fc2.bias,
+
+            #Data augmentation
+            "prob": torch.tensor(0.5, device=self.device),
+            "mag": torch.tensor(1.0, device=self.device),
+        }
+        Optimizable.__init__(self, parameters, optimizer)
+
+    '''
+    def forward(self, x): #Sature la memoire ???
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        #x = x.view(-1, 4*4*50)
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+    '''
+    def forward(self, x):
+
+        if random.random() < self.parameters["prob"].item():
+            #print(self.parameters["prob"])
+            #x = [T.ToTensor()(
+            #        TF.affine(img=T.ToPILImage()(im), angle=self.parameters["mag"], translate=(0,0), scale=1, shear=0, resample=0, fillcolor=None))
+            #    for im in torch.unbind(x,dim=0)]
+            #x = torch.stack(x,dim=0)
+
+            #x = [ndimage.rotate(im, self.parameters["mag"], reshape=False)
+            #    for im in torch.unbind(x,dim=0)]
+            #x = torch.stack(x,dim=0)
+
+            #x = [im + self.parameters["mag"]
+            #    for im in torch.unbind(x,dim=0)]
+            #x = torch.stack(x,dim=0)
+            
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self.parameters["mag"] * 180 # in degrees
+            angle = torch.ones(batch_size, device=self.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=self.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=self.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self.parameters["w4"], self.parameters["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+    
+    def initialize(self):
+        self.optimizer.initialize()
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def adjust_val(self):
+        self.optimizer.adjust_val(self.parameters)
+
+    def eval(self):
+        self.parameters['prob']=torch.tensor(0.0, device=self.device)
+
+    def __str__(self):
+        return "mnist_CNN_augmented / " + str(self.optimizer)
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py
@ -0,0 +1,52 @@
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+
+class MNIST_aug(Dataset):
+
+    training_file = 'training.pt'
+    test_file = 'test.pt'
+    classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
+               '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
+               
+    def __init__(self):
+        self.images = [TF.to_pil_image(x) for x in torch.ByteTensor(10, 3, 48, 48)]
+        self.set_stage(0) # initial stage
+        
+    def __getitem__(self, index):
+        image = self.images[index]
+        
+        # Just apply your transformations here
+        image = self.crop(image)
+        x = TF.to_tensor(image)
+        return x
+        
+    def set_stage(self, stage):
+        if stage == 0:
+            print('Using (32, 32) crops')
+            self.crop = transforms.RandomCrop((32, 32))
+        elif stage == 1:
+            print('Using (28, 28) crops')
+            self.crop = transforms.RandomCrop((28, 28))
+        
+    def __len__(self):
+        return len(self.images)
+
+
+dataset = MyData()
+loader = DataLoader(dataset,
+                    batch_size=2,
+                    num_workers=2,
+                    shuffle=True)
+
+for batch_idx, data in enumerate(loader):
+    print('Batch idx {}, data shape {}'.format(
+        batch_idx, data.shape))
+    
+loader.dataset.set_stage(1)
+
+for batch_idx, data in enumerate(loader):
+    print('Batch idx {}, data shape {}'.format(
+        batch_idx, data.shape))
+
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py
@ -0,0 +1,150 @@
+#from hyperopt import *
+from hyperopt_v2 import *
+
+import torchvision.transforms.functional as TF
+import torchvision.transforms as T
+
+#from scipy import ndimage
+import kornia
+
+import random
+
+
+class LeNet_v3(nn.Module):
+    def __init__(self, num_inp, num_out):
+        super(LeNet_v3, self).__init__()
+        self.params = nn.ParameterDict({
+            'w1': nn.Parameter(torch.zeros(20, num_inp, 5, 5)),
+            'b1': nn.Parameter(torch.zeros(20)),
+            'w2': nn.Parameter(torch.zeros(50, 20, 5, 5)),
+            'b2': nn.Parameter(torch.zeros(50)),
+            'w3': nn.Parameter(torch.zeros(500,4*4*50)),
+            'b3': nn.Parameter(torch.zeros(500)),
+            'w4': nn.Parameter(torch.zeros(10, 500)),
+            'b4': nn.Parameter(torch.zeros(10))
+        })
+
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.params["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.params["w2"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.params["w3"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.params["w4"], a=math.sqrt(5))
+
+    def forward(self, x):
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self.params["w1"], bias=self.params["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self.params["w2"], bias=self.params["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self.params["w3"], self.params["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self.params["w4"], self.params["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+
+
+    def print_grad_fn(self):
+        for n, p in self.params.items():
+            print(n, p.grad_fn)
+
+    def __str__(self):
+        return "mnist_CNN_augmented / "
+
+class Data_aug(nn.Module):
+    def __init__(self):
+        super(Data_aug, self).__init__()
+        self.data_augmentation = True
+        self.params = nn.ParameterDict({
+            "prob": nn.Parameter(torch.tensor(0.5)),
+            "mag": nn.Parameter(torch.tensor(180.0))
+        })
+
+        #self.params["mag"].register_hook(print)
+
+    def forward(self, x):
+
+        if self.data_augmentation and self.training and random.random() < self.params["prob"]:
+            #print('Aug')
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self.params["mag"] # in degrees
+            angle = torch.ones(batch_size, device=x.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=x.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=x.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        return x
+
+    def eval(self):
+        self.params['prob']=torch.tensor(0.0, device=self.device)
+        nn.Module.eval(self)
+
+    def data_augmentation(self, mode=True):
+        self.data_augmentation=mode
+
+    def print_grad_fn(self):
+        for n, p in self.params.items():
+            print(n, p.grad_fn)
+
+    def __str__(self):
+        return "Data_Augmenter / "
+
+class Augmented_model(nn.Module):
+    def __init__(self, model, data_augmenter):
+        #self.model = model
+        #self.data_aug = data_augmenter
+        super(Augmented_model, self).__init__()#nn.Module.__init__(self)
+        #super().__init__()
+        self.mods = nn.ModuleDict({
+            'data_aug': data_augmenter,
+            'model': model
+            })
+        #for name, param in self.mods.named_parameters():
+        #    print(name, type(param.data), param.size())
+
+        #params = self.mods.named_parameters() #self.parameters()
+        #parameters = [param for param in self.model.parameters()] + [param for param in self.data_aug.parameters()] 
+        #Optimizable.__init__(self, params, optimizer)
+
+    def initialize(self):
+        self.mods['model'].initialize()
+
+    def forward(self, x):
+        return self.mods['model'](self.mods['data_aug'](x))
+
+    #def adjust(self):
+    #    self.optimizer.adjust(self) #Parametres des dict
+
+    def data_augmentation(self, mode=True):
+        self.mods['data_aug'].data_augmentation=mode
+
+    def begin(self):
+        for param in self.parameters():
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+
+    def print_grad_fn(self):
+        for n, m in self.mods.items():
+            m.print_grad_fn()
+
+    def __str__(self):
+        return str(self.mods['data_aug'])+ str(self.mods['model'])# + str(self.optimizer)
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/graph/graph
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/graph/graph
@ -0,0 +1,5 @@
+digraph {
+	graph [size="12,12"]
+	node [align=left fontsize=12 height=0.2 ranksep=0.1 shape=box style=filled]
+	94296775052080 [label=NoneType fillcolor=darkolivegreen1]
+}
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg
@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="75pt" height="30pt"
+ viewBox="0.00 0.00 74.65 30.40" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 26.4)">
+<title>%3</title>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-26.4 70.6472,-26.4 70.6472,4 -4,4"/>
+<!-- 94296775052080 -->
+<g id="node1" class="node">
+<title>94296775052080</title>
+<polygon fill="#caff70" stroke="#000000" points="66.4717,-22.6036 .1755,-22.6036 .1755,.2036 66.4717,.2036 66.4717,-22.6036"/>
+<text text-anchor="middle" x="33.3236" y="-7.6" font-family="Times,serif" font-size="12.00" fill="#000000">NoneType</text>
+</g>
+</g>
+</svg>
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py
@ -0,0 +1,345 @@
+import math
+import torch
+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+class Optimizable():#nn.Module):
+    """
+    This is the interface for anything that has parameters that need to be
+    optimized, somewhat like torch.nn.Model but with the right plumbing for
+    hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
+    interface which does not give us enough control about the detachments.)
+    Nominal operation of an Optimizable at the lowest level is as follows:
+        o = MyOptimizable(…)
+        o.initialize()
+        loop {
+            o.begin()
+            o.zero_grad()
+            loss = –compute loss function from parameters–
+            loss.backward()
+            o.adjust()
+        }
+    Optimizables recursively handle updates to their optimiz*ers*.
+    """
+    #def __init__(self):
+    #    super(Optimizable, self).__init__()
+    #    self.parameters = nn.Parameter(torch.zeros(()))
+
+    def __init__(self, parameters, optimizer):
+        #super(Optimizable, self).__init__()
+        self.parameters = parameters  # a dict mapping names to tensors
+        self.optimizer = optimizer  # which must itself be Optimizable!
+        self.all_params_with_gradients = []
+        #self.device = device
+
+    def initialize(self):
+        """Initialize parameters, e.g. with a Kaiming initializer."""
+        pass
+
+    def begin(self):
+        """Enable gradient tracking on current parameters."""
+        self.all_params_with_gradients = [] #Reintialisation pour eviter surcharge de la memoire
+        for name, param in self.parameters.items():
+        #for param in self.parameters:
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+            #param.to(self.device)
+            #if param.device == torch.device('cuda:0'):
+            #    print(name, param.device)
+            self.all_params_with_gradients.append(param)
+        self.optimizer.begin()
+
+    def zero_grad(self):
+        """ Set all gradients to zero. """
+        for param in self.all_params_with_gradients:
+            #param = param.to(self.device)
+            param.grad = torch.zeros(param.shape, device=param.device)
+        self.optimizer.zero_grad()
+
+    """ Note: at this point you would probably call .backwards() on the loss
+    function. """
+
+    def adjust(self):
+        """ Update parameters """
+        pass
+
+
+    def print_grad_fn(self):
+        self.optimizer.print_grad_fn()
+        for n, p in self.parameters.items():
+                print(n," - ", p.grad_fn)
+
+    def param_grad(self):
+        return self.all_params_with_gradients
+
+    def param(self, param_name):
+        return self.parameters[param_name].item()
+
+
+class MNIST_FullyConnected(Optimizable):
+    """
+    A fully-connected NN for the MNIST task. This is Optimizable but not itself
+    an optimizer.
+    """
+
+    def __init__(self, num_inp, num_hid, num_out, optimizer):
+        parameters = {
+            "w1": torch.zeros(num_inp, num_hid).t(),
+            "b1": torch.zeros(num_hid).t(),
+            "w2": torch.zeros(num_hid, num_out).t(),
+            "b2": torch.zeros(num_out).t(),
+        }
+        super().__init__(parameters, optimizer)
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
+        self.optimizer.initialize()
+
+    def forward(self, x):
+        """Compute a prediction."""
+        x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
+        x = torch.tanh(x)
+        x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
+        x = torch.tanh(x)
+        x = F.log_softmax(x, dim=1)
+        return x
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def __str__(self):
+        return "mnist / " + str(self.optimizer)
+
+
+class NoOpOptimizer(Optimizable):#, nn.Module):
+    """
+    NoOpOptimizer sits on top of a stack, and does not affect what lies below.
+    """
+
+    def __init__(self):
+        #super(Optimizable, self).__init__()
+        pass
+
+    def initialize(self):
+        pass
+
+    def begin(self):
+        pass
+
+    def zero_grad(self):
+        pass
+
+    def adjust(self, params):
+        pass
+
+    def adjust_val(self, params):
+        pass
+
+    def print_grad_fn(self):
+        pass
+
+    def __str__(self):
+        return "static"
+
+class Adam(Optimizable):
+    """
+    A fully hyperoptimizable Adam optimizer
+    """
+
+    def clamp(x):
+        return (x.tanh() + 1.0) / 2.0
+
+    def unclamp(y):
+        z = y * 2.0 - 1.0
+        return ((1.0 + z) / (1.0 - z)).log() / 2.0
+
+    def __init__(
+        self,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        log_eps=-8.0,
+        optimizer=NoOpOptimizer(),
+        device = torch.device('cuda')
+    ):
+        self.device = device
+        parameters = {
+            "alpha": torch.tensor(alpha, device=self.device),
+            "beta1": Adam.unclamp(torch.tensor(beta1, device=self.device)),
+            "beta2": Adam.unclamp(torch.tensor(beta2, device=self.device)),
+            "log_eps": torch.tensor(log_eps, device=self.device),
+        }
+        super().__init__(parameters, optimizer)
+        self.num_adjustments = 0
+        self.num_adjustments_val = 0
+        self.cache = {}
+
+        for name, param in parameters.items():
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+            #param.to(self.device)
+            #if param.device == torch.device('cuda:0'):
+            #    print(name, param.device)
+
+    def adjust(self, params): #Update param d'apprentissage
+        self.num_adjustments += 1
+        self.optimizer.adjust(self.parameters)
+        #print('Adam update')
+        t = self.num_adjustments
+        beta1 = Adam.clamp(self.parameters["beta1"])
+        beta2 = Adam.clamp(self.parameters["beta2"])
+        for name, param in params.items():
+            if name == "mag": continue
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape, device=self.device),
+                    "v": torch.zeros(param.shape, device=self.device)
+                    + 10.0 ** self.parameters["log_eps"].data
+                    # NOTE that we add a little ‘fudge factor' here because sqrt is not
+                    # differentiable at exactly zero
+                }
+            #print(name, param.device)
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(m)
+            self.all_params_with_gradients.append(v)
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
+            params[name] = param.detach() - self.parameters["alpha"] * dparam
+            #print(name)
+
+    def adjust_val(self, params): #Update param Transformations
+        self.num_adjustments_val += 1
+        self.optimizer.adjust_val(self.parameters)
+        #print('Adam update')
+        t = self.num_adjustments_val
+        beta1 = Adam.clamp(self.parameters["beta1"])
+        beta2 = Adam.clamp(self.parameters["beta2"])
+        for name, param in params.items():
+            if name != "mag": continue
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape, device=self.device),
+                    "v": torch.zeros(param.shape, device=self.device)
+                    + 10.0 ** self.parameters["log_eps"].data
+                    # NOTE that we add a little ‘fudge factor' here because sqrt is not
+                    # differentiable at exactly zero
+                }
+            #print(name, param.device)
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(m)
+            self.all_params_with_gradients.append(v)
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
+            params[name] = param.detach() - self.parameters["alpha"] * dparam
+            #print(name)
+
+    def __str__(self):
+        return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
+'''
+class SGD(Optimizable):
+    """
+    A hyperoptimizable SGD
+    """
+
+    def __init__(self, alpha=0.01, optimizer=NoOpOptimizer()):
+        parameters = {"alpha": torch.tensor(alpha)}
+        super().__init__(parameters, optimizer)
+
+    def adjust(self, params):
+        self.optimizer.adjust(self.parameters)
+        for name, param in params.items():
+            g = param.grad.detach()
+            params[name] = param.detach() - g * self.parameters["alpha"]
+
+    def __str__(self):
+        return "sgd(%f) / " % self.parameters["alpha"] + str(self.optimizer)
+        
+class SGDPerParam(Optimizable):
+    """
+    Like above, but can be taught a separate step size for each parameter it
+    tunes.
+    """
+
+    def __init__(self, alpha=0.01, params=[], optimizer=NoOpOptimizer()):
+        parameters = {name + "_alpha": torch.tensor(alpha) for name in params}
+        super().__init__(parameters, optimizer)
+
+    def adjust(self, params):
+        self.optimizer.adjust(self.parameters)
+        for name, param in params.items():
+            g = param.grad.detach()
+            params[name] = param.detach() - g * self.parameters[name + "_alpha"]
+
+    def __str__(self):
+        return "sgd(%s) / " % str(
+            {k: t.item() for k, t in self.parameters.items()}
+        ) + str(self.optimizer)
+'''
+'''
+class AdamBaydin(Optimizable):
+    """ Same as above, but only optimizes the learning rate, treating the
+    remaining hyperparameters as constants. """
+
+    def __init__(
+        self,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        log_eps=-8.0,
+        optimizer=NoOpOptimizer(),
+    ):
+        parameters = {"alpha": torch.tensor(alpha)}
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.log_eps = log_eps
+        super().__init__(parameters, optimizer)
+        self.num_adjustments = 0
+        self.cache = {}
+
+    def adjust(self, params):
+        self.num_adjustments += 1
+        self.optimizer.adjust(self.parameters)
+        t = self.num_adjustments
+        beta1 = self.beta1
+        beta2 = self.beta2
+        for name, param in params.items():
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape),
+                    "v": torch.zeros(param.shape) + 10.0 ** self.log_eps,
+                }
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(m)
+            self.all_params_with_gradients.append(v)
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.log_eps)
+            params[name] = param.detach() - self.parameters["alpha"] * dparam
+
+    def __str__(self):
+        return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
+'''
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py
@ -0,0 +1,296 @@
+import math
+import torch
+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim.optimizer import Optimizer
+
+class Optimizable():
+    """
+    This is the interface for anything that has parameters that need to be
+    optimized, somewhat like torch.nn.Model but with the right plumbing for
+    hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
+    interface which does not give us enough control about the detachments.)
+    Nominal operation of an Optimizable at the lowest level is as follows:
+        o = MyOptimizable(…)
+        o.initialize()
+        loop {
+            o.begin()
+            o.zero_grad()
+            loss = –compute loss function from parameters–
+            loss.backward()
+            o.adjust()
+        }
+    Optimizables recursively handle updates to their optimiz*ers*.
+    """
+    #def __init__(self):
+    #    super(Optimizable, self).__init__()
+    #    self.parameters = nn.Parameter(torch.zeros(()))
+
+    def __init__(self, parameters, optimizer):
+        self.params = parameters  # a dict mapping names to tensors
+        self.optimizer = optimizer  # which must itself be Optimizable!
+        self.all_params_with_gradients = []
+        #self.device = device
+
+    def initialize(self):
+        """Initialize parameters, e.g. with a Kaiming initializer."""
+        pass
+
+    def begin(self):
+        """Enable gradient tracking on current parameters."""
+        self.all_params_with_gradients = nn.ParameterList() #Reintialisation pour eviter surcharge de la memoire
+        print("Opti param :", type(self.params))
+        #for name, param in self.params:
+        if isinstance(self.params,dict): #Dict
+            for name, param in self.params:
+                param.requires_grad_()  # keep gradient information…
+                param.retain_grad()  # even if not a leaf…
+                self.all_params_with_gradients.append(param)
+        if isinstance(self.params,list): #List
+            for param in self.params:
+                param.requires_grad_()  # keep gradient information…
+                param.retain_grad()  # even if not a leaf…
+                self.all_params_with_gradients.append(param)
+        self.optimizer.begin()
+
+    def zero_grad(self):
+        """ Set all gradients to zero. """
+        for param in self.all_params_with_gradients:
+            param.grad = torch.zeros(param.shape, device=param.device)
+        self.optimizer.zero_grad()
+
+    """ Note: at this point you would probably call .backwards() on the loss
+    function. """
+
+    def adjust(self):
+        """ Update parameters """
+        pass
+
+
+class NoOpOptimizer(Optimizable):#, nn.Module):
+    """
+    NoOpOptimizer sits on top of a stack, and does not affect what lies below.
+    """
+
+    def __init__(self):
+        #super(Optimizable, self).__init__()
+        pass
+
+    def initialize(self):
+        pass
+
+    def begin(self):
+        #print("NoOpt begin")
+        pass
+
+    def zero_grad(self):
+        pass
+
+    def adjust(self, params):
+        pass
+
+    def step(self):
+        pass
+
+    def print_grad_fn(self):
+        pass
+        
+    def __str__(self):
+        return "static"
+
+
+class SGD(Optimizer, nn.Module): #Eviter Optimizer
+    """
+    A hyperoptimizable SGD
+    """
+
+    def __init__(self, params, lr=0.01, height=0):
+        self.height=height
+        #params : a optimiser
+        #reste (defaults) param de l'opti
+        print('SGD - H', height)
+        nn.Module.__init__(self)
+
+        optim_keys = ('lr','') #A mettre dans Optimizable ? #'' pour eviter iteration dans la chaine de charactere...
+        '''
+        self_params = {"lr": torch.tensor(lr),
+                        "momentum": 0,
+                        "dampening":0,
+                        "weight_decay":0,
+                        "nesterov": False}
+        '''
+        #self_params = dict(lr=torch.tensor(lr), 
+        #                    momentum=0, dampening=0, weight_decay=0, nesterov=False)
+
+        self_params = nn.ParameterDict({
+            "lr": nn.Parameter(torch.tensor(lr)),
+            "momentum": nn.Parameter(torch.tensor(0.0)),
+            "dampening": nn.Parameter(torch.tensor(0.0)),
+            "weight_decay": nn.Parameter(torch.tensor(0.0)),
+        })
+
+        for k in self_params.keys() & optim_keys:
+            self_params[k].requires_grad_()  # keep gradient information…
+            self_params[k].retain_grad()  # even if not a leaf…
+            #self_params[k].register_hook(print)
+
+        if height==0:
+            optimizer = NoOpOptimizer()
+        else:
+            #def dict_generator(): yield {k: self_params[k] for k in self_params.keys() & optim_keys}
+            #(dict for dict in {k: self_params[k] for k in self_params.keys() & optim_keys}) #Devrait mar
+            optimizer = SGD(params=(self_params[k]for k in self_params.keys() & optim_keys), lr=lr, height=height-1)
+            #optimizer.register_backward_hook(print)
+
+        self.optimizer = optimizer
+        #if(height==0):
+        #    for n,p in params.items():
+        #        print(n,p)
+
+        #Optimizable.__init__(self, self_params, optimizer)
+
+        #print(type(params))
+        #for p in params:
+        #    print(type(p))
+        Optimizer.__init__(self, params, self_params)
+
+        for group in self.param_groups:
+            for p in group['params']:
+                print(type(p.data), p.size())
+        print('End SGD-H', height)  
+
+    def begin(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                #print(type(p.data), p.size())
+                p.requires_grad_()  # keep gradient information…
+                p.retain_grad()  # even if not a leaf…
+                #p.register_hook(lambda x: print(self.height, x.grad_fn))
+
+        self.optimizer.begin()
+
+    def print_grad_fn(self):
+        self.optimizer.print_grad_fn()
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                print(self.height," - ", i, p.grad_fn)
+
+    #def adjust(self, params):
+    #    self.optimizer.adjust(self.params)
+    #    for name, param in params.items():
+    #        g = param.grad.detach()
+    #        params[name] = param.detach() - g * self.params["lr"]
+
+    def step(self):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        print('SGD start')
+        self.optimizer.step()
+
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                if p.grad is None:
+                    continue
+                #d_p = p.grad.data
+                d_p = p.grad.detach()
+
+                #print(group['lr'])
+                p.data.add_(-group['lr'].item(), d_p)
+                #group['params'][i] = p.detach() - d_p * group['lr']
+                p.data-= group['lr']*d_p #Data ne pas utiliser perte info
+
+            for p in group['params']:
+                if p.grad is None:
+                    print(p, p.grad)
+                    continue
+
+        print("SGD end")
+        #return loss
+
+    def __str__(self):
+        return "sgd(%f) / " % self.params["lr"] + str(self.optimizer)
+
+
+class Adam(Optimizable, nn.Module):
+    """
+    A fully hyperoptimizable Adam optimizer
+    """
+
+    def clamp(x):
+        return (x.tanh() + 1.0) / 2.0
+
+    def unclamp(y):
+        z = y * 2.0 - 1.0
+        return ((1.0 + z) / (1.0 - z)).log() / 2.0
+
+    def __init__(
+        self,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        log_eps=-8.0,
+        optimizer=NoOpOptimizer(),
+        device = torch.device('cuda')
+    ):
+        #super(Adam, self).__init__()
+        nn.Module.__init__(self)
+        self.device = device
+        params = nn.ParameterDict({
+            "alpha": nn.Parameter(torch.tensor(alpha, device=self.device)),
+            "beta1": nn.Parameter(Adam.unclamp(torch.tensor(beta1, device=self.device))),
+            "beta2": nn.Parameter(Adam.unclamp(torch.tensor(beta2, device=self.device))),
+            "log_eps": nn.Parameter(torch.tensor(log_eps, device=self.device)),
+        })
+        Optimizable.__init__(self, params, optimizer)
+        self.num_adjustments = 0
+        self.cache = {}
+
+        for name, param in params.items():
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+
+    def adjust(self, params, pytorch_mod=False):
+        self.num_adjustments += 1
+        self.optimizer.adjust(self.params)
+        t = self.num_adjustments
+        beta1 = Adam.clamp(self.params["beta1"])
+        beta2 = Adam.clamp(self.params["beta2"])
+
+        updated_param = []
+        if pytorch_mod:
+            params = params.named_parameters(prefix='') #Changer nom d'input...
+
+        for name, param in params:
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape, device=self.device),
+                    "v": torch.zeros(param.shape, device=self.device)
+                    + 10.0 ** self.params["log_eps"].data
+                    # NOTE that we add a little ‘fudge factor' here because sqrt is not
+                    # differentiable at exactly zero
+                }
+            #print(name, param.device)
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(nn.Parameter(m)) #Risque de surcharger la memoire => Dict mieux ?
+            self.all_params_with_gradients.append(nn.Parameter(v))
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.params["log_eps"])
+            updated_param[name] = param.detach() - self.params["alpha"] * dparam
+
+        if pytorch_mod: params.update(updated_param) #Changer nom d'input...
+        else: params = updated_param
+
+    def __str__(self):
+        return "adam(" + str(self.params) + ") / " + str(self.optimizer)
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/main.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/main.py
@ -0,0 +1,182 @@
+import numpy as np
+import json, math, time, os
+from hyperopt import *
+import gc
+
+BATCH_SIZE = 300
+
+mnist_train = torchvision.datasets.MNIST(
+    "./data", train=True, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+mnist_test = torchvision.datasets.MNIST(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+dl_train = torch.utils.data.DataLoader(
+    mnist_train, batch_size=BATCH_SIZE, shuffle=False
+)
+dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)
+
+
+def test(model):
+    for i, (features_, labels_) in enumerate(dl_test):
+        features, labels = torch.reshape(features_, (10000, 28 * 28)), labels_
+        pred = model.forward(features)
+        return pred.argmax(dim=1).eq(labels).sum().item() / 10000 * 100
+
+
+def train(model, epochs=3, height=1):
+    stats = []
+    for epoch in range(epochs):
+        for i, (features_, labels_) in enumerate(dl_train):
+            t0 = time.process_time()
+            model.begin()
+            features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
+            pred = model.forward(
+                features
+            )  # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/
+            loss = F.nll_loss(pred, labels)
+            model.zero_grad()
+            loss.backward(create_graph=True)
+            model.adjust()
+            tf = time.process_time()
+            data = {
+                "time": tf - t0,
+                "iter": epoch * len(dl_train) + i,
+                "loss": loss.item(),
+                "params": {
+                    k: v.item()
+                    for k, v in model.optimizer.parameters.items()
+                    if "." not in k
+                },
+            }
+            stats.append(data)
+    return stats
+
+
+def run(opt, name="out", usr={}, epochs=3, height=1):
+    torch.manual_seed(0x42)
+    model = MNIST_FullyConnected(28 * 28, 128, 10, opt)
+    print("Running...", str(model))
+    model.initialize()
+    log = train(model, epochs, height)
+    acc = test(model)
+    out = {"acc": acc, "log": log, "usr": usr}
+    with open("log/%s.json" % name, "w+") as f:
+        json.dump(out, f, indent=True)
+    times = [x["time"] for x in log]
+    print("Times (ms):", np.mean(times), "+/-", np.std(times))
+    print("Final accuracy:", acc)
+    return out
+
+
+def sgd_experiments():
+    run(SGD(0.01), "sgd", epochs=1)
+    out = run(SGD(0.01, optimizer=SGD(0.01)), "sgd+sgd", epochs=1)
+    alpha = out["log"][-1]["params"]["alpha"]
+    print(alpha)
+    run(SGD(alpha), "sgd-final", epochs=1)
+
+
+def adam_experiments():
+    run(Adam(), "adam", epochs=1)
+    print()
+    mo = SGDPerParam(
+        0.001, ["alpha", "beta1", "beta2", "log_eps"], optimizer=SGD(0.0001)
+    )
+    out = run(Adam(optimizer=mo), "adam+sgd", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    beta1 = Adam.clamp(torch.tensor(p["beta1"])).item()
+    beta2 = Adam.clamp(torch.tensor(p["beta2"])).item()
+    log_eps = p["log_eps"]
+    print(alpha, beta1, beta2, log_eps)
+    print(mo)
+    run(
+        Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps),
+        "adam+sgd-final",
+        epochs=1,
+    )
+    print()
+    out = run(Adam(optimizer=Adam()), "adam2", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    beta1 = Adam.clamp(torch.tensor(p["beta1"])).item()
+    beta2 = Adam.clamp(torch.tensor(p["beta2"])).item()
+    log_eps = p["log_eps"]
+    print(alpha, beta1, beta2, log_eps)
+    run(
+        Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps),
+        "adam2-final",
+        epochs=1,
+    )
+    print()
+    mo = SGDPerParam(0.001, ["alpha"], optimizer=SGD(0.0001))
+    out = run(AdamBaydin(optimizer=mo), "adambaydin+sgd", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    print(alpha)
+    print(mo)
+    run(Adam(alpha=p["alpha"]), "adambaydin+sgd-final", epochs=1)
+    print()
+    out = run(AdamBaydin(optimizer=Adam()), "adambaydin2", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    print(alpha)
+    run(Adam(alpha=p["alpha"]), "adambaydin2-final", epochs=1)
+
+
+def surface():
+    run(SGD(10 ** -3, optimizer=SGD(10 ** -1)), "tst", epochs=1)
+    for log_alpha in np.linspace(-3, 2, 10):
+        run(SGD(10 ** log_alpha), "sgd@1e%+.2f" % log_alpha, epochs=1)
+
+
+def make_sgd_stack(height, top):
+    if height == 0:
+        return SGD(alpha=top)
+    return SGD(alpha=top, optimizer=make_sgd_stack(height - 1, top))
+
+
+def make_adam_stack(height, top=0.0000001):
+    if height == 0:
+        return Adam(alpha=top)
+    return Adam(alpha=top, optimizer=make_adam_stack(height - 1))
+
+
+def stack_test():
+    for top in np.linspace(-7, 3, 20):
+        for height in range(6):
+            print("height =", height, "to p=", top)
+            opt = make_sgd_stack(height, 10 ** top)
+            run(
+                opt,
+                "metasgd3-%d@%+.2f" % (height, top),
+                {"height": height, "top": top},
+                epochs=1,
+                height=height,
+            )
+            gc.collect()
+
+
+def perf_test():
+    for h in range(51):
+        print("height:", h)
+        # opt = make_sgd_stack(h, 0.01)
+        opt = make_adam_stack(h)
+        run(opt, "adamperf-%d" % h, {"height": h}, epochs=1)
+        gc.collect()
+
+
+if __name__ == "__main__":
+    try:
+        os.mkdir("log")
+    except:
+        print("log/ exists already")
+
+    surface()
+    sgd_experiments()
+    adam_experiments()
+    stack_test()
+    perf_test()
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt
@ -0,0 +1,5 @@
+numpy==1.17.2
+Pillow==6.2.0
+six==1.12.0
+torch==1.2.0
+torchvision==0.4.0
--- a/Old/Gradient-Descent-The-Ultimate-Optimizer/tests.py
+++ b/Old/Gradient-Descent-The-Ultimate-Optimizer/tests.py
@ -0,0 +1,344 @@
+import numpy as np
+import json, math, time, os
+from data_aug import *
+#from data_aug_v2 import *
+import gc
+
+import matplotlib.pyplot as plt
+from torchviz import make_dot, make_dot_from_trace
+
+from torch.utils.data import SubsetRandomSampler
+
+BATCH_SIZE = 300
+#TEST_SIZE = 10000
+TEST_SIZE = 300
+DATA_LIMIT = 10
+
+'''
+data_train = torchvision.datasets.MNIST(
+    "./data", train=True, download=True, 
+    transform=torchvision.transforms.Compose([
+            #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+            torchvision.transforms.ToTensor()
+        ])
+)
+data_test = torchvision.datasets.MNIST(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+'''
+data_train = torchvision.datasets.CIFAR10(
+    "./data", train=True, download=True, 
+    transform=torchvision.transforms.Compose([
+            #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+            torchvision.transforms.ToTensor()
+        ])
+)
+
+data_test = torchvision.datasets.CIFAR10(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+train_subset_indices=range(int(len(data_train)/2))
+val_subset_indices=range(int(len(data_train)/2),len(data_train))
+
+dl_train = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices))
+dl_val = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices))
+dl_test = torch.utils.data.DataLoader(data_test, batch_size=TEST_SIZE, shuffle=False)
+
+def test(model, reshape_in=True, device = torch.device('cuda')):
+    for i, (features_, labels_) in enumerate(dl_test):
+        if reshape_in :
+            features, labels = torch.reshape(features_, (TEST_SIZE, 28 * 28)), labels_
+        else:
+            features, labels =features_, labels_
+
+        features, labels = features.to(device), labels.to(device)
+
+        pred = model.forward(features)
+        return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100
+
+def train_one_epoch(model, optimizer, epoch=0, reshape_in=True, device = torch.device('cuda'), train_data=True):
+    if train_data: dl = dl_train
+    else: dl = dl_val
+    for i, (features_, labels_) in enumerate(dl):
+        if i > DATA_LIMIT : break
+        #t0 = time.process_time()
+
+        if reshape_in :
+            features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
+        else:
+            features, labels =features_, labels_
+
+        features, labels = features.to(device), labels.to(device)
+
+        #optimizer.begin()
+        #optimizer.zero_grad()
+        model.begin()
+        model.zero_grad()
+        pred = model.forward(features)
+
+        #loss = F.nll_loss(pred, labels)
+        loss = F.cross_entropy(pred,labels)
+
+        #model.print_grad_fn()
+        #optimizer.print_grad_fn()
+        #print('-'*50)
+
+        loss.backward(create_graph=True)
+
+        #optimizer.step()
+        if train_data: model.adjust()
+        else: model.adjust_val()
+        
+        #tf = time.process_time()
+        #data = {
+        #    "time": tf - t0,
+        #    "iter": epoch * len(dl_train) + i,
+        #    "loss": loss.item(),
+        #    "params": {
+        #        k: v.item()
+        #        for k, v in model.optimizer.parameters.items()
+        #        if "." not in k
+        #    },
+        #}
+        #stats.append(data)
+
+        #print_torch_mem(i)
+    return loss.item()
+
+def train_v2(model, optimizer, epochs=3, reshape_in=True, device = torch.device('cuda')):
+    log = []
+    for epoch in range(epochs):
+
+        #dl_train.dataset.transform=torchvision.transforms.Compose([
+        #    torchvision.transforms.RandomAffine(degrees=model.param('mag'), translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+        #    torchvision.transforms.ToTensor()
+        #])
+        viz_data(fig_name='res/data_sample')
+        t0 = time.process_time()
+        loss = train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device)
+        train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device,train_data=False)
+
+        #acc = test(model=model, reshape_in=reshape_in, device=device)
+        acc = 0
+
+        
+        tf = time.process_time()
+        data = {
+            "time": tf - t0,
+            "epoch": epoch,
+            "loss": loss,
+            "acc": acc,
+            "params": {
+                k: v.item()
+                for k, v in model.optimizer.parameters.items()
+                #for k, v in model.mods.data_aug.params.named_parameters()
+                if "." not in k
+
+            },
+        }
+        log.append(data)
+
+
+        print("Epoch :",epoch+1, "/",epochs, "- Loss :",log[-1]["loss"])
+        param = [p for p in model.param_grad() if p.grad is not None]
+        if(len(param)!=0):
+            print(param[-2],' / ', param[-2].grad)
+            print(param[-1],' / ', param[-1].grad)
+    return log
+
+def train(model, epochs=3, height=1, reshape_in=True, device = torch.device('cuda')):
+    stats = []
+    for epoch in range(epochs):
+        for i, (features_, labels_) in enumerate(dl_train):
+            t0 = time.process_time()
+            model.begin()
+            if reshape_in :
+                features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
+            else:
+            	features, labels =features_, labels_
+
+            features, labels = features.to(device), labels.to(device)
+            
+            pred = model.forward(
+                features
+            )  # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/
+            #loss = F.nll_loss(pred, labels)
+            loss = F.cross_entropy(pred,labels)
+
+            #print('-'*50)
+            #param = [p for p in model.param_grad() if p.grad is not None]
+            #if(len(param)!=0):
+            #	print(param[-2],' / ', param[-2].grad)
+            #	print(param[-1],' / ', param[-1].grad)
+
+            model.zero_grad()
+            loss.backward(create_graph=True)
+            model.adjust()
+            tf = time.process_time()
+            data = {
+                "time": tf - t0,
+                "iter": epoch * len(dl_train) + i,
+                "loss": loss.item(),
+                "params": {
+                    k: v.item()
+                    for k, v in model.optimizer.parameters.items()
+                    if "." not in k
+                },
+            }
+            stats.append(data)
+
+        print('-'*50)
+        i=0
+        for obj in gc.get_objects():
+            try:
+                if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)) and len(obj.size())>1:
+                    print(i, type(obj), obj.size())
+                    i+=1
+            except:
+                pass
+        print("Epoch :",epoch+1, "/",epochs, "- Loss :",stats[-1]["loss"])
+        param = [p for p in model.param_grad() if p.grad is not None]
+        if(len(param)!=0):
+            print(param[-2],' / ', param[-2].grad)
+            print(param[-1],' / ', param[-1].grad)
+    return stats
+
+def run(opt, name="out", usr={}, epochs=10, height=1, cnn=True, device = torch.device('cuda')):
+    torch.manual_seed(0x42)
+    if not cnn:
+        reshape_in = True
+        #model = MNIST_FullyConnected(28 * 28, 128, 10, opt)
+        model = MNIST_FullyConnected_Augmented(28 * 28, 128, 10, opt, device=device)
+        
+    else:
+        reshape_in = False
+        #model = LeNet(1, 10,opt, device)
+        #model = LeNet_v2(1, 10,opt, device).to(device=device)
+        model = LeNet_v2(3, 10,opt, device).to(device=device)
+        optimizer=None
+        '''
+        m = LeNet_v3(1, 10)
+        a = Data_aug()
+        model = Augmented_model(model=m, 
+                                data_augmenter=a,
+                                optimizer=opt).to(device) #deux fois le meme optimizer ?...
+        '''
+        '''
+        m = LeNet_v3(1, 10)
+        a = Data_aug()
+        model = Augmented_model(model=m, data_augmenter=a).to(device)
+        #optimizer = SGD(model.parameters())
+        optimizer = SGD(model.parameters(), lr=0.01, height=1)
+        '''
+        
+        
+    #for idx, m in enumerate(model.modules()):
+    #    print(idx, '->', m)
+    print("Running...", str(model))
+    model.initialize()
+    #print_model(model)
+    #model.data_augmentation(False)
+    #model.eval()
+
+    log = train_v2(model=model, optimizer=optimizer, epochs=epochs, reshape_in=reshape_in, device=device)
+    model.eval()
+    acc = test(model, reshape_in, device=device)
+
+    
+    #param = [p for p in model.param_grad() if p.grad is not None]
+    #if(len(param)!=0):
+    #    print(param[-2],' / ', param[-2].grad)
+    #    print(param[-1],' / ', param[-1].grad)
+	
+    out = {"acc": acc, "log": log, "usr": usr}
+    with open("log/%s.json" % name, "w+") as f:
+        json.dump(out, f, indent=True)
+    times = [x["time"] for x in log]
+    print("Times (ms):", np.mean(times), "+/-", np.std(times))
+    print("Final accuracy:", acc)
+
+    #plot_res(log, fig_name='res/'+name)
+
+    return out
+
+def make_adam_stack(height, top=0.0000001, device = torch.device('cuda')):
+    #print(height,device)
+    if height == 0:
+        return Adam(alpha=top, device=device)
+    return Adam(alpha=top, optimizer=make_adam_stack(height - 1, top, device=device), device=device)
+
+def plot_res(log, fig_name='res'):
+    
+    fig, ax = plt.subplots(ncols=3, figsize=(15, 3))
+    ax[0].set_title('Loss')
+    ax[0].plot([x["loss"] for x in log])
+        
+    ax[1].set_title('Acc')
+    ax[1].plot([x["acc"] for x in log]) 
+
+    ax[2].set_title('mag')
+    ax[2].plot([x["data_aug"] for x in log]) 
+
+    plt.savefig(fig_name)
+
+def print_torch_mem(add_info=''):
+
+    nb=0
+    max_size=0
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # and len(obj.size())>1:
+                #print(i, type(obj), obj.size())
+                size = np.sum(obj.size())
+                if(size>max_size): max_size=size
+                nb+=1
+        except:
+            pass
+    print(add_info, "-Pytroch tensor nb:",nb," / Max dim:", max_size)
+
+def print_model(model, fig_name='graph/graph'): #Semble ne pas marcher pour les models en fonctionnel
+    x = torch.randn(1,1,28,28, device=device)
+    dot=make_dot(model(x), params=dict(model.named_parameters()))
+    dot.format = 'svg' #https://graphviz.readthedocs.io/en/stable/manual.html#formats
+    dot.render(fig_name)
+    print("Model graph generated !")
+
+def viz_data(fig_name='data_sample'):
+
+    features_, labels_ = next(iter(dl_train))
+    plt.figure(figsize=(10,10))
+    #for i, (features_, labels_) in enumerate(dl_train):
+    for i in range(25):
+        if i==25: break
+        #print(features_.size(), labels_.size())
+
+        plt.subplot(5,5,i+1)
+        plt.xticks([])
+        plt.yticks([])
+        plt.grid(False)
+
+        img = features_[i,0,:,:]
+        
+        #print('im shape',img.shape)
+        plt.imshow(img, cmap=plt.cm.binary)
+        plt.xlabel(labels_[i].item())
+
+    plt.savefig(fig_name)
+
+##########################################
+if __name__ == "__main__":
+    try:
+        os.mkdir("log")
+    except:
+        print("log/ exists already")
+
+    device = torch.device('cuda')
+
+    run(make_adam_stack(height=1, top=0.001, device=device), 
+        "Augmented_MNIST", 
+        epochs=100, 
+        cnn=True, 
+        device = device)
+    print()