Initial Commit

2025-06-27 15:35:24 +02:00 · 2019-11-08 11:28:06 -05:00 · 2019-11-08 11:28:06 -05:00 · 3ae3e02e59
commit 3ae3e02e59
44 changed files with 4908 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,46 @@
+/higher/data
+/Gradient-Descent-The-Ultimate-Optimizer/data
+/FAR-HO/data
+/__pycache__
+
+*.pyo
+*.pyc
+*~
+
+# Compiled source #
+###################
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+
+# Logs and databases #
+######################
+*.log
+*.sql
+*.sqlite
+
+# OS generated files #
+######################
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
--- a/FAR-HO/augmentation_transforms.py
+++ b/FAR-HO/augmentation_transforms.py
@ -0,0 +1,456 @@
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Transforms used in the Augmentation Policies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import random
+import numpy as np
+# pylint:disable=g-multiple-import
+from PIL import ImageOps, ImageEnhance, ImageFilter, Image
+# pylint:enable=g-multiple-import
+
+
+IMAGE_SIZE = 28
+# What is the dataset mean and std of the images on the training set
+MEANS = [0.49139968, 0.48215841, 0.44653091]
+STDS = [0.24703223, 0.24348513, 0.26158784]
+PARAMETER_MAX = 10  # What is the max 'level' a transform could be predicted
+
+
+def random_flip(x):
+  """Flip the input x horizontally with 50% probability."""
+  if np.random.rand(1)[0] > 0.5:
+    return np.fliplr(x)
+  return x
+
+
+def zero_pad_and_crop(img, amount=4):
+  """Zero pad by `amount` zero pixels on each side then take a random crop.
+
+  Args:
+    img: numpy image that will be zero padded and cropped.
+    amount: amount of zeros to pad `img` with horizontally and verically.
+
+  Returns:
+    The cropped zero padded img. The returned numpy array will be of the same
+    shape as `img`.
+  """
+  padded_img = np.zeros((img.shape[0] + amount * 2, img.shape[1] + amount * 2,
+                         img.shape[2]))
+  padded_img[amount:img.shape[0] + amount, amount:
+             img.shape[1] + amount, :] = img
+  top = np.random.randint(low=0, high=2 * amount)
+  left = np.random.randint(low=0, high=2 * amount)
+  new_img = padded_img[top:top + img.shape[0], left:left + img.shape[1], :]
+  return new_img
+
+
+def create_cutout_mask(img_height, img_width, num_channels, size):
+  """Creates a zero mask used for cutout of shape `img_height` x `img_width`.
+
+  Args:
+    img_height: Height of image cutout mask will be applied to.
+    img_width: Width of image cutout mask will be applied to.
+    num_channels: Number of channels in the image.
+    size: Size of the zeros mask.
+
+  Returns:
+    A mask of shape `img_height` x `img_width` with all ones except for a
+    square of zeros of shape `size` x `size`. This mask is meant to be
+    elementwise multiplied with the original image. Additionally returns
+    the `upper_coord` and `lower_coord` which specify where the cutout mask
+    will be applied.
+  """
+  assert img_height == img_width
+
+  # Sample center where cutout mask will be applied
+  height_loc = np.random.randint(low=0, high=img_height)
+  width_loc = np.random.randint(low=0, high=img_width)
+
+  # Determine upper right and lower left corners of patch
+  upper_coord = (max(0, height_loc - size // 2), max(0, width_loc - size // 2))
+  lower_coord = (min(img_height, height_loc + size // 2),
+                 min(img_width, width_loc + size // 2))
+  mask_height = lower_coord[0] - upper_coord[0]
+  mask_width = lower_coord[1] - upper_coord[1]
+  assert mask_height > 0
+  assert mask_width > 0
+
+  mask = np.ones((img_height, img_width, num_channels))
+  zeros = np.zeros((mask_height, mask_width, num_channels))
+  mask[upper_coord[0]:lower_coord[0], upper_coord[1]:lower_coord[1], :] = (
+      zeros)
+  return mask, upper_coord, lower_coord
+
+
+def cutout_numpy(img, size=16):
+  """Apply cutout with mask of shape `size` x `size` to `img`.
+
+  The cutout operation is from the paper https://arxiv.org/abs/1708.04552.
+  This operation applies a `size`x`size` mask of zeros to a random location
+  within `img`.
+
+  Args:
+    img: Numpy image that cutout will be applied to.
+    size: Height/width of the cutout mask that will be
+
+  Returns:
+    A numpy tensor that is the result of applying the cutout mask to `img`.
+  """
+  img_height, img_width, num_channels = (img.shape[0], img.shape[1],
+                                         img.shape[2])
+  assert len(img.shape) == 3
+  mask, _, _ = create_cutout_mask(img_height, img_width, num_channels, size)
+  return img * mask
+
+
+def float_parameter(level, maxval):
+  """Helper function to scale `val` between 0 and maxval .
+
+  Args:
+    level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+    maxval: Maximum value that the operation can have. This will be scaled
+      to level/PARAMETER_MAX.
+
+  Returns:
+    A float that results from scaling `maxval` according to `level`.
+  """
+  return float(level) * maxval / PARAMETER_MAX
+
+
+def int_parameter(level, maxval):
+  """Helper function to scale `val` between 0 and maxval .
+
+  Args:
+    level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+    maxval: Maximum value that the operation can have. This will be scaled
+      to level/PARAMETER_MAX.
+
+  Returns:
+    An int that results from scaling `maxval` according to `level`.
+  """
+  return int(level * maxval / PARAMETER_MAX)
+
+
+def pil_wrap(img):
+  """Convert the `img` numpy tensor to a PIL Image."""
+  return Image.fromarray(
+      np.uint8((img * STDS + MEANS) * 255.0)).convert('RGBA')
+
+
+def pil_unwrap(pil_img):
+  """Converts the PIL img to a numpy array."""
+  pic_array = (np.array(pil_img.getdata()).reshape((IMAGE_SIZE, IMAGE_SIZE, 4)) / 255.0)
+  i1, i2 = np.where(pic_array[:, :, 3] == 0)
+  pic_array = (pic_array[:, :, :3] - MEANS) / STDS
+  pic_array[i1, i2] = [0, 0, 0]
+  return pic_array
+
+
+def apply_policy(policy, img):
+  """Apply the `policy` to the numpy `img`.
+
+  Args:
+    policy: A list of tuples with the form (name, probability, level) where
+      `name` is the name of the augmentation operation to apply, `probability`
+      is the probability of applying the operation and `level` is what strength
+      the operation to apply.
+    img: Numpy image that will have `policy` applied to it.
+
+  Returns:
+    The result of applying `policy` to `img`.
+  """
+  #print('img shape :',img.shape)
+  #print('Policy len :',len(policy))
+  pil_img = pil_wrap(img)
+  for xform in policy:
+    #print('xform :', len(xform))
+    assert len(xform) == 3
+    name, probability, level = xform
+    #xform_fn = NAME_TO_TRANSFORM[name].pil_transformer(probability, level)
+    xform_fn = NAME_TO_TRANSFORM[name].pil_transformer(probability.eval(), level)
+    pil_img = xform_fn(pil_img)
+  return pil_unwrap(pil_img)
+
+
+class TransformFunction(object):
+  """Wraps the Transform function for pretty printing options."""
+
+  def __init__(self, func, name):
+    self.f = func
+    self.name = name
+
+  def __repr__(self):
+    return '<' + self.name + '>'
+
+  def __call__(self, pil_img):
+    return self.f(pil_img)
+
+
+class TransformT(object):
+  """Each instance of this class represents a specific transform."""
+
+  def __init__(self, name, xform_fn):
+    self.name = name
+    self.xform = xform_fn
+
+  def pil_transformer(self, probability, level):
+
+    def return_function(im):
+      if random.random() < probability:
+        im = self.xform(im, level)
+      return im
+
+    name = self.name + '({:.1f},{})'.format(probability, level)
+    return TransformFunction(return_function, name)
+
+  def do_transform(self, image, level):
+    f = self.pil_transformer(PARAMETER_MAX, level)
+    return pil_unwrap(f(pil_wrap(image)))
+
+
+################## Transform Functions ##################
+identity = TransformT('identity', lambda pil_img, level: pil_img)
+flip_lr = TransformT(
+    'FlipLR',
+    lambda pil_img, level: pil_img.transpose(Image.FLIP_LEFT_RIGHT))
+flip_ud = TransformT(
+    'FlipUD',
+    lambda pil_img, level: pil_img.transpose(Image.FLIP_TOP_BOTTOM))
+# pylint:disable=g-long-lambda
+auto_contrast = TransformT(
+    'AutoContrast',
+    lambda pil_img, level: ImageOps.autocontrast(
+        pil_img.convert('RGB')).convert('RGBA'))
+equalize = TransformT(
+    'Equalize',
+    lambda pil_img, level: ImageOps.equalize(
+        pil_img.convert('RGB')).convert('RGBA'))
+invert = TransformT(
+    'Invert',
+    lambda pil_img, level: ImageOps.invert(
+        pil_img.convert('RGB')).convert('RGBA'))
+# pylint:enable=g-long-lambda
+blur = TransformT(
+    'Blur', lambda pil_img, level: pil_img.filter(ImageFilter.BLUR))
+smooth = TransformT(
+    'Smooth',
+    lambda pil_img, level: pil_img.filter(ImageFilter.SMOOTH))
+
+
+def _rotate_impl(pil_img, level):
+  """Rotates `pil_img` from -30 to 30 degrees depending on `level`."""
+  degrees = int_parameter(level, 30)
+  if random.random() > 0.5:
+    degrees = -degrees
+  return pil_img.rotate(degrees)
+
+
+rotate = TransformT('Rotate', _rotate_impl)
+
+
+def _posterize_impl(pil_img, level):
+  """Applies PIL Posterize to `pil_img`."""
+  level = int_parameter(level, 4)
+  return ImageOps.posterize(pil_img.convert('RGB'), 4 - level).convert('RGBA')
+
+
+posterize = TransformT('Posterize', _posterize_impl)
+
+
+def _shear_x_impl(pil_img, level):
+  """Applies PIL ShearX to `pil_img`.
+
+  The ShearX operation shears the image along the horizontal axis with `level`
+  magnitude.
+
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+
+  Returns:
+    A PIL Image that has had ShearX applied to it.
+  """
+  level = float_parameter(level, 0.3)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, level, 0, 0, 1, 0))
+
+
+shear_x = TransformT('ShearX', _shear_x_impl)
+
+
+def _shear_y_impl(pil_img, level):
+  """Applies PIL ShearY to `pil_img`.
+
+  The ShearY operation shears the image along the vertical axis with `level`
+  magnitude.
+
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+
+  Returns:
+    A PIL Image that has had ShearX applied to it.
+  """
+  level = float_parameter(level, 0.3)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, 0, 0, level, 1, 0))
+
+
+shear_y = TransformT('ShearY', _shear_y_impl)
+
+
+def _translate_x_impl(pil_img, level):
+  """Applies PIL TranslateX to `pil_img`.
+
+  Translate the image in the horizontal direction by `level`
+  number of pixels.
+
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+
+  Returns:
+    A PIL Image that has had TranslateX applied to it.
+  """
+  level = int_parameter(level, 10)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, 0, level, 0, 1, 0))
+
+
+translate_x = TransformT('TranslateX', _translate_x_impl)
+
+
+def _translate_y_impl(pil_img, level):
+  """Applies PIL TranslateY to `pil_img`.
+
+  Translate the image in the vertical direction by `level`
+  number of pixels.
+
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+
+  Returns:
+    A PIL Image that has had TranslateY applied to it.
+  """
+  level = int_parameter(level, 10)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, 0, 0, 0, 1, level))
+
+
+translate_y = TransformT('TranslateY', _translate_y_impl)
+
+
+def _crop_impl(pil_img, level, interpolation=Image.BILINEAR):
+  """Applies a crop to `pil_img` with the size depending on the `level`."""
+  cropped = pil_img.crop((level, level, IMAGE_SIZE - level, IMAGE_SIZE - level))
+  resized = cropped.resize((IMAGE_SIZE, IMAGE_SIZE), interpolation)
+  return resized
+
+
+crop_bilinear = TransformT('CropBilinear', _crop_impl)
+
+
+def _solarize_impl(pil_img, level):
+  """Applies PIL Solarize to `pil_img`.
+
+  Translate the image in the vertical direction by `level`
+  number of pixels.
+
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+
+  Returns:
+    A PIL Image that has had Solarize applied to it.
+  """
+  level = int_parameter(level, 256)
+  return ImageOps.solarize(pil_img.convert('RGB'), 256 - level).convert('RGBA')
+
+
+solarize = TransformT('Solarize', _solarize_impl)
+
+
+def _cutout_pil_impl(pil_img, level):
+  """Apply cutout to pil_img at the specified level."""
+  size = int_parameter(level, 20)
+  if size <= 0:
+    return pil_img
+  img_height, img_width, num_channels = (IMAGE_SIZE, IMAGE_SIZE, 3)
+  _, upper_coord, lower_coord = (
+      create_cutout_mask(img_height, img_width, num_channels, size))
+  pixels = pil_img.load()  # create the pixel map
+  for i in range(upper_coord[0], lower_coord[0]):  # for every col:
+    for j in range(upper_coord[1], lower_coord[1]):  # For every row
+      pixels[i, j] = (125, 122, 113, 0)  # set the colour accordingly
+  return pil_img
+
+cutout = TransformT('Cutout', _cutout_pil_impl)
+
+
+def _enhancer_impl(enhancer):
+  """Sets level to be between 0.1 and 1.8 for ImageEnhance transforms of PIL."""
+  def impl(pil_img, level):
+    v = float_parameter(level, 1.8) + .1  # going to 0 just destroys it
+    return enhancer(pil_img).enhance(v)
+  return impl
+
+
+color = TransformT('Color', _enhancer_impl(ImageEnhance.Color))
+contrast = TransformT('Contrast', _enhancer_impl(ImageEnhance.Contrast))
+brightness = TransformT('Brightness', _enhancer_impl(
+    ImageEnhance.Brightness))
+sharpness = TransformT('Sharpness', _enhancer_impl(ImageEnhance.Sharpness))
+
+ALL_TRANSFORMS = [
+    flip_lr,
+    flip_ud,
+    auto_contrast,
+    equalize,
+    invert,
+    rotate,
+    posterize,
+    crop_bilinear,
+    solarize,
+    color,
+    contrast,
+    brightness,
+    sharpness,
+    shear_x,
+    shear_y,
+    translate_x,
+    translate_y,
+    cutout,
+    blur,
+    smooth
+]
+
+NAME_TO_TRANSFORM = {t.name: t for t in ALL_TRANSFORMS}
+TRANSFORM_NAMES = NAME_TO_TRANSFORM.keys()
--- a/FAR-HO/blue_utils.py
+++ b/FAR-HO/blue_utils.py
@ -0,0 +1,131 @@
+import matplotlib.pyplot as plt
+from far_ho.examples.datasets import Datasets, Dataset
+
+import os
+import numpy as np
+import tensorflow as tf
+
+import augmentation_transforms as augmentation_transforms ##### ATTENTION FICHIER EN DOUBLE => A REGLER MIEUX ####
+
+def viz_data(dataset, fig_name='data_sample',aug_policy=None):
+
+    plt.figure(figsize=(10,10))
+    for i in range(25):
+        plt.subplot(5,5,i+1)
+        plt.xticks([])
+        plt.yticks([])
+        plt.grid(False)
+
+        img = dataset.data[i][:,:,0]
+        if aug_policy :
+            img = augment_img(img,aug_policy)
+        #print('im shape',img.shape)
+        plt.imshow(img, cmap=plt.cm.binary)
+        plt.xlabel(np.nonzero(dataset.target[i])[0].item())
+
+    plt.savefig(fig_name)
+
+def augment_img(data, policy):
+
+    #print('Im shape',data.shape)
+    data = np.stack((data,)*3, axis=-1) #BOF BOF juste pour forcer 3 channels
+    #print('Im shape',data.shape)
+    final_img = augmentation_transforms.apply_policy(policy, data)
+    #final_img = augmentation_transforms.random_flip(augmentation_transforms.zero_pad_and_crop(final_img, 4))
+    # Apply cutout
+    #final_img = augmentation_transforms.cutout_numpy(final_img)
+    
+    im_rgb = np.array(final_img, np.float32)
+    im_gray = np.dot(im_rgb[...,:3], [0.2989, 0.5870, 0.1140]) #Just pour retourner a 1 channel
+
+    return im_gray
+
+
+### https://www.kaggle.com/raoulma/mnist-image-class-tensorflow-cnn-99-51-test-acc#5.-Build-the-neural-network-with-tensorflow-
+## build the neural network class
+# weight initialization
+def weight_variable(shape, name = None):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial, name = name)
+
+# bias initialization
+def bias_variable(shape, name = None):
+    initial = tf.constant(0.1, shape=shape) #  positive bias
+    return tf.Variable(initial, name = name)
+
+# 2D convolution
+def conv2d(x, W, name = None):
+    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', name = name)
+
+# max pooling
+def max_pool_2x2(x, name = None):
+    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
+                              padding='SAME', name = name)
+
+def cnn(x_data_tf,y_data_tf, name='model'):
+     # tunable hyperparameters for nn architecture
+     s_f_conv1 = 3; # filter size of first convolution layer (default = 3)
+     n_f_conv1 = 36; # number of features of first convolution layer (default = 36)
+     s_f_conv2 = 3; # filter size of second convolution layer (default = 3)
+     n_f_conv2 = 36; # number of features of second convolution layer (default = 36)
+     s_f_conv3 = 3; # filter size of third convolution layer (default = 3)
+     n_f_conv3 = 36; # number of features of third convolution layer (default = 36)
+     n_n_fc1 = 576; # number of neurons of first fully connected layer (default = 576)
+
+     # 1.layer: convolution + max pooling
+     W_conv1_tf = weight_variable([s_f_conv1, s_f_conv1, 1, n_f_conv1], name = 'W_conv1_tf') # (5,5,1,32)
+     b_conv1_tf = bias_variable([n_f_conv1], name = 'b_conv1_tf') # (32)
+     h_conv1_tf = tf.nn.relu(conv2d(x_data_tf, 
+                                                 W_conv1_tf) + b_conv1_tf, 
+                                     name = 'h_conv1_tf') # (.,28,28,32)
+     h_pool1_tf = max_pool_2x2(h_conv1_tf, 
+                                            name = 'h_pool1_tf') # (.,14,14,32)
+
+     # 2.layer: convolution + max pooling
+     W_conv2_tf = weight_variable([s_f_conv2, s_f_conv2, 
+                                                n_f_conv1, n_f_conv2], 
+                                               name = 'W_conv2_tf')
+     b_conv2_tf = bias_variable([n_f_conv2], name = 'b_conv2_tf')
+     h_conv2_tf = tf.nn.relu(conv2d(h_pool1_tf, 
+                                                 W_conv2_tf) + b_conv2_tf, 
+                                     name ='h_conv2_tf') #(.,14,14,32)
+     h_pool2_tf = max_pool_2x2(h_conv2_tf, name = 'h_pool2_tf') #(.,7,7,32)
+
+     # 3.layer: convolution + max pooling
+     W_conv3_tf = weight_variable([s_f_conv3, s_f_conv3, 
+                                                n_f_conv2, n_f_conv3], 
+                                               name = 'W_conv3_tf')
+     b_conv3_tf = bias_variable([n_f_conv3], name = 'b_conv3_tf')
+     h_conv3_tf = tf.nn.relu(conv2d(h_pool2_tf, 
+                                                 W_conv3_tf) + b_conv3_tf, 
+                                     name = 'h_conv3_tf') #(.,7,7,32)
+     h_pool3_tf = max_pool_2x2(h_conv3_tf, 
+                                            name = 'h_pool3_tf') # (.,4,4,32)
+
+     # 4.layer: fully connected
+     W_fc1_tf = weight_variable([4*4*n_f_conv3,n_n_fc1], 
+                                             name = 'W_fc1_tf') # (4*4*32, 1024)
+     b_fc1_tf = bias_variable([n_n_fc1], name = 'b_fc1_tf') # (1024)
+     h_pool3_flat_tf = tf.reshape(h_pool3_tf, [-1,4*4*n_f_conv3], 
+                                          name = 'h_pool3_flat_tf') # (.,1024)
+     h_fc1_tf = tf.nn.relu(tf.matmul(h_pool3_flat_tf, 
+                                             W_fc1_tf) + b_fc1_tf, 
+                                   name = 'h_fc1_tf') # (.,1024)
+      
+     # add dropout
+     #keep_prob_tf = tf.placeholder(dtype=tf.float32, name = 'keep_prob_tf')
+     #h_fc1_drop_tf = tf.nn.dropout(h_fc1_tf, keep_prob_tf, name = 'h_fc1_drop_tf')
+
+     # 5.layer: fully connected
+     W_fc2_tf = weight_variable([n_n_fc1, 10], name = 'W_fc2_tf')
+     b_fc2_tf = bias_variable([10], name = 'b_fc2_tf')
+     z_pred_tf = tf.add(tf.matmul(h_fc1_tf, W_fc2_tf), 
+                                b_fc2_tf, name = 'z_pred_tf')# => (.,10)
+     # predicted probabilities in one-hot encoding
+     y_pred_proba_tf = tf.nn.softmax(z_pred_tf, name='y_pred_proba_tf') 
+        
+     # tensor of correct predictions
+     y_pred_correct_tf = tf.equal(tf.argmax(y_pred_proba_tf, 1),
+                                          tf.argmax(y_data_tf, 1),
+                                          name = 'y_pred_correct_tf')  
+     return y_pred_proba_tf
--- a/FAR-HO/far_pba_cifar.py
+++ b/FAR-HO/far_pba_cifar.py
@ -0,0 +1,166 @@
+#https://github.com/arcelien/pba/blob/master/autoaugment/train_cifar.py
+from __future__ import absolute_import, print_function, division
+
+import os
+import numpy as np
+import tensorflow as tf
+#import tensorflow.contrib.layers as layers
+import far_ho as far
+import far_ho.examples as far_ex
+#import pprint
+
+import autoaugment.augmentation_transforms as augmentation_transforms
+#import autoaugment.policies as found_policies
+from autoaugment.wrn import build_wrn_model
+
+
+def build_model(inputs, num_classes, is_training, hparams):
+  """Constructs the vision model being trained/evaled.
+  Args:
+    inputs: input features/images being fed to the image model build built.
+    num_classes: number of output classes being predicted.
+    is_training: is the model training or not.
+    hparams: additional hyperparameters associated with the image model.
+  Returns:
+    The logits of the image model.
+  """
+  scopes = setup_arg_scopes(is_training)
+  with contextlib.nested(*scopes):
+    if hparams.model_name == 'pyramid_net':
+      logits = build_shake_drop_model(
+          inputs, num_classes, is_training)
+    elif hparams.model_name == 'wrn':
+      logits = build_wrn_model(
+          inputs, num_classes, hparams.wrn_size)
+    elif hparams.model_name == 'shake_shake':
+      logits = build_shake_shake_model(
+          inputs, num_classes, hparams, is_training)
+  return logits
+
+
+class CifarModel(object):
+  """Builds an image model for Cifar10/Cifar100."""
+
+  def __init__(self, hparams):
+    self.hparams = hparams
+
+  def build(self, mode):
+    """Construct the cifar model."""
+    assert mode in ['train', 'eval']
+    self.mode = mode
+    self._setup_misc(mode)
+    self._setup_images_and_labels()
+    self._build_graph(self.images, self.labels, mode)
+
+    self.init = tf.group(tf.global_variables_initializer(),
+                         tf.local_variables_initializer())
+
+  def _setup_misc(self, mode):
+    """Sets up miscellaneous in the cifar model constructor."""
+    self.lr_rate_ph = tf.Variable(0.0, name='lrn_rate', trainable=False)
+    self.reuse = None if (mode == 'train') else True
+    self.batch_size = self.hparams.batch_size
+    if mode == 'eval':
+      self.batch_size = 25
+
+  def _setup_images_and_labels(self):
+    """Sets up image and label placeholders for the cifar model."""
+    if FLAGS.dataset == 'cifar10':
+      self.num_classes = 10
+    else:
+      self.num_classes = 100
+    self.images = tf.placeholder(tf.float32, [self.batch_size, 32, 32, 3])
+    self.labels = tf.placeholder(tf.float32,
+                                 [self.batch_size, self.num_classes])
+
+  def assign_epoch(self, session, epoch_value):
+    session.run(self._epoch_update, feed_dict={self._new_epoch: epoch_value})
+
+  def _build_graph(self, images, labels, mode):
+    """Constructs the TF graph for the cifar model.
+    Args:
+      images: A 4-D image Tensor
+      labels: A 2-D labels Tensor.
+      mode: string indicating training mode ( e.g., 'train', 'valid', 'test').
+    """
+    is_training = 'train' in mode
+    if is_training:
+      self.global_step = tf.train.get_or_create_global_step()
+
+    logits = build_model(
+        images,
+        self.num_classes,
+        is_training,
+        self.hparams)
+    self.predictions, self.cost = helper_utils.setup_loss(
+        logits, labels)
+    self.accuracy, self.eval_op = tf.metrics.accuracy(
+        tf.argmax(labels, 1), tf.argmax(self.predictions, 1))
+    self._calc_num_trainable_params()
+
+    # Adds L2 weight decay to the cost
+    self.cost = helper_utils.decay_weights(self.cost,
+                                           self.hparams.weight_decay_rate)
+ #### Attention: differe implem originale
+
+    self.init = tf.group(tf.global_variables_initializer(),
+                         tf.local_variables_initializer())
+
+
+########################################################
+
+######## PBA ############
+
+#Parallele Cifar model trainer
+tf.flags.DEFINE_string('model_name', 'wrn',
+                       'wrn, shake_shake_32, shake_shake_96, shake_shake_112, '
+                       'pyramid_net')
+tf.flags.DEFINE_string('checkpoint_dir', '/tmp/training', 'Training Directory.')
+tf.flags.DEFINE_string('data_path', '/tmp/data',
+                       'Directory where dataset is located.')
+tf.flags.DEFINE_string('dataset', 'cifar10',
+                       'Dataset to train with. Either cifar10 or cifar100')
+tf.flags.DEFINE_integer('use_cpu', 1, '1 if use CPU, else GPU.')
+## ???
+
+FLAGS = tf.flags.FLAGS
+FLAGS.dataset
+FLAGS.data_path
+FLAGS.model_name = 'wrn'
+
+hparams = tf.contrib.training.HParams(
+      train_size=50000,
+      validation_size=0,
+      eval_test=1,
+      dataset=FLAGS.dataset,
+      data_path=FLAGS.data_path,
+      batch_size=128,
+      gradient_clipping_by_global_norm=5.0)
+  if FLAGS.model_name == 'wrn':
+    hparams.add_hparam('model_name', 'wrn')
+    hparams.add_hparam('num_epochs', 200)
+    hparams.add_hparam('wrn_size', 160)
+    hparams.add_hparam('lr', 0.1)
+    hparams.add_hparam('weight_decay_rate', 5e-4)
+
+data_loader = data_utils.DataSet(hparams)
+data_loader.reset()
+
+with tf.Graph().as_default(): #, tf.device('/cpu:0' if FLAGS.use_cpu else '/gpu:0'):
+"""Builds the image models for train and eval."""
+    # Determine if we should build the train and eval model. When using
+    # distributed training we only want to build one or the other and not both.
+    with tf.variable_scope('model', use_resource=False):
+      m = CifarModel(self.hparams)
+      m.build('train')
+      #self._num_trainable_params = m.num_trainable_params
+      #self._saver = m.saver
+    #with tf.variable_scope('model', reuse=True, use_resource=False):
+    #  meval = CifarModel(self.hparams)
+    #  meval.build('eval')
+
+
+##### FAR-HO ####
+for _ in range(n_hyper_iterations):
+
+
--- a/FAR-HO/test.py
+++ b/FAR-HO/test.py
@ -0,0 +1,92 @@
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+import far_ho as far
+import far_ho.examples as far_ex
+import matplotlib.pyplot as plt
+
+sess = tf.InteractiveSession()
+
+
+def get_data():
+    # load a small portion of mnist data
+    datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=(.1, .1,))
+    return datasets.train, datasets.validation
+
+
+def g_logits(x,y):
+    with tf.variable_scope('model'):
+        h1 = layers.fully_connected(x, 300)
+        logits = layers.fully_connected(h1, int(y.shape[1]))
+    return logits
+
+
+x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x')
+y = tf.placeholder(tf.float32, shape=(None, 10), name='y')
+logits = g_logits(x,y)
+train_set, validation_set = get_data()
+
+lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples))
+lr = far.get_hyperparameter('lr', initializer=0.01)
+
+ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
+L = tf.reduce_mean(tf.sigmoid(lambdas)*ce)
+E = tf.reduce_mean(ce)
+
+accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32))
+
+inner_optimizer = far.GradientDescentOptimizer(lr)
+outer_optimizer = tf.train.AdamOptimizer()
+rev_it =10
+hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it)
+hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer)
+
+T = 20  # Number of inner iterations
+train_set_supplier = train_set.create_supplier(x, y)
+validation_set_supplier = validation_set.create_supplier(x, y)
+tf.global_variables_initializer().run()
+
+print('inner:', L.eval(train_set_supplier()))
+print('outer:', E.eval(validation_set_supplier()))
+# print('-'*50)
+n_hyper_iterations = 200
+inner_losses = []
+outer_losses = []
+train_accs = []
+val_accs = []
+
+for _ in range(n_hyper_iterations):
+    hyper_step(T,
+               inner_objective_feed_dicts=train_set_supplier,
+               outer_objective_feed_dicts=validation_set_supplier)
+
+    inner_obj = L.eval(train_set_supplier())
+    outer_obj = E.eval(validation_set_supplier())
+    inner_losses.append(inner_obj)
+    outer_losses.append(outer_obj)
+    print('inner:', inner_obj)
+    print('outer:', outer_obj)
+
+    train_acc = accuracy.eval(train_set_supplier())
+    val_acc = accuracy.eval(validation_set_supplier())
+    train_accs.append(train_acc)
+    val_accs.append(val_acc)
+    print('training accuracy', train_acc)
+    print('validation accuracy', val_acc)
+
+    print('learning rate', lr.eval())
+    print('norm of examples weight', tf.norm(lambdas).eval())
+    print('-'*50)
+    
+plt.subplot(211)
+plt.plot(inner_losses, label='training loss')
+plt.plot(outer_losses, label='validation loss')
+plt.legend(loc=0, frameon=True)
+#plt.xlim(0, 19)
+plt.subplot(212)
+plt.plot(train_accs, label='training accuracy')
+plt.plot(val_accs, label='validation accuracy')
+plt.legend(loc=0, frameon=True)
+
+plt.savefig('H%d - I%d - R%d'%(n_hyper_iterations,T,rev_it))
--- a/FAR-HO/test_cnn.py
+++ b/FAR-HO/test_cnn.py
@ -0,0 +1,126 @@
+import warnings
+warnings.filterwarnings("ignore")
+
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+import far_ho as far
+import far_ho.examples as far_ex
+
+tf.logging.set_verbosity(tf.logging.ERROR)
+
+import matplotlib.pyplot as plt
+import blue_utils as butil
+
+#Reset
+try:
+    sess.close()
+except: pass
+rnd = np.random.RandomState(1)
+tf.reset_default_graph()
+sess = tf.InteractiveSession()
+
+def get_data(data_split):
+    # load a small portion of mnist data
+    datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=data_split, reshape=False)
+    print("Data shape : ", datasets.train.dim_data, "/ Label shape : ", datasets.train.dim_target)
+    [print("Nb samples : ", d.num_examples) for d in datasets]
+    return datasets.train, datasets.validation, datasets.test
+
+#Model
+# FC : reshape = True
+def g_logits(x,y, name='model'):
+    with tf.variable_scope(name):
+        h1 = layers.fully_connected(x, 300)
+        logits = layers.fully_connected(h1, int(y.shape[1]))
+    return logits
+
+#### Hyper-parametres ####
+n_hyper_iterations = 500
+T = 20  # Number of inner iterations
+rev_it =10
+hp_lr = 1.e-3
+##########################
+
+#MNIST
+#x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x')
+#y = tf.placeholder(tf.float32, shape=(None, 10), name='y')
+#logits = g_logits(x, y)
+
+#CNN : reshape = False
+x = tf.placeholder(dtype=tf.float32, shape=[None,28,28,1], name='x')
+y = tf.placeholder(dtype=tf.float32, shape=[None,10], name='y')
+
+logits = butil.cnn(x,y)
+
+train_set, validation_set, test_set = get_data(data_split=(.05, .05,))
+
+butil.viz_data(train_set)
+print('Data sampled !')
+
+# lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples))
+#lr = far.get_hyperparameter('lr', initializer=1e-4, constraint=lambda t: tf.maximum(tf.minimum(t, .1), 1.e-7))
+#mu = far.get_hyperparameter('mu', initializer=0.9, constraint=lambda t: tf.maximum(tf.minimum(t, .99), 1.e-5))
+#rho = far.get_hyperparameter('rho', initializer=0.00001, constraint=lambda t: tf.maximum(tf.minimum(t, 0.01), 0.))
+lr = far.get_hyperparameter('lr', initializer=1e-4, constraint=lambda t: tf.maximum(tf.minimum(t, 1e-4), 1e-4))
+mu = far.get_hyperparameter('mu', initializer=0.9, constraint=lambda t: tf.maximum(tf.minimum(t, 0.9), 0.9))
+rho = far.get_hyperparameter('rho', initializer=0.00001, constraint=lambda t: tf.maximum(tf.minimum(t, 0.00001), 0.00001))
+
+ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
+L = tf.reduce_mean(ce) + rho*tf.add_n([tf.reduce_sum(w**2) for w in tf.trainable_variables()]) #Retirer la seconde partie de la loss quand HP inutiles
+E = tf.reduce_mean(ce)
+
+accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32))
+
+inner_optimizer = far.MomentumOptimizer(lr, mu)
+outer_optimizer = tf.train.AdamOptimizer(hp_lr)
+hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it)
+hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer)
+
+train_set_supplier = train_set.create_supplier(x, y, batch_size=256)  # stochastic GD
+validation_set_supplier = validation_set.create_supplier(x, y)
+
+his_params = []
+
+tf.global_variables_initializer().run()
+
+for hyt in range(n_hyper_iterations):
+    hyper_step(T,
+               inner_objective_feed_dicts=train_set_supplier,
+               outer_objective_feed_dicts=validation_set_supplier)
+    res = sess.run(far.hyperparameters()) + [L.eval(train_set_supplier()), 
+                                             E.eval(validation_set_supplier()),
+                                             accuracy.eval(train_set_supplier()),
+                                             accuracy.eval(validation_set_supplier())]
+    his_params.append(res)
+
+    print('Hyper-it :',hyt,'/',n_hyper_iterations)
+    print('inner:', L.eval(train_set_supplier()))
+    print('outer:', E.eval(validation_set_supplier()))
+    print('training accuracy:', res[5])
+    print('validation accuracy:', res[6])
+    #print('learning rate', lr.eval(), 'momentum', mu.eval(), 'l2 coefficient', rho.eval())
+    print('-'*50)
+
+test_set_supplier = test_set.create_supplier(x, y)
+print('Test accuracy:',accuracy.eval(test_set_supplier()))
+
+fig, ax = plt.subplots(ncols=4, figsize=(15, 3))
+ax[0].set_title('Learning rate')
+ax[0].plot([e[0] for e in his_params])
+    
+ax[1].set_title('Momentum factor')
+ax[1].plot([e[1] for e in his_params]) 
+    
+#ax[2].set_title('L2 regulariz.')
+#ax[2].plot([e[2] for e in his_params])
+ax[2].set_title('Tr. and val. acc')
+ax[2].plot([e[5] for e in his_params])
+ax[2].plot([e[6] for e in his_params])
+    
+ax[3].set_title('Tr. and val. errors')
+ax[3].plot([e[3] for e in his_params])
+ax[3].plot([e[4] for e in his_params])  
+
+plt.savefig('res_cnn_H{}_I{}'.format(n_hyper_iterations,T))
--- a/FAR-HO/test_cnn_aug.py
+++ b/FAR-HO/test_cnn_aug.py
@ -0,0 +1,141 @@
+import warnings
+warnings.filterwarnings("ignore")
+
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+import far_ho as far
+import far_ho.examples as far_ex
+
+tf.logging.set_verbosity(tf.logging.ERROR)
+
+import matplotlib.pyplot as plt
+import blue_utils as butil
+
+#Reset
+try:
+    sess.close()
+except: pass
+rnd = np.random.RandomState(1)
+tf.reset_default_graph()
+sess = tf.InteractiveSession()
+
+def get_data(data_split):
+    # load a small portion of mnist data
+    datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=data_split, reshape=False)
+    print("Data shape : ", datasets.train.dim_data, "/ Label shape : ", datasets.train.dim_target)
+    [print("Nb samples : ", d.num_examples) for d in datasets]
+    return datasets.train, datasets.validation, datasets.test
+
+#Model
+# FC : reshape = True
+def g_logits(x,y, name='model'):
+    with tf.variable_scope(name):
+        h1 = layers.fully_connected(x, 300)
+        logits = layers.fully_connected(h1, int(y.shape[1]))
+    return logits
+
+#### Hyper-parametres ####
+n_hyper_iterations = 10
+T = 10 # Number of inner iterations
+rev_it =10
+hp_lr = 0.02
+##########################
+
+#MNIST
+#x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x')
+#y = tf.placeholder(tf.float32, shape=(None, 10), name='y')
+#logits = g_logits(x, y)
+
+#CNN : reshape = False
+x = tf.placeholder(dtype=tf.float32, shape=[None,28,28,1], name='x')
+y = tf.placeholder(dtype=tf.float32, shape=[None,10], name='y')
+
+logits = butil.cnn(x,y)
+
+train_set, validation_set, test_set = get_data(data_split=(.1, .1,))
+
+probX = far.get_hyperparameter('probX', initializer=0.1, constraint=lambda t: tf.maximum(tf.minimum(t, 0.1), 0.9))
+probY = far.get_hyperparameter('probY', initializer=0.1, constraint=lambda t: tf.maximum(tf.minimum(t, 0.1), 0.9))
+
+#lr = far.get_hyperparameter('lr', initializer=1e-4, constraint=lambda t: tf.maximum(tf.minimum(t, 1e-4), 1e-4))
+#mu = far.get_hyperparameter('mu', initializer=0.9, constraint=lambda t: tf.maximum(tf.minimum(t, 0.9), 0.9))
+
+#probX, probY = 0.5, 0.5
+#policy = [('TranslateX', probX, 8), ('TranslateY', probY, 8)]
+policy = [('TranslateX', probX, 8), ('FlipUD', probY, 8)]
+print('Hyp :',far.utils.hyperparameters(scope=None))
+
+#butil.viz_data(train_set, aug_policy= policy)
+#print('Data sampled !')
+
+#Ajout artificiel des transfo a la loss juste pour qu il soit compter dans la dynamique du graph
+probX_loss = tf.sigmoid(probX)
+probY_loss = tf.sigmoid(probY)
+
+ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
+L = tf.reduce_mean(probX_loss*probY_loss*ce)
+E = tf.reduce_mean(ce)
+
+accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32))
+
+inner_optimizer = far.AdamOptimizer()
+outer_optimizer = tf.train.AdamOptimizer(hp_lr)
+hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it)
+hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer)
+
+train_set_supplier = train_set.create_supplier(x, y, batch_size=256, aug_policy=policy)  # stochastic GD
+validation_set_supplier = validation_set.create_supplier(x, y)
+
+#print(train_set.dim_data,validation_set.dim_data)
+
+his_params = []
+
+tf.global_variables_initializer().run()
+
+butil.viz_data(train_set, fig_name= 'Start_sample',aug_policy= policy)
+print('Data sampled !')
+
+for hyt in range(n_hyper_iterations):
+    hyper_step(T,
+               inner_objective_feed_dicts=train_set_supplier,
+               outer_objective_feed_dicts=validation_set_supplier,
+               _skip_hyper_ts=True)
+    res = sess.run(far.hyperparameters()) + [L.eval(train_set_supplier()), 
+                                             E.eval(validation_set_supplier()),
+                                             accuracy.eval(train_set_supplier()),
+                                             accuracy.eval(validation_set_supplier())]
+    his_params.append(res)
+
+    butil.viz_data(train_set, fig_name= 'Train_sample_{}'.format(hyt),aug_policy= policy)
+    print('Data sampled !')
+
+    print('Hyper-it :',hyt,'/',n_hyper_iterations)
+    print('inner:', L.eval(train_set_supplier()))
+    print('outer:', E.eval(validation_set_supplier()))
+    print('training accuracy:', res[4])
+    print('validation accuracy:', res[5])
+    print('Transformation : ProbX -',res[0],'/ProbY -',res[1])
+    #print('learning rate', lr.eval(), 'momentum', mu.eval(), 'l2 coefficient', rho.eval())
+    print('-'*50)
+
+test_set_supplier = test_set.create_supplier(x, y)
+print('Test accuracy:',accuracy.eval(test_set_supplier()))
+
+fig, ax = plt.subplots(ncols=4, figsize=(15, 3))
+ax[0].set_title('ProbX')
+ax[0].plot([e[0] for e in his_params])
+    
+ax[1].set_title('ProbY')
+ax[1].plot([e[1] for e in his_params]) 
+    
+ax[2].set_title('Tr. and val. errors')
+ax[2].plot([e[2] for e in his_params])
+ax[2].plot([e[3] for e in his_params])  
+
+ax[3].set_title('Tr. and val. acc')
+ax[3].plot([e[4] for e in his_params])
+ax[3].plot([e[5] for e in his_params])
+
+plt.savefig('res_cnn_aug_H{}_I{}'.format(n_hyper_iterations,T))
--- a/FAR-HO/test_fc.py
+++ b/FAR-HO/test_fc.py
@ -0,0 +1,133 @@
+#https://github.com/lucfra/FAR-HO/blob/master/far_ho/examples/autoMLDemos/Far-HO%20Demo%2C%20AutoML%202018%2C%20ICML%20workshop.ipynb
+import warnings
+warnings.filterwarnings("ignore")
+
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.layers as layers
+import far_ho as far
+import far_ho.examples as far_ex
+
+tf.logging.set_verbosity(tf.logging.ERROR)
+
+import matplotlib.pyplot as plt
+#import blue_utils as butil
+
+#Reset
+try:
+    sess.close()
+except: pass
+rnd = np.random.RandomState(1)
+tf.reset_default_graph()
+sess = tf.InteractiveSession()
+
+def get_data(data_split):
+    # load a small portion of mnist data
+    datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=data_split, reshape=True)
+    print("Data shape : ", datasets.train.dim_data, " / Label shape : ", datasets.train.dim_target)
+    [print("Nb samples : ", d.num_examples) for d in datasets]
+    return datasets.train, datasets.validation, datasets.test
+
+#Model
+# FC : reshape = True
+def g_logits(x,y, name='model'):
+    with tf.variable_scope(name):
+        h1 = layers.fully_connected(x, 300)
+        logits = layers.fully_connected(h1, int(y.shape[1]))
+    return logits
+
+#### Hyper-parametres ####
+n_hyper_iterations = 90
+T = 20  # Number of inner iterations
+rev_it =10
+hp_lr = 0.1
+epochs =10
+batch_size = 256
+##########################
+
+#MNIST
+x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x')
+y = tf.placeholder(tf.float32, shape=(None, 10), name='y')
+logits = g_logits(x, y)
+
+#CNN : reshape = False
+#x = tf.placeholder(dtype=tf.float32, shape=[None,28,28,1], name='x')
+#y = tf.placeholder(dtype=tf.float32, shape=[None,10], name='y')
+
+#logits = butil.cnn(x,y)
+
+train_set, validation_set, test_set = get_data(data_split=(.6, .3,))
+
+#butil.viz_data(train_set)
+
+# lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples))
+lr = far.get_hyperparameter('lr', initializer=1e-2, constraint=lambda t: tf.maximum(tf.minimum(t, 0.1), 1.e-7))
+mu = far.get_hyperparameter('mu', initializer=0.95, constraint=lambda t: tf.maximum(tf.minimum(t, .99), 1.e-5))
+#rho = far.get_hyperparameter('rho', initializer=0.00001, constraint=lambda t: tf.maximum(tf.minimum(t, 0.01), 0.))
+
+
+ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
+L = tf.reduce_mean(ce) #+ rho*tf.add_n([tf.reduce_sum(w**2) for w in tf.trainable_variables()]) #Retirer la seconde partie de la loss quand HP inutiles
+E = tf.reduce_mean(ce)
+
+accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32))
+
+inner_optimizer = far.MomentumOptimizer(lr, mu)
+#inner_optimizer = far.GradientDescentOptimizer(lr)
+outer_optimizer = tf.train.AdamOptimizer(hp_lr)
+hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it)
+hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer)#, global_step=tf.train.get_or_create_step())
+
+train_set_supplier = train_set.create_supplier(x, y, batch_size=batch_size)#, epochs=1)  # stochastic GD
+validation_set_supplier = validation_set.create_supplier(x, y)
+
+
+print('Hyper iterations par epochs',int(train_set.num_examples/batch_size*epochs/T))
+
+his_params = []
+
+tf.global_variables_initializer().run()
+
+for hyt in range(n_hyper_iterations):
+    hyper_step(T,
+               inner_objective_feed_dicts=train_set_supplier,
+               outer_objective_feed_dicts=validation_set_supplier,
+               _skip_hyper_ts=False)
+    res = sess.run(far.hyperparameters()) + [0, L.eval(train_set_supplier()), 
+                                             E.eval(validation_set_supplier()),
+                                             accuracy.eval(train_set_supplier()),
+                                             accuracy.eval(validation_set_supplier())]
+
+    his_params.append(res)
+
+    print('Hyper-it :',hyt,'/',n_hyper_iterations)
+    print('inner:', res[3])
+    print('outer:', res[4])
+    print('training accuracy:', res[5])
+    print('validation accuracy:', res[6])
+    #print('learning rate', lr.eval(), 'momentum', mu.eval(), 'l2 coefficient', rho.eval())
+    print('-'*50)
+
+test_set_supplier = test_set.create_supplier(x, y)
+print('Test accuracy:',accuracy.eval(test_set_supplier()))
+
+fig, ax = plt.subplots(ncols=4, figsize=(15, 3))
+ax[0].set_title('Learning rate')
+ax[0].plot([e[0] for e in his_params])
+    
+ax[1].set_title('Momentum factor')
+ax[1].plot([e[1] for e in his_params]) 
+    
+#ax[2].set_title('L2 regulariz.')
+#ax[2].plot([e[2] for e in his_params])
+ax[2].set_title('Tr. and val. acc')
+ax[2].plot([e[5] for e in his_params])
+ax[2].plot([e[6] for e in his_params])
+    
+ax[3].set_title('Tr. and val. errors')
+ax[3].plot([e[3] for e in his_params])
+ax[3].plot([e[4] for e in his_params])  
+
+plt.savefig('resultats/res_fc_H{}_I{}'.format(n_hyper_iterations,T))
+#plt.savefig('resultats/res_fc_H{}_I{}_noHyp'.format(n_hyper_iterations,T))
--- a/Gradient-Descent-The-Ultimate-Optimizer/.gitignore
+++ b/Gradient-Descent-The-Ultimate-Optimizer/.gitignore
@ -0,0 +1,5 @@
+venv/
+__pycache__
+data/
+log/
+.vscode/
--- a/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf
+++ b/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf
--- a/Gradient-Descent-The-Ultimate-Optimizer/README.md
+++ b/Gradient-Descent-The-Ultimate-Optimizer/README.md
@ -0,0 +1,33 @@
+# Gradient Descent: The Ultimate Optimizer
+
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
+
+| ⚠️ WARNING: THIS IS NOT MY WORK ⚠️ |
+| --- |
+
+This repository contains the paper and code to the paper [Gradient Descent:
+The Ultimate Optimizer](https://arxiv.org/abs/1909.13371).
+
+I couldn't find the code (which is found in the appendix at the end of the
+paper) anywhere on the web. What I present here is the code of the paper with
+instructions on how to set it up.
+
+Getting the code in a runnable state required some fixes on my part so the
+code might be slightly different than that presented in the paper.
+
+## Set up 
+
+```sh
+git clone https://github.com/Rainymood/Gradient-Descent-The-Ultimate-Optimizer 
+cd Gradient-Descent-The-Ultimate-Optimizer
+virtualenv -p python3 venv
+source venv/bin/activate
+pip install -r requirements.txt
+python main.py
+```
+
+When you are done you can exit the virtualenv with 
+
+```shell
+deactivate
+```
--- a/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py
@ -0,0 +1,244 @@
+from hyperopt import *
+#from hyperopt_v2 import *
+
+import torchvision.transforms.functional as TF
+import torchvision.transforms as T
+
+#from scipy import ndimage
+import kornia
+
+import random
+
+
+class MNIST_FullyConnected_Augmented(Optimizable):
+    """
+    A fully-connected NN for the MNIST task. This is Optimizable but not itself
+    an optimizer.
+    """
+
+    def __init__(self, num_inp, num_hid, num_out, optimizer, device = torch.device('cuda')):
+        self.device = device
+        #print(self.device)
+        parameters = {
+            "w1": torch.zeros(num_inp, num_hid, device=self.device).t(),
+            "b1": torch.zeros(num_hid, device=self.device).t(),
+            "w2": torch.zeros(num_hid, num_out, device=self.device).t(),
+            "b2": torch.zeros(num_out, device=self.device).t(),
+
+            #Data augmentation
+            "prob": torch.tensor(0.5, device=self.device),
+            "mag": torch.tensor(180.0, device=self.device),
+        }
+        super().__init__(parameters, optimizer)
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
+        self.optimizer.initialize()
+        #print(self.device)
+
+    def forward(self, x):
+        """Compute a prediction."""
+        #print("Prob:",self.parameters["prob"].item())
+        if random.random() < self.parameters["prob"]:
+            #angle = 45
+            #x = TF.rotate(x, angle)
+            #print(self.device)
+            #x = F.linear(x, torch.ones(28*28, 28*28, device=self.device).t()*self.parameters["mag"], bias=None)
+            x = x + self.parameters["mag"]
+
+        x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
+        x = torch.tanh(x)
+        x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
+        x = torch.tanh(x)
+        x = F.log_softmax(x, dim=1)
+        return x
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def __str__(self):
+        return "mnist_FC_augmented / " + str(self.optimizer)
+
+class LeNet(Optimizable, nn.Module):
+    def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')):
+        nn.Module.__init__(self)
+        self.device = device
+        parameters = {
+            "w1": torch.zeros(20, num_inp, 5, 5, device=self.device),
+            "b1": torch.zeros(20, device=self.device),
+            "w2": torch.zeros(50, 20, 5, 5, device=self.device),
+            "b2": torch.zeros(50, device=self.device),
+            "w3": torch.zeros(500,4*4*50, device=self.device),
+            "b3": torch.zeros(500, device=self.device),
+            "w4": torch.zeros(10, 500, device=self.device),
+            "b4": torch.zeros(10, device=self.device),
+
+            #Data augmentation
+            "prob": torch.tensor(1.0, device=self.device),
+            "mag": torch.tensor(180.0, device=self.device),
+        }
+        super().__init__(parameters, optimizer)
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w3"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w4"], a=math.sqrt(5))
+        self.optimizer.initialize()
+
+    def forward(self, x):
+
+        if random.random() < self.parameters["prob"]:
+            
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self.parameters["mag"] # in degrees
+            angle = torch.ones(batch_size, device=self.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=self.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=self.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self.parameters["w4"], self.parameters["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def __str__(self):
+        return "mnist_CNN_augmented / " + str(self.optimizer)
+
+class LeNet_v2(Optimizable, nn.Module):
+    def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')):
+        
+        nn.Module.__init__(self)
+        self.device = device
+        self.conv1 = nn.Conv2d(num_inp, 20, 5, 1)
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        #self.fc1 = nn.Linear(4*4*50, 500)
+        self.fc1 = nn.Linear(1250, 500)
+        self.fc2 = nn.Linear(500, 10)
+
+        #print(self.conv1.weight)
+        parameters = {
+            "w1": self.conv1.weight,
+            "b1": self.conv1.bias,
+            "w2": self.conv2.weight,
+            "b2": self.conv2.bias,
+            "w3": self.fc1.weight,
+            "b3": self.fc1.bias,
+            "w4": self.fc2.weight,
+            "b4": self.fc2.bias,
+
+            #Data augmentation
+            "prob": torch.tensor(0.5, device=self.device),
+            "mag": torch.tensor(1.0, device=self.device),
+        }
+        Optimizable.__init__(self, parameters, optimizer)
+
+    '''
+    def forward(self, x): #Sature la memoire ???
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        #x = x.view(-1, 4*4*50)
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+    '''
+    def forward(self, x):
+
+        if random.random() < self.parameters["prob"].item():
+            #print(self.parameters["prob"])
+            #x = [T.ToTensor()(
+            #        TF.affine(img=T.ToPILImage()(im), angle=self.parameters["mag"], translate=(0,0), scale=1, shear=0, resample=0, fillcolor=None))
+            #    for im in torch.unbind(x,dim=0)]
+            #x = torch.stack(x,dim=0)
+
+            #x = [ndimage.rotate(im, self.parameters["mag"], reshape=False)
+            #    for im in torch.unbind(x,dim=0)]
+            #x = torch.stack(x,dim=0)
+
+            #x = [im + self.parameters["mag"]
+            #    for im in torch.unbind(x,dim=0)]
+            #x = torch.stack(x,dim=0)
+            
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self.parameters["mag"] * 180 # in degrees
+            angle = torch.ones(batch_size, device=self.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=self.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=self.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self.parameters["w4"], self.parameters["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+    
+    def initialize(self):
+        self.optimizer.initialize()
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def adjust_val(self):
+        self.optimizer.adjust_val(self.parameters)
+
+    def eval(self):
+        self.parameters['prob']=torch.tensor(0.0, device=self.device)
+
+    def __str__(self):
+        return "mnist_CNN_augmented / " + str(self.optimizer)
--- a/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py
@ -0,0 +1,52 @@
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+
+class MNIST_aug(Dataset):
+
+    training_file = 'training.pt'
+    test_file = 'test.pt'
+    classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
+               '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
+               
+    def __init__(self):
+        self.images = [TF.to_pil_image(x) for x in torch.ByteTensor(10, 3, 48, 48)]
+        self.set_stage(0) # initial stage
+        
+    def __getitem__(self, index):
+        image = self.images[index]
+        
+        # Just apply your transformations here
+        image = self.crop(image)
+        x = TF.to_tensor(image)
+        return x
+        
+    def set_stage(self, stage):
+        if stage == 0:
+            print('Using (32, 32) crops')
+            self.crop = transforms.RandomCrop((32, 32))
+        elif stage == 1:
+            print('Using (28, 28) crops')
+            self.crop = transforms.RandomCrop((28, 28))
+        
+    def __len__(self):
+        return len(self.images)
+
+
+dataset = MyData()
+loader = DataLoader(dataset,
+                    batch_size=2,
+                    num_workers=2,
+                    shuffle=True)
+
+for batch_idx, data in enumerate(loader):
+    print('Batch idx {}, data shape {}'.format(
+        batch_idx, data.shape))
+    
+loader.dataset.set_stage(1)
+
+for batch_idx, data in enumerate(loader):
+    print('Batch idx {}, data shape {}'.format(
+        batch_idx, data.shape))
+
--- a/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py
@ -0,0 +1,150 @@
+#from hyperopt import *
+from hyperopt_v2 import *
+
+import torchvision.transforms.functional as TF
+import torchvision.transforms as T
+
+#from scipy import ndimage
+import kornia
+
+import random
+
+
+class LeNet_v3(nn.Module):
+    def __init__(self, num_inp, num_out):
+        super(LeNet_v3, self).__init__()
+        self.params = nn.ParameterDict({
+            'w1': nn.Parameter(torch.zeros(20, num_inp, 5, 5)),
+            'b1': nn.Parameter(torch.zeros(20)),
+            'w2': nn.Parameter(torch.zeros(50, 20, 5, 5)),
+            'b2': nn.Parameter(torch.zeros(50)),
+            'w3': nn.Parameter(torch.zeros(500,4*4*50)),
+            'b3': nn.Parameter(torch.zeros(500)),
+            'w4': nn.Parameter(torch.zeros(10, 500)),
+            'b4': nn.Parameter(torch.zeros(10))
+        })
+
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.params["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.params["w2"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.params["w3"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.params["w4"], a=math.sqrt(5))
+
+    def forward(self, x):
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self.params["w1"], bias=self.params["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self.params["w2"], bias=self.params["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self.params["w3"], self.params["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self.params["w4"], self.params["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+
+
+    def print_grad_fn(self):
+        for n, p in self.params.items():
+            print(n, p.grad_fn)
+
+    def __str__(self):
+        return "mnist_CNN_augmented / "
+
+class Data_aug(nn.Module):
+    def __init__(self):
+        super(Data_aug, self).__init__()
+        self.data_augmentation = True
+        self.params = nn.ParameterDict({
+            "prob": nn.Parameter(torch.tensor(0.5)),
+            "mag": nn.Parameter(torch.tensor(180.0))
+        })
+
+        #self.params["mag"].register_hook(print)
+
+    def forward(self, x):
+
+        if self.data_augmentation and self.training and random.random() < self.params["prob"]:
+            #print('Aug')
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self.params["mag"] # in degrees
+            angle = torch.ones(batch_size, device=x.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=x.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=x.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        return x
+
+    def eval(self):
+        self.params['prob']=torch.tensor(0.0, device=self.device)
+        nn.Module.eval(self)
+
+    def data_augmentation(self, mode=True):
+        self.data_augmentation=mode
+
+    def print_grad_fn(self):
+        for n, p in self.params.items():
+            print(n, p.grad_fn)
+
+    def __str__(self):
+        return "Data_Augmenter / "
+
+class Augmented_model(nn.Module):
+    def __init__(self, model, data_augmenter):
+        #self.model = model
+        #self.data_aug = data_augmenter
+        super(Augmented_model, self).__init__()#nn.Module.__init__(self)
+        #super().__init__()
+        self.mods = nn.ModuleDict({
+            'data_aug': data_augmenter,
+            'model': model
+            })
+        #for name, param in self.mods.named_parameters():
+        #    print(name, type(param.data), param.size())
+
+        #params = self.mods.named_parameters() #self.parameters()
+        #parameters = [param for param in self.model.parameters()] + [param for param in self.data_aug.parameters()] 
+        #Optimizable.__init__(self, params, optimizer)
+
+    def initialize(self):
+        self.mods['model'].initialize()
+
+    def forward(self, x):
+        return self.mods['model'](self.mods['data_aug'](x))
+
+    #def adjust(self):
+    #    self.optimizer.adjust(self) #Parametres des dict
+
+    def data_augmentation(self, mode=True):
+        self.mods['data_aug'].data_augmentation=mode
+
+    def begin(self):
+        for param in self.parameters():
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+
+    def print_grad_fn(self):
+        for n, m in self.mods.items():
+            m.print_grad_fn()
+
+    def __str__(self):
+        return str(self.mods['data_aug'])+ str(self.mods['model'])# + str(self.optimizer)
--- a/Gradient-Descent-The-Ultimate-Optimizer/graph/graph
+++ b/Gradient-Descent-The-Ultimate-Optimizer/graph/graph
@ -0,0 +1,5 @@
+digraph {
+	graph [size="12,12"]
+	node [align=left fontsize=12 height=0.2 ranksep=0.1 shape=box style=filled]
+	94296775052080 [label=NoneType fillcolor=darkolivegreen1]
+}
--- a/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg
+++ b/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg
@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="75pt" height="30pt"
+ viewBox="0.00 0.00 74.65 30.40" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 26.4)">
+<title>%3</title>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-26.4 70.6472,-26.4 70.6472,4 -4,4"/>
+<!-- 94296775052080 -->
+<g id="node1" class="node">
+<title>94296775052080</title>
+<polygon fill="#caff70" stroke="#000000" points="66.4717,-22.6036 .1755,-22.6036 .1755,.2036 66.4717,.2036 66.4717,-22.6036"/>
+<text text-anchor="middle" x="33.3236" y="-7.6" font-family="Times,serif" font-size="12.00" fill="#000000">NoneType</text>
+</g>
+</g>
+</svg>
--- a/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py
@ -0,0 +1,345 @@
+import math
+import torch
+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+class Optimizable():#nn.Module):
+    """
+    This is the interface for anything that has parameters that need to be
+    optimized, somewhat like torch.nn.Model but with the right plumbing for
+    hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
+    interface which does not give us enough control about the detachments.)
+    Nominal operation of an Optimizable at the lowest level is as follows:
+        o = MyOptimizable(…)
+        o.initialize()
+        loop {
+            o.begin()
+            o.zero_grad()
+            loss = –compute loss function from parameters–
+            loss.backward()
+            o.adjust()
+        }
+    Optimizables recursively handle updates to their optimiz*ers*.
+    """
+    #def __init__(self):
+    #    super(Optimizable, self).__init__()
+    #    self.parameters = nn.Parameter(torch.zeros(()))
+
+    def __init__(self, parameters, optimizer):
+        #super(Optimizable, self).__init__()
+        self.parameters = parameters  # a dict mapping names to tensors
+        self.optimizer = optimizer  # which must itself be Optimizable!
+        self.all_params_with_gradients = []
+        #self.device = device
+
+    def initialize(self):
+        """Initialize parameters, e.g. with a Kaiming initializer."""
+        pass
+
+    def begin(self):
+        """Enable gradient tracking on current parameters."""
+        self.all_params_with_gradients = [] #Reintialisation pour eviter surcharge de la memoire
+        for name, param in self.parameters.items():
+        #for param in self.parameters:
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+            #param.to(self.device)
+            #if param.device == torch.device('cuda:0'):
+            #    print(name, param.device)
+            self.all_params_with_gradients.append(param)
+        self.optimizer.begin()
+
+    def zero_grad(self):
+        """ Set all gradients to zero. """
+        for param in self.all_params_with_gradients:
+            #param = param.to(self.device)
+            param.grad = torch.zeros(param.shape, device=param.device)
+        self.optimizer.zero_grad()
+
+    """ Note: at this point you would probably call .backwards() on the loss
+    function. """
+
+    def adjust(self):
+        """ Update parameters """
+        pass
+
+
+    def print_grad_fn(self):
+        self.optimizer.print_grad_fn()
+        for n, p in self.parameters.items():
+                print(n," - ", p.grad_fn)
+
+    def param_grad(self):
+        return self.all_params_with_gradients
+
+    def param(self, param_name):
+        return self.parameters[param_name].item()
+
+
+class MNIST_FullyConnected(Optimizable):
+    """
+    A fully-connected NN for the MNIST task. This is Optimizable but not itself
+    an optimizer.
+    """
+
+    def __init__(self, num_inp, num_hid, num_out, optimizer):
+        parameters = {
+            "w1": torch.zeros(num_inp, num_hid).t(),
+            "b1": torch.zeros(num_hid).t(),
+            "w2": torch.zeros(num_hid, num_out).t(),
+            "b2": torch.zeros(num_out).t(),
+        }
+        super().__init__(parameters, optimizer)
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
+        self.optimizer.initialize()
+
+    def forward(self, x):
+        """Compute a prediction."""
+        x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
+        x = torch.tanh(x)
+        x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
+        x = torch.tanh(x)
+        x = F.log_softmax(x, dim=1)
+        return x
+
+    def adjust(self):
+        self.optimizer.adjust(self.parameters)
+
+    def __str__(self):
+        return "mnist / " + str(self.optimizer)
+
+
+class NoOpOptimizer(Optimizable):#, nn.Module):
+    """
+    NoOpOptimizer sits on top of a stack, and does not affect what lies below.
+    """
+
+    def __init__(self):
+        #super(Optimizable, self).__init__()
+        pass
+
+    def initialize(self):
+        pass
+
+    def begin(self):
+        pass
+
+    def zero_grad(self):
+        pass
+
+    def adjust(self, params):
+        pass
+
+    def adjust_val(self, params):
+        pass
+
+    def print_grad_fn(self):
+        pass
+
+    def __str__(self):
+        return "static"
+
+class Adam(Optimizable):
+    """
+    A fully hyperoptimizable Adam optimizer
+    """
+
+    def clamp(x):
+        return (x.tanh() + 1.0) / 2.0
+
+    def unclamp(y):
+        z = y * 2.0 - 1.0
+        return ((1.0 + z) / (1.0 - z)).log() / 2.0
+
+    def __init__(
+        self,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        log_eps=-8.0,
+        optimizer=NoOpOptimizer(),
+        device = torch.device('cuda')
+    ):
+        self.device = device
+        parameters = {
+            "alpha": torch.tensor(alpha, device=self.device),
+            "beta1": Adam.unclamp(torch.tensor(beta1, device=self.device)),
+            "beta2": Adam.unclamp(torch.tensor(beta2, device=self.device)),
+            "log_eps": torch.tensor(log_eps, device=self.device),
+        }
+        super().__init__(parameters, optimizer)
+        self.num_adjustments = 0
+        self.num_adjustments_val = 0
+        self.cache = {}
+
+        for name, param in parameters.items():
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+            #param.to(self.device)
+            #if param.device == torch.device('cuda:0'):
+            #    print(name, param.device)
+
+    def adjust(self, params): #Update param d'apprentissage
+        self.num_adjustments += 1
+        self.optimizer.adjust(self.parameters)
+        #print('Adam update')
+        t = self.num_adjustments
+        beta1 = Adam.clamp(self.parameters["beta1"])
+        beta2 = Adam.clamp(self.parameters["beta2"])
+        for name, param in params.items():
+            if name == "mag": continue
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape, device=self.device),
+                    "v": torch.zeros(param.shape, device=self.device)
+                    + 10.0 ** self.parameters["log_eps"].data
+                    # NOTE that we add a little ‘fudge factor' here because sqrt is not
+                    # differentiable at exactly zero
+                }
+            #print(name, param.device)
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(m)
+            self.all_params_with_gradients.append(v)
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
+            params[name] = param.detach() - self.parameters["alpha"] * dparam
+            #print(name)
+
+    def adjust_val(self, params): #Update param Transformations
+        self.num_adjustments_val += 1
+        self.optimizer.adjust_val(self.parameters)
+        #print('Adam update')
+        t = self.num_adjustments_val
+        beta1 = Adam.clamp(self.parameters["beta1"])
+        beta2 = Adam.clamp(self.parameters["beta2"])
+        for name, param in params.items():
+            if name != "mag": continue
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape, device=self.device),
+                    "v": torch.zeros(param.shape, device=self.device)
+                    + 10.0 ** self.parameters["log_eps"].data
+                    # NOTE that we add a little ‘fudge factor' here because sqrt is not
+                    # differentiable at exactly zero
+                }
+            #print(name, param.device)
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(m)
+            self.all_params_with_gradients.append(v)
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
+            params[name] = param.detach() - self.parameters["alpha"] * dparam
+            #print(name)
+
+    def __str__(self):
+        return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
+'''
+class SGD(Optimizable):
+    """
+    A hyperoptimizable SGD
+    """
+
+    def __init__(self, alpha=0.01, optimizer=NoOpOptimizer()):
+        parameters = {"alpha": torch.tensor(alpha)}
+        super().__init__(parameters, optimizer)
+
+    def adjust(self, params):
+        self.optimizer.adjust(self.parameters)
+        for name, param in params.items():
+            g = param.grad.detach()
+            params[name] = param.detach() - g * self.parameters["alpha"]
+
+    def __str__(self):
+        return "sgd(%f) / " % self.parameters["alpha"] + str(self.optimizer)
+        
+class SGDPerParam(Optimizable):
+    """
+    Like above, but can be taught a separate step size for each parameter it
+    tunes.
+    """
+
+    def __init__(self, alpha=0.01, params=[], optimizer=NoOpOptimizer()):
+        parameters = {name + "_alpha": torch.tensor(alpha) for name in params}
+        super().__init__(parameters, optimizer)
+
+    def adjust(self, params):
+        self.optimizer.adjust(self.parameters)
+        for name, param in params.items():
+            g = param.grad.detach()
+            params[name] = param.detach() - g * self.parameters[name + "_alpha"]
+
+    def __str__(self):
+        return "sgd(%s) / " % str(
+            {k: t.item() for k, t in self.parameters.items()}
+        ) + str(self.optimizer)
+'''
+'''
+class AdamBaydin(Optimizable):
+    """ Same as above, but only optimizes the learning rate, treating the
+    remaining hyperparameters as constants. """
+
+    def __init__(
+        self,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        log_eps=-8.0,
+        optimizer=NoOpOptimizer(),
+    ):
+        parameters = {"alpha": torch.tensor(alpha)}
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.log_eps = log_eps
+        super().__init__(parameters, optimizer)
+        self.num_adjustments = 0
+        self.cache = {}
+
+    def adjust(self, params):
+        self.num_adjustments += 1
+        self.optimizer.adjust(self.parameters)
+        t = self.num_adjustments
+        beta1 = self.beta1
+        beta2 = self.beta2
+        for name, param in params.items():
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape),
+                    "v": torch.zeros(param.shape) + 10.0 ** self.log_eps,
+                }
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(m)
+            self.all_params_with_gradients.append(v)
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.log_eps)
+            params[name] = param.detach() - self.parameters["alpha"] * dparam
+
+    def __str__(self):
+        return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
+'''
--- a/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py
@ -0,0 +1,296 @@
+import math
+import torch
+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim.optimizer import Optimizer
+
+class Optimizable():
+    """
+    This is the interface for anything that has parameters that need to be
+    optimized, somewhat like torch.nn.Model but with the right plumbing for
+    hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
+    interface which does not give us enough control about the detachments.)
+    Nominal operation of an Optimizable at the lowest level is as follows:
+        o = MyOptimizable(…)
+        o.initialize()
+        loop {
+            o.begin()
+            o.zero_grad()
+            loss = –compute loss function from parameters–
+            loss.backward()
+            o.adjust()
+        }
+    Optimizables recursively handle updates to their optimiz*ers*.
+    """
+    #def __init__(self):
+    #    super(Optimizable, self).__init__()
+    #    self.parameters = nn.Parameter(torch.zeros(()))
+
+    def __init__(self, parameters, optimizer):
+        self.params = parameters  # a dict mapping names to tensors
+        self.optimizer = optimizer  # which must itself be Optimizable!
+        self.all_params_with_gradients = []
+        #self.device = device
+
+    def initialize(self):
+        """Initialize parameters, e.g. with a Kaiming initializer."""
+        pass
+
+    def begin(self):
+        """Enable gradient tracking on current parameters."""
+        self.all_params_with_gradients = nn.ParameterList() #Reintialisation pour eviter surcharge de la memoire
+        print("Opti param :", type(self.params))
+        #for name, param in self.params:
+        if isinstance(self.params,dict): #Dict
+            for name, param in self.params:
+                param.requires_grad_()  # keep gradient information…
+                param.retain_grad()  # even if not a leaf…
+                self.all_params_with_gradients.append(param)
+        if isinstance(self.params,list): #List
+            for param in self.params:
+                param.requires_grad_()  # keep gradient information…
+                param.retain_grad()  # even if not a leaf…
+                self.all_params_with_gradients.append(param)
+        self.optimizer.begin()
+
+    def zero_grad(self):
+        """ Set all gradients to zero. """
+        for param in self.all_params_with_gradients:
+            param.grad = torch.zeros(param.shape, device=param.device)
+        self.optimizer.zero_grad()
+
+    """ Note: at this point you would probably call .backwards() on the loss
+    function. """
+
+    def adjust(self):
+        """ Update parameters """
+        pass
+
+
+class NoOpOptimizer(Optimizable):#, nn.Module):
+    """
+    NoOpOptimizer sits on top of a stack, and does not affect what lies below.
+    """
+
+    def __init__(self):
+        #super(Optimizable, self).__init__()
+        pass
+
+    def initialize(self):
+        pass
+
+    def begin(self):
+        #print("NoOpt begin")
+        pass
+
+    def zero_grad(self):
+        pass
+
+    def adjust(self, params):
+        pass
+
+    def step(self):
+        pass
+
+    def print_grad_fn(self):
+        pass
+        
+    def __str__(self):
+        return "static"
+
+
+class SGD(Optimizer, nn.Module): #Eviter Optimizer
+    """
+    A hyperoptimizable SGD
+    """
+
+    def __init__(self, params, lr=0.01, height=0):
+        self.height=height
+        #params : a optimiser
+        #reste (defaults) param de l'opti
+        print('SGD - H', height)
+        nn.Module.__init__(self)
+
+        optim_keys = ('lr','') #A mettre dans Optimizable ? #'' pour eviter iteration dans la chaine de charactere...
+        '''
+        self_params = {"lr": torch.tensor(lr),
+                        "momentum": 0,
+                        "dampening":0,
+                        "weight_decay":0,
+                        "nesterov": False}
+        '''
+        #self_params = dict(lr=torch.tensor(lr), 
+        #                    momentum=0, dampening=0, weight_decay=0, nesterov=False)
+
+        self_params = nn.ParameterDict({
+            "lr": nn.Parameter(torch.tensor(lr)),
+            "momentum": nn.Parameter(torch.tensor(0.0)),
+            "dampening": nn.Parameter(torch.tensor(0.0)),
+            "weight_decay": nn.Parameter(torch.tensor(0.0)),
+        })
+
+        for k in self_params.keys() & optim_keys:
+            self_params[k].requires_grad_()  # keep gradient information…
+            self_params[k].retain_grad()  # even if not a leaf…
+            #self_params[k].register_hook(print)
+
+        if height==0:
+            optimizer = NoOpOptimizer()
+        else:
+            #def dict_generator(): yield {k: self_params[k] for k in self_params.keys() & optim_keys}
+            #(dict for dict in {k: self_params[k] for k in self_params.keys() & optim_keys}) #Devrait mar
+            optimizer = SGD(params=(self_params[k]for k in self_params.keys() & optim_keys), lr=lr, height=height-1)
+            #optimizer.register_backward_hook(print)
+
+        self.optimizer = optimizer
+        #if(height==0):
+        #    for n,p in params.items():
+        #        print(n,p)
+
+        #Optimizable.__init__(self, self_params, optimizer)
+
+        #print(type(params))
+        #for p in params:
+        #    print(type(p))
+        Optimizer.__init__(self, params, self_params)
+
+        for group in self.param_groups:
+            for p in group['params']:
+                print(type(p.data), p.size())
+        print('End SGD-H', height)  
+
+    def begin(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                #print(type(p.data), p.size())
+                p.requires_grad_()  # keep gradient information…
+                p.retain_grad()  # even if not a leaf…
+                #p.register_hook(lambda x: print(self.height, x.grad_fn))
+
+        self.optimizer.begin()
+
+    def print_grad_fn(self):
+        self.optimizer.print_grad_fn()
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                print(self.height," - ", i, p.grad_fn)
+
+    #def adjust(self, params):
+    #    self.optimizer.adjust(self.params)
+    #    for name, param in params.items():
+    #        g = param.grad.detach()
+    #        params[name] = param.detach() - g * self.params["lr"]
+
+    def step(self):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        print('SGD start')
+        self.optimizer.step()
+
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                if p.grad is None:
+                    continue
+                #d_p = p.grad.data
+                d_p = p.grad.detach()
+
+                #print(group['lr'])
+                p.data.add_(-group['lr'].item(), d_p)
+                #group['params'][i] = p.detach() - d_p * group['lr']
+                p.data-= group['lr']*d_p #Data ne pas utiliser perte info
+
+            for p in group['params']:
+                if p.grad is None:
+                    print(p, p.grad)
+                    continue
+
+        print("SGD end")
+        #return loss
+
+    def __str__(self):
+        return "sgd(%f) / " % self.params["lr"] + str(self.optimizer)
+
+
+class Adam(Optimizable, nn.Module):
+    """
+    A fully hyperoptimizable Adam optimizer
+    """
+
+    def clamp(x):
+        return (x.tanh() + 1.0) / 2.0
+
+    def unclamp(y):
+        z = y * 2.0 - 1.0
+        return ((1.0 + z) / (1.0 - z)).log() / 2.0
+
+    def __init__(
+        self,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        log_eps=-8.0,
+        optimizer=NoOpOptimizer(),
+        device = torch.device('cuda')
+    ):
+        #super(Adam, self).__init__()
+        nn.Module.__init__(self)
+        self.device = device
+        params = nn.ParameterDict({
+            "alpha": nn.Parameter(torch.tensor(alpha, device=self.device)),
+            "beta1": nn.Parameter(Adam.unclamp(torch.tensor(beta1, device=self.device))),
+            "beta2": nn.Parameter(Adam.unclamp(torch.tensor(beta2, device=self.device))),
+            "log_eps": nn.Parameter(torch.tensor(log_eps, device=self.device)),
+        })
+        Optimizable.__init__(self, params, optimizer)
+        self.num_adjustments = 0
+        self.cache = {}
+
+        for name, param in params.items():
+            param.requires_grad_()  # keep gradient information…
+            param.retain_grad()  # even if not a leaf…
+
+    def adjust(self, params, pytorch_mod=False):
+        self.num_adjustments += 1
+        self.optimizer.adjust(self.params)
+        t = self.num_adjustments
+        beta1 = Adam.clamp(self.params["beta1"])
+        beta2 = Adam.clamp(self.params["beta2"])
+
+        updated_param = []
+        if pytorch_mod:
+            params = params.named_parameters(prefix='') #Changer nom d'input...
+
+        for name, param in params:
+            if name not in self.cache:
+                self.cache[name] = {
+                    "m": torch.zeros(param.shape, device=self.device),
+                    "v": torch.zeros(param.shape, device=self.device)
+                    + 10.0 ** self.params["log_eps"].data
+                    # NOTE that we add a little ‘fudge factor' here because sqrt is not
+                    # differentiable at exactly zero
+                }
+            #print(name, param.device)
+            g = param.grad.detach()
+            self.cache[name]["m"] = m = (
+                beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
+            )
+            self.cache[name]["v"] = v = (
+                beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
+            )
+            self.all_params_with_gradients.append(nn.Parameter(m)) #Risque de surcharger la memoire => Dict mieux ?
+            self.all_params_with_gradients.append(nn.Parameter(v))
+            m_hat = m / (1.0 - beta1 ** float(t))
+            v_hat = v / (1.0 - beta2 ** float(t))
+            dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.params["log_eps"])
+            updated_param[name] = param.detach() - self.params["alpha"] * dparam
+
+        if pytorch_mod: params.update(updated_param) #Changer nom d'input...
+        else: params = updated_param
+
+    def __str__(self):
+        return "adam(" + str(self.params) + ") / " + str(self.optimizer)
--- a/Gradient-Descent-The-Ultimate-Optimizer/main.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/main.py
@ -0,0 +1,182 @@
+import numpy as np
+import json, math, time, os
+from hyperopt import *
+import gc
+
+BATCH_SIZE = 300
+
+mnist_train = torchvision.datasets.MNIST(
+    "./data", train=True, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+mnist_test = torchvision.datasets.MNIST(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+dl_train = torch.utils.data.DataLoader(
+    mnist_train, batch_size=BATCH_SIZE, shuffle=False
+)
+dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)
+
+
+def test(model):
+    for i, (features_, labels_) in enumerate(dl_test):
+        features, labels = torch.reshape(features_, (10000, 28 * 28)), labels_
+        pred = model.forward(features)
+        return pred.argmax(dim=1).eq(labels).sum().item() / 10000 * 100
+
+
+def train(model, epochs=3, height=1):
+    stats = []
+    for epoch in range(epochs):
+        for i, (features_, labels_) in enumerate(dl_train):
+            t0 = time.process_time()
+            model.begin()
+            features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
+            pred = model.forward(
+                features
+            )  # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/
+            loss = F.nll_loss(pred, labels)
+            model.zero_grad()
+            loss.backward(create_graph=True)
+            model.adjust()
+            tf = time.process_time()
+            data = {
+                "time": tf - t0,
+                "iter": epoch * len(dl_train) + i,
+                "loss": loss.item(),
+                "params": {
+                    k: v.item()
+                    for k, v in model.optimizer.parameters.items()
+                    if "." not in k
+                },
+            }
+            stats.append(data)
+    return stats
+
+
+def run(opt, name="out", usr={}, epochs=3, height=1):
+    torch.manual_seed(0x42)
+    model = MNIST_FullyConnected(28 * 28, 128, 10, opt)
+    print("Running...", str(model))
+    model.initialize()
+    log = train(model, epochs, height)
+    acc = test(model)
+    out = {"acc": acc, "log": log, "usr": usr}
+    with open("log/%s.json" % name, "w+") as f:
+        json.dump(out, f, indent=True)
+    times = [x["time"] for x in log]
+    print("Times (ms):", np.mean(times), "+/-", np.std(times))
+    print("Final accuracy:", acc)
+    return out
+
+
+def sgd_experiments():
+    run(SGD(0.01), "sgd", epochs=1)
+    out = run(SGD(0.01, optimizer=SGD(0.01)), "sgd+sgd", epochs=1)
+    alpha = out["log"][-1]["params"]["alpha"]
+    print(alpha)
+    run(SGD(alpha), "sgd-final", epochs=1)
+
+
+def adam_experiments():
+    run(Adam(), "adam", epochs=1)
+    print()
+    mo = SGDPerParam(
+        0.001, ["alpha", "beta1", "beta2", "log_eps"], optimizer=SGD(0.0001)
+    )
+    out = run(Adam(optimizer=mo), "adam+sgd", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    beta1 = Adam.clamp(torch.tensor(p["beta1"])).item()
+    beta2 = Adam.clamp(torch.tensor(p["beta2"])).item()
+    log_eps = p["log_eps"]
+    print(alpha, beta1, beta2, log_eps)
+    print(mo)
+    run(
+        Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps),
+        "adam+sgd-final",
+        epochs=1,
+    )
+    print()
+    out = run(Adam(optimizer=Adam()), "adam2", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    beta1 = Adam.clamp(torch.tensor(p["beta1"])).item()
+    beta2 = Adam.clamp(torch.tensor(p["beta2"])).item()
+    log_eps = p["log_eps"]
+    print(alpha, beta1, beta2, log_eps)
+    run(
+        Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps),
+        "adam2-final",
+        epochs=1,
+    )
+    print()
+    mo = SGDPerParam(0.001, ["alpha"], optimizer=SGD(0.0001))
+    out = run(AdamBaydin(optimizer=mo), "adambaydin+sgd", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    print(alpha)
+    print(mo)
+    run(Adam(alpha=p["alpha"]), "adambaydin+sgd-final", epochs=1)
+    print()
+    out = run(AdamBaydin(optimizer=Adam()), "adambaydin2", epochs=1)
+    p = out["log"][-1]["params"]
+    alpha = p["alpha"]
+    print(alpha)
+    run(Adam(alpha=p["alpha"]), "adambaydin2-final", epochs=1)
+
+
+def surface():
+    run(SGD(10 ** -3, optimizer=SGD(10 ** -1)), "tst", epochs=1)
+    for log_alpha in np.linspace(-3, 2, 10):
+        run(SGD(10 ** log_alpha), "sgd@1e%+.2f" % log_alpha, epochs=1)
+
+
+def make_sgd_stack(height, top):
+    if height == 0:
+        return SGD(alpha=top)
+    return SGD(alpha=top, optimizer=make_sgd_stack(height - 1, top))
+
+
+def make_adam_stack(height, top=0.0000001):
+    if height == 0:
+        return Adam(alpha=top)
+    return Adam(alpha=top, optimizer=make_adam_stack(height - 1))
+
+
+def stack_test():
+    for top in np.linspace(-7, 3, 20):
+        for height in range(6):
+            print("height =", height, "to p=", top)
+            opt = make_sgd_stack(height, 10 ** top)
+            run(
+                opt,
+                "metasgd3-%d@%+.2f" % (height, top),
+                {"height": height, "top": top},
+                epochs=1,
+                height=height,
+            )
+            gc.collect()
+
+
+def perf_test():
+    for h in range(51):
+        print("height:", h)
+        # opt = make_sgd_stack(h, 0.01)
+        opt = make_adam_stack(h)
+        run(opt, "adamperf-%d" % h, {"height": h}, epochs=1)
+        gc.collect()
+
+
+if __name__ == "__main__":
+    try:
+        os.mkdir("log")
+    except:
+        print("log/ exists already")
+
+    surface()
+    sgd_experiments()
+    adam_experiments()
+    stack_test()
+    perf_test()
--- a/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt
+++ b/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt
@ -0,0 +1,5 @@
+numpy==1.17.2
+Pillow==6.2.0
+six==1.12.0
+torch==1.2.0
+torchvision==0.4.0
--- a/Gradient-Descent-The-Ultimate-Optimizer/tests.py
+++ b/Gradient-Descent-The-Ultimate-Optimizer/tests.py
@ -0,0 +1,344 @@
+import numpy as np
+import json, math, time, os
+from data_aug import *
+#from data_aug_v2 import *
+import gc
+
+import matplotlib.pyplot as plt
+from torchviz import make_dot, make_dot_from_trace
+
+from torch.utils.data import SubsetRandomSampler
+
+BATCH_SIZE = 300
+#TEST_SIZE = 10000
+TEST_SIZE = 300
+DATA_LIMIT = 10
+
+'''
+data_train = torchvision.datasets.MNIST(
+    "./data", train=True, download=True, 
+    transform=torchvision.transforms.Compose([
+            #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+            torchvision.transforms.ToTensor()
+        ])
+)
+data_test = torchvision.datasets.MNIST(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+'''
+data_train = torchvision.datasets.CIFAR10(
+    "./data", train=True, download=True, 
+    transform=torchvision.transforms.Compose([
+            #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+            torchvision.transforms.ToTensor()
+        ])
+)
+
+data_test = torchvision.datasets.CIFAR10(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+train_subset_indices=range(int(len(data_train)/2))
+val_subset_indices=range(int(len(data_train)/2),len(data_train))
+
+dl_train = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices))
+dl_val = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices))
+dl_test = torch.utils.data.DataLoader(data_test, batch_size=TEST_SIZE, shuffle=False)
+
+def test(model, reshape_in=True, device = torch.device('cuda')):
+    for i, (features_, labels_) in enumerate(dl_test):
+        if reshape_in :
+            features, labels = torch.reshape(features_, (TEST_SIZE, 28 * 28)), labels_
+        else:
+            features, labels =features_, labels_
+
+        features, labels = features.to(device), labels.to(device)
+
+        pred = model.forward(features)
+        return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100
+
+def train_one_epoch(model, optimizer, epoch=0, reshape_in=True, device = torch.device('cuda'), train_data=True):
+    if train_data: dl = dl_train
+    else: dl = dl_val
+    for i, (features_, labels_) in enumerate(dl):
+        if i > DATA_LIMIT : break
+        #t0 = time.process_time()
+
+        if reshape_in :
+            features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
+        else:
+            features, labels =features_, labels_
+
+        features, labels = features.to(device), labels.to(device)
+
+        #optimizer.begin()
+        #optimizer.zero_grad()
+        model.begin()
+        model.zero_grad()
+        pred = model.forward(features)
+
+        #loss = F.nll_loss(pred, labels)
+        loss = F.cross_entropy(pred,labels)
+
+        #model.print_grad_fn()
+        #optimizer.print_grad_fn()
+        #print('-'*50)
+
+        loss.backward(create_graph=True)
+
+        #optimizer.step()
+        if train_data: model.adjust()
+        else: model.adjust_val()
+        
+        #tf = time.process_time()
+        #data = {
+        #    "time": tf - t0,
+        #    "iter": epoch * len(dl_train) + i,
+        #    "loss": loss.item(),
+        #    "params": {
+        #        k: v.item()
+        #        for k, v in model.optimizer.parameters.items()
+        #        if "." not in k
+        #    },
+        #}
+        #stats.append(data)
+
+        #print_torch_mem(i)
+    return loss.item()
+
+def train_v2(model, optimizer, epochs=3, reshape_in=True, device = torch.device('cuda')):
+    log = []
+    for epoch in range(epochs):
+
+        #dl_train.dataset.transform=torchvision.transforms.Compose([
+        #    torchvision.transforms.RandomAffine(degrees=model.param('mag'), translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+        #    torchvision.transforms.ToTensor()
+        #])
+        viz_data(fig_name='res/data_sample')
+        t0 = time.process_time()
+        loss = train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device)
+        train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device,train_data=False)
+
+        #acc = test(model=model, reshape_in=reshape_in, device=device)
+        acc = 0
+
+        
+        tf = time.process_time()
+        data = {
+            "time": tf - t0,
+            "epoch": epoch,
+            "loss": loss,
+            "acc": acc,
+            "params": {
+                k: v.item()
+                for k, v in model.optimizer.parameters.items()
+                #for k, v in model.mods.data_aug.params.named_parameters()
+                if "." not in k
+
+            },
+        }
+        log.append(data)
+
+
+        print("Epoch :",epoch+1, "/",epochs, "- Loss :",log[-1]["loss"])
+        param = [p for p in model.param_grad() if p.grad is not None]
+        if(len(param)!=0):
+            print(param[-2],' / ', param[-2].grad)
+            print(param[-1],' / ', param[-1].grad)
+    return log
+
+def train(model, epochs=3, height=1, reshape_in=True, device = torch.device('cuda')):
+    stats = []
+    for epoch in range(epochs):
+        for i, (features_, labels_) in enumerate(dl_train):
+            t0 = time.process_time()
+            model.begin()
+            if reshape_in :
+                features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
+            else:
+            	features, labels =features_, labels_
+
+            features, labels = features.to(device), labels.to(device)
+            
+            pred = model.forward(
+                features
+            )  # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/
+            #loss = F.nll_loss(pred, labels)
+            loss = F.cross_entropy(pred,labels)
+
+            #print('-'*50)
+            #param = [p for p in model.param_grad() if p.grad is not None]
+            #if(len(param)!=0):
+            #	print(param[-2],' / ', param[-2].grad)
+            #	print(param[-1],' / ', param[-1].grad)
+
+            model.zero_grad()
+            loss.backward(create_graph=True)
+            model.adjust()
+            tf = time.process_time()
+            data = {
+                "time": tf - t0,
+                "iter": epoch * len(dl_train) + i,
+                "loss": loss.item(),
+                "params": {
+                    k: v.item()
+                    for k, v in model.optimizer.parameters.items()
+                    if "." not in k
+                },
+            }
+            stats.append(data)
+
+        print('-'*50)
+        i=0
+        for obj in gc.get_objects():
+            try:
+                if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)) and len(obj.size())>1:
+                    print(i, type(obj), obj.size())
+                    i+=1
+            except:
+                pass
+        print("Epoch :",epoch+1, "/",epochs, "- Loss :",stats[-1]["loss"])
+        param = [p for p in model.param_grad() if p.grad is not None]
+        if(len(param)!=0):
+            print(param[-2],' / ', param[-2].grad)
+            print(param[-1],' / ', param[-1].grad)
+    return stats
+
+def run(opt, name="out", usr={}, epochs=10, height=1, cnn=True, device = torch.device('cuda')):
+    torch.manual_seed(0x42)
+    if not cnn:
+        reshape_in = True
+        #model = MNIST_FullyConnected(28 * 28, 128, 10, opt)
+        model = MNIST_FullyConnected_Augmented(28 * 28, 128, 10, opt, device=device)
+        
+    else:
+        reshape_in = False
+        #model = LeNet(1, 10,opt, device)
+        #model = LeNet_v2(1, 10,opt, device).to(device=device)
+        model = LeNet_v2(3, 10,opt, device).to(device=device)
+        optimizer=None
+        '''
+        m = LeNet_v3(1, 10)
+        a = Data_aug()
+        model = Augmented_model(model=m, 
+                                data_augmenter=a,
+                                optimizer=opt).to(device) #deux fois le meme optimizer ?...
+        '''
+        '''
+        m = LeNet_v3(1, 10)
+        a = Data_aug()
+        model = Augmented_model(model=m, data_augmenter=a).to(device)
+        #optimizer = SGD(model.parameters())
+        optimizer = SGD(model.parameters(), lr=0.01, height=1)
+        '''
+        
+        
+    #for idx, m in enumerate(model.modules()):
+    #    print(idx, '->', m)
+    print("Running...", str(model))
+    model.initialize()
+    #print_model(model)
+    #model.data_augmentation(False)
+    #model.eval()
+
+    log = train_v2(model=model, optimizer=optimizer, epochs=epochs, reshape_in=reshape_in, device=device)
+    model.eval()
+    acc = test(model, reshape_in, device=device)
+
+    
+    #param = [p for p in model.param_grad() if p.grad is not None]
+    #if(len(param)!=0):
+    #    print(param[-2],' / ', param[-2].grad)
+    #    print(param[-1],' / ', param[-1].grad)
+	
+    out = {"acc": acc, "log": log, "usr": usr}
+    with open("log/%s.json" % name, "w+") as f:
+        json.dump(out, f, indent=True)
+    times = [x["time"] for x in log]
+    print("Times (ms):", np.mean(times), "+/-", np.std(times))
+    print("Final accuracy:", acc)
+
+    #plot_res(log, fig_name='res/'+name)
+
+    return out
+
+def make_adam_stack(height, top=0.0000001, device = torch.device('cuda')):
+    #print(height,device)
+    if height == 0:
+        return Adam(alpha=top, device=device)
+    return Adam(alpha=top, optimizer=make_adam_stack(height - 1, top, device=device), device=device)
+
+def plot_res(log, fig_name='res'):
+    
+    fig, ax = plt.subplots(ncols=3, figsize=(15, 3))
+    ax[0].set_title('Loss')
+    ax[0].plot([x["loss"] for x in log])
+        
+    ax[1].set_title('Acc')
+    ax[1].plot([x["acc"] for x in log]) 
+
+    ax[2].set_title('mag')
+    ax[2].plot([x["data_aug"] for x in log]) 
+
+    plt.savefig(fig_name)
+
+def print_torch_mem(add_info=''):
+
+    nb=0
+    max_size=0
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # and len(obj.size())>1:
+                #print(i, type(obj), obj.size())
+                size = np.sum(obj.size())
+                if(size>max_size): max_size=size
+                nb+=1
+        except:
+            pass
+    print(add_info, "-Pytroch tensor nb:",nb," / Max dim:", max_size)
+
+def print_model(model, fig_name='graph/graph'): #Semble ne pas marcher pour les models en fonctionnel
+    x = torch.randn(1,1,28,28, device=device)
+    dot=make_dot(model(x), params=dict(model.named_parameters()))
+    dot.format = 'svg' #https://graphviz.readthedocs.io/en/stable/manual.html#formats
+    dot.render(fig_name)
+    print("Model graph generated !")
+
+def viz_data(fig_name='data_sample'):
+
+    features_, labels_ = next(iter(dl_train))
+    plt.figure(figsize=(10,10))
+    #for i, (features_, labels_) in enumerate(dl_train):
+    for i in range(25):
+        if i==25: break
+        #print(features_.size(), labels_.size())
+
+        plt.subplot(5,5,i+1)
+        plt.xticks([])
+        plt.yticks([])
+        plt.grid(False)
+
+        img = features_[i,0,:,:]
+        
+        #print('im shape',img.shape)
+        plt.imshow(img, cmap=plt.cm.binary)
+        plt.xlabel(labels_[i].item())
+
+    plt.savefig(fig_name)
+
+##########################################
+if __name__ == "__main__":
+    try:
+        os.mkdir("log")
+    except:
+        print("log/ exists already")
+
+    device = torch.device('cuda')
+
+    run(make_adam_stack(height=1, top=0.001, device=device), 
+        "Augmented_MNIST", 
+        epochs=100, 
+        cnn=True, 
+        device = device)
+    print()
--- a/higher/dataug.py
+++ b/higher/dataug.py
@ -0,0 +1,583 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import *
+
+#import kornia
+#import random
+#import numpy as np
+import copy
+
+import transformations as TF
+
+class Data_aug(nn.Module): #Rotation parametree
+    def __init__(self):
+        super(Data_aug, self).__init__()
+        self._data_augmentation = True
+        self._params = nn.ParameterDict({
+            "prob": nn.Parameter(torch.tensor(0.5)),
+            "mag": nn.Parameter(torch.tensor(1.0))
+        })
+
+        #self.params["mag"].register_hook(print)
+
+    def forward(self, x):
+
+        if self._data_augmentation and random.random() < self._params["prob"]:
+            #print('Aug')
+            batch_size = x.shape[0]
+            # create transformation (rotation)
+            alpha = self._params["mag"]*180 # in degrees
+            angle = torch.ones(batch_size, device=x.device) * alpha
+
+            # define the rotation center
+            center = torch.ones(batch_size, 2, device=x.device)
+            center[..., 0] = x.shape[3] / 2  # x
+            center[..., 1] = x.shape[2] / 2  # y
+
+            #print(x.shape, center)
+            # define the scale factor
+            scale = torch.ones(batch_size, device=x.device)
+
+            # compute the transformation matrix
+            M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+            # apply the transformation to original image
+            x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+        return x
+
+    def eval(self):
+        self.augment(mode=False)
+        nn.Module.eval(self)
+
+    def augment(self, mode=True):
+        self._data_augmentation=mode
+
+    def __getitem__(self, key):
+        return self._params[key]
+
+    def __str__(self):
+        return "Data_aug(Mag-1 TF)"
+
+class Data_augV2(nn.Module): #Methode exacte
+    def __init__(self):
+        super(Data_augV2, self).__init__()
+        self._data_augmentation = True
+
+        self._fixed_transf=[0.0, 45.0, 180.0] #Degree rotation
+        #self._fixed_transf=[0.0]
+        self._nb_tf= len(self._fixed_transf)
+
+        self._params = nn.ParameterDict({
+            "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme
+            #"prob2": nn.Parameter(torch.ones(len(self._fixed_transf)).softmax(dim=0))
+        })
+
+        #print(self._params["prob"], self._params["prob2"])
+
+        self.transf_idx=0
+
+    def forward(self, x):
+
+        if self._data_augmentation:
+            #print('Aug',self._fixed_transf[self.transf_idx])
+            device = x.device
+            batch_size = x.shape[0]
+
+            # create transformation (rotation)
+            #alpha = 180 # in degrees
+            alpha = self._fixed_transf[self.transf_idx]
+            angle = torch.ones(batch_size, device=device) * alpha
+
+            x = self.rotate(x,angle)
+
+        return x
+
+    def rotate(self, x, angle):
+
+        device = x.device
+        batch_size = x.shape[0]
+        # define the rotation center
+        center = torch.ones(batch_size, 2, device=device)
+        center[..., 0] = x.shape[3] / 2  # x
+        center[..., 1] = x.shape[2] / 2  # y
+
+        #print(x.shape, center)
+        # define the scale factor
+        scale = torch.ones(batch_size, device=device)
+
+        # compute the transformation matrix
+        M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+        # apply the transformation to original image
+        return kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+
+    def adjust_prob(self): #Detach from gradient ?
+        self._params['prob'].data = self._params['prob'].clamp(min=0.0,max=1.0)
+        #print('proba',self._params['prob'])
+        self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1
+        #print('Sum p', sum(self._params['prob']))
+
+    def eval(self):
+        self.augment(mode=False)
+        nn.Module.eval(self)
+
+    def augment(self, mode=True):
+        self._data_augmentation=mode
+
+    def __getitem__(self, key):
+        return self._params[key]
+
+    def __str__(self):
+        return "Data_augV2(Exact-%d TF)" % self._nb_tf
+
+class Data_augV3(nn.Module): #Echantillonage uniforme/Mixte
+    def __init__(self, mix_dist=0.0):
+        super(Data_augV3, self).__init__()
+        self._data_augmentation = True
+
+        #self._fixed_transf=[0.0, 45.0, 180.0] #Degree rotation
+        self._fixed_transf=[0.0, 1.0, -1.0] #Flips (Identity,Horizontal,Vertical)
+        #self._fixed_transf=[0.0]
+        self._nb_tf= len(self._fixed_transf)
+
+        self._params = nn.ParameterDict({
+            "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme
+            #"prob2": nn.Parameter(torch.ones(len(self._fixed_transf)).softmax(dim=0))
+        })
+
+        #print(self._params["prob"], self._params["prob2"])
+        self._sample = []
+
+        self._mix_dist = False
+        if mix_dist != 0.0:
+            self._mix_dist = True
+            self._mix_factor = max(min(mix_dist, 1.0), 0.0)
+
+    def forward(self, x):
+
+        if self._data_augmentation:
+            device = x.device
+            batch_size = x.shape[0]
+
+            
+            #good_distrib = Uniform(low=torch.zeros(batch_size,1, device=device),high=torch.new_full((batch_size,1),self._params["prob"], device=device))
+            #bad_distrib = Uniform(low=torch.zeros(batch_size,1, device=device),high=torch.new_full((batch_size,1), 1-self._params["prob"], device=device))
+
+            #transform_dist = Categorical(probs=torch.tensor([self._params["prob"], 1-self._params["prob"]], device=device))
+            #self._sample = transform_dist._sample(sample_shape=torch.Size([batch_size,1]))  
+
+            uniforme_dist = torch.ones(1,self._nb_tf,device=device).softmax(dim=0)
+
+            if not self._mix_dist:
+                distrib = uniforme_dist        
+            else:
+                distrib = (self._mix_factor*self._params["prob"]+(1-self._mix_factor)*uniforme_dist).softmax(dim=0) #Mix distrib reel / uniforme avec mix_factor
+
+            cat_distrib= Categorical(probs=torch.ones((batch_size, self._nb_tf), device=device)*distrib)
+            self._sample = cat_distrib.sample()
+
+            TF_param = torch.tensor([self._fixed_transf[x] for x in self._sample], device=device) #Approche de marco peut-etre plus rapide
+
+            #x = self.rotate(x,angle=TF_param)
+            x = self.flip(x,flip_mat=TF_param)
+
+        return x
+
+    def rotate(self, x, angle):
+
+        device = x.device
+        batch_size = x.shape[0]
+        # define the rotation center
+        center = torch.ones(batch_size, 2, device=device)
+        center[..., 0] = x.shape[3] / 2  # x
+        center[..., 1] = x.shape[2] / 2  # y
+
+        #print(x.shape, center)
+        # define the scale factor
+        scale = torch.ones(batch_size, device=device)
+
+        # compute the transformation matrix
+        M = kornia.get_rotation_matrix2d(center, angle, scale)
+
+        # apply the transformation to original image
+        return kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
+
+    def flip(self, x, flip_mat):
+
+        #print(flip_mat)
+        device = x.device
+        batch_size = x.shape[0]
+
+        h, w = x.shape[2], x.shape[3]  # destination size
+        #points_src = torch.ones(batch_size, 4, 2, device=device)
+        #points_dst = torch.ones(batch_size, 4, 2, device=device)
+
+        #Identity
+        iM=torch.tensor(np.eye(3))
+
+        #Horizontal flip
+        # the source points are the region to crop corners
+        #points_src = torch.FloatTensor([[
+        #    [w - 1, 0], [0, 0], [0, h - 1], [w - 1, h - 1],
+        #]])
+        # the destination points are the image vertexes
+        #points_dst = torch.FloatTensor([[
+        #    [0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1],
+        #]])
+        # compute perspective transform
+        #hM = kornia.get_perspective_transform(points_src, points_dst)
+        hM =torch.tensor( [[[-1.,  0., w-1],
+                            [ 0.,  1.,  0.],
+                            [ 0.,  0.,  1.]]])
+
+        #Vertical flip
+        # the source points are the region to crop corners
+        #points_src = torch.FloatTensor([[
+        #    [0, h - 1], [w - 1, h - 1], [w - 1, 0], [0, 0],
+        #]])
+        # the destination points are the image vertexes
+        #points_dst = torch.FloatTensor([[
+        #    [0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1],
+        #]])
+        # compute perspective transform
+        #vM = kornia.get_perspective_transform(points_src, points_dst)
+        vM =torch.tensor( [[[ 1.,  0.,  0.],
+                            [ 0., -1.,  h-1],
+                            [ 0.,  0.,  1.]]])
+        #print(vM)
+
+        M=torch.ones(batch_size, 3, 3, device=device)
+
+        for i in range(batch_size): # A optimiser
+            if flip_mat[i]==0.0:
+                M[i,]=iM
+            elif flip_mat[i]==1.0:
+                M[i,]=hM
+            elif flip_mat[i]==-1.0:
+                M[i,]=vM
+
+        # warp the original image by the found transform
+        return kornia.warp_perspective(x, M, dsize=(h, w))
+
+    def adjust_prob(self, soft=False): #Detach from gradient ?
+        
+        if soft :
+            self._params['prob'].data=F.softmax(self._params['prob'].data, dim=0) #Trop 'soft', bloque en dist uniforme si lr trop faible
+        else:
+            #self._params['prob'].clamp(min=0.0,max=1.0)
+            self._params['prob'].data = F.relu(self._params['prob'].data)
+            #self._params['prob'].data = self._params['prob'].clamp(min=0.0,max=1.0)
+            #print('proba',self._params['prob'])
+            self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1
+            #print('Sum p', sum(self._params['prob']))
+
+    def loss_weight(self):
+        #w_loss = [self._params["prob"][x] for x in self._sample]
+        #print(self._sample.view(-1,1).shape)
+        #print(self._sample[:10])
+        
+        w_loss = torch.zeros((self._sample.shape[0],self._nb_tf), device=self._sample.device)
+        w_loss.scatter_(1, self._sample.view(-1,1), 1)
+        #print(w_loss.shape)
+        #print(w_loss[:10,:])
+        w_loss = w_loss * self._params["prob"]
+        #print(w_loss.shape)
+        #print(w_loss[:10,:])
+        w_loss = torch.sum(w_loss,dim=1)
+        #print(w_loss.shape)
+        #print(w_loss[:10])
+        return w_loss
+
+    def train(self, mode=None):
+        if mode is None :
+            mode=self._data_augmentation
+        self.augment(mode=mode) #Inutile si mode=None
+        super(Data_augV3, self).train(mode)
+
+    def eval(self):
+        self.train(mode=False)
+        #super(Augmented_model, self).eval()
+
+    def augment(self, mode=True):
+        self._data_augmentation=mode
+
+    def __getitem__(self, key):
+        return self._params[key]
+
+    def __str__(self):
+        if not self._mix_dist:
+            return "Data_augV3(Uniform-%d TF)" % self._nb_tf
+        else:
+            return "Data_augV3(Mix %.1f-%d TF)" % (self._mix_factor, self._nb_tf)
+
+class Data_augV4(nn.Module): #Transformations avec mask
+    def __init__(self, TF_dict=TF.TF_dict, N_TF=1, mix_dist=0.0):
+        super(Data_augV4, self).__init__()
+        self._data_augmentation = True
+
+        #self._TF_matrix={}
+        #self._input_info={'h':0, 'w':0, 'device':None} #Input associe a TF_matrix
+        '''
+        self._mag_fct={ #f(mag_normalise)=mag_reelle
+            ## Geometric TF ##
+            'Identity' : (lambda mag: None),
+            'FlipUD' : (lambda mag: None),
+            'FlipLR' : (lambda mag: None),
+            'Rotate': (lambda mag: random.randint(-int_parameter(mag, maxval=30), int_parameter(mag, maxval=30))),
+            'TranslateX': (lambda mag: [random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20)), 0]),
+            'TranslateY': (lambda mag: [0, random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20))]),
+            'ShearX': (lambda mag: [random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3)), 0]),
+            'ShearY': (lambda mag: [0, random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3))]),
+
+            ## Color TF (Expect image in the range of [0, 1]) ##
+            'Contrast': (lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))),
+            'Color':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))),
+            'Brightness':(lambda mag: random.uniform(1., float_parameter(mag, maxval=1.9))),
+            'Sharpness':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))),
+            'Posterize': (lambda mag: random.randint(4, int_parameter(mag, maxval=8))),
+            'Solarize': (lambda mag: random.randint(1, int_parameter(mag, maxval=256))/256.), #=>Image entre [0,1] #Pas opti pour des batch
+
+            #Non fonctionnel
+            'Auto_Contrast': (lambda mag: None), #Pas opti pour des batch (Super lent)
+            #'Equalize': (lambda mag: None),
+        }
+        '''
+        self._mag_fct = TF_dict
+        self._TF=list(self._mag_fct.keys())
+        self._nb_tf= len(self._TF)
+
+        self._fixed_mag=5 #[0, PARAMETER_MAX]
+        self._params = nn.ParameterDict({
+            "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme
+        })
+
+        self._sample = []
+
+        self._mix_dist = False
+        if mix_dist != 0.0:
+            self._mix_dist = True
+            self._mix_factor = max(min(mix_dist, 1.0), 0.0)
+
+    def forward(self, x):
+        if self._data_augmentation:
+            device = x.device
+            batch_size, h, w = x.shape[0], x.shape[2], x.shape[3]
+            
+
+            ## Echantillonage ##
+            uniforme_dist = torch.ones(1,self._nb_tf,device=device).softmax(dim=1)
+
+            if not self._mix_dist:
+                self._distrib = uniforme_dist        
+            else:
+                self._distrib = (self._mix_factor*self._params["prob"]+(1-self._mix_factor)*uniforme_dist).softmax(dim=1) #Mix distrib reel / uniforme avec mix_factor
+                print(self.distrib.shape)
+
+            cat_distrib= Categorical(probs=torch.ones((batch_size, self._nb_tf), device=device)*self._distrib)
+            self._sample = cat_distrib.sample()
+
+            ## Transformations ##
+            #'''
+            x = copy.deepcopy(x) #Evite de modifier les echantillons par reference (Problematique pour des utilisations paralleles)
+            smps_x=[]
+            masks=[]
+            for tf_idx in range(self._nb_tf):
+                mask = self._sample==tf_idx #Create selection mask
+                smp_x = x[mask] #torch.masked_select() ?
+
+                if smp_x.shape[0]!=0: #if there's data to TF
+                    magnitude=self._fixed_mag
+                    tf=self._TF[tf_idx]
+
+                    ## Geometric TF ##
+                    if tf=='Identity':
+                        pass
+                    elif tf=='FlipLR':
+                        smp_x = TF.flipLR(smp_x)
+                    elif tf=='FlipUD':
+                        smp_x = TF.flipUD(smp_x)
+                    elif tf=='Rotate':
+                        smp_x = TF.rotate(smp_x, angle=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+                    elif tf=='TranslateX' or tf=='TranslateY':
+                        smp_x = TF.translate(smp_x, translation=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+                    elif tf=='ShearX' or tf=='ShearY' :
+                        smp_x = TF.shear(smp_x, shear=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+
+                    ## Color TF (Expect image in the range of [0, 1]) ##
+                    elif tf=='Contrast':
+                        smp_x = TF.contrast(smp_x, contrast_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+                    elif tf=='Color':
+                        smp_x = TF.color(smp_x, color_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+                    elif tf=='Brightness':
+                        smp_x = TF.brightness(smp_x, brightness_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+                    elif tf=='Sharpness':
+                        smp_x = TF.sharpeness(smp_x, sharpness_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device))
+                    elif tf=='Posterize':
+                        smp_x = TF.posterize(smp_x, bits=torch.tensor([1 for _ in smp_x], device=device))
+                    elif tf=='Solarize':
+                        smp_x = TF.solarize(smp_x, thresholds=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) 
+                    elif tf=='Equalize':
+                        smp_x = TF.equalize(smp_x)
+                    elif tf=='Auto_Contrast':
+                        smp_x = TF.auto_contrast(smp_x)
+                    else:
+                        raise Exception("Invalid TF requested : ", tf) 
+
+                    x[mask]=smp_x # Refusionner eviter x[mask] : in place
+                
+                #idx= mask.nonzero()
+                #print('-'*8)
+                #print(idx[0], tf_idx)
+                #print(smp_x[0,])
+                #x=x.view(-1,3*32*32)
+                #x=x.scatter(dim=0, index=idx, src=smp_x.view(-1,3*32*32)) #Changement des Tensor mais pas visible sur la visualisation...
+                #x=x.view(-1,3,32,32)
+                #print(x[0,])
+                
+            '''
+            if len(self._TF_matrix)==0 or self._input_info['h']!=h or self._input_info['w']!=w or self._input_info['device']!=device: #Device different:Pas necessaire de tout recalculer
+                self.compute_TF_matrix(sample_info={'h': x.shape[2],
+                                                    'w': x.shape[3], 
+                                                    'device': x.device})
+
+            TF_matrix = torch.zeros(batch_size, 3, 3, device=device) #All geom TF 
+
+            for tf_idx in range(self._nb_tf):
+                mask = self._sample==tf_idx #Create selection mask
+                TF_matrix[mask,]=self._TF_matrix[self._TF[tf_idx]]
+
+            x=kornia.warp_perspective(x, TF_matrix, dsize=(h, w))
+            '''
+        return x
+    '''
+    def compute_TF_matrix(self, magnitude=None, sample_info= None):
+        print('Computing TF_matrix...')
+        if not magnitude :
+            magnitude=self._fixed_mag
+
+        if sample_info:
+            self._input_info['h']= sample_info['h']
+            self._input_info['w']= sample_info['w']
+            self._input_info['device'] = sample_info['device']
+        h, w, device= self._input_info['h'], self._input_info['w'], self._input_info['device']
+
+        self._TF_matrix={}
+        for tf in self._TF :
+            if tf=='Id':
+                self._TF_matrix[tf]=torch.tensor([[[ 1.,  0.,  0.],
+                                                   [ 0.,  1.,  0.],
+                                                   [ 0.,  0.,  1.]]], device=device)
+            elif tf=='Rot':
+                center = torch.ones(1, 2, device=device)
+                center[0, 0] = w / 2  # x
+                center[0, 1] = h / 2  # y
+                scale = torch.ones(1, device=device)
+                angle = self._mag_fct[tf](magnitude) * torch.ones(1, device=device)
+                R = kornia.get_rotation_matrix2d(center, angle, scale) #Rotation matrix (1,2,3)
+                self._TF_matrix[tf]=torch.cat((R,torch.tensor([[[ 0.,  0.,  1.]]], device=device)), dim=1) #TF matrix (1,3,3)
+            elif tf=='FlipLR':
+                self._TF_matrix[tf]=torch.tensor([[[-1.,  0., w-1],
+                                                   [ 0.,  1.,  0.],
+                                                   [ 0.,  0.,  1.]]], device=device)
+            elif tf=='FlipUD':
+                self._TF_matrix[tf]=torch.tensor([[[ 1.,  0.,  0.],
+                                                   [ 0., -1.,  h-1],
+                                                   [ 0.,  0.,  1.]]], device=device)
+            else:
+                raise Exception("Invalid TF requested")
+    '''
+    def adjust_prob(self, soft=False): #Detach from gradient ?
+        
+        if soft :
+            self._params['prob'].data=F.softmax(self._params['prob'].data, dim=0) #Trop 'soft', bloque en dist uniforme si lr trop faible
+        else:
+            #self._params['prob'].clamp(min=0.0,max=1.0)
+            self._params['prob'].data = F.relu(self._params['prob'].data)
+            #self._params['prob'].data = self._params['prob'].clamp(min=0.0,max=1.0)
+
+            self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1
+
+    def loss_weight(self):
+        w_loss = torch.zeros((self._sample.shape[0],self._nb_tf), device=self._sample.device)
+        w_loss.scatter_(1, self._sample.view(-1,1), 1)
+        w_loss = w_loss * self._params["prob"]/self._distrib #Ponderation par les proba (divisee par la distrib pour pas diminuer la loss)
+        w_loss = torch.sum(w_loss,dim=1)
+        return w_loss
+
+    def train(self, mode=None):
+        if mode is None :
+            mode=self._data_augmentation
+        self.augment(mode=mode) #Inutile si mode=None
+        super(Data_augV4, self).train(mode)
+
+    def eval(self):
+        self.train(mode=False)
+
+    def augment(self, mode=True):
+        self._data_augmentation=mode
+
+    def __getitem__(self, key):
+        return self._params[key]
+
+    def __str__(self):
+        if not self._mix_dist:
+            return "Data_augV4(Uniform-%d TF)" % self._nb_tf
+        else:
+            return "Data_augV4(Mix %.1f-%d TF)" % (self._mix_factor, self._nb_tf)
+
+class Augmented_model(nn.Module):
+    def __init__(self, data_augmenter, model):
+        super(Augmented_model, self).__init__()
+
+        self._mods = nn.ModuleDict({
+            'data_aug': data_augmenter,
+            'model': model
+            })
+
+        self.augment(mode=True)
+
+    def initialize(self):
+        self._mods['model'].initialize()
+
+    def forward(self, x):
+        return self._mods['model'](self._mods['data_aug'](x))
+    
+    def augment(self, mode=True):
+        self._data_augmentation=mode
+        self._mods['data_aug'].augment(mode)
+
+    def train(self, mode=None):
+        if mode is None :
+            mode=self._data_augmentation
+        self._mods['data_aug'].augment(mode)
+        super(Augmented_model, self).train(mode)
+
+    def eval(self):
+        self.train(mode=False)
+        #super(Augmented_model, self).eval()
+
+    def items(self):
+        """Return an iterable of the ModuleDict key/value pairs.
+        """
+        return self._mods.items()
+
+    def update(self, modules):
+        self._mods.update(modules)
+
+    def is_augmenting(self):
+        return self._data_augmentation
+
+    def TF_names(self):
+        try:
+            return self._mods['data_aug']._TF
+        except:
+            return None
+
+    def __getitem__(self, key):
+        return self._mods[key]
+
+    def __str__(self):
+        return "Aug_mod("+str(self._mods['data_aug'])+"-"+str(self._mods['model'])+")"
--- a/higher/model.py
+++ b/higher/model.py
@ -0,0 +1,51 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LeNet(nn.Module):
+    def __init__(self, num_inp, num_out):
+        super(LeNet, self).__init__()
+        self._params = nn.ParameterDict({
+            'w1': nn.Parameter(torch.zeros(20, num_inp, 5, 5)),
+            'b1': nn.Parameter(torch.zeros(20)),
+            'w2': nn.Parameter(torch.zeros(50, 20, 5, 5)),
+            'b2': nn.Parameter(torch.zeros(50)),
+            #'w3': nn.Parameter(torch.zeros(500,4*4*50)), #num_imp=1
+            'w3': nn.Parameter(torch.zeros(500,5*5*50)), #num_imp=3
+            'b3': nn.Parameter(torch.zeros(500)),
+            'w4': nn.Parameter(torch.zeros(num_out, 500)),
+            'b4': nn.Parameter(torch.zeros(num_out))
+        })
+        self.initialize()
+
+
+    def initialize(self):
+        nn.init.kaiming_uniform_(self._params["w1"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self._params["w2"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self._params["w3"], a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self._params["w4"], a=math.sqrt(5))
+
+    def forward(self, x):
+        #print("Start Shape ", x.shape)
+        out = F.relu(F.conv2d(input=x, weight=self._params["w1"], bias=self._params["b1"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = F.relu(F.conv2d(input=out, weight=self._params["w2"], bias=self._params["b2"]))
+        #print("Shape ", out.shape)
+        out = F.max_pool2d(out, 2)
+        #print("Shape ", out.shape)
+        out = out.view(out.size(0), -1)
+        #print("Shape ", out.shape)
+        out = F.relu(F.linear(out, self._params["w3"], self._params["b3"]))
+        #print("Shape ", out.shape)
+        out = F.linear(out, self._params["w4"], self._params["b4"])
+        #print("Shape ", out.shape)
+        return F.log_softmax(out, dim=1)
+
+    def __getitem__(self, key):
+        return self._params[key]
+
+    def __str__(self):
+        return "LeNet"
--- a/higher/res/Aug_mod(Data_augV4(Uniform-11
+++ b/higher/res/Aug_mod(Data_augV4(Uniform-11
--- a/higher/res/Aug_mod(Data_augV4(Uniform-11
+++ b/higher/res/Aug_mod(Data_augV4(Uniform-11
--- a/higher/res/Aug_mod(Data_augV4(Uniform-3
+++ b/higher/res/Aug_mod(Data_augV4(Uniform-3
--- a/higher/res/Aug_mod(Data_augV4(Uniform-3
+++ b/higher/res/Aug_mod(Data_augV4(Uniform-3
--- a/higher/res/Aug_mod(Data_augV4(Uniform-3
+++ b/higher/res/Aug_mod(Data_augV4(Uniform-3
--- a/higher/res/LeNet-100
+++ b/higher/res/LeNet-100
--- a/higher/res/MNIST/Aug_mod(Data_aug(Mag-1
+++ b/higher/res/MNIST/Aug_mod(Data_aug(Mag-1
--- a/higher/res/MNIST/Aug_mod(Data_aug(Mag-1
+++ b/higher/res/MNIST/Aug_mod(Data_aug(Mag-1
--- a/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3
+++ b/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3
--- a/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3
+++ b/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3
--- a/higher/res/MNIST/Aug_mod(Data_augV3(Mix
+++ b/higher/res/MNIST/Aug_mod(Data_augV3(Mix
--- a/higher/res/MNIST/Aug_mod(Data_augV3(Mix
+++ b/higher/res/MNIST/Aug_mod(Data_augV3(Mix
--- a/higher/res/MNIST/Aug_mod(Data_augV3(Mix
+++ b/higher/res/MNIST/Aug_mod(Data_augV3(Mix
--- a/higher/res/MNIST/Aug_mod(Data_augV3(Mix
+++ b/higher/res/MNIST/Aug_mod(Data_augV3(Mix
--- a/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3
+++ b/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3
--- a/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3
+++ b/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3
--- a/higher/res/MNIST/LeNet-10
+++ b/higher/res/MNIST/LeNet-10
--- a/higher/test_dataug.py
+++ b/higher/test_dataug.py
@ -0,0 +1,764 @@
+from torch.utils.data import SubsetRandomSampler
+import torch.optim as optim
+import torchvision
+import higher
+
+from model import *
+from dataug import *
+from utils import *
+
+BATCH_SIZE = 300
+#TEST_SIZE = 300
+TEST_SIZE = 10000
+
+#ATTENTION : Dataug (Kornia) Expect image in the range of [0, 1]
+transform = torchvision.transforms.Compose([
+    torchvision.transforms.ToTensor(),
+    #torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), #CIFAR10
+])
+'''
+data_train = torchvision.datasets.MNIST(
+    "./data", train=True, download=True, 
+    transform=torchvision.transforms.Compose([
+            #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+            torchvision.transforms.ToTensor()
+        ])
+)
+data_test = torchvision.datasets.MNIST(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+'''
+data_train = torchvision.datasets.CIFAR10(
+    "./data", train=True, download=True, transform=transform
+)
+data_test = torchvision.datasets.CIFAR10(
+    "./data", train=False, download=True, transform=transform
+)
+#'''
+train_subset_indices=range(int(len(data_train)/2))
+#train_subset_indices=range(BATCH_SIZE*10)
+val_subset_indices=range(int(len(data_train)/2),len(data_train))
+
+dl_train = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices))
+dl_val = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices))
+dl_test = torch.utils.data.DataLoader(data_test, batch_size=TEST_SIZE, shuffle=False)
+
+device = torch.device('cuda')
+
+if device == torch.device('cpu'):
+    device_name = 'CPU'
+else:
+    device_name = torch.cuda.get_device_name(device)
+
+
+def test(model):
+    model.eval()
+    for i, (features, labels) in enumerate(dl_test):
+        features,labels = features.to(device), labels.to(device)
+
+        pred = model.forward(features)
+        return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100
+
+def compute_vaLoss(model, dl_val_it):
+    try:
+        xs_val, ys_val = next(dl_val_it)
+    except StopIteration: #Fin epoch val
+        dl_val_it = iter(dl_val)
+        xs_val, ys_val = next(dl_val_it)
+    xs_val, ys_val = xs_val.to(device), ys_val.to(device)
+
+    try:
+        model.augment(mode=False) #Validation sans transfornations !
+    except:
+        pass
+    return F.cross_entropy(model(xs_val), ys_val)
+
+def train_classic(model, epochs=1):
+    #opt = torch.optim.Adam(model.parameters(), lr=1e-3)
+    optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
+
+    model.train()
+    dl_val_it = iter(dl_val)
+    log = []
+    for epoch in range(epochs):
+        print_torch_mem("Start epoch")
+        t0 = time.process_time()
+        for i, (features, labels) in enumerate(dl_train):
+            #print_torch_mem("Start iter")
+            features,labels = features.to(device), labels.to(device)
+
+            optim.zero_grad()
+            pred = model.forward(features)
+            loss = F.cross_entropy(pred,labels)
+            loss.backward()
+            optim.step()
+
+        #### Tests ####
+        tf = time.process_time()
+        try:
+            xs_val, ys_val = next(dl_val_it)
+        except StopIteration: #Fin epoch val
+            dl_val_it = iter(dl_val)
+            xs_val, ys_val = next(dl_val_it)
+        xs_val, ys_val = xs_val.to(device), ys_val.to(device)
+
+        val_loss = F.cross_entropy(model(xs_val), ys_val)
+        accuracy=test(model)
+        model.train()
+        #### Log ####
+        data={
+            "epoch": epoch,
+            "train_loss": loss.item(),
+            "val_loss": val_loss.item(),
+            "acc": accuracy,
+            "time": tf - t0,
+
+            "param": None,
+        }
+        log.append(data)
+
+    return log
+
+def train_classic_higher(model, epochs=1):
+    #opt = torch.optim.Adam(model.parameters(), lr=1e-3)
+    optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
+
+    model.train()
+    dl_val_it = iter(dl_val)
+    log = []
+
+    fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+    diffopt = higher.optim.get_diff_optim(optim, model.parameters(),fmodel=fmodel,track_higher_grads=False)
+    #with higher.innerloop_ctx(model, optim, copy_initial_weights=True, track_higher_grads=False) as (fmodel, diffopt):
+
+    for epoch in range(epochs):
+        print_torch_mem("Start epoch "+str(epoch))
+        print("Fast param ",len(fmodel._fast_params))
+        t0 = time.process_time()
+        for i, (features, labels) in enumerate(dl_train):
+            #print_torch_mem("Start iter")
+            features,labels = features.to(device), labels.to(device)
+
+            #optim.zero_grad()
+            pred = fmodel.forward(features)
+            loss = F.cross_entropy(pred,labels)
+            #.backward()
+            #optim.step()
+            diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step)
+
+        model_copy(src=fmodel, dst=model, patch_copy=False)
+        optim_copy(dopt=diffopt, opt=optim)
+        fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+        diffopt = higher.optim.get_diff_optim(optim, model.parameters(),fmodel=fmodel,track_higher_grads=False)
+
+        #### Tests ####
+        tf = time.process_time()
+        try:
+            xs_val, ys_val = next(dl_val_it)
+        except StopIteration: #Fin epoch val
+            dl_val_it = iter(dl_val)
+            xs_val, ys_val = next(dl_val_it)
+        xs_val, ys_val = xs_val.to(device), ys_val.to(device)
+
+        val_loss = F.cross_entropy(model(xs_val), ys_val)
+        accuracy=test(model)
+        model.train()
+        #### Log ####
+        data={
+            "epoch": epoch,
+            "train_loss": loss.item(),
+            "val_loss": val_loss.item(),
+            "acc": accuracy,
+            "time": tf - t0,
+
+            "param": None,
+        }
+        log.append(data)
+
+    return log
+
+def train_classic_tests(model, epochs=1):
+    #opt = torch.optim.Adam(model.parameters(), lr=1e-3)
+    optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
+
+    countcopy=0
+    model.train()
+    dl_val_it = iter(dl_val)
+    log = []
+
+    fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+    doptim = higher.optim.get_diff_optim(optim, model.parameters(), fmodel=fmodel, track_higher_grads=False)
+    for epoch in range(epochs):
+        print_torch_mem("Start epoch")
+        print(len(fmodel._fast_params))
+        t0 = time.process_time()
+        #with higher.innerloop_ctx(model, optim, copy_initial_weights=True, track_higher_grads=True) as (fmodel, doptim):
+
+        #fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+        #doptim = higher.optim.get_diff_optim(optim, model.parameters(), track_higher_grads=True)
+
+        for i, (features, labels) in enumerate(dl_train):
+            features,labels = features.to(device), labels.to(device)
+
+            #with higher.innerloop_ctx(model, optim, copy_initial_weights=True, track_higher_grads=False) as (fmodel, doptim):
+
+            
+            #optim.zero_grad()
+            pred = fmodel.forward(features)
+            loss = F.cross_entropy(pred,labels)
+            doptim.step(loss) #(opt.zero_grad, loss.backward, opt.step)
+            #loss.backward()
+            #new_params = doptim.step(loss, params=fmodel.parameters())
+            #fmodel.update_params(new_params)
+
+            
+            #print('Fast param',len(fmodel._fast_params))
+            #print('opt state', type(doptim.state[0][0]['momentum_buffer']), doptim.state[0][2]['momentum_buffer'].shape)
+            
+            if False or (len(fmodel._fast_params)>1):
+                print("fmodel fast param",len(fmodel._fast_params))
+                '''
+                #val_loss = F.cross_entropy(fmodel(features), labels)
+
+                #print_graph(val_loss)
+
+                #val_loss.backward()
+                #print('bip')
+
+                tmp = fmodel.parameters()
+                
+                #print(list(tmp)[1])
+                tmp = [higher.utils._copy_tensor(t,safe_copy=True) if isinstance(t, torch.Tensor) else t for t in tmp]
+                #print(len(tmp))
+
+                #fmodel._fast_params.clear()
+                del fmodel._fast_params
+                fmodel._fast_params=None
+                
+                fmodel.fast_params=tmp # Surcharge la memoire          
+                #fmodel.update_params(tmp) #Meilleur perf / Surcharge la memoire avec trach higher grad
+
+                #optim._fmodel=fmodel
+                '''
+            
+
+                countcopy+=1
+                model_copy(src=fmodel, dst=model, patch_copy=False)
+                fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+                #doptim.detach_dyn()
+                #tmp = doptim.state
+                #tmp = doptim.state_dict()
+                #for k, v in tmp['state'].items():
+                #    print('dict',k, type(v))
+
+                a = optim.param_groups[0]['params'][0]
+                state = optim.state[a]
+                #state['momentum_buffer'] = None
+                #print('opt state', type(optim.state[a]), len(optim.state[a]))
+                #optim.load_state_dict(tmp)
+
+
+                for group_idx, group in enumerate(optim.param_groups):
+                   # print('gp idx',group_idx)
+                    for p_idx, p in enumerate(group['params']):
+                        optim.state[p]=doptim.state[group_idx][p_idx]
+
+                #print('opt state', type(optim.state[a]['momentum_buffer']), optim.state[a]['momentum_buffer'][0:10])
+                #print('dopt state', type(doptim.state[0][0]['momentum_buffer']), doptim.state[0][0]['momentum_buffer'][0:10])
+                '''
+                for a in tmp:
+                    #print(type(a), len(a))
+                    for nb, b in a.items():
+                        #print(nb, type(b), len(b))
+                        for n, state in b.items():
+                            #print(n, type(states))
+                            #print(state.grad_fn)
+                            state = torch.tensor(state.data).requires_grad_()
+                            #print(state.grad_fn)
+                '''
+
+
+                doptim = higher.optim.get_diff_optim(optim, model.parameters(), track_higher_grads=True)
+                #doptim.state = tmp
+
+
+        countcopy+=1
+        model_copy(src=fmodel, dst=model)
+        optim_copy(dopt=diffopt, opt=inner_opt) 
+
+        #### Tests ####
+        tf = time.process_time()
+        try:
+            xs_val, ys_val = next(dl_val_it)
+        except StopIteration: #Fin epoch val
+            dl_val_it = iter(dl_val)
+            xs_val, ys_val = next(dl_val_it)
+        xs_val, ys_val = xs_val.to(device), ys_val.to(device)
+
+        val_loss = F.cross_entropy(model(xs_val), ys_val)
+        accuracy=test(model)
+        model.train()
+        #### Log ####
+        data={
+            "epoch": epoch,
+            "train_loss": loss.item(),
+            "val_loss": val_loss.item(),
+            "acc": accuracy,
+            "time": tf - t0,
+
+            "param": None,
+        }
+        log.append(data)
+
+    #countcopy+=1
+    #model_copy(src=fmodel, dst=model, patch_copy=False)
+    #optim.load_state_dict(doptim.state_dict()) #Besoin sauver etat otpim ?
+
+    print("Copy ", countcopy)
+    return log
+
+def run_simple_dataug(inner_it, epochs=1):
+
+    dl_train_it = iter(dl_train)
+    dl_val_it = iter(dl_val)
+
+    #aug_model = nn.Sequential(
+    #    Data_aug(),
+    #    LeNet(1,10),
+    #    )
+    aug_model = Augmented_model(Data_aug(), LeNet(1,10)).to(device)
+    print(str(aug_model))
+    meta_opt = torch.optim.Adam(aug_model['data_aug'].parameters(), lr=1e-2)
+    inner_opt = torch.optim.SGD(aug_model['model'].parameters(), lr=1e-2, momentum=0.9)
+
+    log = []
+    t0 = time.process_time()
+
+    epoch = 0
+    while epoch < epochs:
+        meta_opt.zero_grad()
+        aug_model.train()
+        with higher.innerloop_ctx(aug_model, inner_opt, copy_initial_weights=True, track_higher_grads=True) as (fmodel, diffopt): #effet copy_initial_weight pas clair...
+
+            for i in range(n_inner_iter):
+                try:
+                    xs, ys = next(dl_train_it)
+                except StopIteration: #Fin epoch train
+                    tf = time.process_time()
+                    epoch +=1
+                    dl_train_it = iter(dl_train)
+                    xs, ys = next(dl_train_it)
+
+                    accuracy=test(aug_model)
+                    aug_model.train()
+
+                    #### Print ####
+                    print('-'*9)
+                    print('Epoch %d/%d'%(epoch,epochs))
+                    print('train loss',loss.item(), '/ val loss', val_loss.item())
+                    print('acc', accuracy)
+                    print('mag', aug_model['data_aug']['mag'].item())
+
+                    #### Log ####
+                    data={
+                        "epoch": epoch,
+                        "train_loss": loss.item(),
+                        "val_loss": val_loss.item(),
+                        "acc": accuracy,
+                        "time": tf - t0,
+
+                        "param": aug_model['data_aug']['mag'].item(),
+                    }
+                    log.append(data)
+                    t0 = time.process_time()
+
+                xs, ys = xs.to(device), ys.to(device)
+
+                logits = fmodel(xs)  # modified `params` can also be passed as a kwarg
+
+                loss = F.cross_entropy(logits, ys)  # no need to call loss.backwards()
+                #loss.backward(retain_graph=True)
+                #print(fmodel['model']._params['b4'].grad)
+                #print('mag', fmodel['data_aug']['mag'].grad)
+
+                diffopt.step(loss)  # note that `step` must take `loss` as an argument!
+                # The line above gets P[t+1] from P[t] and loss[t]. `step` also returns
+                # these new parameters, as an alternative to getting them from
+                # `fmodel.fast_params` or `fmodel.parameters()` after calling
+                # `diffopt.step`.
+
+                # At this point, or at any point in the iteration, you can take the
+                # gradient of `fmodel.parameters()` (or equivalently
+                # `fmodel.fast_params`) w.r.t. `fmodel.parameters(time=0)` (equivalently
+                # `fmodel.init_fast_params`). i.e. `fast_params` will always have
+                # `grad_fn` as an attribute, and be part of the gradient tape.
+            
+            # At the end of your inner loop you can obtain these e.g. ...
+            #grad_of_grads = torch.autograd.grad(
+            #    meta_loss_fn(fmodel.parameters()), fmodel.parameters(time=0))
+            try:
+                xs_val, ys_val = next(dl_val_it)
+            except StopIteration: #Fin epoch val
+                dl_val_it = iter(dl_val)
+                xs_val, ys_val = next(dl_val_it)
+            xs_val, ys_val = xs_val.to(device), ys_val.to(device)
+
+            fmodel.augment(mode=False)
+            val_logits = fmodel(xs_val) #Validation sans transfornations !
+            val_loss = F.cross_entropy(val_logits, ys_val)
+            #print('val_loss',val_loss.item())
+            val_loss.backward()
+
+            #print('mag', fmodel['data_aug']['mag'], '/', fmodel['data_aug']['mag'].grad)
+
+            #model=copy.deepcopy(fmodel)
+            aug_model.load_state_dict(fmodel.state_dict()) #Do not copy gradient ! 
+            #Copie des gradients
+            for paramName, paramValue, in fmodel.named_parameters():
+              for netCopyName, netCopyValue, in aug_model.named_parameters():
+                if paramName == netCopyName:
+                  netCopyValue.grad = paramValue.grad
+
+            #print('mag', aug_model['data_aug']['mag'], '/', aug_model['data_aug']['mag'].grad)
+            meta_opt.step()
+
+    plot_res(log, fig_name="res/{}-{} epochs- {} in_it".format(str(aug_model),epochs,inner_it))
+    print('-'*9)
+    times = [x["time"] for x in log]
+    print(str(aug_model),": acc", max([x["acc"] for x in log]), "in (ms):", np.mean(times), "+/-", np.std(times))
+
+def run_dist_dataug(model, epochs=1, inner_it=1, dataug_epoch_start=0):
+
+    dl_train_it = iter(dl_train)
+    dl_val_it = iter(dl_val)
+    
+    meta_opt = torch.optim.Adam(model['data_aug'].parameters(), lr=1e-3)
+    inner_opt = torch.optim.SGD(model['model'].parameters(), lr=1e-2, momentum=0.9)
+
+    high_grad_track = True
+    if dataug_epoch_start>0:
+        model.augment(mode=False)
+        high_grad_track = False
+
+    model.train()
+
+    log = []
+    t0 = time.process_time()
+
+    countcopy=0
+    val_loss=torch.tensor(0)
+    opt_param=None
+
+    epoch = 0
+    while epoch < epochs:
+        meta_opt.zero_grad()
+        with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, override=opt_param, track_higher_grads=high_grad_track) as (fmodel, diffopt): #effet copy_initial_weight pas clair...
+
+            for i in range(n_inner_iter):
+                try:
+                    xs, ys = next(dl_train_it)
+                except StopIteration: #Fin epoch train
+                    tf = time.process_time()
+                    epoch +=1
+                    dl_train_it = iter(dl_train)
+                    xs, ys = next(dl_train_it)
+
+                    #viz_sample_data(imgs=xs, labels=ys, fig_name='samples/data_sample_epoch{}_noTF'.format(epoch))
+                    #viz_sample_data(imgs=aug_model['data_aug'](xs), labels=ys, fig_name='samples/data_sample_epoch{}'.format(epoch))
+
+                    accuracy=test(model)
+                    model.train()
+
+                    #### Print ####
+                    print('-'*9)
+                    print('Epoch : %d/%d'%(epoch,epochs))
+                    print('Train loss :',loss.item(), '/ val loss', val_loss.item())
+                    print('Accuracy :', accuracy)
+                    print('Data Augmention : {} (Epoch {})'.format(model._data_augmentation, dataug_epoch_start))
+                    print('TF Proba :', model['data_aug']['prob'].data)
+                    #print('proba grad',aug_model['data_aug']['prob'].grad)
+                    #############
+                    #### Log ####
+                    data={
+                        "epoch": epoch,
+                        "train_loss": loss.item(),
+                        "val_loss": val_loss.item(),
+                        "acc": accuracy,
+                        "time": tf - t0,
+
+                        "param": [p for p in model['data_aug']['prob']],
+                    }
+                    log.append(data)
+                    #############
+
+                    if epoch == dataug_epoch_start:
+                        print('Starting Data Augmention...')
+                        model.augment(mode=True)
+                        high_grad_track = True
+
+                    t0 = time.process_time()
+
+                xs, ys = xs.to(device), ys.to(device)
+
+                '''
+                #Methode exacte
+                final_loss = 0
+                for tf_idx in range(fmodel['data_aug']._nb_tf):
+                    fmodel['data_aug'].transf_idx=tf_idx
+                    logits = fmodel(xs)
+                    loss = F.cross_entropy(logits, ys)
+                    #loss.backward(retain_graph=True)
+                    #print('idx', tf_idx)
+                    #print(fmodel['data_aug']['prob'][tf_idx], fmodel['data_aug']['prob'][tf_idx].grad)
+                    final_loss += loss*fmodel['data_aug']['prob'][tf_idx] #Take it in the forward function ?
+                
+                loss = final_loss 
+                '''
+                #Methode uniforme
+                logits = fmodel(xs)  # modified `params` can also be passed as a kwarg
+                loss = F.cross_entropy(logits, ys, reduction='none')  # no need to call loss.backwards()
+                if fmodel._data_augmentation: #Weight loss
+                    w_loss = fmodel['data_aug'].loss_weight().to(device)
+                    loss = loss * w_loss
+                loss = loss.mean()
+                #'''
+
+                #to visualize computational graph
+                #print_graph(loss)
+
+                #loss.backward(retain_graph=True)
+                #print(fmodel['model']._params['b4'].grad)
+                #print('prob grad', fmodel['data_aug']['prob'].grad)
+
+                diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step)
+
+            try:
+                xs_val, ys_val = next(dl_val_it)
+            except StopIteration: #Fin epoch val
+                dl_val_it = iter(dl_val)
+                xs_val, ys_val = next(dl_val_it)
+            xs_val, ys_val = xs_val.to(device), ys_val.to(device)
+
+            fmodel.augment(mode=False) #Validation sans transfornations !
+            val_loss = F.cross_entropy(fmodel(xs_val), ys_val)
+
+            #print_graph(val_loss)
+
+            val_loss.backward()
+            
+            countcopy+=1
+            model_copy(src=fmodel, dst=model)
+            optim_copy(dopt=diffopt, opt=inner_opt)
+            
+            meta_opt.step()
+            model['data_aug'].adjust_prob() #Contrainte sum(proba)=1
+
+    print("Copy ", countcopy)
+    return log
+
+def run_dist_dataugV2(model, epochs=1, inner_it=0, dataug_epoch_start=0, print_freq=1, loss_patience=None):
+
+    log = []
+    countcopy=0
+    val_loss=torch.tensor(0) #Necessaire si pas de metastep sur une epoch
+    dl_val_it = iter(dl_val)
+
+    meta_opt = torch.optim.Adam(model['data_aug'].parameters(), lr=1e-2)
+    inner_opt = torch.optim.SGD(model['model'].parameters(), lr=1e-2, momentum=0.9)
+
+    high_grad_track = True
+    if inner_it == 0:
+        high_grad_track=False
+    if dataug_epoch_start!=0:
+        model.augment(mode=False)
+        high_grad_track = False
+
+    val_loss_monitor= None
+    if loss_patience != None :
+        if dataug_epoch_start==-1: val_loss_monitor = loss_monitor(patience=loss_patience, end_train=2) #1st limit = dataug start
+        else: val_loss_monitor = loss_monitor(patience=loss_patience) #Val loss monitor
+
+    model.train()
+    
+    fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+    diffopt = higher.optim.get_diff_optim(inner_opt, model.parameters(),fmodel=fmodel,track_higher_grads=high_grad_track)
+
+    for epoch in range(1, epochs+1):
+        #print_torch_mem("Start epoch "+str(epoch))
+        #print(high_grad_track, fmodel._data_augmentation, len(fmodel._fast_params))
+        t0 = time.process_time()
+        #with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, override=opt_param, track_higher_grads=high_grad_track) as (fmodel, diffopt):
+
+        for i, (xs, ys) in enumerate(dl_train):
+            xs, ys = xs.to(device), ys.to(device)
+            '''
+            #Methode exacte
+            final_loss = 0
+            for tf_idx in range(fmodel['data_aug']._nb_tf):
+                fmodel['data_aug'].transf_idx=tf_idx
+                logits = fmodel(xs)
+                loss = F.cross_entropy(logits, ys)
+                #loss.backward(retain_graph=True)
+                #print('idx', tf_idx)
+                #print(fmodel['data_aug']['prob'][tf_idx], fmodel['data_aug']['prob'][tf_idx].grad)
+                final_loss += loss*fmodel['data_aug']['prob'][tf_idx] #Take it in the forward function ?
+            
+            loss = final_loss 
+            '''
+            #Methode uniforme
+            
+            logits = fmodel(xs)  # modified `params` can also be passed as a kwarg
+            loss = F.cross_entropy(logits, ys, reduction='none')  # no need to call loss.backwards()
+            #PAS PONDERE LOSS POUR DIST MIX
+            if fmodel._data_augmentation: # and not fmodel['data_aug']._mix_dist: #Weight loss
+                w_loss = fmodel['data_aug'].loss_weight().to(device)
+                loss = loss * w_loss
+            loss = loss.mean()
+            #'''
+
+            #to visualize computational graph
+            #print_graph(loss)
+
+            #loss.backward(retain_graph=True)
+            #print(fmodel['model']._params['b4'].grad)
+            #print('prob grad', fmodel['data_aug']['prob'].grad)
+
+            diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step)
+
+            if(high_grad_track and i%inner_it==0): #Perform Meta step
+                #print("meta")
+                #Peu utile si high_grad_track = False
+                val_loss = compute_vaLoss(model=fmodel, dl_val_it=dl_val_it)
+
+                #print_graph(val_loss)
+
+                val_loss.backward()
+            
+                countcopy+=1
+                model_copy(src=fmodel, dst=model)
+                optim_copy(dopt=diffopt, opt=inner_opt)
+
+                meta_opt.step()
+                model['data_aug'].adjust_prob(soft=False) #Contrainte sum(proba)=1
+
+                fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+                diffopt = higher.optim.get_diff_optim(inner_opt, model.parameters(),fmodel=fmodel, track_higher_grads=high_grad_track)
+
+        tf = time.process_time()
+
+        #viz_sample_data(imgs=xs, labels=ys, fig_name='samples/data_sample_epoch{}_noTF'.format(epoch))
+        #viz_sample_data(imgs=aug_model['data_aug'](xs), labels=ys, fig_name='samples/data_sample_epoch{}'.format(epoch))
+        
+        if(not high_grad_track): 
+            countcopy+=1
+            model_copy(src=fmodel, dst=model)
+            optim_copy(dopt=diffopt, opt=inner_opt)
+            val_loss = compute_vaLoss(model=fmodel, dl_val_it=dl_val_it)
+
+            #Necessaire pour reset higher (Accumule les fast_param meme avec track_higher_grads = False)
+            fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True)
+            diffopt = higher.optim.get_diff_optim(inner_opt, model.parameters(),fmodel=fmodel, track_higher_grads=high_grad_track)
+
+        accuracy=test(model)
+        model.train()
+
+        #### Print ####
+        if(print_freq and epoch%print_freq==0):
+            print('-'*9)
+            print('Epoch : %d/%d'%(epoch,epochs))
+            print('Time : %.00f ms'%(tf - t0))
+            print('Train loss :',loss.item(), '/ val loss', val_loss.item())
+            print('Accuracy :', accuracy)
+            print('Data Augmention : {} (Epoch {})'.format(model._data_augmentation, dataug_epoch_start))
+            print('TF Proba :', model['data_aug']['prob'].data)
+            #print('proba grad',aug_model['data_aug']['prob'].grad)
+        #############
+        #### Log ####
+        data={
+            "epoch": epoch,
+            "train_loss": loss.item(),
+            "val_loss": val_loss.item(),
+            "acc": accuracy,
+            "time": tf - t0,
+
+            "param": [p.item() for p in model['data_aug']['prob']],
+        }
+        log.append(data)
+        #############
+        if val_loss_monitor : 
+            val_loss_monitor.register(val_loss.item())
+            if val_loss_monitor.end_training(): break #Stop training
+
+
+        if not model.is_augmenting() and (epoch == dataug_epoch_start or (val_loss_monitor and val_loss_monitor.limit_reached()==1)):
+            print('Starting Data Augmention...')
+            dataug_epoch_start = epoch
+            model.augment(mode=True)
+            if inner_it != 0: high_grad_track = True
+
+    print("Copy ", countcopy)
+    return log
+
+##########################################
+if __name__ == "__main__":
+
+    n_inner_iter = 0
+    epochs = 2
+    dataug_epoch_start=0
+
+    #### Classic ####
+    '''
+    model = LeNet(3,10).to(device)
+    #model = torchvision.models.resnet18()
+    #model = Augmented_model(Data_augV3(mix_dist=0.0), LeNet(3,10)).to(device)
+    #model.augment(mode=False)
+
+    print(str(model), 'on', device_name)
+    log= train_classic_higher(model=model, epochs=epochs)
+
+    ####
+    plot_res(log, fig_name="res/{}-{} epochs".format(str(model),epochs))
+    print('-'*9)
+    times = [x["time"] for x in log]
+    out = {"Accuracy": max([x["acc"] for x in log]), "Time": (np.mean(times),np.std(times)), "Device": device_name, "Log": log}
+    print(str(model),": acc", out["Accuracy"], "in (ms):", out["Time"][0], "+/-", out["Time"][1])
+    with open("res/log/%s.json" % "{}-{} epochs".format(str(model),epochs), "w+") as f:
+        json.dump(out, f, indent=True)
+        print('Log :\"',f.name, '\" saved !')
+    print('-'*9)
+    '''
+    #### Augmented Model ####
+    #'''
+    aug_model = Augmented_model(Data_augV4(TF_dict=TF.TF_dict, mix_dist=0.0), LeNet(3,10)).to(device)
+    print(str(aug_model), 'on', device_name)
+    #run_simple_dataug(inner_it=n_inner_iter, epochs=epochs)
+    log= run_dist_dataugV2(model=aug_model, epochs=epochs, inner_it=n_inner_iter, dataug_epoch_start=dataug_epoch_start, print_freq=10, loss_patience=10)
+
+    ####
+    plot_res(log, fig_name="res/{}-{} epochs (dataug:{})- {} in_it".format(str(aug_model),epochs,dataug_epoch_start,n_inner_iter))
+    print('-'*9)
+    times = [x["time"] for x in log]
+    out = {"Accuracy": max([x["acc"] for x in log]), "Time": (np.mean(times),np.std(times)), "Device": device_name, "Param_names": aug_model.TF_names(), "Log": log}
+    print(str(aug_model),": acc", out["Accuracy"], "in (ms):", out["Time"][0], "+/-", out["Time"][1])
+    with open("res/log/%s.json" % "{}-{} epochs (dataug:{})- {} in_it".format(str(aug_model),epochs,dataug_epoch_start,n_inner_iter), "w+") as f:
+        json.dump(out, f, indent=True)
+        print('Log :\"',f.name, '\" saved !')
+    print('-'*9)
+    #'''
+
+    #### Comparison ####
+    '''
+    files=[
+        #"res/log/LeNet-100 epochs.json",
+        #"res/log/Aug_mod(Data_augV4(Uniform-4 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.json",
+        #"res/log/Aug_mod(Data_augV4(Uniform-4 TF)-LeNet)-100 epochs (dataug:50)- 0 in_it.json",
+        #"res/log/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.json",
+        #"res/log/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-100 epochs (dataug:50)- 10 in_it.json",
+        #"res/log/Aug_mod(Data_augV4(Mix 0,5-3 TF)-LeNet)-100 epochs (dataug:0)- 1 in_it.json",
+        #"res/log/Aug_mod(Data_augV4(Mix 0.5-3 TF)-LeNet)-100 epochs (dataug:50)- 10 in_it.json",
+        #"res/log/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:0)- 10 in_it.json",
+        "res/log/Aug_mod(Data_augV4(Uniform-10 TF)-LeNet)-100 epochs (dataug:50)- 10 in_it.json",
+        "res/log/Aug_mod(Data_augV4(Uniform-10 TF)-LeNet)-100 epochs (dataug:50)- 0 in_it.json",
+    ]
+    plot_compare(filenames=files, fig_name="res/compare")
+    '''
--- a/higher/test_lr.py
+++ b/higher/test_lr.py
@ -0,0 +1,150 @@
+import numpy as np
+import json, math, time, os
+
+from torch.utils.data import SubsetRandomSampler
+import torch.optim as optim
+import higher
+from model import *
+
+import copy
+
+BATCH_SIZE = 300
+TEST_SIZE = 300
+
+mnist_train = torchvision.datasets.MNIST(
+    "./data", train=True, download=True, 
+    transform=torchvision.transforms.Compose([
+            #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
+            torchvision.transforms.ToTensor()
+        ])
+)
+
+mnist_test = torchvision.datasets.MNIST(
+    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
+)
+
+#train_subset_indices=range(int(len(mnist_train)/2))
+train_subset_indices=range(BATCH_SIZE)
+val_subset_indices=range(int(len(mnist_train)/2),len(mnist_train))
+
+dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices))
+dl_val = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices))
+dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=TEST_SIZE, shuffle=False)
+
+
+def test(model):
+    model.eval()
+    for i, (features, labels) in enumerate(dl_test):
+        pred = model.forward(features)
+        return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100
+
+def train_classic(model, optim, epochs=1):
+    model.train()
+    log = []
+    for epoch in range(epochs):
+        t0 = time.process_time()
+        for i, (features, labels) in enumerate(dl_train):
+
+            optim.zero_grad()
+            pred = model.forward(features)
+            loss = F.cross_entropy(pred,labels)
+            loss.backward()
+            optim.step()
+
+        #### Log ####
+        tf = time.process_time()
+        data={
+            "time": tf - t0,
+        }
+        log.append(data)
+
+    times = [x["time"] for x in log]
+    print("Vanilla : acc", test(model), "in (ms):", np.mean(times), "+/-", np.std(times))
+##########################################
+if __name__ == "__main__":
+
+    device = torch.device('cpu')
+
+    model = LeNet(1,10)
+    opt_param = {
+        "lr": torch.tensor(1e-2).requires_grad_(),
+        "momentum": torch.tensor(0.9).requires_grad_()
+        }
+    n_inner_iter = 1
+    dl_train_it = iter(dl_train)
+    dl_val_it = iter(dl_val)
+    epoch = 0
+    epochs = 10
+
+    ####
+    train_classic(model=model, optim=torch.optim.Adam(model.parameters(), lr=0.001), epochs=epochs)
+    model = LeNet(1,10)
+
+    meta_opt = torch.optim.Adam(opt_param.values(), lr=1e-2)
+    inner_opt = torch.optim.SGD(model.parameters(), lr=opt_param['lr'], momentum=opt_param['momentum'])
+    #for xs_val, ys_val in dl_val:
+    while epoch < epochs:
+        #print(data_aug.params["mag"], data_aug.params["mag"].grad)
+        meta_opt.zero_grad()
+        model.train()
+        with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, track_higher_grads=True) as (fmodel, diffopt): #effet copy_initial_weight pas clair...
+            
+            for param_group in diffopt.param_groups:
+                param_group['lr'] = opt_param['lr']
+                param_group['momentum'] = opt_param['momentum']
+
+            for i in range(n_inner_iter):
+                try:
+                    xs, ys = next(dl_train_it)
+                except StopIteration: #Fin epoch train
+                    epoch +=1
+                    dl_train_it = iter(dl_train)
+                    xs, ys = next(dl_train_it)
+
+                    print('Epoch', epoch)
+                    print('train loss',loss.item(), '/ val loss', val_loss.item())
+                    print('acc', test(model))
+                    print('opt : lr', opt_param['lr'].item(), 'momentum', opt_param['momentum'].item())
+                    print('-'*9)
+                    model.train()
+
+
+                logits = fmodel(xs)  # modified `params` can also be passed as a kwarg
+                loss = F.cross_entropy(logits, ys)  # no need to call loss.backwards()
+                #print('loss',loss.item())
+                diffopt.step(loss)  # note that `step` must take `loss` as an argument!
+                # The line above gets P[t+1] from P[t] and loss[t]. `step` also returns
+                # these new parameters, as an alternative to getting them from
+                # `fmodel.fast_params` or `fmodel.parameters()` after calling
+                # `diffopt.step`.
+
+                # At this point, or at any point in the iteration, you can take the
+                # gradient of `fmodel.parameters()` (or equivalently
+                # `fmodel.fast_params`) w.r.t. `fmodel.parameters(time=0)` (equivalently
+                # `fmodel.init_fast_params`). i.e. `fast_params` will always have
+                # `grad_fn` as an attribute, and be part of the gradient tape.
+            
+            # At the end of your inner loop you can obtain these e.g. ...
+            #grad_of_grads = torch.autograd.grad(
+            #    meta_loss_fn(fmodel.parameters()), fmodel.parameters(time=0))
+            try:
+                xs_val, ys_val = next(dl_val_it)
+            except StopIteration: #Fin epoch val
+                dl_val_it = iter(dl_val_it)
+                xs_val, ys_val = next(dl_val_it)
+
+            val_logits = fmodel(xs_val)
+            val_loss = F.cross_entropy(val_logits, ys_val)
+            #print('val_loss',val_loss.item())
+
+            val_loss.backward()
+            #meta_grads = torch.autograd.grad(val_loss, opt_lr, allow_unused=True)
+            #print(meta_grads)
+            for param_group in diffopt.param_groups:
+                    print(param_group['lr'], '/',param_group['lr'].grad)
+                    print(param_group['momentum'], '/',param_group['momentum'].grad)
+
+            #model=copy.deepcopy(fmodel)
+            model.load_state_dict(fmodel.state_dict())
+
+            meta_opt.step()
--- a/higher/transformations.py
+++ b/higher/transformations.py
@ -0,0 +1,205 @@
+import torch
+import kornia
+import random
+
+### Available TF for Dataug ###
+TF_dict={ #f(mag_normalise)=mag_reelle
+  ## Geometric TF ##
+  'Identity' : (lambda mag: None),
+  'FlipUD' : (lambda mag: None),
+  'FlipLR' : (lambda mag: None),
+  'Rotate': (lambda mag: random.randint(-int_parameter(mag, maxval=30), int_parameter(mag, maxval=30))),
+  'TranslateX': (lambda mag: [random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20)), 0]),
+  'TranslateY': (lambda mag: [0, random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20))]),
+  'ShearX': (lambda mag: [random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3)), 0]),
+  'ShearY': (lambda mag: [0, random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3))]),
+
+  ## Color TF (Expect image in the range of [0, 1]) ##
+  'Contrast': (lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))),
+  'Color':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))),
+  'Brightness':(lambda mag: random.uniform(1., float_parameter(mag, maxval=1.9))),
+  'Sharpness':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))),
+  'Posterize': (lambda mag: random.randint(4, int_parameter(mag, maxval=8))),
+  'Solarize': (lambda mag: random.randint(1, int_parameter(mag, maxval=256))/256.), #=>Image entre [0,1] #Pas opti pour des batch
+
+  #Non fonctionnel
+  #'Auto_Contrast': (lambda mag: None), #Pas opti pour des batch (Super lent)
+  #'Equalize': (lambda mag: None),
+}
+
+
+def int_image(float_image): #ATTENTION : legere perte d'info (granularite : 1/256 = 0.0039)
+  return (float_image*255.).type(torch.uint8)
+
+def float_image(int_image):
+  return int_image.type(torch.float)/255.
+
+def rand_inverse(value):
+    return  value if random.random() < 0.5 else -value
+    
+#https://github.com/tensorflow/models/blob/fc2056bce6ab17eabdc139061fef8f4f2ee763ec/research/autoaugment/augmentation_transforms.py#L137
+PARAMETER_MAX = 10  # What is the max 'level' a transform could be predicted
+def float_parameter(level, maxval):
+  """Helper function to scale `val` between 0 and maxval .
+  Args:
+    level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+    maxval: Maximum value that the operation can have. This will be scaled
+      to level/PARAMETER_MAX.
+  Returns:
+    A float that results from scaling `maxval` according to `level`.
+  """
+  return float(level) * maxval / PARAMETER_MAX
+
+def int_parameter(level, maxval):
+  """Helper function to scale `val` between 0 and maxval .
+  Args:
+    level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+    maxval: Maximum value that the operation can have. This will be scaled
+      to level/PARAMETER_MAX.
+  Returns:
+    An int that results from scaling `maxval` according to `level`.
+  """
+  return int(level * maxval / PARAMETER_MAX)
+
+def flipLR(x):
+    device = x.device
+    (batch_size, channels, h, w) = x.shape
+
+    M =torch.tensor( [[[-1.,  0., w-1],
+                        [ 0.,  1.,  0.],
+                        [ 0.,  0.,  1.]]], device=device).expand(batch_size,-1,-1)
+
+    # warp the original image by the found transform
+    return kornia.warp_perspective(x, M, dsize=(h, w))
+
+def flipUD(x):
+    device = x.device
+    (batch_size, channels, h, w) = x.shape
+
+    M =torch.tensor( [[[ 1.,  0.,  0.],
+                        [ 0., -1.,  h-1],
+                        [ 0.,  0.,  1.]]], device=device).expand(batch_size,-1,-1)
+
+    # warp the original image by the found transform
+    return kornia.warp_perspective(x, M, dsize=(h, w))
+
+def rotate(x, angle):
+  return kornia.rotate(x, angle=angle.type(torch.float32)) #Kornia ne supporte pas les int
+
+def translate(x, translation):
+  return kornia.translate(x, translation=translation.type(torch.float32)) #Kornia ne supporte pas les int
+
+def shear(x, shear):
+  return kornia.shear(x, shear=shear)
+
+def contrast(x, contrast_factor):
+  return kornia.adjust_contrast(x, contrast_factor=contrast_factor) #Expect image in the range of [0, 1]
+
+#https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageEnhance.py
+def color(x, color_factor):
+    (batch_size, channels, h, w) = x.shape
+
+    gray_x = kornia.rgb_to_grayscale(x)
+    gray_x = gray_x.repeat_interleave(channels, dim=1)
+    return blend(gray_x, x, color_factor).clamp(min=0.0,max=1.0) #Expect image in the range of [0, 1]
+
+def brightness(x, brightness_factor):
+    device = x.device
+
+    return blend(torch.zeros(x.size(), device=device), x, brightness_factor).clamp(min=0.0,max=1.0) #Expect image in the range of [0, 1]
+
+def sharpeness(x, sharpness_factor):
+    device = x.device
+    (batch_size, channels, h, w) = x.shape
+
+    k = torch.tensor([[[ 1.,  1.,  1.],
+                       [ 1.,  5.,  1.],
+                       [ 1.,  1.,  1.]]], device=device) #Smooth Filter : https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageFilter.py
+    smooth_x = kornia.filter2D(x, kernel=k, border_type='reflect', normalized=True) #Peut etre necessaire de s'occuper du channel Alhpa differement
+
+    return blend(smooth_x, x, sharpness_factor).clamp(min=0.0,max=1.0) #Expect image in the range of [0, 1]
+
+#https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageOps.py
+def posterize(x, bits):
+  x = int_image(x) #Expect image in the range of [0, 1]
+
+  mask = ~(2 ** (8 - bits) - 1).type(torch.uint8)
+
+  (batch_size, channels, h, w) = x.shape
+  mask = mask.unsqueeze(dim=1).expand(-1,channels).unsqueeze(dim=2).expand(-1,channels, h).unsqueeze(dim=3).expand(-1,channels, h, w) #Il y a forcement plus simple ...
+
+  return float_image(x & mask)
+
+def auto_contrast(x): #PAS OPTIMISE POUR DES BATCH #EXTRA LENT
+  # Optimisation : Application de LUT efficace / Calcul d'histogramme par batch/channel
+  print("Warning : Pas encore check !")
+  (batch_size, channels, h, w) = x.shape
+  x = int_image(x) #Expect image in the range of [0, 1]
+  #print('Start',x[0])
+  for im_idx, img in enumerate(x.chunk(batch_size, dim=0)): #Operation par image
+    #print(img.shape)
+    for chan_idx, chan in enumerate(img.chunk(channels, dim=1)): # Operation par channel
+      #print(chan.shape)
+      hist = torch.histc(chan, bins=256, min=0, max=255) #PAS DIFFERENTIABLE
+
+      # find lowest/highest samples after preprocessing
+      for lo in range(256):
+          if hist[lo]:
+              break
+      for hi in range(255, -1, -1):
+          if hist[hi]:
+              break
+      if hi <= lo:
+          # don't bother
+          pass
+      else:
+        scale = 255.0 / (hi - lo)
+        offset = -lo * scale
+        for ix in range(256):
+          n_ix = int(ix * scale + offset)
+          if n_ix < 0: n_ix = 0
+          elif n_ix > 255: n_ix = 255
+
+          chan[chan==ix]=n_ix
+          x[im_idx, chan_idx]=chan
+
+  #print('End',x[0])
+  return float_image(x)
+
+def equalize(x): #PAS OPTIMISE POUR DES BATCH
+  raise Exception(self, "not implemented") 
+  # Optimisation : Application de LUT efficace / Calcul d'histogramme par batch/channel
+  (batch_size, channels, h, w) = x.shape
+  x = int_image(x) #Expect image in the range of [0, 1]
+  #print('Start',x[0])
+  for im_idx, img in enumerate(x.chunk(batch_size, dim=0)): #Operation par image
+    #print(img.shape)
+    for chan_idx, chan in enumerate(img.chunk(channels, dim=1)): # Operation par channel
+      #print(chan.shape)
+      hist = torch.histc(chan, bins=256, min=0, max=255) #PAS DIFFERENTIABLE
+
+  return float_image(x)
+
+def solarize(x, thresholds): #PAS OPTIMISE POUR DES BATCH
+  # Optimisation : Mask direct sur toute les donnees (Mask = (B,C,H,W)> (B))
+  for idx, t in enumerate(thresholds): #Operation par image
+    mask = x[idx] > t.item()
+    inv_x = 1-x[idx][mask]
+    x[idx][mask]=inv_x
+  return x
+
+#https://github.com/python-pillow/Pillow/blob/9c78c3f97291bd681bc8637922d6a2fa9415916c/src/PIL/Image.py#L2818
+def blend(x,y,alpha): #out = image1 * (1.0 - alpha) + image2 * alpha
+    #return kornia.add_weighted(src1=x, alpha=(1-alpha), src2=y, beta=alpha, gamma=0) #out=src1∗alpha+src2∗beta+gamma #Ne fonctionne pas pour des batch de alpha
+
+    if not isinstance(x, torch.Tensor):
+        raise TypeError("x should be a tensor. Got {}".format(type(x)))
+
+    if not isinstance(y, torch.Tensor):
+        raise TypeError("y should be a tensor. Got {}".format(type(y)))
+
+    (batch_size, channels, h, w) = x.shape
+    alpha = alpha.unsqueeze(dim=1).expand(-1,channels).unsqueeze(dim=2).expand(-1,channels, h).unsqueeze(dim=3).expand(-1,channels, h, w) #Il y a forcement plus simple ...
+    res = x*(1-alpha) + y*alpha
+
+    return res
--- a/higher/utils.py
+++ b/higher/utils.py
@ -0,0 +1,184 @@
+import numpy as np
+import json, math, time, os
+import matplotlib.pyplot as plt
+import copy
+import gc
+
+from torchviz import make_dot 
+
+import torch
+import torch.nn.functional as F
+
+
+def print_graph(PyTorch_obj, fig_name='graph'):
+    graph=make_dot(PyTorch_obj) #Loss give the whole graph
+    graph.format = 'svg' #https://graphviz.readthedocs.io/en/stable/manual.html#formats
+    graph.render(fig_name)
+
+def plot_res(log, fig_name='res'):
+
+    epochs = [x["epoch"] for x in log]
+
+    fig, ax = plt.subplots(ncols=3, figsize=(15, 3))
+
+    ax[0].set_title('Loss')
+    ax[0].plot(epochs,[x["train_loss"] for x in log], label='Train')
+    ax[0].plot(epochs,[x["val_loss"] for x in log], label='Val')
+    ax[0].legend()
+        
+    ax[1].set_title('Acc')
+    ax[1].plot(epochs,[x["acc"] for x in log]) 
+
+    if log[0]["param"]!= None:
+        if isinstance(log[0]["param"],float):
+            ax[2].set_title('Mag')
+            ax[2].plot(epochs,[x["param"] for x in log], label='Mag')
+            ax[2].legend()
+        else :
+            ax[2].set_title('Prob')
+            for idx, _ in enumerate(log[0]["param"]):
+                ax[2].plot(epochs,[x["param"][idx] for x in log], label='P'+str(idx))
+            ax[2].legend() 
+            #ax[2].legend(('P-0', 'P-45', 'P-180'))
+
+    fig_name = fig_name.replace('.',',')
+    plt.savefig(fig_name)
+
+def plot_compare(filenames, fig_name='res'):
+
+    all_data=[]
+    legend=""
+    for idx, file in enumerate(filenames):
+        legend+=str(idx)+'-'+file+'\n'
+        with open(file) as json_file:
+            data = json.load(json_file)
+            all_data.append(data)
+
+    fig, ax = plt.subplots(ncols=3, figsize=(30, 8))
+
+    for data_idx, log in enumerate(all_data):            
+        log=log['Log']
+        epochs = [x["epoch"] for x in log]
+
+        ax[0].plot(epochs,[x["train_loss"] for x in log], label=str(data_idx)+'-Train')
+        ax[0].plot(epochs,[x["val_loss"] for x in log], label=str(data_idx)+'-Val')
+            
+        ax[1].plot(epochs,[x["acc"] for x in log], label=str(data_idx)) 
+        #ax[1].text(x=0.5,y=0,s=str(data_idx)+'-'+filenames[data_idx], transform=ax[1].transAxes)
+
+        if log[0]["param"]!= None:
+            if isinstance(log[0]["param"],float):
+                ax[2].plot(epochs,[x["param"] for x in log], label=str(data_idx)+'-Mag')
+                
+            else :
+                for idx, _ in enumerate(log[0]["param"]):
+                    ax[2].plot(epochs,[x["param"][idx] for x in log], label=str(data_idx)+'-P'+str(idx))
+
+    fig.suptitle(legend)
+    ax[0].set_title('Loss')
+    ax[1].set_title('Acc')
+    ax[2].set_title('Param')
+    for a in ax: a.legend()
+    fig_name = fig_name.replace('.',',')
+
+    plt.savefig(fig_name, bbox_inches='tight')
+
+def viz_sample_data(imgs, labels, fig_name='data_sample'):
+
+    sample = imgs[0:25,].permute(0, 2, 3, 1).squeeze().cpu()
+
+    plt.figure(figsize=(10,10))
+    for i in range(25):
+        plt.subplot(5,5,i+1)
+        plt.xticks([])
+        plt.yticks([])
+        plt.grid(False)
+        plt.imshow(sample[i,], cmap=plt.cm.binary)
+        plt.xlabel(labels[i].item())
+
+    plt.savefig(fig_name)
+
+def model_copy(src,dst, patch_copy=True, copy_grad=True):
+    #model=copy.deepcopy(fmodel) #Pas approprie, on ne souhaite que les poids/grad (pas tout fmodel et ses etats)
+
+    dst.load_state_dict(src.state_dict()) #Do not copy gradient ! 
+
+    if patch_copy:
+        dst['model'].load_state_dict(src['model'].state_dict()) #Copie donnee manquante ?
+        dst['data_aug'].load_state_dict(src['data_aug'].state_dict())
+
+    #Copie des gradients
+    if copy_grad:
+        for paramName, paramValue, in src.named_parameters():
+          for netCopyName, netCopyValue, in dst.named_parameters():
+            if paramName == netCopyName:
+              netCopyValue.grad = paramValue.grad
+              #netCopyValue=copy.deepcopy(paramValue)
+
+    try: #Data_augV4
+        dst['data_aug']._input_info = src['data_aug']._input_info 
+        dst['data_aug']._TF_matrix = src['data_aug']._TF_matrix
+    except:
+        pass
+
+def optim_copy(dopt, opt):
+
+    #inner_opt.load_state_dict(diffopt.state_dict()) #Besoin sauver etat otpim (momentum, etc.) => Ne copie pas le state...
+    #opt_param=higher.optim.get_trainable_opt_params(diffopt)
+
+    for group_idx, group in enumerate(opt.param_groups):
+       # print('gp idx',group_idx)
+        for p_idx, p in enumerate(group['params']):
+            opt.state[p]=dopt.state[group_idx][p_idx]
+
+def print_torch_mem(add_info=''):
+
+    nb=0
+    max_size=0
+    for obj in gc.get_objects():
+        #print(type(obj))
+        try:
+            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # and len(obj.size())>1:
+                #print(i, type(obj), obj.size())
+                size = np.sum(obj.size())
+                if(size>max_size): max_size=size
+                nb+=1
+        except:
+            pass
+    print(add_info, "-Pytroch tensor nb:",nb," / Max dim:", max_size)
+
+    #print(add_info, "-Garbage size :",len(gc.garbage))
+
+class loss_monitor(): #Voir https://github.com/pytorch/ignite
+    def __init__(self, patience, end_train=1):
+        self.patience = patience
+        self.end_train = end_train
+        self.counter = 0
+        self.best_score = None
+        self.reached_limit = 0
+
+    def register(self, loss):
+        if self.best_score is None:
+            self.best_score = loss
+        elif loss > self.best_score:
+            self.counter += 1
+            #if not self.reached_limit: 
+            print("loss no improve counter", self.counter, self.reached_limit)
+        else:
+            self.best_score = loss
+            self.counter = 0
+    def limit_reached(self):
+        if self.counter >= self.patience:
+            self.counter = 0
+            self.reached_limit +=1
+            self.best_score = None
+        return self.reached_limit
+
+    def end_training(self):
+        if self.limit_reached() >= self.end_train:
+            return True
+        else:
+            return False
+
+    def reset(self):
+        self.__init__(self.patience, self.end_train)