commit 3ae3e02e590bb094d045f89bbd51fbc3b58d6f4f Author: Harle, Antoine (Contracteur) Date: Fri Nov 8 11:28:06 2019 -0500 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d38ecc --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +/higher/data +/Gradient-Descent-The-Ultimate-Optimizer/data +/FAR-HO/data +/__pycache__ + +*.pyo +*.pyc +*~ + +# Compiled source # +################### +*.com +*.class +*.dll +*.exe +*.o +*.so + +# Packages # +############ +# it's better to unpack these files and commit the raw source +# git has its own built in compression methods +*.7z +*.dmg +*.gz +*.iso +*.jar +*.rar +*.tar +*.zip + +# Logs and databases # +###################### +*.log +*.sql +*.sqlite + +# OS generated files # +###################### +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db diff --git a/FAR-HO/augmentation_transforms.py b/FAR-HO/augmentation_transforms.py new file mode 100755 index 0000000..ef17188 --- /dev/null +++ b/FAR-HO/augmentation_transforms.py @@ -0,0 +1,456 @@ +# Copyright 2018 The TensorFlow Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Transforms used in the Augmentation Policies.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import inspect +import random +import numpy as np +# pylint:disable=g-multiple-import +from PIL import ImageOps, ImageEnhance, ImageFilter, Image +# pylint:enable=g-multiple-import + + +IMAGE_SIZE = 28 +# What is the dataset mean and std of the images on the training set +MEANS = [0.49139968, 0.48215841, 0.44653091] +STDS = [0.24703223, 0.24348513, 0.26158784] +PARAMETER_MAX = 10 # What is the max 'level' a transform could be predicted + + +def random_flip(x): + """Flip the input x horizontally with 50% probability.""" + if np.random.rand(1)[0] > 0.5: + return np.fliplr(x) + return x + + +def zero_pad_and_crop(img, amount=4): + """Zero pad by `amount` zero pixels on each side then take a random crop. + + Args: + img: numpy image that will be zero padded and cropped. + amount: amount of zeros to pad `img` with horizontally and verically. + + Returns: + The cropped zero padded img. The returned numpy array will be of the same + shape as `img`. + """ + padded_img = np.zeros((img.shape[0] + amount * 2, img.shape[1] + amount * 2, + img.shape[2])) + padded_img[amount:img.shape[0] + amount, amount: + img.shape[1] + amount, :] = img + top = np.random.randint(low=0, high=2 * amount) + left = np.random.randint(low=0, high=2 * amount) + new_img = padded_img[top:top + img.shape[0], left:left + img.shape[1], :] + return new_img + + +def create_cutout_mask(img_height, img_width, num_channels, size): + """Creates a zero mask used for cutout of shape `img_height` x `img_width`. + + Args: + img_height: Height of image cutout mask will be applied to. + img_width: Width of image cutout mask will be applied to. + num_channels: Number of channels in the image. + size: Size of the zeros mask. + + Returns: + A mask of shape `img_height` x `img_width` with all ones except for a + square of zeros of shape `size` x `size`. This mask is meant to be + elementwise multiplied with the original image. Additionally returns + the `upper_coord` and `lower_coord` which specify where the cutout mask + will be applied. + """ + assert img_height == img_width + + # Sample center where cutout mask will be applied + height_loc = np.random.randint(low=0, high=img_height) + width_loc = np.random.randint(low=0, high=img_width) + + # Determine upper right and lower left corners of patch + upper_coord = (max(0, height_loc - size // 2), max(0, width_loc - size // 2)) + lower_coord = (min(img_height, height_loc + size // 2), + min(img_width, width_loc + size // 2)) + mask_height = lower_coord[0] - upper_coord[0] + mask_width = lower_coord[1] - upper_coord[1] + assert mask_height > 0 + assert mask_width > 0 + + mask = np.ones((img_height, img_width, num_channels)) + zeros = np.zeros((mask_height, mask_width, num_channels)) + mask[upper_coord[0]:lower_coord[0], upper_coord[1]:lower_coord[1], :] = ( + zeros) + return mask, upper_coord, lower_coord + + +def cutout_numpy(img, size=16): + """Apply cutout with mask of shape `size` x `size` to `img`. + + The cutout operation is from the paper https://arxiv.org/abs/1708.04552. + This operation applies a `size`x`size` mask of zeros to a random location + within `img`. + + Args: + img: Numpy image that cutout will be applied to. + size: Height/width of the cutout mask that will be + + Returns: + A numpy tensor that is the result of applying the cutout mask to `img`. + """ + img_height, img_width, num_channels = (img.shape[0], img.shape[1], + img.shape[2]) + assert len(img.shape) == 3 + mask, _, _ = create_cutout_mask(img_height, img_width, num_channels, size) + return img * mask + + +def float_parameter(level, maxval): + """Helper function to scale `val` between 0 and maxval . + + Args: + level: Level of the operation that will be between [0, `PARAMETER_MAX`]. + maxval: Maximum value that the operation can have. This will be scaled + to level/PARAMETER_MAX. + + Returns: + A float that results from scaling `maxval` according to `level`. + """ + return float(level) * maxval / PARAMETER_MAX + + +def int_parameter(level, maxval): + """Helper function to scale `val` between 0 and maxval . + + Args: + level: Level of the operation that will be between [0, `PARAMETER_MAX`]. + maxval: Maximum value that the operation can have. This will be scaled + to level/PARAMETER_MAX. + + Returns: + An int that results from scaling `maxval` according to `level`. + """ + return int(level * maxval / PARAMETER_MAX) + + +def pil_wrap(img): + """Convert the `img` numpy tensor to a PIL Image.""" + return Image.fromarray( + np.uint8((img * STDS + MEANS) * 255.0)).convert('RGBA') + + +def pil_unwrap(pil_img): + """Converts the PIL img to a numpy array.""" + pic_array = (np.array(pil_img.getdata()).reshape((IMAGE_SIZE, IMAGE_SIZE, 4)) / 255.0) + i1, i2 = np.where(pic_array[:, :, 3] == 0) + pic_array = (pic_array[:, :, :3] - MEANS) / STDS + pic_array[i1, i2] = [0, 0, 0] + return pic_array + + +def apply_policy(policy, img): + """Apply the `policy` to the numpy `img`. + + Args: + policy: A list of tuples with the form (name, probability, level) where + `name` is the name of the augmentation operation to apply, `probability` + is the probability of applying the operation and `level` is what strength + the operation to apply. + img: Numpy image that will have `policy` applied to it. + + Returns: + The result of applying `policy` to `img`. + """ + #print('img shape :',img.shape) + #print('Policy len :',len(policy)) + pil_img = pil_wrap(img) + for xform in policy: + #print('xform :', len(xform)) + assert len(xform) == 3 + name, probability, level = xform + #xform_fn = NAME_TO_TRANSFORM[name].pil_transformer(probability, level) + xform_fn = NAME_TO_TRANSFORM[name].pil_transformer(probability.eval(), level) + pil_img = xform_fn(pil_img) + return pil_unwrap(pil_img) + + +class TransformFunction(object): + """Wraps the Transform function for pretty printing options.""" + + def __init__(self, func, name): + self.f = func + self.name = name + + def __repr__(self): + return '<' + self.name + '>' + + def __call__(self, pil_img): + return self.f(pil_img) + + +class TransformT(object): + """Each instance of this class represents a specific transform.""" + + def __init__(self, name, xform_fn): + self.name = name + self.xform = xform_fn + + def pil_transformer(self, probability, level): + + def return_function(im): + if random.random() < probability: + im = self.xform(im, level) + return im + + name = self.name + '({:.1f},{})'.format(probability, level) + return TransformFunction(return_function, name) + + def do_transform(self, image, level): + f = self.pil_transformer(PARAMETER_MAX, level) + return pil_unwrap(f(pil_wrap(image))) + + +################## Transform Functions ################## +identity = TransformT('identity', lambda pil_img, level: pil_img) +flip_lr = TransformT( + 'FlipLR', + lambda pil_img, level: pil_img.transpose(Image.FLIP_LEFT_RIGHT)) +flip_ud = TransformT( + 'FlipUD', + lambda pil_img, level: pil_img.transpose(Image.FLIP_TOP_BOTTOM)) +# pylint:disable=g-long-lambda +auto_contrast = TransformT( + 'AutoContrast', + lambda pil_img, level: ImageOps.autocontrast( + pil_img.convert('RGB')).convert('RGBA')) +equalize = TransformT( + 'Equalize', + lambda pil_img, level: ImageOps.equalize( + pil_img.convert('RGB')).convert('RGBA')) +invert = TransformT( + 'Invert', + lambda pil_img, level: ImageOps.invert( + pil_img.convert('RGB')).convert('RGBA')) +# pylint:enable=g-long-lambda +blur = TransformT( + 'Blur', lambda pil_img, level: pil_img.filter(ImageFilter.BLUR)) +smooth = TransformT( + 'Smooth', + lambda pil_img, level: pil_img.filter(ImageFilter.SMOOTH)) + + +def _rotate_impl(pil_img, level): + """Rotates `pil_img` from -30 to 30 degrees depending on `level`.""" + degrees = int_parameter(level, 30) + if random.random() > 0.5: + degrees = -degrees + return pil_img.rotate(degrees) + + +rotate = TransformT('Rotate', _rotate_impl) + + +def _posterize_impl(pil_img, level): + """Applies PIL Posterize to `pil_img`.""" + level = int_parameter(level, 4) + return ImageOps.posterize(pil_img.convert('RGB'), 4 - level).convert('RGBA') + + +posterize = TransformT('Posterize', _posterize_impl) + + +def _shear_x_impl(pil_img, level): + """Applies PIL ShearX to `pil_img`. + + The ShearX operation shears the image along the horizontal axis with `level` + magnitude. + + Args: + pil_img: Image in PIL object. + level: Strength of the operation specified as an Integer from + [0, `PARAMETER_MAX`]. + + Returns: + A PIL Image that has had ShearX applied to it. + """ + level = float_parameter(level, 0.3) + if random.random() > 0.5: + level = -level + return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, level, 0, 0, 1, 0)) + + +shear_x = TransformT('ShearX', _shear_x_impl) + + +def _shear_y_impl(pil_img, level): + """Applies PIL ShearY to `pil_img`. + + The ShearY operation shears the image along the vertical axis with `level` + magnitude. + + Args: + pil_img: Image in PIL object. + level: Strength of the operation specified as an Integer from + [0, `PARAMETER_MAX`]. + + Returns: + A PIL Image that has had ShearX applied to it. + """ + level = float_parameter(level, 0.3) + if random.random() > 0.5: + level = -level + return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, 0, 0, level, 1, 0)) + + +shear_y = TransformT('ShearY', _shear_y_impl) + + +def _translate_x_impl(pil_img, level): + """Applies PIL TranslateX to `pil_img`. + + Translate the image in the horizontal direction by `level` + number of pixels. + + Args: + pil_img: Image in PIL object. + level: Strength of the operation specified as an Integer from + [0, `PARAMETER_MAX`]. + + Returns: + A PIL Image that has had TranslateX applied to it. + """ + level = int_parameter(level, 10) + if random.random() > 0.5: + level = -level + return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, 0, level, 0, 1, 0)) + + +translate_x = TransformT('TranslateX', _translate_x_impl) + + +def _translate_y_impl(pil_img, level): + """Applies PIL TranslateY to `pil_img`. + + Translate the image in the vertical direction by `level` + number of pixels. + + Args: + pil_img: Image in PIL object. + level: Strength of the operation specified as an Integer from + [0, `PARAMETER_MAX`]. + + Returns: + A PIL Image that has had TranslateY applied to it. + """ + level = int_parameter(level, 10) + if random.random() > 0.5: + level = -level + return pil_img.transform((IMAGE_SIZE, IMAGE_SIZE), Image.AFFINE, (1, 0, 0, 0, 1, level)) + + +translate_y = TransformT('TranslateY', _translate_y_impl) + + +def _crop_impl(pil_img, level, interpolation=Image.BILINEAR): + """Applies a crop to `pil_img` with the size depending on the `level`.""" + cropped = pil_img.crop((level, level, IMAGE_SIZE - level, IMAGE_SIZE - level)) + resized = cropped.resize((IMAGE_SIZE, IMAGE_SIZE), interpolation) + return resized + + +crop_bilinear = TransformT('CropBilinear', _crop_impl) + + +def _solarize_impl(pil_img, level): + """Applies PIL Solarize to `pil_img`. + + Translate the image in the vertical direction by `level` + number of pixels. + + Args: + pil_img: Image in PIL object. + level: Strength of the operation specified as an Integer from + [0, `PARAMETER_MAX`]. + + Returns: + A PIL Image that has had Solarize applied to it. + """ + level = int_parameter(level, 256) + return ImageOps.solarize(pil_img.convert('RGB'), 256 - level).convert('RGBA') + + +solarize = TransformT('Solarize', _solarize_impl) + + +def _cutout_pil_impl(pil_img, level): + """Apply cutout to pil_img at the specified level.""" + size = int_parameter(level, 20) + if size <= 0: + return pil_img + img_height, img_width, num_channels = (IMAGE_SIZE, IMAGE_SIZE, 3) + _, upper_coord, lower_coord = ( + create_cutout_mask(img_height, img_width, num_channels, size)) + pixels = pil_img.load() # create the pixel map + for i in range(upper_coord[0], lower_coord[0]): # for every col: + for j in range(upper_coord[1], lower_coord[1]): # For every row + pixels[i, j] = (125, 122, 113, 0) # set the colour accordingly + return pil_img + +cutout = TransformT('Cutout', _cutout_pil_impl) + + +def _enhancer_impl(enhancer): + """Sets level to be between 0.1 and 1.8 for ImageEnhance transforms of PIL.""" + def impl(pil_img, level): + v = float_parameter(level, 1.8) + .1 # going to 0 just destroys it + return enhancer(pil_img).enhance(v) + return impl + + +color = TransformT('Color', _enhancer_impl(ImageEnhance.Color)) +contrast = TransformT('Contrast', _enhancer_impl(ImageEnhance.Contrast)) +brightness = TransformT('Brightness', _enhancer_impl( + ImageEnhance.Brightness)) +sharpness = TransformT('Sharpness', _enhancer_impl(ImageEnhance.Sharpness)) + +ALL_TRANSFORMS = [ + flip_lr, + flip_ud, + auto_contrast, + equalize, + invert, + rotate, + posterize, + crop_bilinear, + solarize, + color, + contrast, + brightness, + sharpness, + shear_x, + shear_y, + translate_x, + translate_y, + cutout, + blur, + smooth +] + +NAME_TO_TRANSFORM = {t.name: t for t in ALL_TRANSFORMS} +TRANSFORM_NAMES = NAME_TO_TRANSFORM.keys() diff --git a/FAR-HO/blue_utils.py b/FAR-HO/blue_utils.py new file mode 100644 index 0000000..59ee62e --- /dev/null +++ b/FAR-HO/blue_utils.py @@ -0,0 +1,131 @@ +import matplotlib.pyplot as plt +from far_ho.examples.datasets import Datasets, Dataset + +import os +import numpy as np +import tensorflow as tf + +import augmentation_transforms as augmentation_transforms ##### ATTENTION FICHIER EN DOUBLE => A REGLER MIEUX #### + +def viz_data(dataset, fig_name='data_sample',aug_policy=None): + + plt.figure(figsize=(10,10)) + for i in range(25): + plt.subplot(5,5,i+1) + plt.xticks([]) + plt.yticks([]) + plt.grid(False) + + img = dataset.data[i][:,:,0] + if aug_policy : + img = augment_img(img,aug_policy) + #print('im shape',img.shape) + plt.imshow(img, cmap=plt.cm.binary) + plt.xlabel(np.nonzero(dataset.target[i])[0].item()) + + plt.savefig(fig_name) + +def augment_img(data, policy): + + #print('Im shape',data.shape) + data = np.stack((data,)*3, axis=-1) #BOF BOF juste pour forcer 3 channels + #print('Im shape',data.shape) + final_img = augmentation_transforms.apply_policy(policy, data) + #final_img = augmentation_transforms.random_flip(augmentation_transforms.zero_pad_and_crop(final_img, 4)) + # Apply cutout + #final_img = augmentation_transforms.cutout_numpy(final_img) + + im_rgb = np.array(final_img, np.float32) + im_gray = np.dot(im_rgb[...,:3], [0.2989, 0.5870, 0.1140]) #Just pour retourner a 1 channel + + return im_gray + + +### https://www.kaggle.com/raoulma/mnist-image-class-tensorflow-cnn-99-51-test-acc#5.-Build-the-neural-network-with-tensorflow- +## build the neural network class +# weight initialization +def weight_variable(shape, name = None): + initial = tf.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial, name = name) + +# bias initialization +def bias_variable(shape, name = None): + initial = tf.constant(0.1, shape=shape) # positive bias + return tf.Variable(initial, name = name) + +# 2D convolution +def conv2d(x, W, name = None): + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', name = name) + +# max pooling +def max_pool_2x2(x, name = None): + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], + padding='SAME', name = name) + +def cnn(x_data_tf,y_data_tf, name='model'): + # tunable hyperparameters for nn architecture + s_f_conv1 = 3; # filter size of first convolution layer (default = 3) + n_f_conv1 = 36; # number of features of first convolution layer (default = 36) + s_f_conv2 = 3; # filter size of second convolution layer (default = 3) + n_f_conv2 = 36; # number of features of second convolution layer (default = 36) + s_f_conv3 = 3; # filter size of third convolution layer (default = 3) + n_f_conv3 = 36; # number of features of third convolution layer (default = 36) + n_n_fc1 = 576; # number of neurons of first fully connected layer (default = 576) + + # 1.layer: convolution + max pooling + W_conv1_tf = weight_variable([s_f_conv1, s_f_conv1, 1, n_f_conv1], name = 'W_conv1_tf') # (5,5,1,32) + b_conv1_tf = bias_variable([n_f_conv1], name = 'b_conv1_tf') # (32) + h_conv1_tf = tf.nn.relu(conv2d(x_data_tf, + W_conv1_tf) + b_conv1_tf, + name = 'h_conv1_tf') # (.,28,28,32) + h_pool1_tf = max_pool_2x2(h_conv1_tf, + name = 'h_pool1_tf') # (.,14,14,32) + + # 2.layer: convolution + max pooling + W_conv2_tf = weight_variable([s_f_conv2, s_f_conv2, + n_f_conv1, n_f_conv2], + name = 'W_conv2_tf') + b_conv2_tf = bias_variable([n_f_conv2], name = 'b_conv2_tf') + h_conv2_tf = tf.nn.relu(conv2d(h_pool1_tf, + W_conv2_tf) + b_conv2_tf, + name ='h_conv2_tf') #(.,14,14,32) + h_pool2_tf = max_pool_2x2(h_conv2_tf, name = 'h_pool2_tf') #(.,7,7,32) + + # 3.layer: convolution + max pooling + W_conv3_tf = weight_variable([s_f_conv3, s_f_conv3, + n_f_conv2, n_f_conv3], + name = 'W_conv3_tf') + b_conv3_tf = bias_variable([n_f_conv3], name = 'b_conv3_tf') + h_conv3_tf = tf.nn.relu(conv2d(h_pool2_tf, + W_conv3_tf) + b_conv3_tf, + name = 'h_conv3_tf') #(.,7,7,32) + h_pool3_tf = max_pool_2x2(h_conv3_tf, + name = 'h_pool3_tf') # (.,4,4,32) + + # 4.layer: fully connected + W_fc1_tf = weight_variable([4*4*n_f_conv3,n_n_fc1], + name = 'W_fc1_tf') # (4*4*32, 1024) + b_fc1_tf = bias_variable([n_n_fc1], name = 'b_fc1_tf') # (1024) + h_pool3_flat_tf = tf.reshape(h_pool3_tf, [-1,4*4*n_f_conv3], + name = 'h_pool3_flat_tf') # (.,1024) + h_fc1_tf = tf.nn.relu(tf.matmul(h_pool3_flat_tf, + W_fc1_tf) + b_fc1_tf, + name = 'h_fc1_tf') # (.,1024) + + # add dropout + #keep_prob_tf = tf.placeholder(dtype=tf.float32, name = 'keep_prob_tf') + #h_fc1_drop_tf = tf.nn.dropout(h_fc1_tf, keep_prob_tf, name = 'h_fc1_drop_tf') + + # 5.layer: fully connected + W_fc2_tf = weight_variable([n_n_fc1, 10], name = 'W_fc2_tf') + b_fc2_tf = bias_variable([10], name = 'b_fc2_tf') + z_pred_tf = tf.add(tf.matmul(h_fc1_tf, W_fc2_tf), + b_fc2_tf, name = 'z_pred_tf')# => (.,10) + # predicted probabilities in one-hot encoding + y_pred_proba_tf = tf.nn.softmax(z_pred_tf, name='y_pred_proba_tf') + + # tensor of correct predictions + y_pred_correct_tf = tf.equal(tf.argmax(y_pred_proba_tf, 1), + tf.argmax(y_data_tf, 1), + name = 'y_pred_correct_tf') + return y_pred_proba_tf \ No newline at end of file diff --git a/FAR-HO/far_pba_cifar.py b/FAR-HO/far_pba_cifar.py new file mode 100644 index 0000000..60dc509 --- /dev/null +++ b/FAR-HO/far_pba_cifar.py @@ -0,0 +1,166 @@ +#https://github.com/arcelien/pba/blob/master/autoaugment/train_cifar.py +from __future__ import absolute_import, print_function, division + +import os +import numpy as np +import tensorflow as tf +#import tensorflow.contrib.layers as layers +import far_ho as far +import far_ho.examples as far_ex +#import pprint + +import autoaugment.augmentation_transforms as augmentation_transforms +#import autoaugment.policies as found_policies +from autoaugment.wrn import build_wrn_model + + +def build_model(inputs, num_classes, is_training, hparams): + """Constructs the vision model being trained/evaled. + Args: + inputs: input features/images being fed to the image model build built. + num_classes: number of output classes being predicted. + is_training: is the model training or not. + hparams: additional hyperparameters associated with the image model. + Returns: + The logits of the image model. + """ + scopes = setup_arg_scopes(is_training) + with contextlib.nested(*scopes): + if hparams.model_name == 'pyramid_net': + logits = build_shake_drop_model( + inputs, num_classes, is_training) + elif hparams.model_name == 'wrn': + logits = build_wrn_model( + inputs, num_classes, hparams.wrn_size) + elif hparams.model_name == 'shake_shake': + logits = build_shake_shake_model( + inputs, num_classes, hparams, is_training) + return logits + + +class CifarModel(object): + """Builds an image model for Cifar10/Cifar100.""" + + def __init__(self, hparams): + self.hparams = hparams + + def build(self, mode): + """Construct the cifar model.""" + assert mode in ['train', 'eval'] + self.mode = mode + self._setup_misc(mode) + self._setup_images_and_labels() + self._build_graph(self.images, self.labels, mode) + + self.init = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + + def _setup_misc(self, mode): + """Sets up miscellaneous in the cifar model constructor.""" + self.lr_rate_ph = tf.Variable(0.0, name='lrn_rate', trainable=False) + self.reuse = None if (mode == 'train') else True + self.batch_size = self.hparams.batch_size + if mode == 'eval': + self.batch_size = 25 + + def _setup_images_and_labels(self): + """Sets up image and label placeholders for the cifar model.""" + if FLAGS.dataset == 'cifar10': + self.num_classes = 10 + else: + self.num_classes = 100 + self.images = tf.placeholder(tf.float32, [self.batch_size, 32, 32, 3]) + self.labels = tf.placeholder(tf.float32, + [self.batch_size, self.num_classes]) + + def assign_epoch(self, session, epoch_value): + session.run(self._epoch_update, feed_dict={self._new_epoch: epoch_value}) + + def _build_graph(self, images, labels, mode): + """Constructs the TF graph for the cifar model. + Args: + images: A 4-D image Tensor + labels: A 2-D labels Tensor. + mode: string indicating training mode ( e.g., 'train', 'valid', 'test'). + """ + is_training = 'train' in mode + if is_training: + self.global_step = tf.train.get_or_create_global_step() + + logits = build_model( + images, + self.num_classes, + is_training, + self.hparams) + self.predictions, self.cost = helper_utils.setup_loss( + logits, labels) + self.accuracy, self.eval_op = tf.metrics.accuracy( + tf.argmax(labels, 1), tf.argmax(self.predictions, 1)) + self._calc_num_trainable_params() + + # Adds L2 weight decay to the cost + self.cost = helper_utils.decay_weights(self.cost, + self.hparams.weight_decay_rate) + #### Attention: differe implem originale + + self.init = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + + +######################################################## + +######## PBA ############ + +#Parallele Cifar model trainer +tf.flags.DEFINE_string('model_name', 'wrn', + 'wrn, shake_shake_32, shake_shake_96, shake_shake_112, ' + 'pyramid_net') +tf.flags.DEFINE_string('checkpoint_dir', '/tmp/training', 'Training Directory.') +tf.flags.DEFINE_string('data_path', '/tmp/data', + 'Directory where dataset is located.') +tf.flags.DEFINE_string('dataset', 'cifar10', + 'Dataset to train with. Either cifar10 or cifar100') +tf.flags.DEFINE_integer('use_cpu', 1, '1 if use CPU, else GPU.') +## ??? + +FLAGS = tf.flags.FLAGS +FLAGS.dataset +FLAGS.data_path +FLAGS.model_name = 'wrn' + +hparams = tf.contrib.training.HParams( + train_size=50000, + validation_size=0, + eval_test=1, + dataset=FLAGS.dataset, + data_path=FLAGS.data_path, + batch_size=128, + gradient_clipping_by_global_norm=5.0) + if FLAGS.model_name == 'wrn': + hparams.add_hparam('model_name', 'wrn') + hparams.add_hparam('num_epochs', 200) + hparams.add_hparam('wrn_size', 160) + hparams.add_hparam('lr', 0.1) + hparams.add_hparam('weight_decay_rate', 5e-4) + +data_loader = data_utils.DataSet(hparams) +data_loader.reset() + +with tf.Graph().as_default(): #, tf.device('/cpu:0' if FLAGS.use_cpu else '/gpu:0'): +"""Builds the image models for train and eval.""" + # Determine if we should build the train and eval model. When using + # distributed training we only want to build one or the other and not both. + with tf.variable_scope('model', use_resource=False): + m = CifarModel(self.hparams) + m.build('train') + #self._num_trainable_params = m.num_trainable_params + #self._saver = m.saver + #with tf.variable_scope('model', reuse=True, use_resource=False): + # meval = CifarModel(self.hparams) + # meval.build('eval') + + +##### FAR-HO #### +for _ in range(n_hyper_iterations): + + diff --git a/FAR-HO/test.py b/FAR-HO/test.py new file mode 100644 index 0000000..3364c00 --- /dev/null +++ b/FAR-HO/test.py @@ -0,0 +1,92 @@ +import os +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers +import far_ho as far +import far_ho.examples as far_ex +import matplotlib.pyplot as plt + +sess = tf.InteractiveSession() + + +def get_data(): + # load a small portion of mnist data + datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=(.1, .1,)) + return datasets.train, datasets.validation + + +def g_logits(x,y): + with tf.variable_scope('model'): + h1 = layers.fully_connected(x, 300) + logits = layers.fully_connected(h1, int(y.shape[1])) + return logits + + +x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x') +y = tf.placeholder(tf.float32, shape=(None, 10), name='y') +logits = g_logits(x,y) +train_set, validation_set = get_data() + +lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples)) +lr = far.get_hyperparameter('lr', initializer=0.01) + +ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits) +L = tf.reduce_mean(tf.sigmoid(lambdas)*ce) +E = tf.reduce_mean(ce) + +accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32)) + +inner_optimizer = far.GradientDescentOptimizer(lr) +outer_optimizer = tf.train.AdamOptimizer() +rev_it =10 +hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it) +hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer) + +T = 20 # Number of inner iterations +train_set_supplier = train_set.create_supplier(x, y) +validation_set_supplier = validation_set.create_supplier(x, y) +tf.global_variables_initializer().run() + +print('inner:', L.eval(train_set_supplier())) +print('outer:', E.eval(validation_set_supplier())) +# print('-'*50) +n_hyper_iterations = 200 +inner_losses = [] +outer_losses = [] +train_accs = [] +val_accs = [] + +for _ in range(n_hyper_iterations): + hyper_step(T, + inner_objective_feed_dicts=train_set_supplier, + outer_objective_feed_dicts=validation_set_supplier) + + inner_obj = L.eval(train_set_supplier()) + outer_obj = E.eval(validation_set_supplier()) + inner_losses.append(inner_obj) + outer_losses.append(outer_obj) + print('inner:', inner_obj) + print('outer:', outer_obj) + + train_acc = accuracy.eval(train_set_supplier()) + val_acc = accuracy.eval(validation_set_supplier()) + train_accs.append(train_acc) + val_accs.append(val_acc) + print('training accuracy', train_acc) + print('validation accuracy', val_acc) + + print('learning rate', lr.eval()) + print('norm of examples weight', tf.norm(lambdas).eval()) + print('-'*50) + +plt.subplot(211) +plt.plot(inner_losses, label='training loss') +plt.plot(outer_losses, label='validation loss') +plt.legend(loc=0, frameon=True) +#plt.xlim(0, 19) +plt.subplot(212) +plt.plot(train_accs, label='training accuracy') +plt.plot(val_accs, label='validation accuracy') +plt.legend(loc=0, frameon=True) + +plt.savefig('H%d - I%d - R%d'%(n_hyper_iterations,T,rev_it)) diff --git a/FAR-HO/test_cnn.py b/FAR-HO/test_cnn.py new file mode 100644 index 0000000..ffbcb8d --- /dev/null +++ b/FAR-HO/test_cnn.py @@ -0,0 +1,126 @@ +import warnings +warnings.filterwarnings("ignore") + +import os +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers +import far_ho as far +import far_ho.examples as far_ex + +tf.logging.set_verbosity(tf.logging.ERROR) + +import matplotlib.pyplot as plt +import blue_utils as butil + +#Reset +try: + sess.close() +except: pass +rnd = np.random.RandomState(1) +tf.reset_default_graph() +sess = tf.InteractiveSession() + +def get_data(data_split): + # load a small portion of mnist data + datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=data_split, reshape=False) + print("Data shape : ", datasets.train.dim_data, "/ Label shape : ", datasets.train.dim_target) + [print("Nb samples : ", d.num_examples) for d in datasets] + return datasets.train, datasets.validation, datasets.test + +#Model +# FC : reshape = True +def g_logits(x,y, name='model'): + with tf.variable_scope(name): + h1 = layers.fully_connected(x, 300) + logits = layers.fully_connected(h1, int(y.shape[1])) + return logits + +#### Hyper-parametres #### +n_hyper_iterations = 500 +T = 20 # Number of inner iterations +rev_it =10 +hp_lr = 1.e-3 +########################## + +#MNIST +#x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x') +#y = tf.placeholder(tf.float32, shape=(None, 10), name='y') +#logits = g_logits(x, y) + +#CNN : reshape = False +x = tf.placeholder(dtype=tf.float32, shape=[None,28,28,1], name='x') +y = tf.placeholder(dtype=tf.float32, shape=[None,10], name='y') + +logits = butil.cnn(x,y) + +train_set, validation_set, test_set = get_data(data_split=(.05, .05,)) + +butil.viz_data(train_set) +print('Data sampled !') + +# lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples)) +#lr = far.get_hyperparameter('lr', initializer=1e-4, constraint=lambda t: tf.maximum(tf.minimum(t, .1), 1.e-7)) +#mu = far.get_hyperparameter('mu', initializer=0.9, constraint=lambda t: tf.maximum(tf.minimum(t, .99), 1.e-5)) +#rho = far.get_hyperparameter('rho', initializer=0.00001, constraint=lambda t: tf.maximum(tf.minimum(t, 0.01), 0.)) +lr = far.get_hyperparameter('lr', initializer=1e-4, constraint=lambda t: tf.maximum(tf.minimum(t, 1e-4), 1e-4)) +mu = far.get_hyperparameter('mu', initializer=0.9, constraint=lambda t: tf.maximum(tf.minimum(t, 0.9), 0.9)) +rho = far.get_hyperparameter('rho', initializer=0.00001, constraint=lambda t: tf.maximum(tf.minimum(t, 0.00001), 0.00001)) + +ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits) +L = tf.reduce_mean(ce) + rho*tf.add_n([tf.reduce_sum(w**2) for w in tf.trainable_variables()]) #Retirer la seconde partie de la loss quand HP inutiles +E = tf.reduce_mean(ce) + +accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32)) + +inner_optimizer = far.MomentumOptimizer(lr, mu) +outer_optimizer = tf.train.AdamOptimizer(hp_lr) +hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it) +hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer) + +train_set_supplier = train_set.create_supplier(x, y, batch_size=256) # stochastic GD +validation_set_supplier = validation_set.create_supplier(x, y) + +his_params = [] + +tf.global_variables_initializer().run() + +for hyt in range(n_hyper_iterations): + hyper_step(T, + inner_objective_feed_dicts=train_set_supplier, + outer_objective_feed_dicts=validation_set_supplier) + res = sess.run(far.hyperparameters()) + [L.eval(train_set_supplier()), + E.eval(validation_set_supplier()), + accuracy.eval(train_set_supplier()), + accuracy.eval(validation_set_supplier())] + his_params.append(res) + + print('Hyper-it :',hyt,'/',n_hyper_iterations) + print('inner:', L.eval(train_set_supplier())) + print('outer:', E.eval(validation_set_supplier())) + print('training accuracy:', res[5]) + print('validation accuracy:', res[6]) + #print('learning rate', lr.eval(), 'momentum', mu.eval(), 'l2 coefficient', rho.eval()) + print('-'*50) + +test_set_supplier = test_set.create_supplier(x, y) +print('Test accuracy:',accuracy.eval(test_set_supplier())) + +fig, ax = plt.subplots(ncols=4, figsize=(15, 3)) +ax[0].set_title('Learning rate') +ax[0].plot([e[0] for e in his_params]) + +ax[1].set_title('Momentum factor') +ax[1].plot([e[1] for e in his_params]) + +#ax[2].set_title('L2 regulariz.') +#ax[2].plot([e[2] for e in his_params]) +ax[2].set_title('Tr. and val. acc') +ax[2].plot([e[5] for e in his_params]) +ax[2].plot([e[6] for e in his_params]) + +ax[3].set_title('Tr. and val. errors') +ax[3].plot([e[3] for e in his_params]) +ax[3].plot([e[4] for e in his_params]) + +plt.savefig('res_cnn_H{}_I{}'.format(n_hyper_iterations,T)) diff --git a/FAR-HO/test_cnn_aug.py b/FAR-HO/test_cnn_aug.py new file mode 100644 index 0000000..db48936 --- /dev/null +++ b/FAR-HO/test_cnn_aug.py @@ -0,0 +1,141 @@ +import warnings +warnings.filterwarnings("ignore") + +import os +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers +import far_ho as far +import far_ho.examples as far_ex + +tf.logging.set_verbosity(tf.logging.ERROR) + +import matplotlib.pyplot as plt +import blue_utils as butil + +#Reset +try: + sess.close() +except: pass +rnd = np.random.RandomState(1) +tf.reset_default_graph() +sess = tf.InteractiveSession() + +def get_data(data_split): + # load a small portion of mnist data + datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=data_split, reshape=False) + print("Data shape : ", datasets.train.dim_data, "/ Label shape : ", datasets.train.dim_target) + [print("Nb samples : ", d.num_examples) for d in datasets] + return datasets.train, datasets.validation, datasets.test + +#Model +# FC : reshape = True +def g_logits(x,y, name='model'): + with tf.variable_scope(name): + h1 = layers.fully_connected(x, 300) + logits = layers.fully_connected(h1, int(y.shape[1])) + return logits + +#### Hyper-parametres #### +n_hyper_iterations = 10 +T = 10 # Number of inner iterations +rev_it =10 +hp_lr = 0.02 +########################## + +#MNIST +#x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x') +#y = tf.placeholder(tf.float32, shape=(None, 10), name='y') +#logits = g_logits(x, y) + +#CNN : reshape = False +x = tf.placeholder(dtype=tf.float32, shape=[None,28,28,1], name='x') +y = tf.placeholder(dtype=tf.float32, shape=[None,10], name='y') + +logits = butil.cnn(x,y) + +train_set, validation_set, test_set = get_data(data_split=(.1, .1,)) + +probX = far.get_hyperparameter('probX', initializer=0.1, constraint=lambda t: tf.maximum(tf.minimum(t, 0.1), 0.9)) +probY = far.get_hyperparameter('probY', initializer=0.1, constraint=lambda t: tf.maximum(tf.minimum(t, 0.1), 0.9)) + +#lr = far.get_hyperparameter('lr', initializer=1e-4, constraint=lambda t: tf.maximum(tf.minimum(t, 1e-4), 1e-4)) +#mu = far.get_hyperparameter('mu', initializer=0.9, constraint=lambda t: tf.maximum(tf.minimum(t, 0.9), 0.9)) + +#probX, probY = 0.5, 0.5 +#policy = [('TranslateX', probX, 8), ('TranslateY', probY, 8)] +policy = [('TranslateX', probX, 8), ('FlipUD', probY, 8)] +print('Hyp :',far.utils.hyperparameters(scope=None)) + +#butil.viz_data(train_set, aug_policy= policy) +#print('Data sampled !') + +#Ajout artificiel des transfo a la loss juste pour qu il soit compter dans la dynamique du graph +probX_loss = tf.sigmoid(probX) +probY_loss = tf.sigmoid(probY) + +ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits) +L = tf.reduce_mean(probX_loss*probY_loss*ce) +E = tf.reduce_mean(ce) + +accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32)) + +inner_optimizer = far.AdamOptimizer() +outer_optimizer = tf.train.AdamOptimizer(hp_lr) +hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it) +hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer) + +train_set_supplier = train_set.create_supplier(x, y, batch_size=256, aug_policy=policy) # stochastic GD +validation_set_supplier = validation_set.create_supplier(x, y) + +#print(train_set.dim_data,validation_set.dim_data) + +his_params = [] + +tf.global_variables_initializer().run() + +butil.viz_data(train_set, fig_name= 'Start_sample',aug_policy= policy) +print('Data sampled !') + +for hyt in range(n_hyper_iterations): + hyper_step(T, + inner_objective_feed_dicts=train_set_supplier, + outer_objective_feed_dicts=validation_set_supplier, + _skip_hyper_ts=True) + res = sess.run(far.hyperparameters()) + [L.eval(train_set_supplier()), + E.eval(validation_set_supplier()), + accuracy.eval(train_set_supplier()), + accuracy.eval(validation_set_supplier())] + his_params.append(res) + + butil.viz_data(train_set, fig_name= 'Train_sample_{}'.format(hyt),aug_policy= policy) + print('Data sampled !') + + print('Hyper-it :',hyt,'/',n_hyper_iterations) + print('inner:', L.eval(train_set_supplier())) + print('outer:', E.eval(validation_set_supplier())) + print('training accuracy:', res[4]) + print('validation accuracy:', res[5]) + print('Transformation : ProbX -',res[0],'/ProbY -',res[1]) + #print('learning rate', lr.eval(), 'momentum', mu.eval(), 'l2 coefficient', rho.eval()) + print('-'*50) + +test_set_supplier = test_set.create_supplier(x, y) +print('Test accuracy:',accuracy.eval(test_set_supplier())) + +fig, ax = plt.subplots(ncols=4, figsize=(15, 3)) +ax[0].set_title('ProbX') +ax[0].plot([e[0] for e in his_params]) + +ax[1].set_title('ProbY') +ax[1].plot([e[1] for e in his_params]) + +ax[2].set_title('Tr. and val. errors') +ax[2].plot([e[2] for e in his_params]) +ax[2].plot([e[3] for e in his_params]) + +ax[3].set_title('Tr. and val. acc') +ax[3].plot([e[4] for e in his_params]) +ax[3].plot([e[5] for e in his_params]) + +plt.savefig('res_cnn_aug_H{}_I{}'.format(n_hyper_iterations,T)) diff --git a/FAR-HO/test_fc.py b/FAR-HO/test_fc.py new file mode 100644 index 0000000..24eb596 --- /dev/null +++ b/FAR-HO/test_fc.py @@ -0,0 +1,133 @@ +#https://github.com/lucfra/FAR-HO/blob/master/far_ho/examples/autoMLDemos/Far-HO%20Demo%2C%20AutoML%202018%2C%20ICML%20workshop.ipynb +import warnings +warnings.filterwarnings("ignore") + +import os +import numpy as np +import tensorflow as tf +import tensorflow.contrib.layers as layers +import far_ho as far +import far_ho.examples as far_ex + +tf.logging.set_verbosity(tf.logging.ERROR) + +import matplotlib.pyplot as plt +#import blue_utils as butil + +#Reset +try: + sess.close() +except: pass +rnd = np.random.RandomState(1) +tf.reset_default_graph() +sess = tf.InteractiveSession() + +def get_data(data_split): + # load a small portion of mnist data + datasets = far_ex.mnist(data_root_folder=os.path.join(os.getcwd(), 'MNIST_DATA'), partitions=data_split, reshape=True) + print("Data shape : ", datasets.train.dim_data, " / Label shape : ", datasets.train.dim_target) + [print("Nb samples : ", d.num_examples) for d in datasets] + return datasets.train, datasets.validation, datasets.test + +#Model +# FC : reshape = True +def g_logits(x,y, name='model'): + with tf.variable_scope(name): + h1 = layers.fully_connected(x, 300) + logits = layers.fully_connected(h1, int(y.shape[1])) + return logits + +#### Hyper-parametres #### +n_hyper_iterations = 90 +T = 20 # Number of inner iterations +rev_it =10 +hp_lr = 0.1 +epochs =10 +batch_size = 256 +########################## + +#MNIST +x = tf.placeholder(tf.float32, shape=(None, 28**2), name='x') +y = tf.placeholder(tf.float32, shape=(None, 10), name='y') +logits = g_logits(x, y) + +#CNN : reshape = False +#x = tf.placeholder(dtype=tf.float32, shape=[None,28,28,1], name='x') +#y = tf.placeholder(dtype=tf.float32, shape=[None,10], name='y') + +#logits = butil.cnn(x,y) + +train_set, validation_set, test_set = get_data(data_split=(.6, .3,)) + +#butil.viz_data(train_set) + +# lambdas = far.get_hyperparameter('lambdas', tf.zeros(train_set.num_examples)) +lr = far.get_hyperparameter('lr', initializer=1e-2, constraint=lambda t: tf.maximum(tf.minimum(t, 0.1), 1.e-7)) +mu = far.get_hyperparameter('mu', initializer=0.95, constraint=lambda t: tf.maximum(tf.minimum(t, .99), 1.e-5)) +#rho = far.get_hyperparameter('rho', initializer=0.00001, constraint=lambda t: tf.maximum(tf.minimum(t, 0.01), 0.)) + + +ce = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits) +L = tf.reduce_mean(ce) #+ rho*tf.add_n([tf.reduce_sum(w**2) for w in tf.trainable_variables()]) #Retirer la seconde partie de la loss quand HP inutiles +E = tf.reduce_mean(ce) + +accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(logits, 1)), tf.float32)) + +inner_optimizer = far.MomentumOptimizer(lr, mu) +#inner_optimizer = far.GradientDescentOptimizer(lr) +outer_optimizer = tf.train.AdamOptimizer(hp_lr) +hyper_method = far.ReverseHG().truncated(reverse_iterations=rev_it) +hyper_step = far.HyperOptimizer(hyper_method).minimize(E, outer_optimizer, L, inner_optimizer)#, global_step=tf.train.get_or_create_step()) + +train_set_supplier = train_set.create_supplier(x, y, batch_size=batch_size)#, epochs=1) # stochastic GD +validation_set_supplier = validation_set.create_supplier(x, y) + + +print('Hyper iterations par epochs',int(train_set.num_examples/batch_size*epochs/T)) + +his_params = [] + +tf.global_variables_initializer().run() + +for hyt in range(n_hyper_iterations): + hyper_step(T, + inner_objective_feed_dicts=train_set_supplier, + outer_objective_feed_dicts=validation_set_supplier, + _skip_hyper_ts=False) + res = sess.run(far.hyperparameters()) + [0, L.eval(train_set_supplier()), + E.eval(validation_set_supplier()), + accuracy.eval(train_set_supplier()), + accuracy.eval(validation_set_supplier())] + + his_params.append(res) + + print('Hyper-it :',hyt,'/',n_hyper_iterations) + print('inner:', res[3]) + print('outer:', res[4]) + print('training accuracy:', res[5]) + print('validation accuracy:', res[6]) + #print('learning rate', lr.eval(), 'momentum', mu.eval(), 'l2 coefficient', rho.eval()) + print('-'*50) + +test_set_supplier = test_set.create_supplier(x, y) +print('Test accuracy:',accuracy.eval(test_set_supplier())) + +fig, ax = plt.subplots(ncols=4, figsize=(15, 3)) +ax[0].set_title('Learning rate') +ax[0].plot([e[0] for e in his_params]) + +ax[1].set_title('Momentum factor') +ax[1].plot([e[1] for e in his_params]) + +#ax[2].set_title('L2 regulariz.') +#ax[2].plot([e[2] for e in his_params]) +ax[2].set_title('Tr. and val. acc') +ax[2].plot([e[5] for e in his_params]) +ax[2].plot([e[6] for e in his_params]) + +ax[3].set_title('Tr. and val. errors') +ax[3].plot([e[3] for e in his_params]) +ax[3].plot([e[4] for e in his_params]) + +plt.savefig('resultats/res_fc_H{}_I{}'.format(n_hyper_iterations,T)) +#plt.savefig('resultats/res_fc_H{}_I{}_noHyp'.format(n_hyper_iterations,T)) diff --git a/Gradient-Descent-The-Ultimate-Optimizer/.gitignore b/Gradient-Descent-The-Ultimate-Optimizer/.gitignore new file mode 100644 index 0000000..8c17325 --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/.gitignore @@ -0,0 +1,5 @@ +venv/ +__pycache__ +data/ +log/ +.vscode/ diff --git a/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf b/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf new file mode 100644 index 0000000..4f0b65a Binary files /dev/null and b/Gradient-Descent-The-Ultimate-Optimizer/20190929-paper.pdf differ diff --git a/Gradient-Descent-The-Ultimate-Optimizer/README.md b/Gradient-Descent-The-Ultimate-Optimizer/README.md new file mode 100644 index 0000000..cfa0e6f --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/README.md @@ -0,0 +1,33 @@ +# Gradient Descent: The Ultimate Optimizer + +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) + +| ⚠️ WARNING: THIS IS NOT MY WORK ⚠️ | +| --- | + +This repository contains the paper and code to the paper [Gradient Descent: +The Ultimate Optimizer](https://arxiv.org/abs/1909.13371). + +I couldn't find the code (which is found in the appendix at the end of the +paper) anywhere on the web. What I present here is the code of the paper with +instructions on how to set it up. + +Getting the code in a runnable state required some fixes on my part so the +code might be slightly different than that presented in the paper. + +## Set up + +```sh +git clone https://github.com/Rainymood/Gradient-Descent-The-Ultimate-Optimizer +cd Gradient-Descent-The-Ultimate-Optimizer +virtualenv -p python3 venv +source venv/bin/activate +pip install -r requirements.txt +python main.py +``` + +When you are done you can exit the virtualenv with + +```shell +deactivate +``` diff --git a/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py b/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py new file mode 100644 index 0000000..a18ddf0 --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/data_aug.py @@ -0,0 +1,244 @@ +from hyperopt import * +#from hyperopt_v2 import * + +import torchvision.transforms.functional as TF +import torchvision.transforms as T + +#from scipy import ndimage +import kornia + +import random + + +class MNIST_FullyConnected_Augmented(Optimizable): + """ + A fully-connected NN for the MNIST task. This is Optimizable but not itself + an optimizer. + """ + + def __init__(self, num_inp, num_hid, num_out, optimizer, device = torch.device('cuda')): + self.device = device + #print(self.device) + parameters = { + "w1": torch.zeros(num_inp, num_hid, device=self.device).t(), + "b1": torch.zeros(num_hid, device=self.device).t(), + "w2": torch.zeros(num_hid, num_out, device=self.device).t(), + "b2": torch.zeros(num_out, device=self.device).t(), + + #Data augmentation + "prob": torch.tensor(0.5, device=self.device), + "mag": torch.tensor(180.0, device=self.device), + } + super().__init__(parameters, optimizer) + + def initialize(self): + nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5)) + self.optimizer.initialize() + #print(self.device) + + def forward(self, x): + """Compute a prediction.""" + #print("Prob:",self.parameters["prob"].item()) + if random.random() < self.parameters["prob"]: + #angle = 45 + #x = TF.rotate(x, angle) + #print(self.device) + #x = F.linear(x, torch.ones(28*28, 28*28, device=self.device).t()*self.parameters["mag"], bias=None) + x = x + self.parameters["mag"] + + x = F.linear(x, self.parameters["w1"], self.parameters["b1"]) + x = torch.tanh(x) + x = F.linear(x, self.parameters["w2"], self.parameters["b2"]) + x = torch.tanh(x) + x = F.log_softmax(x, dim=1) + return x + + def adjust(self): + self.optimizer.adjust(self.parameters) + + def __str__(self): + return "mnist_FC_augmented / " + str(self.optimizer) + +class LeNet(Optimizable, nn.Module): + def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')): + nn.Module.__init__(self) + self.device = device + parameters = { + "w1": torch.zeros(20, num_inp, 5, 5, device=self.device), + "b1": torch.zeros(20, device=self.device), + "w2": torch.zeros(50, 20, 5, 5, device=self.device), + "b2": torch.zeros(50, device=self.device), + "w3": torch.zeros(500,4*4*50, device=self.device), + "b3": torch.zeros(500, device=self.device), + "w4": torch.zeros(10, 500, device=self.device), + "b4": torch.zeros(10, device=self.device), + + #Data augmentation + "prob": torch.tensor(1.0, device=self.device), + "mag": torch.tensor(180.0, device=self.device), + } + super().__init__(parameters, optimizer) + + def initialize(self): + nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.parameters["w3"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.parameters["w4"], a=math.sqrt(5)) + self.optimizer.initialize() + + def forward(self, x): + + if random.random() < self.parameters["prob"]: + + batch_size = x.shape[0] + # create transformation (rotation) + alpha = self.parameters["mag"] # in degrees + angle = torch.ones(batch_size, device=self.device) * alpha + + # define the rotation center + center = torch.ones(batch_size, 2, device=self.device) + center[..., 0] = x.shape[3] / 2 # x + center[..., 1] = x.shape[2] / 2 # y + + #print(x.shape, center) + # define the scale factor + scale = torch.ones(batch_size, device=self.device) + + # compute the transformation matrix + M = kornia.get_rotation_matrix2d(center, angle, scale) + + # apply the transformation to original image + x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w) + + #print("Start Shape ", x.shape) + out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = out.view(out.size(0), -1) + #print("Shape ", out.shape) + out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"])) + #print("Shape ", out.shape) + out = F.linear(out, self.parameters["w4"], self.parameters["b4"]) + #print("Shape ", out.shape) + return F.log_softmax(out, dim=1) + + def adjust(self): + self.optimizer.adjust(self.parameters) + + def __str__(self): + return "mnist_CNN_augmented / " + str(self.optimizer) + +class LeNet_v2(Optimizable, nn.Module): + def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')): + + nn.Module.__init__(self) + self.device = device + self.conv1 = nn.Conv2d(num_inp, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + #self.fc1 = nn.Linear(4*4*50, 500) + self.fc1 = nn.Linear(1250, 500) + self.fc2 = nn.Linear(500, 10) + + #print(self.conv1.weight) + parameters = { + "w1": self.conv1.weight, + "b1": self.conv1.bias, + "w2": self.conv2.weight, + "b2": self.conv2.bias, + "w3": self.fc1.weight, + "b3": self.fc1.bias, + "w4": self.fc2.weight, + "b4": self.fc2.bias, + + #Data augmentation + "prob": torch.tensor(0.5, device=self.device), + "mag": torch.tensor(1.0, device=self.device), + } + Optimizable.__init__(self, parameters, optimizer) + + ''' + def forward(self, x): #Sature la memoire ??? + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + #x = x.view(-1, 4*4*50) + x = x.view(x.size(0), -1) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + ''' + def forward(self, x): + + if random.random() < self.parameters["prob"].item(): + #print(self.parameters["prob"]) + #x = [T.ToTensor()( + # TF.affine(img=T.ToPILImage()(im), angle=self.parameters["mag"], translate=(0,0), scale=1, shear=0, resample=0, fillcolor=None)) + # for im in torch.unbind(x,dim=0)] + #x = torch.stack(x,dim=0) + + #x = [ndimage.rotate(im, self.parameters["mag"], reshape=False) + # for im in torch.unbind(x,dim=0)] + #x = torch.stack(x,dim=0) + + #x = [im + self.parameters["mag"] + # for im in torch.unbind(x,dim=0)] + #x = torch.stack(x,dim=0) + + batch_size = x.shape[0] + # create transformation (rotation) + alpha = self.parameters["mag"] * 180 # in degrees + angle = torch.ones(batch_size, device=self.device) * alpha + + # define the rotation center + center = torch.ones(batch_size, 2, device=self.device) + center[..., 0] = x.shape[3] / 2 # x + center[..., 1] = x.shape[2] / 2 # y + + #print(x.shape, center) + # define the scale factor + scale = torch.ones(batch_size, device=self.device) + + # compute the transformation matrix + M = kornia.get_rotation_matrix2d(center, angle, scale) + + # apply the transformation to original image + x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w) + + #print("Start Shape ", x.shape) + out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = out.view(out.size(0), -1) + #print("Shape ", out.shape) + out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"])) + #print("Shape ", out.shape) + out = F.linear(out, self.parameters["w4"], self.parameters["b4"]) + #print("Shape ", out.shape) + return F.log_softmax(out, dim=1) + + def initialize(self): + self.optimizer.initialize() + + def adjust(self): + self.optimizer.adjust(self.parameters) + + def adjust_val(self): + self.optimizer.adjust_val(self.parameters) + + def eval(self): + self.parameters['prob']=torch.tensor(0.0, device=self.device) + + def __str__(self): + return "mnist_CNN_augmented / " + str(self.optimizer) \ No newline at end of file diff --git a/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py b/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py new file mode 100644 index 0000000..160e97c --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug.py @@ -0,0 +1,52 @@ +import torch +from torch.utils.data import Dataset, DataLoader +from torchvision import transforms +import torchvision.transforms.functional as TF + +class MNIST_aug(Dataset): + + training_file = 'training.pt' + test_file = 'test.pt' + classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', + '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine'] + + def __init__(self): + self.images = [TF.to_pil_image(x) for x in torch.ByteTensor(10, 3, 48, 48)] + self.set_stage(0) # initial stage + + def __getitem__(self, index): + image = self.images[index] + + # Just apply your transformations here + image = self.crop(image) + x = TF.to_tensor(image) + return x + + def set_stage(self, stage): + if stage == 0: + print('Using (32, 32) crops') + self.crop = transforms.RandomCrop((32, 32)) + elif stage == 1: + print('Using (28, 28) crops') + self.crop = transforms.RandomCrop((28, 28)) + + def __len__(self): + return len(self.images) + + +dataset = MyData() +loader = DataLoader(dataset, + batch_size=2, + num_workers=2, + shuffle=True) + +for batch_idx, data in enumerate(loader): + print('Batch idx {}, data shape {}'.format( + batch_idx, data.shape)) + +loader.dataset.set_stage(1) + +for batch_idx, data in enumerate(loader): + print('Batch idx {}, data shape {}'.format( + batch_idx, data.shape)) + diff --git a/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py b/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py new file mode 100644 index 0000000..d2a992b --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/dataset_aug_v2.py @@ -0,0 +1,150 @@ +#from hyperopt import * +from hyperopt_v2 import * + +import torchvision.transforms.functional as TF +import torchvision.transforms as T + +#from scipy import ndimage +import kornia + +import random + + +class LeNet_v3(nn.Module): + def __init__(self, num_inp, num_out): + super(LeNet_v3, self).__init__() + self.params = nn.ParameterDict({ + 'w1': nn.Parameter(torch.zeros(20, num_inp, 5, 5)), + 'b1': nn.Parameter(torch.zeros(20)), + 'w2': nn.Parameter(torch.zeros(50, 20, 5, 5)), + 'b2': nn.Parameter(torch.zeros(50)), + 'w3': nn.Parameter(torch.zeros(500,4*4*50)), + 'b3': nn.Parameter(torch.zeros(500)), + 'w4': nn.Parameter(torch.zeros(10, 500)), + 'b4': nn.Parameter(torch.zeros(10)) + }) + + + def initialize(self): + nn.init.kaiming_uniform_(self.params["w1"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.params["w2"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.params["w3"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.params["w4"], a=math.sqrt(5)) + + def forward(self, x): + #print("Start Shape ", x.shape) + out = F.relu(F.conv2d(input=x, weight=self.params["w1"], bias=self.params["b1"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = F.relu(F.conv2d(input=out, weight=self.params["w2"], bias=self.params["b2"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = out.view(out.size(0), -1) + #print("Shape ", out.shape) + out = F.relu(F.linear(out, self.params["w3"], self.params["b3"])) + #print("Shape ", out.shape) + out = F.linear(out, self.params["w4"], self.params["b4"]) + #print("Shape ", out.shape) + return F.log_softmax(out, dim=1) + + + def print_grad_fn(self): + for n, p in self.params.items(): + print(n, p.grad_fn) + + def __str__(self): + return "mnist_CNN_augmented / " + +class Data_aug(nn.Module): + def __init__(self): + super(Data_aug, self).__init__() + self.data_augmentation = True + self.params = nn.ParameterDict({ + "prob": nn.Parameter(torch.tensor(0.5)), + "mag": nn.Parameter(torch.tensor(180.0)) + }) + + #self.params["mag"].register_hook(print) + + def forward(self, x): + + if self.data_augmentation and self.training and random.random() < self.params["prob"]: + #print('Aug') + batch_size = x.shape[0] + # create transformation (rotation) + alpha = self.params["mag"] # in degrees + angle = torch.ones(batch_size, device=x.device) * alpha + + # define the rotation center + center = torch.ones(batch_size, 2, device=x.device) + center[..., 0] = x.shape[3] / 2 # x + center[..., 1] = x.shape[2] / 2 # y + + #print(x.shape, center) + # define the scale factor + scale = torch.ones(batch_size, device=x.device) + + # compute the transformation matrix + M = kornia.get_rotation_matrix2d(center, angle, scale) + + # apply the transformation to original image + x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w) + + return x + + def eval(self): + self.params['prob']=torch.tensor(0.0, device=self.device) + nn.Module.eval(self) + + def data_augmentation(self, mode=True): + self.data_augmentation=mode + + def print_grad_fn(self): + for n, p in self.params.items(): + print(n, p.grad_fn) + + def __str__(self): + return "Data_Augmenter / " + +class Augmented_model(nn.Module): + def __init__(self, model, data_augmenter): + #self.model = model + #self.data_aug = data_augmenter + super(Augmented_model, self).__init__()#nn.Module.__init__(self) + #super().__init__() + self.mods = nn.ModuleDict({ + 'data_aug': data_augmenter, + 'model': model + }) + #for name, param in self.mods.named_parameters(): + # print(name, type(param.data), param.size()) + + #params = self.mods.named_parameters() #self.parameters() + #parameters = [param for param in self.model.parameters()] + [param for param in self.data_aug.parameters()] + #Optimizable.__init__(self, params, optimizer) + + def initialize(self): + self.mods['model'].initialize() + + def forward(self, x): + return self.mods['model'](self.mods['data_aug'](x)) + + #def adjust(self): + # self.optimizer.adjust(self) #Parametres des dict + + def data_augmentation(self, mode=True): + self.mods['data_aug'].data_augmentation=mode + + def begin(self): + for param in self.parameters(): + param.requires_grad_() # keep gradient information… + param.retain_grad() # even if not a leaf… + + def print_grad_fn(self): + for n, m in self.mods.items(): + m.print_grad_fn() + + def __str__(self): + return str(self.mods['data_aug'])+ str(self.mods['model'])# + str(self.optimizer) \ No newline at end of file diff --git a/Gradient-Descent-The-Ultimate-Optimizer/graph/graph b/Gradient-Descent-The-Ultimate-Optimizer/graph/graph new file mode 100644 index 0000000..96389f9 --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/graph/graph @@ -0,0 +1,5 @@ +digraph { + graph [size="12,12"] + node [align=left fontsize=12 height=0.2 ranksep=0.1 shape=box style=filled] + 94296775052080 [label=NoneType fillcolor=darkolivegreen1] +} diff --git a/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg b/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg new file mode 100644 index 0000000..a682cbc --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/graph/graph.svg @@ -0,0 +1,19 @@ + + + + + + +%3 + + + +94296775052080 + +NoneType + + + diff --git a/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py b/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py new file mode 100644 index 0000000..1506f30 --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py @@ -0,0 +1,345 @@ +import math +import torch +import torchvision +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + + +class Optimizable():#nn.Module): + """ + This is the interface for anything that has parameters that need to be + optimized, somewhat like torch.nn.Model but with the right plumbing for + hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter + interface which does not give us enough control about the detachments.) + Nominal operation of an Optimizable at the lowest level is as follows: + o = MyOptimizable(…) + o.initialize() + loop { + o.begin() + o.zero_grad() + loss = –compute loss function from parameters– + loss.backward() + o.adjust() + } + Optimizables recursively handle updates to their optimiz*ers*. + """ + #def __init__(self): + # super(Optimizable, self).__init__() + # self.parameters = nn.Parameter(torch.zeros(())) + + def __init__(self, parameters, optimizer): + #super(Optimizable, self).__init__() + self.parameters = parameters # a dict mapping names to tensors + self.optimizer = optimizer # which must itself be Optimizable! + self.all_params_with_gradients = [] + #self.device = device + + def initialize(self): + """Initialize parameters, e.g. with a Kaiming initializer.""" + pass + + def begin(self): + """Enable gradient tracking on current parameters.""" + self.all_params_with_gradients = [] #Reintialisation pour eviter surcharge de la memoire + for name, param in self.parameters.items(): + #for param in self.parameters: + param.requires_grad_() # keep gradient information… + param.retain_grad() # even if not a leaf… + #param.to(self.device) + #if param.device == torch.device('cuda:0'): + # print(name, param.device) + self.all_params_with_gradients.append(param) + self.optimizer.begin() + + def zero_grad(self): + """ Set all gradients to zero. """ + for param in self.all_params_with_gradients: + #param = param.to(self.device) + param.grad = torch.zeros(param.shape, device=param.device) + self.optimizer.zero_grad() + + """ Note: at this point you would probably call .backwards() on the loss + function. """ + + def adjust(self): + """ Update parameters """ + pass + + + def print_grad_fn(self): + self.optimizer.print_grad_fn() + for n, p in self.parameters.items(): + print(n," - ", p.grad_fn) + + def param_grad(self): + return self.all_params_with_gradients + + def param(self, param_name): + return self.parameters[param_name].item() + + +class MNIST_FullyConnected(Optimizable): + """ + A fully-connected NN for the MNIST task. This is Optimizable but not itself + an optimizer. + """ + + def __init__(self, num_inp, num_hid, num_out, optimizer): + parameters = { + "w1": torch.zeros(num_inp, num_hid).t(), + "b1": torch.zeros(num_hid).t(), + "w2": torch.zeros(num_hid, num_out).t(), + "b2": torch.zeros(num_out).t(), + } + super().__init__(parameters, optimizer) + + def initialize(self): + nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5)) + self.optimizer.initialize() + + def forward(self, x): + """Compute a prediction.""" + x = F.linear(x, self.parameters["w1"], self.parameters["b1"]) + x = torch.tanh(x) + x = F.linear(x, self.parameters["w2"], self.parameters["b2"]) + x = torch.tanh(x) + x = F.log_softmax(x, dim=1) + return x + + def adjust(self): + self.optimizer.adjust(self.parameters) + + def __str__(self): + return "mnist / " + str(self.optimizer) + + +class NoOpOptimizer(Optimizable):#, nn.Module): + """ + NoOpOptimizer sits on top of a stack, and does not affect what lies below. + """ + + def __init__(self): + #super(Optimizable, self).__init__() + pass + + def initialize(self): + pass + + def begin(self): + pass + + def zero_grad(self): + pass + + def adjust(self, params): + pass + + def adjust_val(self, params): + pass + + def print_grad_fn(self): + pass + + def __str__(self): + return "static" + +class Adam(Optimizable): + """ + A fully hyperoptimizable Adam optimizer + """ + + def clamp(x): + return (x.tanh() + 1.0) / 2.0 + + def unclamp(y): + z = y * 2.0 - 1.0 + return ((1.0 + z) / (1.0 - z)).log() / 2.0 + + def __init__( + self, + alpha=0.001, + beta1=0.9, + beta2=0.999, + log_eps=-8.0, + optimizer=NoOpOptimizer(), + device = torch.device('cuda') + ): + self.device = device + parameters = { + "alpha": torch.tensor(alpha, device=self.device), + "beta1": Adam.unclamp(torch.tensor(beta1, device=self.device)), + "beta2": Adam.unclamp(torch.tensor(beta2, device=self.device)), + "log_eps": torch.tensor(log_eps, device=self.device), + } + super().__init__(parameters, optimizer) + self.num_adjustments = 0 + self.num_adjustments_val = 0 + self.cache = {} + + for name, param in parameters.items(): + param.requires_grad_() # keep gradient information… + param.retain_grad() # even if not a leaf… + #param.to(self.device) + #if param.device == torch.device('cuda:0'): + # print(name, param.device) + + def adjust(self, params): #Update param d'apprentissage + self.num_adjustments += 1 + self.optimizer.adjust(self.parameters) + #print('Adam update') + t = self.num_adjustments + beta1 = Adam.clamp(self.parameters["beta1"]) + beta2 = Adam.clamp(self.parameters["beta2"]) + for name, param in params.items(): + if name == "mag": continue + if name not in self.cache: + self.cache[name] = { + "m": torch.zeros(param.shape, device=self.device), + "v": torch.zeros(param.shape, device=self.device) + + 10.0 ** self.parameters["log_eps"].data + # NOTE that we add a little ‘fudge factor' here because sqrt is not + # differentiable at exactly zero + } + #print(name, param.device) + g = param.grad.detach() + self.cache[name]["m"] = m = ( + beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g + ) + self.cache[name]["v"] = v = ( + beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g + ) + self.all_params_with_gradients.append(m) + self.all_params_with_gradients.append(v) + m_hat = m / (1.0 - beta1 ** float(t)) + v_hat = v / (1.0 - beta2 ** float(t)) + dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"]) + params[name] = param.detach() - self.parameters["alpha"] * dparam + #print(name) + + def adjust_val(self, params): #Update param Transformations + self.num_adjustments_val += 1 + self.optimizer.adjust_val(self.parameters) + #print('Adam update') + t = self.num_adjustments_val + beta1 = Adam.clamp(self.parameters["beta1"]) + beta2 = Adam.clamp(self.parameters["beta2"]) + for name, param in params.items(): + if name != "mag": continue + if name not in self.cache: + self.cache[name] = { + "m": torch.zeros(param.shape, device=self.device), + "v": torch.zeros(param.shape, device=self.device) + + 10.0 ** self.parameters["log_eps"].data + # NOTE that we add a little ‘fudge factor' here because sqrt is not + # differentiable at exactly zero + } + #print(name, param.device) + g = param.grad.detach() + self.cache[name]["m"] = m = ( + beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g + ) + self.cache[name]["v"] = v = ( + beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g + ) + self.all_params_with_gradients.append(m) + self.all_params_with_gradients.append(v) + m_hat = m / (1.0 - beta1 ** float(t)) + v_hat = v / (1.0 - beta2 ** float(t)) + dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"]) + params[name] = param.detach() - self.parameters["alpha"] * dparam + #print(name) + + def __str__(self): + return "adam(" + str(self.parameters) + ") / " + str(self.optimizer) +''' +class SGD(Optimizable): + """ + A hyperoptimizable SGD + """ + + def __init__(self, alpha=0.01, optimizer=NoOpOptimizer()): + parameters = {"alpha": torch.tensor(alpha)} + super().__init__(parameters, optimizer) + + def adjust(self, params): + self.optimizer.adjust(self.parameters) + for name, param in params.items(): + g = param.grad.detach() + params[name] = param.detach() - g * self.parameters["alpha"] + + def __str__(self): + return "sgd(%f) / " % self.parameters["alpha"] + str(self.optimizer) + +class SGDPerParam(Optimizable): + """ + Like above, but can be taught a separate step size for each parameter it + tunes. + """ + + def __init__(self, alpha=0.01, params=[], optimizer=NoOpOptimizer()): + parameters = {name + "_alpha": torch.tensor(alpha) for name in params} + super().__init__(parameters, optimizer) + + def adjust(self, params): + self.optimizer.adjust(self.parameters) + for name, param in params.items(): + g = param.grad.detach() + params[name] = param.detach() - g * self.parameters[name + "_alpha"] + + def __str__(self): + return "sgd(%s) / " % str( + {k: t.item() for k, t in self.parameters.items()} + ) + str(self.optimizer) +''' +''' +class AdamBaydin(Optimizable): + """ Same as above, but only optimizes the learning rate, treating the + remaining hyperparameters as constants. """ + + def __init__( + self, + alpha=0.001, + beta1=0.9, + beta2=0.999, + log_eps=-8.0, + optimizer=NoOpOptimizer(), + ): + parameters = {"alpha": torch.tensor(alpha)} + self.beta1 = beta1 + self.beta2 = beta2 + self.log_eps = log_eps + super().__init__(parameters, optimizer) + self.num_adjustments = 0 + self.cache = {} + + def adjust(self, params): + self.num_adjustments += 1 + self.optimizer.adjust(self.parameters) + t = self.num_adjustments + beta1 = self.beta1 + beta2 = self.beta2 + for name, param in params.items(): + if name not in self.cache: + self.cache[name] = { + "m": torch.zeros(param.shape), + "v": torch.zeros(param.shape) + 10.0 ** self.log_eps, + } + g = param.grad.detach() + self.cache[name]["m"] = m = ( + beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g + ) + self.cache[name]["v"] = v = ( + beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g + ) + self.all_params_with_gradients.append(m) + self.all_params_with_gradients.append(v) + m_hat = m / (1.0 - beta1 ** float(t)) + v_hat = v / (1.0 - beta2 ** float(t)) + dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.log_eps) + params[name] = param.detach() - self.parameters["alpha"] * dparam + + def __str__(self): + return "adam(" + str(self.parameters) + ") / " + str(self.optimizer) +''' \ No newline at end of file diff --git a/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py b/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py new file mode 100644 index 0000000..c100085 --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py @@ -0,0 +1,296 @@ +import math +import torch +import torchvision +import torch.nn as nn +import torch.nn.functional as F +from torch.optim.optimizer import Optimizer + +class Optimizable(): + """ + This is the interface for anything that has parameters that need to be + optimized, somewhat like torch.nn.Model but with the right plumbing for + hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter + interface which does not give us enough control about the detachments.) + Nominal operation of an Optimizable at the lowest level is as follows: + o = MyOptimizable(…) + o.initialize() + loop { + o.begin() + o.zero_grad() + loss = –compute loss function from parameters– + loss.backward() + o.adjust() + } + Optimizables recursively handle updates to their optimiz*ers*. + """ + #def __init__(self): + # super(Optimizable, self).__init__() + # self.parameters = nn.Parameter(torch.zeros(())) + + def __init__(self, parameters, optimizer): + self.params = parameters # a dict mapping names to tensors + self.optimizer = optimizer # which must itself be Optimizable! + self.all_params_with_gradients = [] + #self.device = device + + def initialize(self): + """Initialize parameters, e.g. with a Kaiming initializer.""" + pass + + def begin(self): + """Enable gradient tracking on current parameters.""" + self.all_params_with_gradients = nn.ParameterList() #Reintialisation pour eviter surcharge de la memoire + print("Opti param :", type(self.params)) + #for name, param in self.params: + if isinstance(self.params,dict): #Dict + for name, param in self.params: + param.requires_grad_() # keep gradient information… + param.retain_grad() # even if not a leaf… + self.all_params_with_gradients.append(param) + if isinstance(self.params,list): #List + for param in self.params: + param.requires_grad_() # keep gradient information… + param.retain_grad() # even if not a leaf… + self.all_params_with_gradients.append(param) + self.optimizer.begin() + + def zero_grad(self): + """ Set all gradients to zero. """ + for param in self.all_params_with_gradients: + param.grad = torch.zeros(param.shape, device=param.device) + self.optimizer.zero_grad() + + """ Note: at this point you would probably call .backwards() on the loss + function. """ + + def adjust(self): + """ Update parameters """ + pass + + +class NoOpOptimizer(Optimizable):#, nn.Module): + """ + NoOpOptimizer sits on top of a stack, and does not affect what lies below. + """ + + def __init__(self): + #super(Optimizable, self).__init__() + pass + + def initialize(self): + pass + + def begin(self): + #print("NoOpt begin") + pass + + def zero_grad(self): + pass + + def adjust(self, params): + pass + + def step(self): + pass + + def print_grad_fn(self): + pass + + def __str__(self): + return "static" + + +class SGD(Optimizer, nn.Module): #Eviter Optimizer + """ + A hyperoptimizable SGD + """ + + def __init__(self, params, lr=0.01, height=0): + self.height=height + #params : a optimiser + #reste (defaults) param de l'opti + print('SGD - H', height) + nn.Module.__init__(self) + + optim_keys = ('lr','') #A mettre dans Optimizable ? #'' pour eviter iteration dans la chaine de charactere... + ''' + self_params = {"lr": torch.tensor(lr), + "momentum": 0, + "dampening":0, + "weight_decay":0, + "nesterov": False} + ''' + #self_params = dict(lr=torch.tensor(lr), + # momentum=0, dampening=0, weight_decay=0, nesterov=False) + + self_params = nn.ParameterDict({ + "lr": nn.Parameter(torch.tensor(lr)), + "momentum": nn.Parameter(torch.tensor(0.0)), + "dampening": nn.Parameter(torch.tensor(0.0)), + "weight_decay": nn.Parameter(torch.tensor(0.0)), + }) + + for k in self_params.keys() & optim_keys: + self_params[k].requires_grad_() # keep gradient information… + self_params[k].retain_grad() # even if not a leaf… + #self_params[k].register_hook(print) + + if height==0: + optimizer = NoOpOptimizer() + else: + #def dict_generator(): yield {k: self_params[k] for k in self_params.keys() & optim_keys} + #(dict for dict in {k: self_params[k] for k in self_params.keys() & optim_keys}) #Devrait mar + optimizer = SGD(params=(self_params[k]for k in self_params.keys() & optim_keys), lr=lr, height=height-1) + #optimizer.register_backward_hook(print) + + self.optimizer = optimizer + #if(height==0): + # for n,p in params.items(): + # print(n,p) + + #Optimizable.__init__(self, self_params, optimizer) + + #print(type(params)) + #for p in params: + # print(type(p)) + Optimizer.__init__(self, params, self_params) + + for group in self.param_groups: + for p in group['params']: + print(type(p.data), p.size()) + print('End SGD-H', height) + + def begin(self): + for group in self.param_groups: + for p in group['params']: + #print(type(p.data), p.size()) + p.requires_grad_() # keep gradient information… + p.retain_grad() # even if not a leaf… + #p.register_hook(lambda x: print(self.height, x.grad_fn)) + + self.optimizer.begin() + + def print_grad_fn(self): + self.optimizer.print_grad_fn() + for group in self.param_groups: + for i, p in enumerate(group['params']): + print(self.height," - ", i, p.grad_fn) + + #def adjust(self, params): + # self.optimizer.adjust(self.params) + # for name, param in params.items(): + # g = param.grad.detach() + # params[name] = param.detach() - g * self.params["lr"] + + def step(self): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + print('SGD start') + self.optimizer.step() + + for group in self.param_groups: + for i, p in enumerate(group['params']): + if p.grad is None: + continue + #d_p = p.grad.data + d_p = p.grad.detach() + + #print(group['lr']) + p.data.add_(-group['lr'].item(), d_p) + #group['params'][i] = p.detach() - d_p * group['lr'] + p.data-= group['lr']*d_p #Data ne pas utiliser perte info + + for p in group['params']: + if p.grad is None: + print(p, p.grad) + continue + + print("SGD end") + #return loss + + def __str__(self): + return "sgd(%f) / " % self.params["lr"] + str(self.optimizer) + + +class Adam(Optimizable, nn.Module): + """ + A fully hyperoptimizable Adam optimizer + """ + + def clamp(x): + return (x.tanh() + 1.0) / 2.0 + + def unclamp(y): + z = y * 2.0 - 1.0 + return ((1.0 + z) / (1.0 - z)).log() / 2.0 + + def __init__( + self, + alpha=0.001, + beta1=0.9, + beta2=0.999, + log_eps=-8.0, + optimizer=NoOpOptimizer(), + device = torch.device('cuda') + ): + #super(Adam, self).__init__() + nn.Module.__init__(self) + self.device = device + params = nn.ParameterDict({ + "alpha": nn.Parameter(torch.tensor(alpha, device=self.device)), + "beta1": nn.Parameter(Adam.unclamp(torch.tensor(beta1, device=self.device))), + "beta2": nn.Parameter(Adam.unclamp(torch.tensor(beta2, device=self.device))), + "log_eps": nn.Parameter(torch.tensor(log_eps, device=self.device)), + }) + Optimizable.__init__(self, params, optimizer) + self.num_adjustments = 0 + self.cache = {} + + for name, param in params.items(): + param.requires_grad_() # keep gradient information… + param.retain_grad() # even if not a leaf… + + def adjust(self, params, pytorch_mod=False): + self.num_adjustments += 1 + self.optimizer.adjust(self.params) + t = self.num_adjustments + beta1 = Adam.clamp(self.params["beta1"]) + beta2 = Adam.clamp(self.params["beta2"]) + + updated_param = [] + if pytorch_mod: + params = params.named_parameters(prefix='') #Changer nom d'input... + + for name, param in params: + if name not in self.cache: + self.cache[name] = { + "m": torch.zeros(param.shape, device=self.device), + "v": torch.zeros(param.shape, device=self.device) + + 10.0 ** self.params["log_eps"].data + # NOTE that we add a little ‘fudge factor' here because sqrt is not + # differentiable at exactly zero + } + #print(name, param.device) + g = param.grad.detach() + self.cache[name]["m"] = m = ( + beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g + ) + self.cache[name]["v"] = v = ( + beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g + ) + self.all_params_with_gradients.append(nn.Parameter(m)) #Risque de surcharger la memoire => Dict mieux ? + self.all_params_with_gradients.append(nn.Parameter(v)) + m_hat = m / (1.0 - beta1 ** float(t)) + v_hat = v / (1.0 - beta2 ** float(t)) + dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.params["log_eps"]) + updated_param[name] = param.detach() - self.params["alpha"] * dparam + + if pytorch_mod: params.update(updated_param) #Changer nom d'input... + else: params = updated_param + + def __str__(self): + return "adam(" + str(self.params) + ") / " + str(self.optimizer) diff --git a/Gradient-Descent-The-Ultimate-Optimizer/main.py b/Gradient-Descent-The-Ultimate-Optimizer/main.py new file mode 100644 index 0000000..6ed0f6f --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/main.py @@ -0,0 +1,182 @@ +import numpy as np +import json, math, time, os +from hyperopt import * +import gc + +BATCH_SIZE = 300 + +mnist_train = torchvision.datasets.MNIST( + "./data", train=True, download=True, transform=torchvision.transforms.ToTensor() +) + +mnist_test = torchvision.datasets.MNIST( + "./data", train=False, download=True, transform=torchvision.transforms.ToTensor() +) + +dl_train = torch.utils.data.DataLoader( + mnist_train, batch_size=BATCH_SIZE, shuffle=False +) +dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False) + + +def test(model): + for i, (features_, labels_) in enumerate(dl_test): + features, labels = torch.reshape(features_, (10000, 28 * 28)), labels_ + pred = model.forward(features) + return pred.argmax(dim=1).eq(labels).sum().item() / 10000 * 100 + + +def train(model, epochs=3, height=1): + stats = [] + for epoch in range(epochs): + for i, (features_, labels_) in enumerate(dl_train): + t0 = time.process_time() + model.begin() + features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_ + pred = model.forward( + features + ) # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/ + loss = F.nll_loss(pred, labels) + model.zero_grad() + loss.backward(create_graph=True) + model.adjust() + tf = time.process_time() + data = { + "time": tf - t0, + "iter": epoch * len(dl_train) + i, + "loss": loss.item(), + "params": { + k: v.item() + for k, v in model.optimizer.parameters.items() + if "." not in k + }, + } + stats.append(data) + return stats + + +def run(opt, name="out", usr={}, epochs=3, height=1): + torch.manual_seed(0x42) + model = MNIST_FullyConnected(28 * 28, 128, 10, opt) + print("Running...", str(model)) + model.initialize() + log = train(model, epochs, height) + acc = test(model) + out = {"acc": acc, "log": log, "usr": usr} + with open("log/%s.json" % name, "w+") as f: + json.dump(out, f, indent=True) + times = [x["time"] for x in log] + print("Times (ms):", np.mean(times), "+/-", np.std(times)) + print("Final accuracy:", acc) + return out + + +def sgd_experiments(): + run(SGD(0.01), "sgd", epochs=1) + out = run(SGD(0.01, optimizer=SGD(0.01)), "sgd+sgd", epochs=1) + alpha = out["log"][-1]["params"]["alpha"] + print(alpha) + run(SGD(alpha), "sgd-final", epochs=1) + + +def adam_experiments(): + run(Adam(), "adam", epochs=1) + print() + mo = SGDPerParam( + 0.001, ["alpha", "beta1", "beta2", "log_eps"], optimizer=SGD(0.0001) + ) + out = run(Adam(optimizer=mo), "adam+sgd", epochs=1) + p = out["log"][-1]["params"] + alpha = p["alpha"] + beta1 = Adam.clamp(torch.tensor(p["beta1"])).item() + beta2 = Adam.clamp(torch.tensor(p["beta2"])).item() + log_eps = p["log_eps"] + print(alpha, beta1, beta2, log_eps) + print(mo) + run( + Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps), + "adam+sgd-final", + epochs=1, + ) + print() + out = run(Adam(optimizer=Adam()), "adam2", epochs=1) + p = out["log"][-1]["params"] + alpha = p["alpha"] + beta1 = Adam.clamp(torch.tensor(p["beta1"])).item() + beta2 = Adam.clamp(torch.tensor(p["beta2"])).item() + log_eps = p["log_eps"] + print(alpha, beta1, beta2, log_eps) + run( + Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps), + "adam2-final", + epochs=1, + ) + print() + mo = SGDPerParam(0.001, ["alpha"], optimizer=SGD(0.0001)) + out = run(AdamBaydin(optimizer=mo), "adambaydin+sgd", epochs=1) + p = out["log"][-1]["params"] + alpha = p["alpha"] + print(alpha) + print(mo) + run(Adam(alpha=p["alpha"]), "adambaydin+sgd-final", epochs=1) + print() + out = run(AdamBaydin(optimizer=Adam()), "adambaydin2", epochs=1) + p = out["log"][-1]["params"] + alpha = p["alpha"] + print(alpha) + run(Adam(alpha=p["alpha"]), "adambaydin2-final", epochs=1) + + +def surface(): + run(SGD(10 ** -3, optimizer=SGD(10 ** -1)), "tst", epochs=1) + for log_alpha in np.linspace(-3, 2, 10): + run(SGD(10 ** log_alpha), "sgd@1e%+.2f" % log_alpha, epochs=1) + + +def make_sgd_stack(height, top): + if height == 0: + return SGD(alpha=top) + return SGD(alpha=top, optimizer=make_sgd_stack(height - 1, top)) + + +def make_adam_stack(height, top=0.0000001): + if height == 0: + return Adam(alpha=top) + return Adam(alpha=top, optimizer=make_adam_stack(height - 1)) + + +def stack_test(): + for top in np.linspace(-7, 3, 20): + for height in range(6): + print("height =", height, "to p=", top) + opt = make_sgd_stack(height, 10 ** top) + run( + opt, + "metasgd3-%d@%+.2f" % (height, top), + {"height": height, "top": top}, + epochs=1, + height=height, + ) + gc.collect() + + +def perf_test(): + for h in range(51): + print("height:", h) + # opt = make_sgd_stack(h, 0.01) + opt = make_adam_stack(h) + run(opt, "adamperf-%d" % h, {"height": h}, epochs=1) + gc.collect() + + +if __name__ == "__main__": + try: + os.mkdir("log") + except: + print("log/ exists already") + + surface() + sgd_experiments() + adam_experiments() + stack_test() + perf_test() diff --git a/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt b/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt new file mode 100644 index 0000000..5aae77b --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/requirements.txt @@ -0,0 +1,5 @@ +numpy==1.17.2 +Pillow==6.2.0 +six==1.12.0 +torch==1.2.0 +torchvision==0.4.0 diff --git a/Gradient-Descent-The-Ultimate-Optimizer/tests.py b/Gradient-Descent-The-Ultimate-Optimizer/tests.py new file mode 100644 index 0000000..936894f --- /dev/null +++ b/Gradient-Descent-The-Ultimate-Optimizer/tests.py @@ -0,0 +1,344 @@ +import numpy as np +import json, math, time, os +from data_aug import * +#from data_aug_v2 import * +import gc + +import matplotlib.pyplot as plt +from torchviz import make_dot, make_dot_from_trace + +from torch.utils.data import SubsetRandomSampler + +BATCH_SIZE = 300 +#TEST_SIZE = 10000 +TEST_SIZE = 300 +DATA_LIMIT = 10 + +''' +data_train = torchvision.datasets.MNIST( + "./data", train=True, download=True, + transform=torchvision.transforms.Compose([ + #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0), + torchvision.transforms.ToTensor() + ]) +) +data_test = torchvision.datasets.MNIST( + "./data", train=False, download=True, transform=torchvision.transforms.ToTensor() +) + +''' +data_train = torchvision.datasets.CIFAR10( + "./data", train=True, download=True, + transform=torchvision.transforms.Compose([ + #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0), + torchvision.transforms.ToTensor() + ]) +) + +data_test = torchvision.datasets.CIFAR10( + "./data", train=False, download=True, transform=torchvision.transforms.ToTensor() +) + +train_subset_indices=range(int(len(data_train)/2)) +val_subset_indices=range(int(len(data_train)/2),len(data_train)) + +dl_train = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices)) +dl_val = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices)) +dl_test = torch.utils.data.DataLoader(data_test, batch_size=TEST_SIZE, shuffle=False) + +def test(model, reshape_in=True, device = torch.device('cuda')): + for i, (features_, labels_) in enumerate(dl_test): + if reshape_in : + features, labels = torch.reshape(features_, (TEST_SIZE, 28 * 28)), labels_ + else: + features, labels =features_, labels_ + + features, labels = features.to(device), labels.to(device) + + pred = model.forward(features) + return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100 + +def train_one_epoch(model, optimizer, epoch=0, reshape_in=True, device = torch.device('cuda'), train_data=True): + if train_data: dl = dl_train + else: dl = dl_val + for i, (features_, labels_) in enumerate(dl): + if i > DATA_LIMIT : break + #t0 = time.process_time() + + if reshape_in : + features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_ + else: + features, labels =features_, labels_ + + features, labels = features.to(device), labels.to(device) + + #optimizer.begin() + #optimizer.zero_grad() + model.begin() + model.zero_grad() + pred = model.forward(features) + + #loss = F.nll_loss(pred, labels) + loss = F.cross_entropy(pred,labels) + + #model.print_grad_fn() + #optimizer.print_grad_fn() + #print('-'*50) + + loss.backward(create_graph=True) + + #optimizer.step() + if train_data: model.adjust() + else: model.adjust_val() + + #tf = time.process_time() + #data = { + # "time": tf - t0, + # "iter": epoch * len(dl_train) + i, + # "loss": loss.item(), + # "params": { + # k: v.item() + # for k, v in model.optimizer.parameters.items() + # if "." not in k + # }, + #} + #stats.append(data) + + #print_torch_mem(i) + return loss.item() + +def train_v2(model, optimizer, epochs=3, reshape_in=True, device = torch.device('cuda')): + log = [] + for epoch in range(epochs): + + #dl_train.dataset.transform=torchvision.transforms.Compose([ + # torchvision.transforms.RandomAffine(degrees=model.param('mag'), translate=None, scale=None, shear=None, resample=False, fillcolor=0), + # torchvision.transforms.ToTensor() + #]) + viz_data(fig_name='res/data_sample') + t0 = time.process_time() + loss = train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device) + train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device,train_data=False) + + #acc = test(model=model, reshape_in=reshape_in, device=device) + acc = 0 + + + tf = time.process_time() + data = { + "time": tf - t0, + "epoch": epoch, + "loss": loss, + "acc": acc, + "params": { + k: v.item() + for k, v in model.optimizer.parameters.items() + #for k, v in model.mods.data_aug.params.named_parameters() + if "." not in k + + }, + } + log.append(data) + + + print("Epoch :",epoch+1, "/",epochs, "- Loss :",log[-1]["loss"]) + param = [p for p in model.param_grad() if p.grad is not None] + if(len(param)!=0): + print(param[-2],' / ', param[-2].grad) + print(param[-1],' / ', param[-1].grad) + return log + +def train(model, epochs=3, height=1, reshape_in=True, device = torch.device('cuda')): + stats = [] + for epoch in range(epochs): + for i, (features_, labels_) in enumerate(dl_train): + t0 = time.process_time() + model.begin() + if reshape_in : + features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_ + else: + features, labels =features_, labels_ + + features, labels = features.to(device), labels.to(device) + + pred = model.forward( + features + ) # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/ + #loss = F.nll_loss(pred, labels) + loss = F.cross_entropy(pred,labels) + + #print('-'*50) + #param = [p for p in model.param_grad() if p.grad is not None] + #if(len(param)!=0): + # print(param[-2],' / ', param[-2].grad) + # print(param[-1],' / ', param[-1].grad) + + model.zero_grad() + loss.backward(create_graph=True) + model.adjust() + tf = time.process_time() + data = { + "time": tf - t0, + "iter": epoch * len(dl_train) + i, + "loss": loss.item(), + "params": { + k: v.item() + for k, v in model.optimizer.parameters.items() + if "." not in k + }, + } + stats.append(data) + + print('-'*50) + i=0 + for obj in gc.get_objects(): + try: + if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)) and len(obj.size())>1: + print(i, type(obj), obj.size()) + i+=1 + except: + pass + print("Epoch :",epoch+1, "/",epochs, "- Loss :",stats[-1]["loss"]) + param = [p for p in model.param_grad() if p.grad is not None] + if(len(param)!=0): + print(param[-2],' / ', param[-2].grad) + print(param[-1],' / ', param[-1].grad) + return stats + +def run(opt, name="out", usr={}, epochs=10, height=1, cnn=True, device = torch.device('cuda')): + torch.manual_seed(0x42) + if not cnn: + reshape_in = True + #model = MNIST_FullyConnected(28 * 28, 128, 10, opt) + model = MNIST_FullyConnected_Augmented(28 * 28, 128, 10, opt, device=device) + + else: + reshape_in = False + #model = LeNet(1, 10,opt, device) + #model = LeNet_v2(1, 10,opt, device).to(device=device) + model = LeNet_v2(3, 10,opt, device).to(device=device) + optimizer=None + ''' + m = LeNet_v3(1, 10) + a = Data_aug() + model = Augmented_model(model=m, + data_augmenter=a, + optimizer=opt).to(device) #deux fois le meme optimizer ?... + ''' + ''' + m = LeNet_v3(1, 10) + a = Data_aug() + model = Augmented_model(model=m, data_augmenter=a).to(device) + #optimizer = SGD(model.parameters()) + optimizer = SGD(model.parameters(), lr=0.01, height=1) + ''' + + + #for idx, m in enumerate(model.modules()): + # print(idx, '->', m) + print("Running...", str(model)) + model.initialize() + #print_model(model) + #model.data_augmentation(False) + #model.eval() + + log = train_v2(model=model, optimizer=optimizer, epochs=epochs, reshape_in=reshape_in, device=device) + model.eval() + acc = test(model, reshape_in, device=device) + + + #param = [p for p in model.param_grad() if p.grad is not None] + #if(len(param)!=0): + # print(param[-2],' / ', param[-2].grad) + # print(param[-1],' / ', param[-1].grad) + + out = {"acc": acc, "log": log, "usr": usr} + with open("log/%s.json" % name, "w+") as f: + json.dump(out, f, indent=True) + times = [x["time"] for x in log] + print("Times (ms):", np.mean(times), "+/-", np.std(times)) + print("Final accuracy:", acc) + + #plot_res(log, fig_name='res/'+name) + + return out + +def make_adam_stack(height, top=0.0000001, device = torch.device('cuda')): + #print(height,device) + if height == 0: + return Adam(alpha=top, device=device) + return Adam(alpha=top, optimizer=make_adam_stack(height - 1, top, device=device), device=device) + +def plot_res(log, fig_name='res'): + + fig, ax = plt.subplots(ncols=3, figsize=(15, 3)) + ax[0].set_title('Loss') + ax[0].plot([x["loss"] for x in log]) + + ax[1].set_title('Acc') + ax[1].plot([x["acc"] for x in log]) + + ax[2].set_title('mag') + ax[2].plot([x["data_aug"] for x in log]) + + plt.savefig(fig_name) + +def print_torch_mem(add_info=''): + + nb=0 + max_size=0 + for obj in gc.get_objects(): + try: + if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # and len(obj.size())>1: + #print(i, type(obj), obj.size()) + size = np.sum(obj.size()) + if(size>max_size): max_size=size + nb+=1 + except: + pass + print(add_info, "-Pytroch tensor nb:",nb," / Max dim:", max_size) + +def print_model(model, fig_name='graph/graph'): #Semble ne pas marcher pour les models en fonctionnel + x = torch.randn(1,1,28,28, device=device) + dot=make_dot(model(x), params=dict(model.named_parameters())) + dot.format = 'svg' #https://graphviz.readthedocs.io/en/stable/manual.html#formats + dot.render(fig_name) + print("Model graph generated !") + +def viz_data(fig_name='data_sample'): + + features_, labels_ = next(iter(dl_train)) + plt.figure(figsize=(10,10)) + #for i, (features_, labels_) in enumerate(dl_train): + for i in range(25): + if i==25: break + #print(features_.size(), labels_.size()) + + plt.subplot(5,5,i+1) + plt.xticks([]) + plt.yticks([]) + plt.grid(False) + + img = features_[i,0,:,:] + + #print('im shape',img.shape) + plt.imshow(img, cmap=plt.cm.binary) + plt.xlabel(labels_[i].item()) + + plt.savefig(fig_name) + +########################################## +if __name__ == "__main__": + try: + os.mkdir("log") + except: + print("log/ exists already") + + device = torch.device('cuda') + + run(make_adam_stack(height=1, top=0.001, device=device), + "Augmented_MNIST", + epochs=100, + cnn=True, + device = device) + print() \ No newline at end of file diff --git a/higher/dataug.py b/higher/dataug.py new file mode 100644 index 0000000..03609f2 --- /dev/null +++ b/higher/dataug.py @@ -0,0 +1,583 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import * + +#import kornia +#import random +#import numpy as np +import copy + +import transformations as TF + +class Data_aug(nn.Module): #Rotation parametree + def __init__(self): + super(Data_aug, self).__init__() + self._data_augmentation = True + self._params = nn.ParameterDict({ + "prob": nn.Parameter(torch.tensor(0.5)), + "mag": nn.Parameter(torch.tensor(1.0)) + }) + + #self.params["mag"].register_hook(print) + + def forward(self, x): + + if self._data_augmentation and random.random() < self._params["prob"]: + #print('Aug') + batch_size = x.shape[0] + # create transformation (rotation) + alpha = self._params["mag"]*180 # in degrees + angle = torch.ones(batch_size, device=x.device) * alpha + + # define the rotation center + center = torch.ones(batch_size, 2, device=x.device) + center[..., 0] = x.shape[3] / 2 # x + center[..., 1] = x.shape[2] / 2 # y + + #print(x.shape, center) + # define the scale factor + scale = torch.ones(batch_size, device=x.device) + + # compute the transformation matrix + M = kornia.get_rotation_matrix2d(center, angle, scale) + + # apply the transformation to original image + x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w) + + return x + + def eval(self): + self.augment(mode=False) + nn.Module.eval(self) + + def augment(self, mode=True): + self._data_augmentation=mode + + def __getitem__(self, key): + return self._params[key] + + def __str__(self): + return "Data_aug(Mag-1 TF)" + +class Data_augV2(nn.Module): #Methode exacte + def __init__(self): + super(Data_augV2, self).__init__() + self._data_augmentation = True + + self._fixed_transf=[0.0, 45.0, 180.0] #Degree rotation + #self._fixed_transf=[0.0] + self._nb_tf= len(self._fixed_transf) + + self._params = nn.ParameterDict({ + "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme + #"prob2": nn.Parameter(torch.ones(len(self._fixed_transf)).softmax(dim=0)) + }) + + #print(self._params["prob"], self._params["prob2"]) + + self.transf_idx=0 + + def forward(self, x): + + if self._data_augmentation: + #print('Aug',self._fixed_transf[self.transf_idx]) + device = x.device + batch_size = x.shape[0] + + # create transformation (rotation) + #alpha = 180 # in degrees + alpha = self._fixed_transf[self.transf_idx] + angle = torch.ones(batch_size, device=device) * alpha + + x = self.rotate(x,angle) + + return x + + def rotate(self, x, angle): + + device = x.device + batch_size = x.shape[0] + # define the rotation center + center = torch.ones(batch_size, 2, device=device) + center[..., 0] = x.shape[3] / 2 # x + center[..., 1] = x.shape[2] / 2 # y + + #print(x.shape, center) + # define the scale factor + scale = torch.ones(batch_size, device=device) + + # compute the transformation matrix + M = kornia.get_rotation_matrix2d(center, angle, scale) + + # apply the transformation to original image + return kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w) + + + def adjust_prob(self): #Detach from gradient ? + self._params['prob'].data = self._params['prob'].clamp(min=0.0,max=1.0) + #print('proba',self._params['prob']) + self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1 + #print('Sum p', sum(self._params['prob'])) + + def eval(self): + self.augment(mode=False) + nn.Module.eval(self) + + def augment(self, mode=True): + self._data_augmentation=mode + + def __getitem__(self, key): + return self._params[key] + + def __str__(self): + return "Data_augV2(Exact-%d TF)" % self._nb_tf + +class Data_augV3(nn.Module): #Echantillonage uniforme/Mixte + def __init__(self, mix_dist=0.0): + super(Data_augV3, self).__init__() + self._data_augmentation = True + + #self._fixed_transf=[0.0, 45.0, 180.0] #Degree rotation + self._fixed_transf=[0.0, 1.0, -1.0] #Flips (Identity,Horizontal,Vertical) + #self._fixed_transf=[0.0] + self._nb_tf= len(self._fixed_transf) + + self._params = nn.ParameterDict({ + "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme + #"prob2": nn.Parameter(torch.ones(len(self._fixed_transf)).softmax(dim=0)) + }) + + #print(self._params["prob"], self._params["prob2"]) + self._sample = [] + + self._mix_dist = False + if mix_dist != 0.0: + self._mix_dist = True + self._mix_factor = max(min(mix_dist, 1.0), 0.0) + + def forward(self, x): + + if self._data_augmentation: + device = x.device + batch_size = x.shape[0] + + + #good_distrib = Uniform(low=torch.zeros(batch_size,1, device=device),high=torch.new_full((batch_size,1),self._params["prob"], device=device)) + #bad_distrib = Uniform(low=torch.zeros(batch_size,1, device=device),high=torch.new_full((batch_size,1), 1-self._params["prob"], device=device)) + + #transform_dist = Categorical(probs=torch.tensor([self._params["prob"], 1-self._params["prob"]], device=device)) + #self._sample = transform_dist._sample(sample_shape=torch.Size([batch_size,1])) + + uniforme_dist = torch.ones(1,self._nb_tf,device=device).softmax(dim=0) + + if not self._mix_dist: + distrib = uniforme_dist + else: + distrib = (self._mix_factor*self._params["prob"]+(1-self._mix_factor)*uniforme_dist).softmax(dim=0) #Mix distrib reel / uniforme avec mix_factor + + cat_distrib= Categorical(probs=torch.ones((batch_size, self._nb_tf), device=device)*distrib) + self._sample = cat_distrib.sample() + + TF_param = torch.tensor([self._fixed_transf[x] for x in self._sample], device=device) #Approche de marco peut-etre plus rapide + + #x = self.rotate(x,angle=TF_param) + x = self.flip(x,flip_mat=TF_param) + + return x + + def rotate(self, x, angle): + + device = x.device + batch_size = x.shape[0] + # define the rotation center + center = torch.ones(batch_size, 2, device=device) + center[..., 0] = x.shape[3] / 2 # x + center[..., 1] = x.shape[2] / 2 # y + + #print(x.shape, center) + # define the scale factor + scale = torch.ones(batch_size, device=device) + + # compute the transformation matrix + M = kornia.get_rotation_matrix2d(center, angle, scale) + + # apply the transformation to original image + return kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w) + + def flip(self, x, flip_mat): + + #print(flip_mat) + device = x.device + batch_size = x.shape[0] + + h, w = x.shape[2], x.shape[3] # destination size + #points_src = torch.ones(batch_size, 4, 2, device=device) + #points_dst = torch.ones(batch_size, 4, 2, device=device) + + #Identity + iM=torch.tensor(np.eye(3)) + + #Horizontal flip + # the source points are the region to crop corners + #points_src = torch.FloatTensor([[ + # [w - 1, 0], [0, 0], [0, h - 1], [w - 1, h - 1], + #]]) + # the destination points are the image vertexes + #points_dst = torch.FloatTensor([[ + # [0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1], + #]]) + # compute perspective transform + #hM = kornia.get_perspective_transform(points_src, points_dst) + hM =torch.tensor( [[[-1., 0., w-1], + [ 0., 1., 0.], + [ 0., 0., 1.]]]) + + #Vertical flip + # the source points are the region to crop corners + #points_src = torch.FloatTensor([[ + # [0, h - 1], [w - 1, h - 1], [w - 1, 0], [0, 0], + #]]) + # the destination points are the image vertexes + #points_dst = torch.FloatTensor([[ + # [0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1], + #]]) + # compute perspective transform + #vM = kornia.get_perspective_transform(points_src, points_dst) + vM =torch.tensor( [[[ 1., 0., 0.], + [ 0., -1., h-1], + [ 0., 0., 1.]]]) + #print(vM) + + M=torch.ones(batch_size, 3, 3, device=device) + + for i in range(batch_size): # A optimiser + if flip_mat[i]==0.0: + M[i,]=iM + elif flip_mat[i]==1.0: + M[i,]=hM + elif flip_mat[i]==-1.0: + M[i,]=vM + + # warp the original image by the found transform + return kornia.warp_perspective(x, M, dsize=(h, w)) + + def adjust_prob(self, soft=False): #Detach from gradient ? + + if soft : + self._params['prob'].data=F.softmax(self._params['prob'].data, dim=0) #Trop 'soft', bloque en dist uniforme si lr trop faible + else: + #self._params['prob'].clamp(min=0.0,max=1.0) + self._params['prob'].data = F.relu(self._params['prob'].data) + #self._params['prob'].data = self._params['prob'].clamp(min=0.0,max=1.0) + #print('proba',self._params['prob']) + self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1 + #print('Sum p', sum(self._params['prob'])) + + def loss_weight(self): + #w_loss = [self._params["prob"][x] for x in self._sample] + #print(self._sample.view(-1,1).shape) + #print(self._sample[:10]) + + w_loss = torch.zeros((self._sample.shape[0],self._nb_tf), device=self._sample.device) + w_loss.scatter_(1, self._sample.view(-1,1), 1) + #print(w_loss.shape) + #print(w_loss[:10,:]) + w_loss = w_loss * self._params["prob"] + #print(w_loss.shape) + #print(w_loss[:10,:]) + w_loss = torch.sum(w_loss,dim=1) + #print(w_loss.shape) + #print(w_loss[:10]) + return w_loss + + def train(self, mode=None): + if mode is None : + mode=self._data_augmentation + self.augment(mode=mode) #Inutile si mode=None + super(Data_augV3, self).train(mode) + + def eval(self): + self.train(mode=False) + #super(Augmented_model, self).eval() + + def augment(self, mode=True): + self._data_augmentation=mode + + def __getitem__(self, key): + return self._params[key] + + def __str__(self): + if not self._mix_dist: + return "Data_augV3(Uniform-%d TF)" % self._nb_tf + else: + return "Data_augV3(Mix %.1f-%d TF)" % (self._mix_factor, self._nb_tf) + +class Data_augV4(nn.Module): #Transformations avec mask + def __init__(self, TF_dict=TF.TF_dict, N_TF=1, mix_dist=0.0): + super(Data_augV4, self).__init__() + self._data_augmentation = True + + #self._TF_matrix={} + #self._input_info={'h':0, 'w':0, 'device':None} #Input associe a TF_matrix + ''' + self._mag_fct={ #f(mag_normalise)=mag_reelle + ## Geometric TF ## + 'Identity' : (lambda mag: None), + 'FlipUD' : (lambda mag: None), + 'FlipLR' : (lambda mag: None), + 'Rotate': (lambda mag: random.randint(-int_parameter(mag, maxval=30), int_parameter(mag, maxval=30))), + 'TranslateX': (lambda mag: [random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20)), 0]), + 'TranslateY': (lambda mag: [0, random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20))]), + 'ShearX': (lambda mag: [random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3)), 0]), + 'ShearY': (lambda mag: [0, random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3))]), + + ## Color TF (Expect image in the range of [0, 1]) ## + 'Contrast': (lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))), + 'Color':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))), + 'Brightness':(lambda mag: random.uniform(1., float_parameter(mag, maxval=1.9))), + 'Sharpness':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))), + 'Posterize': (lambda mag: random.randint(4, int_parameter(mag, maxval=8))), + 'Solarize': (lambda mag: random.randint(1, int_parameter(mag, maxval=256))/256.), #=>Image entre [0,1] #Pas opti pour des batch + + #Non fonctionnel + 'Auto_Contrast': (lambda mag: None), #Pas opti pour des batch (Super lent) + #'Equalize': (lambda mag: None), + } + ''' + self._mag_fct = TF_dict + self._TF=list(self._mag_fct.keys()) + self._nb_tf= len(self._TF) + + self._fixed_mag=5 #[0, PARAMETER_MAX] + self._params = nn.ParameterDict({ + "prob": nn.Parameter(torch.ones(self._nb_tf)/self._nb_tf), #Distribution prob uniforme + }) + + self._sample = [] + + self._mix_dist = False + if mix_dist != 0.0: + self._mix_dist = True + self._mix_factor = max(min(mix_dist, 1.0), 0.0) + + def forward(self, x): + if self._data_augmentation: + device = x.device + batch_size, h, w = x.shape[0], x.shape[2], x.shape[3] + + + ## Echantillonage ## + uniforme_dist = torch.ones(1,self._nb_tf,device=device).softmax(dim=1) + + if not self._mix_dist: + self._distrib = uniforme_dist + else: + self._distrib = (self._mix_factor*self._params["prob"]+(1-self._mix_factor)*uniforme_dist).softmax(dim=1) #Mix distrib reel / uniforme avec mix_factor + print(self.distrib.shape) + + cat_distrib= Categorical(probs=torch.ones((batch_size, self._nb_tf), device=device)*self._distrib) + self._sample = cat_distrib.sample() + + ## Transformations ## + #''' + x = copy.deepcopy(x) #Evite de modifier les echantillons par reference (Problematique pour des utilisations paralleles) + smps_x=[] + masks=[] + for tf_idx in range(self._nb_tf): + mask = self._sample==tf_idx #Create selection mask + smp_x = x[mask] #torch.masked_select() ? + + if smp_x.shape[0]!=0: #if there's data to TF + magnitude=self._fixed_mag + tf=self._TF[tf_idx] + + ## Geometric TF ## + if tf=='Identity': + pass + elif tf=='FlipLR': + smp_x = TF.flipLR(smp_x) + elif tf=='FlipUD': + smp_x = TF.flipUD(smp_x) + elif tf=='Rotate': + smp_x = TF.rotate(smp_x, angle=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='TranslateX' or tf=='TranslateY': + smp_x = TF.translate(smp_x, translation=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='ShearX' or tf=='ShearY' : + smp_x = TF.shear(smp_x, shear=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + + ## Color TF (Expect image in the range of [0, 1]) ## + elif tf=='Contrast': + smp_x = TF.contrast(smp_x, contrast_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='Color': + smp_x = TF.color(smp_x, color_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='Brightness': + smp_x = TF.brightness(smp_x, brightness_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='Sharpness': + smp_x = TF.sharpeness(smp_x, sharpness_factor=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='Posterize': + smp_x = TF.posterize(smp_x, bits=torch.tensor([1 for _ in smp_x], device=device)) + elif tf=='Solarize': + smp_x = TF.solarize(smp_x, thresholds=torch.tensor([self._mag_fct[tf](magnitude) for _ in smp_x], device=device)) + elif tf=='Equalize': + smp_x = TF.equalize(smp_x) + elif tf=='Auto_Contrast': + smp_x = TF.auto_contrast(smp_x) + else: + raise Exception("Invalid TF requested : ", tf) + + x[mask]=smp_x # Refusionner eviter x[mask] : in place + + #idx= mask.nonzero() + #print('-'*8) + #print(idx[0], tf_idx) + #print(smp_x[0,]) + #x=x.view(-1,3*32*32) + #x=x.scatter(dim=0, index=idx, src=smp_x.view(-1,3*32*32)) #Changement des Tensor mais pas visible sur la visualisation... + #x=x.view(-1,3,32,32) + #print(x[0,]) + + ''' + if len(self._TF_matrix)==0 or self._input_info['h']!=h or self._input_info['w']!=w or self._input_info['device']!=device: #Device different:Pas necessaire de tout recalculer + self.compute_TF_matrix(sample_info={'h': x.shape[2], + 'w': x.shape[3], + 'device': x.device}) + + TF_matrix = torch.zeros(batch_size, 3, 3, device=device) #All geom TF + + for tf_idx in range(self._nb_tf): + mask = self._sample==tf_idx #Create selection mask + TF_matrix[mask,]=self._TF_matrix[self._TF[tf_idx]] + + x=kornia.warp_perspective(x, TF_matrix, dsize=(h, w)) + ''' + return x + ''' + def compute_TF_matrix(self, magnitude=None, sample_info= None): + print('Computing TF_matrix...') + if not magnitude : + magnitude=self._fixed_mag + + if sample_info: + self._input_info['h']= sample_info['h'] + self._input_info['w']= sample_info['w'] + self._input_info['device'] = sample_info['device'] + h, w, device= self._input_info['h'], self._input_info['w'], self._input_info['device'] + + self._TF_matrix={} + for tf in self._TF : + if tf=='Id': + self._TF_matrix[tf]=torch.tensor([[[ 1., 0., 0.], + [ 0., 1., 0.], + [ 0., 0., 1.]]], device=device) + elif tf=='Rot': + center = torch.ones(1, 2, device=device) + center[0, 0] = w / 2 # x + center[0, 1] = h / 2 # y + scale = torch.ones(1, device=device) + angle = self._mag_fct[tf](magnitude) * torch.ones(1, device=device) + R = kornia.get_rotation_matrix2d(center, angle, scale) #Rotation matrix (1,2,3) + self._TF_matrix[tf]=torch.cat((R,torch.tensor([[[ 0., 0., 1.]]], device=device)), dim=1) #TF matrix (1,3,3) + elif tf=='FlipLR': + self._TF_matrix[tf]=torch.tensor([[[-1., 0., w-1], + [ 0., 1., 0.], + [ 0., 0., 1.]]], device=device) + elif tf=='FlipUD': + self._TF_matrix[tf]=torch.tensor([[[ 1., 0., 0.], + [ 0., -1., h-1], + [ 0., 0., 1.]]], device=device) + else: + raise Exception("Invalid TF requested") + ''' + def adjust_prob(self, soft=False): #Detach from gradient ? + + if soft : + self._params['prob'].data=F.softmax(self._params['prob'].data, dim=0) #Trop 'soft', bloque en dist uniforme si lr trop faible + else: + #self._params['prob'].clamp(min=0.0,max=1.0) + self._params['prob'].data = F.relu(self._params['prob'].data) + #self._params['prob'].data = self._params['prob'].clamp(min=0.0,max=1.0) + + self._params['prob'].data = self._params['prob']/sum(self._params['prob']) #Contrainte sum(p)=1 + + def loss_weight(self): + w_loss = torch.zeros((self._sample.shape[0],self._nb_tf), device=self._sample.device) + w_loss.scatter_(1, self._sample.view(-1,1), 1) + w_loss = w_loss * self._params["prob"]/self._distrib #Ponderation par les proba (divisee par la distrib pour pas diminuer la loss) + w_loss = torch.sum(w_loss,dim=1) + return w_loss + + def train(self, mode=None): + if mode is None : + mode=self._data_augmentation + self.augment(mode=mode) #Inutile si mode=None + super(Data_augV4, self).train(mode) + + def eval(self): + self.train(mode=False) + + def augment(self, mode=True): + self._data_augmentation=mode + + def __getitem__(self, key): + return self._params[key] + + def __str__(self): + if not self._mix_dist: + return "Data_augV4(Uniform-%d TF)" % self._nb_tf + else: + return "Data_augV4(Mix %.1f-%d TF)" % (self._mix_factor, self._nb_tf) + +class Augmented_model(nn.Module): + def __init__(self, data_augmenter, model): + super(Augmented_model, self).__init__() + + self._mods = nn.ModuleDict({ + 'data_aug': data_augmenter, + 'model': model + }) + + self.augment(mode=True) + + def initialize(self): + self._mods['model'].initialize() + + def forward(self, x): + return self._mods['model'](self._mods['data_aug'](x)) + + def augment(self, mode=True): + self._data_augmentation=mode + self._mods['data_aug'].augment(mode) + + def train(self, mode=None): + if mode is None : + mode=self._data_augmentation + self._mods['data_aug'].augment(mode) + super(Augmented_model, self).train(mode) + + def eval(self): + self.train(mode=False) + #super(Augmented_model, self).eval() + + def items(self): + """Return an iterable of the ModuleDict key/value pairs. + """ + return self._mods.items() + + def update(self, modules): + self._mods.update(modules) + + def is_augmenting(self): + return self._data_augmentation + + def TF_names(self): + try: + return self._mods['data_aug']._TF + except: + return None + + def __getitem__(self, key): + return self._mods[key] + + def __str__(self): + return "Aug_mod("+str(self._mods['data_aug'])+"-"+str(self._mods['model'])+")" \ No newline at end of file diff --git a/higher/model.py b/higher/model.py new file mode 100644 index 0000000..9835ed2 --- /dev/null +++ b/higher/model.py @@ -0,0 +1,51 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +class LeNet(nn.Module): + def __init__(self, num_inp, num_out): + super(LeNet, self).__init__() + self._params = nn.ParameterDict({ + 'w1': nn.Parameter(torch.zeros(20, num_inp, 5, 5)), + 'b1': nn.Parameter(torch.zeros(20)), + 'w2': nn.Parameter(torch.zeros(50, 20, 5, 5)), + 'b2': nn.Parameter(torch.zeros(50)), + #'w3': nn.Parameter(torch.zeros(500,4*4*50)), #num_imp=1 + 'w3': nn.Parameter(torch.zeros(500,5*5*50)), #num_imp=3 + 'b3': nn.Parameter(torch.zeros(500)), + 'w4': nn.Parameter(torch.zeros(num_out, 500)), + 'b4': nn.Parameter(torch.zeros(num_out)) + }) + self.initialize() + + + def initialize(self): + nn.init.kaiming_uniform_(self._params["w1"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self._params["w2"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self._params["w3"], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self._params["w4"], a=math.sqrt(5)) + + def forward(self, x): + #print("Start Shape ", x.shape) + out = F.relu(F.conv2d(input=x, weight=self._params["w1"], bias=self._params["b1"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = F.relu(F.conv2d(input=out, weight=self._params["w2"], bias=self._params["b2"])) + #print("Shape ", out.shape) + out = F.max_pool2d(out, 2) + #print("Shape ", out.shape) + out = out.view(out.size(0), -1) + #print("Shape ", out.shape) + out = F.relu(F.linear(out, self._params["w3"], self._params["b3"])) + #print("Shape ", out.shape) + out = F.linear(out, self._params["w4"], self._params["b4"]) + #print("Shape ", out.shape) + return F.log_softmax(out, dim=1) + + def __getitem__(self, key): + return self._params[key] + + def __str__(self): + return "LeNet" \ No newline at end of file diff --git a/higher/res/Aug_mod(Data_augV4(Uniform-11 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.png b/higher/res/Aug_mod(Data_augV4(Uniform-11 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.png new file mode 100644 index 0000000..62cf98d Binary files /dev/null and b/higher/res/Aug_mod(Data_augV4(Uniform-11 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.png differ diff --git a/higher/res/Aug_mod(Data_augV4(Uniform-11 TF)-LeNet)-100 epochs (dataug:0)- 10 in_it.png b/higher/res/Aug_mod(Data_augV4(Uniform-11 TF)-LeNet)-100 epochs (dataug:0)- 10 in_it.png new file mode 100644 index 0000000..5885842 Binary files /dev/null and b/higher/res/Aug_mod(Data_augV4(Uniform-11 TF)-LeNet)-100 epochs (dataug:0)- 10 in_it.png differ diff --git a/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:-1)- 0 in_it.png b/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:-1)- 0 in_it.png new file mode 100644 index 0000000..d68e185 Binary files /dev/null and b/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:-1)- 0 in_it.png differ diff --git a/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:-1)- 10 in_it.png b/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:-1)- 10 in_it.png new file mode 100644 index 0000000..8967121 Binary files /dev/null and b/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:-1)- 10 in_it.png differ diff --git a/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-200 epochs (dataug:0)- 10 in_it.png b/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-200 epochs (dataug:0)- 10 in_it.png new file mode 100644 index 0000000..2a59b02 Binary files /dev/null and b/higher/res/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-200 epochs (dataug:0)- 10 in_it.png differ diff --git a/higher/res/LeNet-100 epochs.png b/higher/res/LeNet-100 epochs.png new file mode 100644 index 0000000..88a1a6a Binary files /dev/null and b/higher/res/LeNet-100 epochs.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_aug(Mag-1 TF)-LeNet)-10 epochs- 1 in_it.png b/higher/res/MNIST/Aug_mod(Data_aug(Mag-1 TF)-LeNet)-10 epochs- 1 in_it.png new file mode 100644 index 0000000..1a84a55 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_aug(Mag-1 TF)-LeNet)-10 epochs- 1 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_aug(Mag-1 TF)-LeNet)-10 epochs- 10 in_it.png b/higher/res/MNIST/Aug_mod(Data_aug(Mag-1 TF)-LeNet)-10 epochs- 10 in_it.png new file mode 100644 index 0000000..64e92f8 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_aug(Mag-1 TF)-LeNet)-10 epochs- 10 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3 TF)-LeNet)-10 epochs- 1 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3 TF)-LeNet)-10 epochs- 1 in_it.png new file mode 100644 index 0000000..7717df8 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3 TF)-LeNet)-10 epochs- 1 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3 TF)-LeNet)-10 epochs- 10 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3 TF)-LeNet)-10 epochs- 10 in_it.png new file mode 100644 index 0000000..11de8a6 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV2(Exact-3 TF)-LeNet)-10 epochs- 10 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV3(Mix 0,5-3 TF)-LeNet)-10 epochs- 1 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 0,5-3 TF)-LeNet)-10 epochs- 1 in_it.png new file mode 100644 index 0000000..d440282 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 0,5-3 TF)-LeNet)-10 epochs- 1 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV3(Mix 0,5-3 TF)-LeNet)-10 epochs- 10 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 0,5-3 TF)-LeNet)-10 epochs- 10 in_it.png new file mode 100644 index 0000000..024181b Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 0,5-3 TF)-LeNet)-10 epochs- 10 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV3(Mix 1,0-3 TF)-LeNet)-10 epochs- 1 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 1,0-3 TF)-LeNet)-10 epochs- 1 in_it.png new file mode 100644 index 0000000..d37bff7 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 1,0-3 TF)-LeNet)-10 epochs- 1 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV3(Mix 1,0-3 TF)-LeNet)-10 epochs- 10 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 1,0-3 TF)-LeNet)-10 epochs- 10 in_it.png new file mode 100644 index 0000000..291325c Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV3(Mix 1,0-3 TF)-LeNet)-10 epochs- 10 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-10 epochs- 1 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-10 epochs- 1 in_it.png new file mode 100644 index 0000000..5375d27 Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-10 epochs- 1 in_it.png differ diff --git a/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-10 epochs- 10 in_it.png b/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-10 epochs- 10 in_it.png new file mode 100644 index 0000000..4efe0cd Binary files /dev/null and b/higher/res/MNIST/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-10 epochs- 10 in_it.png differ diff --git a/higher/res/MNIST/LeNet-10 epochs.png b/higher/res/MNIST/LeNet-10 epochs.png new file mode 100644 index 0000000..b6f6588 Binary files /dev/null and b/higher/res/MNIST/LeNet-10 epochs.png differ diff --git a/higher/test_dataug.py b/higher/test_dataug.py new file mode 100644 index 0000000..061a2dc --- /dev/null +++ b/higher/test_dataug.py @@ -0,0 +1,764 @@ +from torch.utils.data import SubsetRandomSampler +import torch.optim as optim +import torchvision +import higher + +from model import * +from dataug import * +from utils import * + +BATCH_SIZE = 300 +#TEST_SIZE = 300 +TEST_SIZE = 10000 + +#ATTENTION : Dataug (Kornia) Expect image in the range of [0, 1] +transform = torchvision.transforms.Compose([ + torchvision.transforms.ToTensor(), + #torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), #CIFAR10 +]) +''' +data_train = torchvision.datasets.MNIST( + "./data", train=True, download=True, + transform=torchvision.transforms.Compose([ + #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0), + torchvision.transforms.ToTensor() + ]) +) +data_test = torchvision.datasets.MNIST( + "./data", train=False, download=True, transform=torchvision.transforms.ToTensor() +) +''' +data_train = torchvision.datasets.CIFAR10( + "./data", train=True, download=True, transform=transform +) +data_test = torchvision.datasets.CIFAR10( + "./data", train=False, download=True, transform=transform +) +#''' +train_subset_indices=range(int(len(data_train)/2)) +#train_subset_indices=range(BATCH_SIZE*10) +val_subset_indices=range(int(len(data_train)/2),len(data_train)) + +dl_train = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices)) +dl_val = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices)) +dl_test = torch.utils.data.DataLoader(data_test, batch_size=TEST_SIZE, shuffle=False) + +device = torch.device('cuda') + +if device == torch.device('cpu'): + device_name = 'CPU' +else: + device_name = torch.cuda.get_device_name(device) + + +def test(model): + model.eval() + for i, (features, labels) in enumerate(dl_test): + features,labels = features.to(device), labels.to(device) + + pred = model.forward(features) + return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100 + +def compute_vaLoss(model, dl_val_it): + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val) + xs_val, ys_val = next(dl_val_it) + xs_val, ys_val = xs_val.to(device), ys_val.to(device) + + try: + model.augment(mode=False) #Validation sans transfornations ! + except: + pass + return F.cross_entropy(model(xs_val), ys_val) + +def train_classic(model, epochs=1): + #opt = torch.optim.Adam(model.parameters(), lr=1e-3) + optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) + + model.train() + dl_val_it = iter(dl_val) + log = [] + for epoch in range(epochs): + print_torch_mem("Start epoch") + t0 = time.process_time() + for i, (features, labels) in enumerate(dl_train): + #print_torch_mem("Start iter") + features,labels = features.to(device), labels.to(device) + + optim.zero_grad() + pred = model.forward(features) + loss = F.cross_entropy(pred,labels) + loss.backward() + optim.step() + + #### Tests #### + tf = time.process_time() + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val) + xs_val, ys_val = next(dl_val_it) + xs_val, ys_val = xs_val.to(device), ys_val.to(device) + + val_loss = F.cross_entropy(model(xs_val), ys_val) + accuracy=test(model) + model.train() + #### Log #### + data={ + "epoch": epoch, + "train_loss": loss.item(), + "val_loss": val_loss.item(), + "acc": accuracy, + "time": tf - t0, + + "param": None, + } + log.append(data) + + return log + +def train_classic_higher(model, epochs=1): + #opt = torch.optim.Adam(model.parameters(), lr=1e-3) + optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) + + model.train() + dl_val_it = iter(dl_val) + log = [] + + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + diffopt = higher.optim.get_diff_optim(optim, model.parameters(),fmodel=fmodel,track_higher_grads=False) + #with higher.innerloop_ctx(model, optim, copy_initial_weights=True, track_higher_grads=False) as (fmodel, diffopt): + + for epoch in range(epochs): + print_torch_mem("Start epoch "+str(epoch)) + print("Fast param ",len(fmodel._fast_params)) + t0 = time.process_time() + for i, (features, labels) in enumerate(dl_train): + #print_torch_mem("Start iter") + features,labels = features.to(device), labels.to(device) + + #optim.zero_grad() + pred = fmodel.forward(features) + loss = F.cross_entropy(pred,labels) + #.backward() + #optim.step() + diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step) + + model_copy(src=fmodel, dst=model, patch_copy=False) + optim_copy(dopt=diffopt, opt=optim) + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + diffopt = higher.optim.get_diff_optim(optim, model.parameters(),fmodel=fmodel,track_higher_grads=False) + + #### Tests #### + tf = time.process_time() + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val) + xs_val, ys_val = next(dl_val_it) + xs_val, ys_val = xs_val.to(device), ys_val.to(device) + + val_loss = F.cross_entropy(model(xs_val), ys_val) + accuracy=test(model) + model.train() + #### Log #### + data={ + "epoch": epoch, + "train_loss": loss.item(), + "val_loss": val_loss.item(), + "acc": accuracy, + "time": tf - t0, + + "param": None, + } + log.append(data) + + return log + +def train_classic_tests(model, epochs=1): + #opt = torch.optim.Adam(model.parameters(), lr=1e-3) + optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) + + countcopy=0 + model.train() + dl_val_it = iter(dl_val) + log = [] + + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + doptim = higher.optim.get_diff_optim(optim, model.parameters(), fmodel=fmodel, track_higher_grads=False) + for epoch in range(epochs): + print_torch_mem("Start epoch") + print(len(fmodel._fast_params)) + t0 = time.process_time() + #with higher.innerloop_ctx(model, optim, copy_initial_weights=True, track_higher_grads=True) as (fmodel, doptim): + + #fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + #doptim = higher.optim.get_diff_optim(optim, model.parameters(), track_higher_grads=True) + + for i, (features, labels) in enumerate(dl_train): + features,labels = features.to(device), labels.to(device) + + #with higher.innerloop_ctx(model, optim, copy_initial_weights=True, track_higher_grads=False) as (fmodel, doptim): + + + #optim.zero_grad() + pred = fmodel.forward(features) + loss = F.cross_entropy(pred,labels) + doptim.step(loss) #(opt.zero_grad, loss.backward, opt.step) + #loss.backward() + #new_params = doptim.step(loss, params=fmodel.parameters()) + #fmodel.update_params(new_params) + + + #print('Fast param',len(fmodel._fast_params)) + #print('opt state', type(doptim.state[0][0]['momentum_buffer']), doptim.state[0][2]['momentum_buffer'].shape) + + if False or (len(fmodel._fast_params)>1): + print("fmodel fast param",len(fmodel._fast_params)) + ''' + #val_loss = F.cross_entropy(fmodel(features), labels) + + #print_graph(val_loss) + + #val_loss.backward() + #print('bip') + + tmp = fmodel.parameters() + + #print(list(tmp)[1]) + tmp = [higher.utils._copy_tensor(t,safe_copy=True) if isinstance(t, torch.Tensor) else t for t in tmp] + #print(len(tmp)) + + #fmodel._fast_params.clear() + del fmodel._fast_params + fmodel._fast_params=None + + fmodel.fast_params=tmp # Surcharge la memoire + #fmodel.update_params(tmp) #Meilleur perf / Surcharge la memoire avec trach higher grad + + #optim._fmodel=fmodel + ''' + + + countcopy+=1 + model_copy(src=fmodel, dst=model, patch_copy=False) + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + #doptim.detach_dyn() + #tmp = doptim.state + #tmp = doptim.state_dict() + #for k, v in tmp['state'].items(): + # print('dict',k, type(v)) + + a = optim.param_groups[0]['params'][0] + state = optim.state[a] + #state['momentum_buffer'] = None + #print('opt state', type(optim.state[a]), len(optim.state[a])) + #optim.load_state_dict(tmp) + + + for group_idx, group in enumerate(optim.param_groups): + # print('gp idx',group_idx) + for p_idx, p in enumerate(group['params']): + optim.state[p]=doptim.state[group_idx][p_idx] + + #print('opt state', type(optim.state[a]['momentum_buffer']), optim.state[a]['momentum_buffer'][0:10]) + #print('dopt state', type(doptim.state[0][0]['momentum_buffer']), doptim.state[0][0]['momentum_buffer'][0:10]) + ''' + for a in tmp: + #print(type(a), len(a)) + for nb, b in a.items(): + #print(nb, type(b), len(b)) + for n, state in b.items(): + #print(n, type(states)) + #print(state.grad_fn) + state = torch.tensor(state.data).requires_grad_() + #print(state.grad_fn) + ''' + + + doptim = higher.optim.get_diff_optim(optim, model.parameters(), track_higher_grads=True) + #doptim.state = tmp + + + countcopy+=1 + model_copy(src=fmodel, dst=model) + optim_copy(dopt=diffopt, opt=inner_opt) + + #### Tests #### + tf = time.process_time() + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val) + xs_val, ys_val = next(dl_val_it) + xs_val, ys_val = xs_val.to(device), ys_val.to(device) + + val_loss = F.cross_entropy(model(xs_val), ys_val) + accuracy=test(model) + model.train() + #### Log #### + data={ + "epoch": epoch, + "train_loss": loss.item(), + "val_loss": val_loss.item(), + "acc": accuracy, + "time": tf - t0, + + "param": None, + } + log.append(data) + + #countcopy+=1 + #model_copy(src=fmodel, dst=model, patch_copy=False) + #optim.load_state_dict(doptim.state_dict()) #Besoin sauver etat otpim ? + + print("Copy ", countcopy) + return log + +def run_simple_dataug(inner_it, epochs=1): + + dl_train_it = iter(dl_train) + dl_val_it = iter(dl_val) + + #aug_model = nn.Sequential( + # Data_aug(), + # LeNet(1,10), + # ) + aug_model = Augmented_model(Data_aug(), LeNet(1,10)).to(device) + print(str(aug_model)) + meta_opt = torch.optim.Adam(aug_model['data_aug'].parameters(), lr=1e-2) + inner_opt = torch.optim.SGD(aug_model['model'].parameters(), lr=1e-2, momentum=0.9) + + log = [] + t0 = time.process_time() + + epoch = 0 + while epoch < epochs: + meta_opt.zero_grad() + aug_model.train() + with higher.innerloop_ctx(aug_model, inner_opt, copy_initial_weights=True, track_higher_grads=True) as (fmodel, diffopt): #effet copy_initial_weight pas clair... + + for i in range(n_inner_iter): + try: + xs, ys = next(dl_train_it) + except StopIteration: #Fin epoch train + tf = time.process_time() + epoch +=1 + dl_train_it = iter(dl_train) + xs, ys = next(dl_train_it) + + accuracy=test(aug_model) + aug_model.train() + + #### Print #### + print('-'*9) + print('Epoch %d/%d'%(epoch,epochs)) + print('train loss',loss.item(), '/ val loss', val_loss.item()) + print('acc', accuracy) + print('mag', aug_model['data_aug']['mag'].item()) + + #### Log #### + data={ + "epoch": epoch, + "train_loss": loss.item(), + "val_loss": val_loss.item(), + "acc": accuracy, + "time": tf - t0, + + "param": aug_model['data_aug']['mag'].item(), + } + log.append(data) + t0 = time.process_time() + + xs, ys = xs.to(device), ys.to(device) + + logits = fmodel(xs) # modified `params` can also be passed as a kwarg + + loss = F.cross_entropy(logits, ys) # no need to call loss.backwards() + #loss.backward(retain_graph=True) + #print(fmodel['model']._params['b4'].grad) + #print('mag', fmodel['data_aug']['mag'].grad) + + diffopt.step(loss) # note that `step` must take `loss` as an argument! + # The line above gets P[t+1] from P[t] and loss[t]. `step` also returns + # these new parameters, as an alternative to getting them from + # `fmodel.fast_params` or `fmodel.parameters()` after calling + # `diffopt.step`. + + # At this point, or at any point in the iteration, you can take the + # gradient of `fmodel.parameters()` (or equivalently + # `fmodel.fast_params`) w.r.t. `fmodel.parameters(time=0)` (equivalently + # `fmodel.init_fast_params`). i.e. `fast_params` will always have + # `grad_fn` as an attribute, and be part of the gradient tape. + + # At the end of your inner loop you can obtain these e.g. ... + #grad_of_grads = torch.autograd.grad( + # meta_loss_fn(fmodel.parameters()), fmodel.parameters(time=0)) + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val) + xs_val, ys_val = next(dl_val_it) + xs_val, ys_val = xs_val.to(device), ys_val.to(device) + + fmodel.augment(mode=False) + val_logits = fmodel(xs_val) #Validation sans transfornations ! + val_loss = F.cross_entropy(val_logits, ys_val) + #print('val_loss',val_loss.item()) + val_loss.backward() + + #print('mag', fmodel['data_aug']['mag'], '/', fmodel['data_aug']['mag'].grad) + + #model=copy.deepcopy(fmodel) + aug_model.load_state_dict(fmodel.state_dict()) #Do not copy gradient ! + #Copie des gradients + for paramName, paramValue, in fmodel.named_parameters(): + for netCopyName, netCopyValue, in aug_model.named_parameters(): + if paramName == netCopyName: + netCopyValue.grad = paramValue.grad + + #print('mag', aug_model['data_aug']['mag'], '/', aug_model['data_aug']['mag'].grad) + meta_opt.step() + + plot_res(log, fig_name="res/{}-{} epochs- {} in_it".format(str(aug_model),epochs,inner_it)) + print('-'*9) + times = [x["time"] for x in log] + print(str(aug_model),": acc", max([x["acc"] for x in log]), "in (ms):", np.mean(times), "+/-", np.std(times)) + +def run_dist_dataug(model, epochs=1, inner_it=1, dataug_epoch_start=0): + + dl_train_it = iter(dl_train) + dl_val_it = iter(dl_val) + + meta_opt = torch.optim.Adam(model['data_aug'].parameters(), lr=1e-3) + inner_opt = torch.optim.SGD(model['model'].parameters(), lr=1e-2, momentum=0.9) + + high_grad_track = True + if dataug_epoch_start>0: + model.augment(mode=False) + high_grad_track = False + + model.train() + + log = [] + t0 = time.process_time() + + countcopy=0 + val_loss=torch.tensor(0) + opt_param=None + + epoch = 0 + while epoch < epochs: + meta_opt.zero_grad() + with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, override=opt_param, track_higher_grads=high_grad_track) as (fmodel, diffopt): #effet copy_initial_weight pas clair... + + for i in range(n_inner_iter): + try: + xs, ys = next(dl_train_it) + except StopIteration: #Fin epoch train + tf = time.process_time() + epoch +=1 + dl_train_it = iter(dl_train) + xs, ys = next(dl_train_it) + + #viz_sample_data(imgs=xs, labels=ys, fig_name='samples/data_sample_epoch{}_noTF'.format(epoch)) + #viz_sample_data(imgs=aug_model['data_aug'](xs), labels=ys, fig_name='samples/data_sample_epoch{}'.format(epoch)) + + accuracy=test(model) + model.train() + + #### Print #### + print('-'*9) + print('Epoch : %d/%d'%(epoch,epochs)) + print('Train loss :',loss.item(), '/ val loss', val_loss.item()) + print('Accuracy :', accuracy) + print('Data Augmention : {} (Epoch {})'.format(model._data_augmentation, dataug_epoch_start)) + print('TF Proba :', model['data_aug']['prob'].data) + #print('proba grad',aug_model['data_aug']['prob'].grad) + ############# + #### Log #### + data={ + "epoch": epoch, + "train_loss": loss.item(), + "val_loss": val_loss.item(), + "acc": accuracy, + "time": tf - t0, + + "param": [p for p in model['data_aug']['prob']], + } + log.append(data) + ############# + + if epoch == dataug_epoch_start: + print('Starting Data Augmention...') + model.augment(mode=True) + high_grad_track = True + + t0 = time.process_time() + + xs, ys = xs.to(device), ys.to(device) + + ''' + #Methode exacte + final_loss = 0 + for tf_idx in range(fmodel['data_aug']._nb_tf): + fmodel['data_aug'].transf_idx=tf_idx + logits = fmodel(xs) + loss = F.cross_entropy(logits, ys) + #loss.backward(retain_graph=True) + #print('idx', tf_idx) + #print(fmodel['data_aug']['prob'][tf_idx], fmodel['data_aug']['prob'][tf_idx].grad) + final_loss += loss*fmodel['data_aug']['prob'][tf_idx] #Take it in the forward function ? + + loss = final_loss + ''' + #Methode uniforme + logits = fmodel(xs) # modified `params` can also be passed as a kwarg + loss = F.cross_entropy(logits, ys, reduction='none') # no need to call loss.backwards() + if fmodel._data_augmentation: #Weight loss + w_loss = fmodel['data_aug'].loss_weight().to(device) + loss = loss * w_loss + loss = loss.mean() + #''' + + #to visualize computational graph + #print_graph(loss) + + #loss.backward(retain_graph=True) + #print(fmodel['model']._params['b4'].grad) + #print('prob grad', fmodel['data_aug']['prob'].grad) + + diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step) + + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val) + xs_val, ys_val = next(dl_val_it) + xs_val, ys_val = xs_val.to(device), ys_val.to(device) + + fmodel.augment(mode=False) #Validation sans transfornations ! + val_loss = F.cross_entropy(fmodel(xs_val), ys_val) + + #print_graph(val_loss) + + val_loss.backward() + + countcopy+=1 + model_copy(src=fmodel, dst=model) + optim_copy(dopt=diffopt, opt=inner_opt) + + meta_opt.step() + model['data_aug'].adjust_prob() #Contrainte sum(proba)=1 + + print("Copy ", countcopy) + return log + +def run_dist_dataugV2(model, epochs=1, inner_it=0, dataug_epoch_start=0, print_freq=1, loss_patience=None): + + log = [] + countcopy=0 + val_loss=torch.tensor(0) #Necessaire si pas de metastep sur une epoch + dl_val_it = iter(dl_val) + + meta_opt = torch.optim.Adam(model['data_aug'].parameters(), lr=1e-2) + inner_opt = torch.optim.SGD(model['model'].parameters(), lr=1e-2, momentum=0.9) + + high_grad_track = True + if inner_it == 0: + high_grad_track=False + if dataug_epoch_start!=0: + model.augment(mode=False) + high_grad_track = False + + val_loss_monitor= None + if loss_patience != None : + if dataug_epoch_start==-1: val_loss_monitor = loss_monitor(patience=loss_patience, end_train=2) #1st limit = dataug start + else: val_loss_monitor = loss_monitor(patience=loss_patience) #Val loss monitor + + model.train() + + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + diffopt = higher.optim.get_diff_optim(inner_opt, model.parameters(),fmodel=fmodel,track_higher_grads=high_grad_track) + + for epoch in range(1, epochs+1): + #print_torch_mem("Start epoch "+str(epoch)) + #print(high_grad_track, fmodel._data_augmentation, len(fmodel._fast_params)) + t0 = time.process_time() + #with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, override=opt_param, track_higher_grads=high_grad_track) as (fmodel, diffopt): + + for i, (xs, ys) in enumerate(dl_train): + xs, ys = xs.to(device), ys.to(device) + ''' + #Methode exacte + final_loss = 0 + for tf_idx in range(fmodel['data_aug']._nb_tf): + fmodel['data_aug'].transf_idx=tf_idx + logits = fmodel(xs) + loss = F.cross_entropy(logits, ys) + #loss.backward(retain_graph=True) + #print('idx', tf_idx) + #print(fmodel['data_aug']['prob'][tf_idx], fmodel['data_aug']['prob'][tf_idx].grad) + final_loss += loss*fmodel['data_aug']['prob'][tf_idx] #Take it in the forward function ? + + loss = final_loss + ''' + #Methode uniforme + + logits = fmodel(xs) # modified `params` can also be passed as a kwarg + loss = F.cross_entropy(logits, ys, reduction='none') # no need to call loss.backwards() + #PAS PONDERE LOSS POUR DIST MIX + if fmodel._data_augmentation: # and not fmodel['data_aug']._mix_dist: #Weight loss + w_loss = fmodel['data_aug'].loss_weight().to(device) + loss = loss * w_loss + loss = loss.mean() + #''' + + #to visualize computational graph + #print_graph(loss) + + #loss.backward(retain_graph=True) + #print(fmodel['model']._params['b4'].grad) + #print('prob grad', fmodel['data_aug']['prob'].grad) + + diffopt.step(loss) #(opt.zero_grad, loss.backward, opt.step) + + if(high_grad_track and i%inner_it==0): #Perform Meta step + #print("meta") + #Peu utile si high_grad_track = False + val_loss = compute_vaLoss(model=fmodel, dl_val_it=dl_val_it) + + #print_graph(val_loss) + + val_loss.backward() + + countcopy+=1 + model_copy(src=fmodel, dst=model) + optim_copy(dopt=diffopt, opt=inner_opt) + + meta_opt.step() + model['data_aug'].adjust_prob(soft=False) #Contrainte sum(proba)=1 + + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + diffopt = higher.optim.get_diff_optim(inner_opt, model.parameters(),fmodel=fmodel, track_higher_grads=high_grad_track) + + tf = time.process_time() + + #viz_sample_data(imgs=xs, labels=ys, fig_name='samples/data_sample_epoch{}_noTF'.format(epoch)) + #viz_sample_data(imgs=aug_model['data_aug'](xs), labels=ys, fig_name='samples/data_sample_epoch{}'.format(epoch)) + + if(not high_grad_track): + countcopy+=1 + model_copy(src=fmodel, dst=model) + optim_copy(dopt=diffopt, opt=inner_opt) + val_loss = compute_vaLoss(model=fmodel, dl_val_it=dl_val_it) + + #Necessaire pour reset higher (Accumule les fast_param meme avec track_higher_grads = False) + fmodel = higher.patch.monkeypatch(model, device=None, copy_initial_weights=True) + diffopt = higher.optim.get_diff_optim(inner_opt, model.parameters(),fmodel=fmodel, track_higher_grads=high_grad_track) + + accuracy=test(model) + model.train() + + #### Print #### + if(print_freq and epoch%print_freq==0): + print('-'*9) + print('Epoch : %d/%d'%(epoch,epochs)) + print('Time : %.00f ms'%(tf - t0)) + print('Train loss :',loss.item(), '/ val loss', val_loss.item()) + print('Accuracy :', accuracy) + print('Data Augmention : {} (Epoch {})'.format(model._data_augmentation, dataug_epoch_start)) + print('TF Proba :', model['data_aug']['prob'].data) + #print('proba grad',aug_model['data_aug']['prob'].grad) + ############# + #### Log #### + data={ + "epoch": epoch, + "train_loss": loss.item(), + "val_loss": val_loss.item(), + "acc": accuracy, + "time": tf - t0, + + "param": [p.item() for p in model['data_aug']['prob']], + } + log.append(data) + ############# + if val_loss_monitor : + val_loss_monitor.register(val_loss.item()) + if val_loss_monitor.end_training(): break #Stop training + + + if not model.is_augmenting() and (epoch == dataug_epoch_start or (val_loss_monitor and val_loss_monitor.limit_reached()==1)): + print('Starting Data Augmention...') + dataug_epoch_start = epoch + model.augment(mode=True) + if inner_it != 0: high_grad_track = True + + print("Copy ", countcopy) + return log + +########################################## +if __name__ == "__main__": + + n_inner_iter = 0 + epochs = 2 + dataug_epoch_start=0 + + #### Classic #### + ''' + model = LeNet(3,10).to(device) + #model = torchvision.models.resnet18() + #model = Augmented_model(Data_augV3(mix_dist=0.0), LeNet(3,10)).to(device) + #model.augment(mode=False) + + print(str(model), 'on', device_name) + log= train_classic_higher(model=model, epochs=epochs) + + #### + plot_res(log, fig_name="res/{}-{} epochs".format(str(model),epochs)) + print('-'*9) + times = [x["time"] for x in log] + out = {"Accuracy": max([x["acc"] for x in log]), "Time": (np.mean(times),np.std(times)), "Device": device_name, "Log": log} + print(str(model),": acc", out["Accuracy"], "in (ms):", out["Time"][0], "+/-", out["Time"][1]) + with open("res/log/%s.json" % "{}-{} epochs".format(str(model),epochs), "w+") as f: + json.dump(out, f, indent=True) + print('Log :\"',f.name, '\" saved !') + print('-'*9) + ''' + #### Augmented Model #### + #''' + aug_model = Augmented_model(Data_augV4(TF_dict=TF.TF_dict, mix_dist=0.0), LeNet(3,10)).to(device) + print(str(aug_model), 'on', device_name) + #run_simple_dataug(inner_it=n_inner_iter, epochs=epochs) + log= run_dist_dataugV2(model=aug_model, epochs=epochs, inner_it=n_inner_iter, dataug_epoch_start=dataug_epoch_start, print_freq=10, loss_patience=10) + + #### + plot_res(log, fig_name="res/{}-{} epochs (dataug:{})- {} in_it".format(str(aug_model),epochs,dataug_epoch_start,n_inner_iter)) + print('-'*9) + times = [x["time"] for x in log] + out = {"Accuracy": max([x["acc"] for x in log]), "Time": (np.mean(times),np.std(times)), "Device": device_name, "Param_names": aug_model.TF_names(), "Log": log} + print(str(aug_model),": acc", out["Accuracy"], "in (ms):", out["Time"][0], "+/-", out["Time"][1]) + with open("res/log/%s.json" % "{}-{} epochs (dataug:{})- {} in_it".format(str(aug_model),epochs,dataug_epoch_start,n_inner_iter), "w+") as f: + json.dump(out, f, indent=True) + print('Log :\"',f.name, '\" saved !') + print('-'*9) + #''' + + #### Comparison #### + ''' + files=[ + #"res/log/LeNet-100 epochs.json", + #"res/log/Aug_mod(Data_augV4(Uniform-4 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.json", + #"res/log/Aug_mod(Data_augV4(Uniform-4 TF)-LeNet)-100 epochs (dataug:50)- 0 in_it.json", + #"res/log/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:0)- 0 in_it.json", + #"res/log/Aug_mod(Data_augV3(Uniform-3 TF)-LeNet)-100 epochs (dataug:50)- 10 in_it.json", + #"res/log/Aug_mod(Data_augV4(Mix 0,5-3 TF)-LeNet)-100 epochs (dataug:0)- 1 in_it.json", + #"res/log/Aug_mod(Data_augV4(Mix 0.5-3 TF)-LeNet)-100 epochs (dataug:50)- 10 in_it.json", + #"res/log/Aug_mod(Data_augV4(Uniform-3 TF)-LeNet)-100 epochs (dataug:0)- 10 in_it.json", + "res/log/Aug_mod(Data_augV4(Uniform-10 TF)-LeNet)-100 epochs (dataug:50)- 10 in_it.json", + "res/log/Aug_mod(Data_augV4(Uniform-10 TF)-LeNet)-100 epochs (dataug:50)- 0 in_it.json", + ] + plot_compare(filenames=files, fig_name="res/compare") + ''' \ No newline at end of file diff --git a/higher/test_lr.py b/higher/test_lr.py new file mode 100644 index 0000000..c70ad95 --- /dev/null +++ b/higher/test_lr.py @@ -0,0 +1,150 @@ +import numpy as np +import json, math, time, os + +from torch.utils.data import SubsetRandomSampler +import torch.optim as optim +import higher +from model import * + +import copy + +BATCH_SIZE = 300 +TEST_SIZE = 300 + +mnist_train = torchvision.datasets.MNIST( + "./data", train=True, download=True, + transform=torchvision.transforms.Compose([ + #torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0), + torchvision.transforms.ToTensor() + ]) +) + +mnist_test = torchvision.datasets.MNIST( + "./data", train=False, download=True, transform=torchvision.transforms.ToTensor() +) + +#train_subset_indices=range(int(len(mnist_train)/2)) +train_subset_indices=range(BATCH_SIZE) +val_subset_indices=range(int(len(mnist_train)/2),len(mnist_train)) + +dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices)) +dl_val = torch.utils.data.DataLoader(mnist_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices)) +dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=TEST_SIZE, shuffle=False) + + +def test(model): + model.eval() + for i, (features, labels) in enumerate(dl_test): + pred = model.forward(features) + return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100 + +def train_classic(model, optim, epochs=1): + model.train() + log = [] + for epoch in range(epochs): + t0 = time.process_time() + for i, (features, labels) in enumerate(dl_train): + + optim.zero_grad() + pred = model.forward(features) + loss = F.cross_entropy(pred,labels) + loss.backward() + optim.step() + + #### Log #### + tf = time.process_time() + data={ + "time": tf - t0, + } + log.append(data) + + times = [x["time"] for x in log] + print("Vanilla : acc", test(model), "in (ms):", np.mean(times), "+/-", np.std(times)) +########################################## +if __name__ == "__main__": + + device = torch.device('cpu') + + model = LeNet(1,10) + opt_param = { + "lr": torch.tensor(1e-2).requires_grad_(), + "momentum": torch.tensor(0.9).requires_grad_() + } + n_inner_iter = 1 + dl_train_it = iter(dl_train) + dl_val_it = iter(dl_val) + epoch = 0 + epochs = 10 + + #### + train_classic(model=model, optim=torch.optim.Adam(model.parameters(), lr=0.001), epochs=epochs) + model = LeNet(1,10) + + meta_opt = torch.optim.Adam(opt_param.values(), lr=1e-2) + inner_opt = torch.optim.SGD(model.parameters(), lr=opt_param['lr'], momentum=opt_param['momentum']) + #for xs_val, ys_val in dl_val: + while epoch < epochs: + #print(data_aug.params["mag"], data_aug.params["mag"].grad) + meta_opt.zero_grad() + model.train() + with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=True, track_higher_grads=True) as (fmodel, diffopt): #effet copy_initial_weight pas clair... + + for param_group in diffopt.param_groups: + param_group['lr'] = opt_param['lr'] + param_group['momentum'] = opt_param['momentum'] + + for i in range(n_inner_iter): + try: + xs, ys = next(dl_train_it) + except StopIteration: #Fin epoch train + epoch +=1 + dl_train_it = iter(dl_train) + xs, ys = next(dl_train_it) + + print('Epoch', epoch) + print('train loss',loss.item(), '/ val loss', val_loss.item()) + print('acc', test(model)) + print('opt : lr', opt_param['lr'].item(), 'momentum', opt_param['momentum'].item()) + print('-'*9) + model.train() + + + logits = fmodel(xs) # modified `params` can also be passed as a kwarg + loss = F.cross_entropy(logits, ys) # no need to call loss.backwards() + #print('loss',loss.item()) + diffopt.step(loss) # note that `step` must take `loss` as an argument! + # The line above gets P[t+1] from P[t] and loss[t]. `step` also returns + # these new parameters, as an alternative to getting them from + # `fmodel.fast_params` or `fmodel.parameters()` after calling + # `diffopt.step`. + + # At this point, or at any point in the iteration, you can take the + # gradient of `fmodel.parameters()` (or equivalently + # `fmodel.fast_params`) w.r.t. `fmodel.parameters(time=0)` (equivalently + # `fmodel.init_fast_params`). i.e. `fast_params` will always have + # `grad_fn` as an attribute, and be part of the gradient tape. + + # At the end of your inner loop you can obtain these e.g. ... + #grad_of_grads = torch.autograd.grad( + # meta_loss_fn(fmodel.parameters()), fmodel.parameters(time=0)) + try: + xs_val, ys_val = next(dl_val_it) + except StopIteration: #Fin epoch val + dl_val_it = iter(dl_val_it) + xs_val, ys_val = next(dl_val_it) + + val_logits = fmodel(xs_val) + val_loss = F.cross_entropy(val_logits, ys_val) + #print('val_loss',val_loss.item()) + + val_loss.backward() + #meta_grads = torch.autograd.grad(val_loss, opt_lr, allow_unused=True) + #print(meta_grads) + for param_group in diffopt.param_groups: + print(param_group['lr'], '/',param_group['lr'].grad) + print(param_group['momentum'], '/',param_group['momentum'].grad) + + #model=copy.deepcopy(fmodel) + model.load_state_dict(fmodel.state_dict()) + + meta_opt.step() diff --git a/higher/transformations.py b/higher/transformations.py new file mode 100644 index 0000000..ec6e29b --- /dev/null +++ b/higher/transformations.py @@ -0,0 +1,205 @@ +import torch +import kornia +import random + +### Available TF for Dataug ### +TF_dict={ #f(mag_normalise)=mag_reelle + ## Geometric TF ## + 'Identity' : (lambda mag: None), + 'FlipUD' : (lambda mag: None), + 'FlipLR' : (lambda mag: None), + 'Rotate': (lambda mag: random.randint(-int_parameter(mag, maxval=30), int_parameter(mag, maxval=30))), + 'TranslateX': (lambda mag: [random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20)), 0]), + 'TranslateY': (lambda mag: [0, random.randint(-int_parameter(mag, maxval=20), int_parameter(mag, maxval=20))]), + 'ShearX': (lambda mag: [random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3)), 0]), + 'ShearY': (lambda mag: [0, random.uniform(-float_parameter(mag, maxval=0.3), float_parameter(mag, maxval=0.3))]), + + ## Color TF (Expect image in the range of [0, 1]) ## + 'Contrast': (lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))), + 'Color':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))), + 'Brightness':(lambda mag: random.uniform(1., float_parameter(mag, maxval=1.9))), + 'Sharpness':(lambda mag: random.uniform(0.1, float_parameter(mag, maxval=1.9))), + 'Posterize': (lambda mag: random.randint(4, int_parameter(mag, maxval=8))), + 'Solarize': (lambda mag: random.randint(1, int_parameter(mag, maxval=256))/256.), #=>Image entre [0,1] #Pas opti pour des batch + + #Non fonctionnel + #'Auto_Contrast': (lambda mag: None), #Pas opti pour des batch (Super lent) + #'Equalize': (lambda mag: None), +} + + +def int_image(float_image): #ATTENTION : legere perte d'info (granularite : 1/256 = 0.0039) + return (float_image*255.).type(torch.uint8) + +def float_image(int_image): + return int_image.type(torch.float)/255. + +def rand_inverse(value): + return value if random.random() < 0.5 else -value + +#https://github.com/tensorflow/models/blob/fc2056bce6ab17eabdc139061fef8f4f2ee763ec/research/autoaugment/augmentation_transforms.py#L137 +PARAMETER_MAX = 10 # What is the max 'level' a transform could be predicted +def float_parameter(level, maxval): + """Helper function to scale `val` between 0 and maxval . + Args: + level: Level of the operation that will be between [0, `PARAMETER_MAX`]. + maxval: Maximum value that the operation can have. This will be scaled + to level/PARAMETER_MAX. + Returns: + A float that results from scaling `maxval` according to `level`. + """ + return float(level) * maxval / PARAMETER_MAX + +def int_parameter(level, maxval): + """Helper function to scale `val` between 0 and maxval . + Args: + level: Level of the operation that will be between [0, `PARAMETER_MAX`]. + maxval: Maximum value that the operation can have. This will be scaled + to level/PARAMETER_MAX. + Returns: + An int that results from scaling `maxval` according to `level`. + """ + return int(level * maxval / PARAMETER_MAX) + +def flipLR(x): + device = x.device + (batch_size, channels, h, w) = x.shape + + M =torch.tensor( [[[-1., 0., w-1], + [ 0., 1., 0.], + [ 0., 0., 1.]]], device=device).expand(batch_size,-1,-1) + + # warp the original image by the found transform + return kornia.warp_perspective(x, M, dsize=(h, w)) + +def flipUD(x): + device = x.device + (batch_size, channels, h, w) = x.shape + + M =torch.tensor( [[[ 1., 0., 0.], + [ 0., -1., h-1], + [ 0., 0., 1.]]], device=device).expand(batch_size,-1,-1) + + # warp the original image by the found transform + return kornia.warp_perspective(x, M, dsize=(h, w)) + +def rotate(x, angle): + return kornia.rotate(x, angle=angle.type(torch.float32)) #Kornia ne supporte pas les int + +def translate(x, translation): + return kornia.translate(x, translation=translation.type(torch.float32)) #Kornia ne supporte pas les int + +def shear(x, shear): + return kornia.shear(x, shear=shear) + +def contrast(x, contrast_factor): + return kornia.adjust_contrast(x, contrast_factor=contrast_factor) #Expect image in the range of [0, 1] + +#https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageEnhance.py +def color(x, color_factor): + (batch_size, channels, h, w) = x.shape + + gray_x = kornia.rgb_to_grayscale(x) + gray_x = gray_x.repeat_interleave(channels, dim=1) + return blend(gray_x, x, color_factor).clamp(min=0.0,max=1.0) #Expect image in the range of [0, 1] + +def brightness(x, brightness_factor): + device = x.device + + return blend(torch.zeros(x.size(), device=device), x, brightness_factor).clamp(min=0.0,max=1.0) #Expect image in the range of [0, 1] + +def sharpeness(x, sharpness_factor): + device = x.device + (batch_size, channels, h, w) = x.shape + + k = torch.tensor([[[ 1., 1., 1.], + [ 1., 5., 1.], + [ 1., 1., 1.]]], device=device) #Smooth Filter : https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageFilter.py + smooth_x = kornia.filter2D(x, kernel=k, border_type='reflect', normalized=True) #Peut etre necessaire de s'occuper du channel Alhpa differement + + return blend(smooth_x, x, sharpness_factor).clamp(min=0.0,max=1.0) #Expect image in the range of [0, 1] + +#https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageOps.py +def posterize(x, bits): + x = int_image(x) #Expect image in the range of [0, 1] + + mask = ~(2 ** (8 - bits) - 1).type(torch.uint8) + + (batch_size, channels, h, w) = x.shape + mask = mask.unsqueeze(dim=1).expand(-1,channels).unsqueeze(dim=2).expand(-1,channels, h).unsqueeze(dim=3).expand(-1,channels, h, w) #Il y a forcement plus simple ... + + return float_image(x & mask) + +def auto_contrast(x): #PAS OPTIMISE POUR DES BATCH #EXTRA LENT + # Optimisation : Application de LUT efficace / Calcul d'histogramme par batch/channel + print("Warning : Pas encore check !") + (batch_size, channels, h, w) = x.shape + x = int_image(x) #Expect image in the range of [0, 1] + #print('Start',x[0]) + for im_idx, img in enumerate(x.chunk(batch_size, dim=0)): #Operation par image + #print(img.shape) + for chan_idx, chan in enumerate(img.chunk(channels, dim=1)): # Operation par channel + #print(chan.shape) + hist = torch.histc(chan, bins=256, min=0, max=255) #PAS DIFFERENTIABLE + + # find lowest/highest samples after preprocessing + for lo in range(256): + if hist[lo]: + break + for hi in range(255, -1, -1): + if hist[hi]: + break + if hi <= lo: + # don't bother + pass + else: + scale = 255.0 / (hi - lo) + offset = -lo * scale + for ix in range(256): + n_ix = int(ix * scale + offset) + if n_ix < 0: n_ix = 0 + elif n_ix > 255: n_ix = 255 + + chan[chan==ix]=n_ix + x[im_idx, chan_idx]=chan + + #print('End',x[0]) + return float_image(x) + +def equalize(x): #PAS OPTIMISE POUR DES BATCH + raise Exception(self, "not implemented") + # Optimisation : Application de LUT efficace / Calcul d'histogramme par batch/channel + (batch_size, channels, h, w) = x.shape + x = int_image(x) #Expect image in the range of [0, 1] + #print('Start',x[0]) + for im_idx, img in enumerate(x.chunk(batch_size, dim=0)): #Operation par image + #print(img.shape) + for chan_idx, chan in enumerate(img.chunk(channels, dim=1)): # Operation par channel + #print(chan.shape) + hist = torch.histc(chan, bins=256, min=0, max=255) #PAS DIFFERENTIABLE + + return float_image(x) + +def solarize(x, thresholds): #PAS OPTIMISE POUR DES BATCH + # Optimisation : Mask direct sur toute les donnees (Mask = (B,C,H,W)> (B)) + for idx, t in enumerate(thresholds): #Operation par image + mask = x[idx] > t.item() + inv_x = 1-x[idx][mask] + x[idx][mask]=inv_x + return x + +#https://github.com/python-pillow/Pillow/blob/9c78c3f97291bd681bc8637922d6a2fa9415916c/src/PIL/Image.py#L2818 +def blend(x,y,alpha): #out = image1 * (1.0 - alpha) + image2 * alpha + #return kornia.add_weighted(src1=x, alpha=(1-alpha), src2=y, beta=alpha, gamma=0) #out=src1∗alpha+src2∗beta+gamma #Ne fonctionne pas pour des batch de alpha + + if not isinstance(x, torch.Tensor): + raise TypeError("x should be a tensor. Got {}".format(type(x))) + + if not isinstance(y, torch.Tensor): + raise TypeError("y should be a tensor. Got {}".format(type(y))) + + (batch_size, channels, h, w) = x.shape + alpha = alpha.unsqueeze(dim=1).expand(-1,channels).unsqueeze(dim=2).expand(-1,channels, h).unsqueeze(dim=3).expand(-1,channels, h, w) #Il y a forcement plus simple ... + res = x*(1-alpha) + y*alpha + + return res diff --git a/higher/utils.py b/higher/utils.py new file mode 100644 index 0000000..b9826bb --- /dev/null +++ b/higher/utils.py @@ -0,0 +1,184 @@ +import numpy as np +import json, math, time, os +import matplotlib.pyplot as plt +import copy +import gc + +from torchviz import make_dot + +import torch +import torch.nn.functional as F + + +def print_graph(PyTorch_obj, fig_name='graph'): + graph=make_dot(PyTorch_obj) #Loss give the whole graph + graph.format = 'svg' #https://graphviz.readthedocs.io/en/stable/manual.html#formats + graph.render(fig_name) + +def plot_res(log, fig_name='res'): + + epochs = [x["epoch"] for x in log] + + fig, ax = plt.subplots(ncols=3, figsize=(15, 3)) + + ax[0].set_title('Loss') + ax[0].plot(epochs,[x["train_loss"] for x in log], label='Train') + ax[0].plot(epochs,[x["val_loss"] for x in log], label='Val') + ax[0].legend() + + ax[1].set_title('Acc') + ax[1].plot(epochs,[x["acc"] for x in log]) + + if log[0]["param"]!= None: + if isinstance(log[0]["param"],float): + ax[2].set_title('Mag') + ax[2].plot(epochs,[x["param"] for x in log], label='Mag') + ax[2].legend() + else : + ax[2].set_title('Prob') + for idx, _ in enumerate(log[0]["param"]): + ax[2].plot(epochs,[x["param"][idx] for x in log], label='P'+str(idx)) + ax[2].legend() + #ax[2].legend(('P-0', 'P-45', 'P-180')) + + fig_name = fig_name.replace('.',',') + plt.savefig(fig_name) + +def plot_compare(filenames, fig_name='res'): + + all_data=[] + legend="" + for idx, file in enumerate(filenames): + legend+=str(idx)+'-'+file+'\n' + with open(file) as json_file: + data = json.load(json_file) + all_data.append(data) + + fig, ax = plt.subplots(ncols=3, figsize=(30, 8)) + + for data_idx, log in enumerate(all_data): + log=log['Log'] + epochs = [x["epoch"] for x in log] + + ax[0].plot(epochs,[x["train_loss"] for x in log], label=str(data_idx)+'-Train') + ax[0].plot(epochs,[x["val_loss"] for x in log], label=str(data_idx)+'-Val') + + ax[1].plot(epochs,[x["acc"] for x in log], label=str(data_idx)) + #ax[1].text(x=0.5,y=0,s=str(data_idx)+'-'+filenames[data_idx], transform=ax[1].transAxes) + + if log[0]["param"]!= None: + if isinstance(log[0]["param"],float): + ax[2].plot(epochs,[x["param"] for x in log], label=str(data_idx)+'-Mag') + + else : + for idx, _ in enumerate(log[0]["param"]): + ax[2].plot(epochs,[x["param"][idx] for x in log], label=str(data_idx)+'-P'+str(idx)) + + fig.suptitle(legend) + ax[0].set_title('Loss') + ax[1].set_title('Acc') + ax[2].set_title('Param') + for a in ax: a.legend() + fig_name = fig_name.replace('.',',') + + plt.savefig(fig_name, bbox_inches='tight') + +def viz_sample_data(imgs, labels, fig_name='data_sample'): + + sample = imgs[0:25,].permute(0, 2, 3, 1).squeeze().cpu() + + plt.figure(figsize=(10,10)) + for i in range(25): + plt.subplot(5,5,i+1) + plt.xticks([]) + plt.yticks([]) + plt.grid(False) + plt.imshow(sample[i,], cmap=plt.cm.binary) + plt.xlabel(labels[i].item()) + + plt.savefig(fig_name) + +def model_copy(src,dst, patch_copy=True, copy_grad=True): + #model=copy.deepcopy(fmodel) #Pas approprie, on ne souhaite que les poids/grad (pas tout fmodel et ses etats) + + dst.load_state_dict(src.state_dict()) #Do not copy gradient ! + + if patch_copy: + dst['model'].load_state_dict(src['model'].state_dict()) #Copie donnee manquante ? + dst['data_aug'].load_state_dict(src['data_aug'].state_dict()) + + #Copie des gradients + if copy_grad: + for paramName, paramValue, in src.named_parameters(): + for netCopyName, netCopyValue, in dst.named_parameters(): + if paramName == netCopyName: + netCopyValue.grad = paramValue.grad + #netCopyValue=copy.deepcopy(paramValue) + + try: #Data_augV4 + dst['data_aug']._input_info = src['data_aug']._input_info + dst['data_aug']._TF_matrix = src['data_aug']._TF_matrix + except: + pass + +def optim_copy(dopt, opt): + + #inner_opt.load_state_dict(diffopt.state_dict()) #Besoin sauver etat otpim (momentum, etc.) => Ne copie pas le state... + #opt_param=higher.optim.get_trainable_opt_params(diffopt) + + for group_idx, group in enumerate(opt.param_groups): + # print('gp idx',group_idx) + for p_idx, p in enumerate(group['params']): + opt.state[p]=dopt.state[group_idx][p_idx] + +def print_torch_mem(add_info=''): + + nb=0 + max_size=0 + for obj in gc.get_objects(): + #print(type(obj)) + try: + if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # and len(obj.size())>1: + #print(i, type(obj), obj.size()) + size = np.sum(obj.size()) + if(size>max_size): max_size=size + nb+=1 + except: + pass + print(add_info, "-Pytroch tensor nb:",nb," / Max dim:", max_size) + + #print(add_info, "-Garbage size :",len(gc.garbage)) + +class loss_monitor(): #Voir https://github.com/pytorch/ignite + def __init__(self, patience, end_train=1): + self.patience = patience + self.end_train = end_train + self.counter = 0 + self.best_score = None + self.reached_limit = 0 + + def register(self, loss): + if self.best_score is None: + self.best_score = loss + elif loss > self.best_score: + self.counter += 1 + #if not self.reached_limit: + print("loss no improve counter", self.counter, self.reached_limit) + else: + self.best_score = loss + self.counter = 0 + def limit_reached(self): + if self.counter >= self.patience: + self.counter = 0 + self.reached_limit +=1 + self.best_score = None + return self.reached_limit + + def end_training(self): + if self.limit_reached() >= self.end_train: + return True + else: + return False + + def reset(self): + self.__init__(self.patience, self.end_train) \ No newline at end of file