Rangement

This commit is contained in:
Harle, Antoine (Contracteur) 2020-02-28 16:46:37 -05:00
parent ca3367d19f
commit 4166922c34
453 changed files with 9797 additions and 7 deletions

View file

@ -0,0 +1,5 @@
venv/
__pycache__
data/
log/
.vscode/

View file

@ -0,0 +1,33 @@
# Gradient Descent: The Ultimate Optimizer
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
| ⚠️ WARNING: THIS IS NOT MY WORK ⚠️ |
| --- |
This repository contains the paper and code to the paper [Gradient Descent:
The Ultimate Optimizer](https://arxiv.org/abs/1909.13371).
I couldn't find the code (which is found in the appendix at the end of the
paper) anywhere on the web. What I present here is the code of the paper with
instructions on how to set it up.
Getting the code in a runnable state required some fixes on my part so the
code might be slightly different than that presented in the paper.
## Set up
```sh
git clone https://github.com/Rainymood/Gradient-Descent-The-Ultimate-Optimizer
cd Gradient-Descent-The-Ultimate-Optimizer
virtualenv -p python3 venv
source venv/bin/activate
pip install -r requirements.txt
python main.py
```
When you are done you can exit the virtualenv with
```shell
deactivate
```

View file

@ -0,0 +1,244 @@
from hyperopt import *
#from hyperopt_v2 import *
import torchvision.transforms.functional as TF
import torchvision.transforms as T
#from scipy import ndimage
import kornia
import random
class MNIST_FullyConnected_Augmented(Optimizable):
"""
A fully-connected NN for the MNIST task. This is Optimizable but not itself
an optimizer.
"""
def __init__(self, num_inp, num_hid, num_out, optimizer, device = torch.device('cuda')):
self.device = device
#print(self.device)
parameters = {
"w1": torch.zeros(num_inp, num_hid, device=self.device).t(),
"b1": torch.zeros(num_hid, device=self.device).t(),
"w2": torch.zeros(num_hid, num_out, device=self.device).t(),
"b2": torch.zeros(num_out, device=self.device).t(),
#Data augmentation
"prob": torch.tensor(0.5, device=self.device),
"mag": torch.tensor(180.0, device=self.device),
}
super().__init__(parameters, optimizer)
def initialize(self):
nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
self.optimizer.initialize()
#print(self.device)
def forward(self, x):
"""Compute a prediction."""
#print("Prob:",self.parameters["prob"].item())
if random.random() < self.parameters["prob"]:
#angle = 45
#x = TF.rotate(x, angle)
#print(self.device)
#x = F.linear(x, torch.ones(28*28, 28*28, device=self.device).t()*self.parameters["mag"], bias=None)
x = x + self.parameters["mag"]
x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
x = torch.tanh(x)
x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
x = torch.tanh(x)
x = F.log_softmax(x, dim=1)
return x
def adjust(self):
self.optimizer.adjust(self.parameters)
def __str__(self):
return "mnist_FC_augmented / " + str(self.optimizer)
class LeNet(Optimizable, nn.Module):
def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')):
nn.Module.__init__(self)
self.device = device
parameters = {
"w1": torch.zeros(20, num_inp, 5, 5, device=self.device),
"b1": torch.zeros(20, device=self.device),
"w2": torch.zeros(50, 20, 5, 5, device=self.device),
"b2": torch.zeros(50, device=self.device),
"w3": torch.zeros(500,4*4*50, device=self.device),
"b3": torch.zeros(500, device=self.device),
"w4": torch.zeros(10, 500, device=self.device),
"b4": torch.zeros(10, device=self.device),
#Data augmentation
"prob": torch.tensor(1.0, device=self.device),
"mag": torch.tensor(180.0, device=self.device),
}
super().__init__(parameters, optimizer)
def initialize(self):
nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.parameters["w3"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.parameters["w4"], a=math.sqrt(5))
self.optimizer.initialize()
def forward(self, x):
if random.random() < self.parameters["prob"]:
batch_size = x.shape[0]
# create transformation (rotation)
alpha = self.parameters["mag"] # in degrees
angle = torch.ones(batch_size, device=self.device) * alpha
# define the rotation center
center = torch.ones(batch_size, 2, device=self.device)
center[..., 0] = x.shape[3] / 2 # x
center[..., 1] = x.shape[2] / 2 # y
#print(x.shape, center)
# define the scale factor
scale = torch.ones(batch_size, device=self.device)
# compute the transformation matrix
M = kornia.get_rotation_matrix2d(center, angle, scale)
# apply the transformation to original image
x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
#print("Start Shape ", x.shape)
out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"]))
#print("Shape ", out.shape)
out = F.max_pool2d(out, 2)
#print("Shape ", out.shape)
out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"]))
#print("Shape ", out.shape)
out = F.max_pool2d(out, 2)
#print("Shape ", out.shape)
out = out.view(out.size(0), -1)
#print("Shape ", out.shape)
out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"]))
#print("Shape ", out.shape)
out = F.linear(out, self.parameters["w4"], self.parameters["b4"])
#print("Shape ", out.shape)
return F.log_softmax(out, dim=1)
def adjust(self):
self.optimizer.adjust(self.parameters)
def __str__(self):
return "mnist_CNN_augmented / " + str(self.optimizer)
class LeNet_v2(Optimizable, nn.Module):
def __init__(self, num_inp, num_out, optimizer, device = torch.device('cuda')):
nn.Module.__init__(self)
self.device = device
self.conv1 = nn.Conv2d(num_inp, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
#self.fc1 = nn.Linear(4*4*50, 500)
self.fc1 = nn.Linear(1250, 500)
self.fc2 = nn.Linear(500, 10)
#print(self.conv1.weight)
parameters = {
"w1": self.conv1.weight,
"b1": self.conv1.bias,
"w2": self.conv2.weight,
"b2": self.conv2.bias,
"w3": self.fc1.weight,
"b3": self.fc1.bias,
"w4": self.fc2.weight,
"b4": self.fc2.bias,
#Data augmentation
"prob": torch.tensor(0.5, device=self.device),
"mag": torch.tensor(1.0, device=self.device),
}
Optimizable.__init__(self, parameters, optimizer)
'''
def forward(self, x): #Sature la memoire ???
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
#x = x.view(-1, 4*4*50)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
'''
def forward(self, x):
if random.random() < self.parameters["prob"].item():
#print(self.parameters["prob"])
#x = [T.ToTensor()(
# TF.affine(img=T.ToPILImage()(im), angle=self.parameters["mag"], translate=(0,0), scale=1, shear=0, resample=0, fillcolor=None))
# for im in torch.unbind(x,dim=0)]
#x = torch.stack(x,dim=0)
#x = [ndimage.rotate(im, self.parameters["mag"], reshape=False)
# for im in torch.unbind(x,dim=0)]
#x = torch.stack(x,dim=0)
#x = [im + self.parameters["mag"]
# for im in torch.unbind(x,dim=0)]
#x = torch.stack(x,dim=0)
batch_size = x.shape[0]
# create transformation (rotation)
alpha = self.parameters["mag"] * 180 # in degrees
angle = torch.ones(batch_size, device=self.device) * alpha
# define the rotation center
center = torch.ones(batch_size, 2, device=self.device)
center[..., 0] = x.shape[3] / 2 # x
center[..., 1] = x.shape[2] / 2 # y
#print(x.shape, center)
# define the scale factor
scale = torch.ones(batch_size, device=self.device)
# compute the transformation matrix
M = kornia.get_rotation_matrix2d(center, angle, scale)
# apply the transformation to original image
x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
#print("Start Shape ", x.shape)
out = F.relu(F.conv2d(input=x, weight=self.parameters["w1"], bias=self.parameters["b1"]))
#print("Shape ", out.shape)
out = F.max_pool2d(out, 2)
#print("Shape ", out.shape)
out = F.relu(F.conv2d(input=out, weight=self.parameters["w2"], bias=self.parameters["b2"]))
#print("Shape ", out.shape)
out = F.max_pool2d(out, 2)
#print("Shape ", out.shape)
out = out.view(out.size(0), -1)
#print("Shape ", out.shape)
out = F.relu(F.linear(out, self.parameters["w3"], self.parameters["b3"]))
#print("Shape ", out.shape)
out = F.linear(out, self.parameters["w4"], self.parameters["b4"])
#print("Shape ", out.shape)
return F.log_softmax(out, dim=1)
def initialize(self):
self.optimizer.initialize()
def adjust(self):
self.optimizer.adjust(self.parameters)
def adjust_val(self):
self.optimizer.adjust_val(self.parameters)
def eval(self):
self.parameters['prob']=torch.tensor(0.0, device=self.device)
def __str__(self):
return "mnist_CNN_augmented / " + str(self.optimizer)

View file

@ -0,0 +1,52 @@
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as TF
class MNIST_aug(Dataset):
training_file = 'training.pt'
test_file = 'test.pt'
classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
'5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
def __init__(self):
self.images = [TF.to_pil_image(x) for x in torch.ByteTensor(10, 3, 48, 48)]
self.set_stage(0) # initial stage
def __getitem__(self, index):
image = self.images[index]
# Just apply your transformations here
image = self.crop(image)
x = TF.to_tensor(image)
return x
def set_stage(self, stage):
if stage == 0:
print('Using (32, 32) crops')
self.crop = transforms.RandomCrop((32, 32))
elif stage == 1:
print('Using (28, 28) crops')
self.crop = transforms.RandomCrop((28, 28))
def __len__(self):
return len(self.images)
dataset = MyData()
loader = DataLoader(dataset,
batch_size=2,
num_workers=2,
shuffle=True)
for batch_idx, data in enumerate(loader):
print('Batch idx {}, data shape {}'.format(
batch_idx, data.shape))
loader.dataset.set_stage(1)
for batch_idx, data in enumerate(loader):
print('Batch idx {}, data shape {}'.format(
batch_idx, data.shape))

View file

@ -0,0 +1,150 @@
#from hyperopt import *
from hyperopt_v2 import *
import torchvision.transforms.functional as TF
import torchvision.transforms as T
#from scipy import ndimage
import kornia
import random
class LeNet_v3(nn.Module):
def __init__(self, num_inp, num_out):
super(LeNet_v3, self).__init__()
self.params = nn.ParameterDict({
'w1': nn.Parameter(torch.zeros(20, num_inp, 5, 5)),
'b1': nn.Parameter(torch.zeros(20)),
'w2': nn.Parameter(torch.zeros(50, 20, 5, 5)),
'b2': nn.Parameter(torch.zeros(50)),
'w3': nn.Parameter(torch.zeros(500,4*4*50)),
'b3': nn.Parameter(torch.zeros(500)),
'w4': nn.Parameter(torch.zeros(10, 500)),
'b4': nn.Parameter(torch.zeros(10))
})
def initialize(self):
nn.init.kaiming_uniform_(self.params["w1"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.params["w2"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.params["w3"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.params["w4"], a=math.sqrt(5))
def forward(self, x):
#print("Start Shape ", x.shape)
out = F.relu(F.conv2d(input=x, weight=self.params["w1"], bias=self.params["b1"]))
#print("Shape ", out.shape)
out = F.max_pool2d(out, 2)
#print("Shape ", out.shape)
out = F.relu(F.conv2d(input=out, weight=self.params["w2"], bias=self.params["b2"]))
#print("Shape ", out.shape)
out = F.max_pool2d(out, 2)
#print("Shape ", out.shape)
out = out.view(out.size(0), -1)
#print("Shape ", out.shape)
out = F.relu(F.linear(out, self.params["w3"], self.params["b3"]))
#print("Shape ", out.shape)
out = F.linear(out, self.params["w4"], self.params["b4"])
#print("Shape ", out.shape)
return F.log_softmax(out, dim=1)
def print_grad_fn(self):
for n, p in self.params.items():
print(n, p.grad_fn)
def __str__(self):
return "mnist_CNN_augmented / "
class Data_aug(nn.Module):
def __init__(self):
super(Data_aug, self).__init__()
self.data_augmentation = True
self.params = nn.ParameterDict({
"prob": nn.Parameter(torch.tensor(0.5)),
"mag": nn.Parameter(torch.tensor(180.0))
})
#self.params["mag"].register_hook(print)
def forward(self, x):
if self.data_augmentation and self.training and random.random() < self.params["prob"]:
#print('Aug')
batch_size = x.shape[0]
# create transformation (rotation)
alpha = self.params["mag"] # in degrees
angle = torch.ones(batch_size, device=x.device) * alpha
# define the rotation center
center = torch.ones(batch_size, 2, device=x.device)
center[..., 0] = x.shape[3] / 2 # x
center[..., 1] = x.shape[2] / 2 # y
#print(x.shape, center)
# define the scale factor
scale = torch.ones(batch_size, device=x.device)
# compute the transformation matrix
M = kornia.get_rotation_matrix2d(center, angle, scale)
# apply the transformation to original image
x = kornia.warp_affine(x, M, dsize=(x.shape[2], x.shape[3])) #dsize=(h, w)
return x
def eval(self):
self.params['prob']=torch.tensor(0.0, device=self.device)
nn.Module.eval(self)
def data_augmentation(self, mode=True):
self.data_augmentation=mode
def print_grad_fn(self):
for n, p in self.params.items():
print(n, p.grad_fn)
def __str__(self):
return "Data_Augmenter / "
class Augmented_model(nn.Module):
def __init__(self, model, data_augmenter):
#self.model = model
#self.data_aug = data_augmenter
super(Augmented_model, self).__init__()#nn.Module.__init__(self)
#super().__init__()
self.mods = nn.ModuleDict({
'data_aug': data_augmenter,
'model': model
})
#for name, param in self.mods.named_parameters():
# print(name, type(param.data), param.size())
#params = self.mods.named_parameters() #self.parameters()
#parameters = [param for param in self.model.parameters()] + [param for param in self.data_aug.parameters()]
#Optimizable.__init__(self, params, optimizer)
def initialize(self):
self.mods['model'].initialize()
def forward(self, x):
return self.mods['model'](self.mods['data_aug'](x))
#def adjust(self):
# self.optimizer.adjust(self) #Parametres des dict
def data_augmentation(self, mode=True):
self.mods['data_aug'].data_augmentation=mode
def begin(self):
for param in self.parameters():
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
def print_grad_fn(self):
for n, m in self.mods.items():
m.print_grad_fn()
def __str__(self):
return str(self.mods['data_aug'])+ str(self.mods['model'])# + str(self.optimizer)

View file

@ -0,0 +1,5 @@
digraph {
graph [size="12,12"]
node [align=left fontsize=12 height=0.2 ranksep=0.1 shape=box style=filled]
94296775052080 [label=NoneType fillcolor=darkolivegreen1]
}

View file

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: %3 Pages: 1 -->
<svg width="75pt" height="30pt"
viewBox="0.00 0.00 74.65 30.40" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 26.4)">
<title>%3</title>
<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-26.4 70.6472,-26.4 70.6472,4 -4,4"/>
<!-- 94296775052080 -->
<g id="node1" class="node">
<title>94296775052080</title>
<polygon fill="#caff70" stroke="#000000" points="66.4717,-22.6036 .1755,-22.6036 .1755,.2036 66.4717,.2036 66.4717,-22.6036"/>
<text text-anchor="middle" x="33.3236" y="-7.6" font-family="Times,serif" font-size="12.00" fill="#000000">NoneType</text>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 937 B

View file

@ -0,0 +1,345 @@
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Optimizable():#nn.Module):
"""
This is the interface for anything that has parameters that need to be
optimized, somewhat like torch.nn.Model but with the right plumbing for
hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
interface which does not give us enough control about the detachments.)
Nominal operation of an Optimizable at the lowest level is as follows:
o = MyOptimizable()
o.initialize()
loop {
o.begin()
o.zero_grad()
loss = compute loss function from parameters
loss.backward()
o.adjust()
}
Optimizables recursively handle updates to their optimiz*ers*.
"""
#def __init__(self):
# super(Optimizable, self).__init__()
# self.parameters = nn.Parameter(torch.zeros(()))
def __init__(self, parameters, optimizer):
#super(Optimizable, self).__init__()
self.parameters = parameters # a dict mapping names to tensors
self.optimizer = optimizer # which must itself be Optimizable!
self.all_params_with_gradients = []
#self.device = device
def initialize(self):
"""Initialize parameters, e.g. with a Kaiming initializer."""
pass
def begin(self):
"""Enable gradient tracking on current parameters."""
self.all_params_with_gradients = [] #Reintialisation pour eviter surcharge de la memoire
for name, param in self.parameters.items():
#for param in self.parameters:
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
#param.to(self.device)
#if param.device == torch.device('cuda:0'):
# print(name, param.device)
self.all_params_with_gradients.append(param)
self.optimizer.begin()
def zero_grad(self):
""" Set all gradients to zero. """
for param in self.all_params_with_gradients:
#param = param.to(self.device)
param.grad = torch.zeros(param.shape, device=param.device)
self.optimizer.zero_grad()
""" Note: at this point you would probably call .backwards() on the loss
function. """
def adjust(self):
""" Update parameters """
pass
def print_grad_fn(self):
self.optimizer.print_grad_fn()
for n, p in self.parameters.items():
print(n," - ", p.grad_fn)
def param_grad(self):
return self.all_params_with_gradients
def param(self, param_name):
return self.parameters[param_name].item()
class MNIST_FullyConnected(Optimizable):
"""
A fully-connected NN for the MNIST task. This is Optimizable but not itself
an optimizer.
"""
def __init__(self, num_inp, num_hid, num_out, optimizer):
parameters = {
"w1": torch.zeros(num_inp, num_hid).t(),
"b1": torch.zeros(num_hid).t(),
"w2": torch.zeros(num_hid, num_out).t(),
"b2": torch.zeros(num_out).t(),
}
super().__init__(parameters, optimizer)
def initialize(self):
nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
self.optimizer.initialize()
def forward(self, x):
"""Compute a prediction."""
x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
x = torch.tanh(x)
x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
x = torch.tanh(x)
x = F.log_softmax(x, dim=1)
return x
def adjust(self):
self.optimizer.adjust(self.parameters)
def __str__(self):
return "mnist / " + str(self.optimizer)
class NoOpOptimizer(Optimizable):#, nn.Module):
"""
NoOpOptimizer sits on top of a stack, and does not affect what lies below.
"""
def __init__(self):
#super(Optimizable, self).__init__()
pass
def initialize(self):
pass
def begin(self):
pass
def zero_grad(self):
pass
def adjust(self, params):
pass
def adjust_val(self, params):
pass
def print_grad_fn(self):
pass
def __str__(self):
return "static"
class Adam(Optimizable):
"""
A fully hyperoptimizable Adam optimizer
"""
def clamp(x):
return (x.tanh() + 1.0) / 2.0
def unclamp(y):
z = y * 2.0 - 1.0
return ((1.0 + z) / (1.0 - z)).log() / 2.0
def __init__(
self,
alpha=0.001,
beta1=0.9,
beta2=0.999,
log_eps=-8.0,
optimizer=NoOpOptimizer(),
device = torch.device('cuda')
):
self.device = device
parameters = {
"alpha": torch.tensor(alpha, device=self.device),
"beta1": Adam.unclamp(torch.tensor(beta1, device=self.device)),
"beta2": Adam.unclamp(torch.tensor(beta2, device=self.device)),
"log_eps": torch.tensor(log_eps, device=self.device),
}
super().__init__(parameters, optimizer)
self.num_adjustments = 0
self.num_adjustments_val = 0
self.cache = {}
for name, param in parameters.items():
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
#param.to(self.device)
#if param.device == torch.device('cuda:0'):
# print(name, param.device)
def adjust(self, params): #Update param d'apprentissage
self.num_adjustments += 1
self.optimizer.adjust(self.parameters)
#print('Adam update')
t = self.num_adjustments
beta1 = Adam.clamp(self.parameters["beta1"])
beta2 = Adam.clamp(self.parameters["beta2"])
for name, param in params.items():
if name == "mag": continue
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape, device=self.device),
"v": torch.zeros(param.shape, device=self.device)
+ 10.0 ** self.parameters["log_eps"].data
# NOTE that we add a little fudge factor' here because sqrt is not
# differentiable at exactly zero
}
#print(name, param.device)
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(m)
self.all_params_with_gradients.append(v)
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
params[name] = param.detach() - self.parameters["alpha"] * dparam
#print(name)
def adjust_val(self, params): #Update param Transformations
self.num_adjustments_val += 1
self.optimizer.adjust_val(self.parameters)
#print('Adam update')
t = self.num_adjustments_val
beta1 = Adam.clamp(self.parameters["beta1"])
beta2 = Adam.clamp(self.parameters["beta2"])
for name, param in params.items():
if name != "mag": continue
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape, device=self.device),
"v": torch.zeros(param.shape, device=self.device)
+ 10.0 ** self.parameters["log_eps"].data
# NOTE that we add a little fudge factor' here because sqrt is not
# differentiable at exactly zero
}
#print(name, param.device)
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(m)
self.all_params_with_gradients.append(v)
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
params[name] = param.detach() - self.parameters["alpha"] * dparam
#print(name)
def __str__(self):
return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
'''
class SGD(Optimizable):
"""
A hyperoptimizable SGD
"""
def __init__(self, alpha=0.01, optimizer=NoOpOptimizer()):
parameters = {"alpha": torch.tensor(alpha)}
super().__init__(parameters, optimizer)
def adjust(self, params):
self.optimizer.adjust(self.parameters)
for name, param in params.items():
g = param.grad.detach()
params[name] = param.detach() - g * self.parameters["alpha"]
def __str__(self):
return "sgd(%f) / " % self.parameters["alpha"] + str(self.optimizer)
class SGDPerParam(Optimizable):
"""
Like above, but can be taught a separate step size for each parameter it
tunes.
"""
def __init__(self, alpha=0.01, params=[], optimizer=NoOpOptimizer()):
parameters = {name + "_alpha": torch.tensor(alpha) for name in params}
super().__init__(parameters, optimizer)
def adjust(self, params):
self.optimizer.adjust(self.parameters)
for name, param in params.items():
g = param.grad.detach()
params[name] = param.detach() - g * self.parameters[name + "_alpha"]
def __str__(self):
return "sgd(%s) / " % str(
{k: t.item() for k, t in self.parameters.items()}
) + str(self.optimizer)
'''
'''
class AdamBaydin(Optimizable):
""" Same as above, but only optimizes the learning rate, treating the
remaining hyperparameters as constants. """
def __init__(
self,
alpha=0.001,
beta1=0.9,
beta2=0.999,
log_eps=-8.0,
optimizer=NoOpOptimizer(),
):
parameters = {"alpha": torch.tensor(alpha)}
self.beta1 = beta1
self.beta2 = beta2
self.log_eps = log_eps
super().__init__(parameters, optimizer)
self.num_adjustments = 0
self.cache = {}
def adjust(self, params):
self.num_adjustments += 1
self.optimizer.adjust(self.parameters)
t = self.num_adjustments
beta1 = self.beta1
beta2 = self.beta2
for name, param in params.items():
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape),
"v": torch.zeros(param.shape) + 10.0 ** self.log_eps,
}
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(m)
self.all_params_with_gradients.append(v)
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.log_eps)
params[name] = param.detach() - self.parameters["alpha"] * dparam
def __str__(self):
return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
'''

View file

@ -0,0 +1,296 @@
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.optimizer import Optimizer
class Optimizable():
"""
This is the interface for anything that has parameters that need to be
optimized, somewhat like torch.nn.Model but with the right plumbing for
hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
interface which does not give us enough control about the detachments.)
Nominal operation of an Optimizable at the lowest level is as follows:
o = MyOptimizable()
o.initialize()
loop {
o.begin()
o.zero_grad()
loss = compute loss function from parameters
loss.backward()
o.adjust()
}
Optimizables recursively handle updates to their optimiz*ers*.
"""
#def __init__(self):
# super(Optimizable, self).__init__()
# self.parameters = nn.Parameter(torch.zeros(()))
def __init__(self, parameters, optimizer):
self.params = parameters # a dict mapping names to tensors
self.optimizer = optimizer # which must itself be Optimizable!
self.all_params_with_gradients = []
#self.device = device
def initialize(self):
"""Initialize parameters, e.g. with a Kaiming initializer."""
pass
def begin(self):
"""Enable gradient tracking on current parameters."""
self.all_params_with_gradients = nn.ParameterList() #Reintialisation pour eviter surcharge de la memoire
print("Opti param :", type(self.params))
#for name, param in self.params:
if isinstance(self.params,dict): #Dict
for name, param in self.params:
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
self.all_params_with_gradients.append(param)
if isinstance(self.params,list): #List
for param in self.params:
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
self.all_params_with_gradients.append(param)
self.optimizer.begin()
def zero_grad(self):
""" Set all gradients to zero. """
for param in self.all_params_with_gradients:
param.grad = torch.zeros(param.shape, device=param.device)
self.optimizer.zero_grad()
""" Note: at this point you would probably call .backwards() on the loss
function. """
def adjust(self):
""" Update parameters """
pass
class NoOpOptimizer(Optimizable):#, nn.Module):
"""
NoOpOptimizer sits on top of a stack, and does not affect what lies below.
"""
def __init__(self):
#super(Optimizable, self).__init__()
pass
def initialize(self):
pass
def begin(self):
#print("NoOpt begin")
pass
def zero_grad(self):
pass
def adjust(self, params):
pass
def step(self):
pass
def print_grad_fn(self):
pass
def __str__(self):
return "static"
class SGD(Optimizer, nn.Module): #Eviter Optimizer
"""
A hyperoptimizable SGD
"""
def __init__(self, params, lr=0.01, height=0):
self.height=height
#params : a optimiser
#reste (defaults) param de l'opti
print('SGD - H', height)
nn.Module.__init__(self)
optim_keys = ('lr','') #A mettre dans Optimizable ? #'' pour eviter iteration dans la chaine de charactere...
'''
self_params = {"lr": torch.tensor(lr),
"momentum": 0,
"dampening":0,
"weight_decay":0,
"nesterov": False}
'''
#self_params = dict(lr=torch.tensor(lr),
# momentum=0, dampening=0, weight_decay=0, nesterov=False)
self_params = nn.ParameterDict({
"lr": nn.Parameter(torch.tensor(lr)),
"momentum": nn.Parameter(torch.tensor(0.0)),
"dampening": nn.Parameter(torch.tensor(0.0)),
"weight_decay": nn.Parameter(torch.tensor(0.0)),
})
for k in self_params.keys() & optim_keys:
self_params[k].requires_grad_() # keep gradient information…
self_params[k].retain_grad() # even if not a leaf…
#self_params[k].register_hook(print)
if height==0:
optimizer = NoOpOptimizer()
else:
#def dict_generator(): yield {k: self_params[k] for k in self_params.keys() & optim_keys}
#(dict for dict in {k: self_params[k] for k in self_params.keys() & optim_keys}) #Devrait mar
optimizer = SGD(params=(self_params[k]for k in self_params.keys() & optim_keys), lr=lr, height=height-1)
#optimizer.register_backward_hook(print)
self.optimizer = optimizer
#if(height==0):
# for n,p in params.items():
# print(n,p)
#Optimizable.__init__(self, self_params, optimizer)
#print(type(params))
#for p in params:
# print(type(p))
Optimizer.__init__(self, params, self_params)
for group in self.param_groups:
for p in group['params']:
print(type(p.data), p.size())
print('End SGD-H', height)
def begin(self):
for group in self.param_groups:
for p in group['params']:
#print(type(p.data), p.size())
p.requires_grad_() # keep gradient information…
p.retain_grad() # even if not a leaf…
#p.register_hook(lambda x: print(self.height, x.grad_fn))
self.optimizer.begin()
def print_grad_fn(self):
self.optimizer.print_grad_fn()
for group in self.param_groups:
for i, p in enumerate(group['params']):
print(self.height," - ", i, p.grad_fn)
#def adjust(self, params):
# self.optimizer.adjust(self.params)
# for name, param in params.items():
# g = param.grad.detach()
# params[name] = param.detach() - g * self.params["lr"]
def step(self):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
print('SGD start')
self.optimizer.step()
for group in self.param_groups:
for i, p in enumerate(group['params']):
if p.grad is None:
continue
#d_p = p.grad.data
d_p = p.grad.detach()
#print(group['lr'])
p.data.add_(-group['lr'].item(), d_p)
#group['params'][i] = p.detach() - d_p * group['lr']
p.data-= group['lr']*d_p #Data ne pas utiliser perte info
for p in group['params']:
if p.grad is None:
print(p, p.grad)
continue
print("SGD end")
#return loss
def __str__(self):
return "sgd(%f) / " % self.params["lr"] + str(self.optimizer)
class Adam(Optimizable, nn.Module):
"""
A fully hyperoptimizable Adam optimizer
"""
def clamp(x):
return (x.tanh() + 1.0) / 2.0
def unclamp(y):
z = y * 2.0 - 1.0
return ((1.0 + z) / (1.0 - z)).log() / 2.0
def __init__(
self,
alpha=0.001,
beta1=0.9,
beta2=0.999,
log_eps=-8.0,
optimizer=NoOpOptimizer(),
device = torch.device('cuda')
):
#super(Adam, self).__init__()
nn.Module.__init__(self)
self.device = device
params = nn.ParameterDict({
"alpha": nn.Parameter(torch.tensor(alpha, device=self.device)),
"beta1": nn.Parameter(Adam.unclamp(torch.tensor(beta1, device=self.device))),
"beta2": nn.Parameter(Adam.unclamp(torch.tensor(beta2, device=self.device))),
"log_eps": nn.Parameter(torch.tensor(log_eps, device=self.device)),
})
Optimizable.__init__(self, params, optimizer)
self.num_adjustments = 0
self.cache = {}
for name, param in params.items():
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
def adjust(self, params, pytorch_mod=False):
self.num_adjustments += 1
self.optimizer.adjust(self.params)
t = self.num_adjustments
beta1 = Adam.clamp(self.params["beta1"])
beta2 = Adam.clamp(self.params["beta2"])
updated_param = []
if pytorch_mod:
params = params.named_parameters(prefix='') #Changer nom d'input...
for name, param in params:
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape, device=self.device),
"v": torch.zeros(param.shape, device=self.device)
+ 10.0 ** self.params["log_eps"].data
# NOTE that we add a little fudge factor' here because sqrt is not
# differentiable at exactly zero
}
#print(name, param.device)
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(nn.Parameter(m)) #Risque de surcharger la memoire => Dict mieux ?
self.all_params_with_gradients.append(nn.Parameter(v))
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.params["log_eps"])
updated_param[name] = param.detach() - self.params["alpha"] * dparam
if pytorch_mod: params.update(updated_param) #Changer nom d'input...
else: params = updated_param
def __str__(self):
return "adam(" + str(self.params) + ") / " + str(self.optimizer)

View file

@ -0,0 +1,182 @@
import numpy as np
import json, math, time, os
from hyperopt import *
import gc
BATCH_SIZE = 300
mnist_train = torchvision.datasets.MNIST(
"./data", train=True, download=True, transform=torchvision.transforms.ToTensor()
)
mnist_test = torchvision.datasets.MNIST(
"./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
)
dl_train = torch.utils.data.DataLoader(
mnist_train, batch_size=BATCH_SIZE, shuffle=False
)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)
def test(model):
for i, (features_, labels_) in enumerate(dl_test):
features, labels = torch.reshape(features_, (10000, 28 * 28)), labels_
pred = model.forward(features)
return pred.argmax(dim=1).eq(labels).sum().item() / 10000 * 100
def train(model, epochs=3, height=1):
stats = []
for epoch in range(epochs):
for i, (features_, labels_) in enumerate(dl_train):
t0 = time.process_time()
model.begin()
features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
pred = model.forward(
features
) # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/
loss = F.nll_loss(pred, labels)
model.zero_grad()
loss.backward(create_graph=True)
model.adjust()
tf = time.process_time()
data = {
"time": tf - t0,
"iter": epoch * len(dl_train) + i,
"loss": loss.item(),
"params": {
k: v.item()
for k, v in model.optimizer.parameters.items()
if "." not in k
},
}
stats.append(data)
return stats
def run(opt, name="out", usr={}, epochs=3, height=1):
torch.manual_seed(0x42)
model = MNIST_FullyConnected(28 * 28, 128, 10, opt)
print("Running...", str(model))
model.initialize()
log = train(model, epochs, height)
acc = test(model)
out = {"acc": acc, "log": log, "usr": usr}
with open("log/%s.json" % name, "w+") as f:
json.dump(out, f, indent=True)
times = [x["time"] for x in log]
print("Times (ms):", np.mean(times), "+/-", np.std(times))
print("Final accuracy:", acc)
return out
def sgd_experiments():
run(SGD(0.01), "sgd", epochs=1)
out = run(SGD(0.01, optimizer=SGD(0.01)), "sgd+sgd", epochs=1)
alpha = out["log"][-1]["params"]["alpha"]
print(alpha)
run(SGD(alpha), "sgd-final", epochs=1)
def adam_experiments():
run(Adam(), "adam", epochs=1)
print()
mo = SGDPerParam(
0.001, ["alpha", "beta1", "beta2", "log_eps"], optimizer=SGD(0.0001)
)
out = run(Adam(optimizer=mo), "adam+sgd", epochs=1)
p = out["log"][-1]["params"]
alpha = p["alpha"]
beta1 = Adam.clamp(torch.tensor(p["beta1"])).item()
beta2 = Adam.clamp(torch.tensor(p["beta2"])).item()
log_eps = p["log_eps"]
print(alpha, beta1, beta2, log_eps)
print(mo)
run(
Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps),
"adam+sgd-final",
epochs=1,
)
print()
out = run(Adam(optimizer=Adam()), "adam2", epochs=1)
p = out["log"][-1]["params"]
alpha = p["alpha"]
beta1 = Adam.clamp(torch.tensor(p["beta1"])).item()
beta2 = Adam.clamp(torch.tensor(p["beta2"])).item()
log_eps = p["log_eps"]
print(alpha, beta1, beta2, log_eps)
run(
Adam(alpha=p["alpha"], beta1=beta1, beta2=beta2, log_eps=log_eps),
"adam2-final",
epochs=1,
)
print()
mo = SGDPerParam(0.001, ["alpha"], optimizer=SGD(0.0001))
out = run(AdamBaydin(optimizer=mo), "adambaydin+sgd", epochs=1)
p = out["log"][-1]["params"]
alpha = p["alpha"]
print(alpha)
print(mo)
run(Adam(alpha=p["alpha"]), "adambaydin+sgd-final", epochs=1)
print()
out = run(AdamBaydin(optimizer=Adam()), "adambaydin2", epochs=1)
p = out["log"][-1]["params"]
alpha = p["alpha"]
print(alpha)
run(Adam(alpha=p["alpha"]), "adambaydin2-final", epochs=1)
def surface():
run(SGD(10 ** -3, optimizer=SGD(10 ** -1)), "tst", epochs=1)
for log_alpha in np.linspace(-3, 2, 10):
run(SGD(10 ** log_alpha), "sgd@1e%+.2f" % log_alpha, epochs=1)
def make_sgd_stack(height, top):
if height == 0:
return SGD(alpha=top)
return SGD(alpha=top, optimizer=make_sgd_stack(height - 1, top))
def make_adam_stack(height, top=0.0000001):
if height == 0:
return Adam(alpha=top)
return Adam(alpha=top, optimizer=make_adam_stack(height - 1))
def stack_test():
for top in np.linspace(-7, 3, 20):
for height in range(6):
print("height =", height, "to p=", top)
opt = make_sgd_stack(height, 10 ** top)
run(
opt,
"metasgd3-%d@%+.2f" % (height, top),
{"height": height, "top": top},
epochs=1,
height=height,
)
gc.collect()
def perf_test():
for h in range(51):
print("height:", h)
# opt = make_sgd_stack(h, 0.01)
opt = make_adam_stack(h)
run(opt, "adamperf-%d" % h, {"height": h}, epochs=1)
gc.collect()
if __name__ == "__main__":
try:
os.mkdir("log")
except:
print("log/ exists already")
surface()
sgd_experiments()
adam_experiments()
stack_test()
perf_test()

View file

@ -0,0 +1,5 @@
numpy==1.17.2
Pillow==6.2.0
six==1.12.0
torch==1.2.0
torchvision==0.4.0

View file

@ -0,0 +1,344 @@
import numpy as np
import json, math, time, os
from data_aug import *
#from data_aug_v2 import *
import gc
import matplotlib.pyplot as plt
from torchviz import make_dot, make_dot_from_trace
from torch.utils.data import SubsetRandomSampler
BATCH_SIZE = 300
#TEST_SIZE = 10000
TEST_SIZE = 300
DATA_LIMIT = 10
'''
data_train = torchvision.datasets.MNIST(
"./data", train=True, download=True,
transform=torchvision.transforms.Compose([
#torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
torchvision.transforms.ToTensor()
])
)
data_test = torchvision.datasets.MNIST(
"./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
)
'''
data_train = torchvision.datasets.CIFAR10(
"./data", train=True, download=True,
transform=torchvision.transforms.Compose([
#torchvision.transforms.RandomAffine(degrees=180, translate=None, scale=None, shear=None, resample=False, fillcolor=0),
torchvision.transforms.ToTensor()
])
)
data_test = torchvision.datasets.CIFAR10(
"./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
)
train_subset_indices=range(int(len(data_train)/2))
val_subset_indices=range(int(len(data_train)/2),len(data_train))
dl_train = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(train_subset_indices))
dl_val = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, sampler=SubsetRandomSampler(val_subset_indices))
dl_test = torch.utils.data.DataLoader(data_test, batch_size=TEST_SIZE, shuffle=False)
def test(model, reshape_in=True, device = torch.device('cuda')):
for i, (features_, labels_) in enumerate(dl_test):
if reshape_in :
features, labels = torch.reshape(features_, (TEST_SIZE, 28 * 28)), labels_
else:
features, labels =features_, labels_
features, labels = features.to(device), labels.to(device)
pred = model.forward(features)
return pred.argmax(dim=1).eq(labels).sum().item() / TEST_SIZE * 100
def train_one_epoch(model, optimizer, epoch=0, reshape_in=True, device = torch.device('cuda'), train_data=True):
if train_data: dl = dl_train
else: dl = dl_val
for i, (features_, labels_) in enumerate(dl):
if i > DATA_LIMIT : break
#t0 = time.process_time()
if reshape_in :
features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
else:
features, labels =features_, labels_
features, labels = features.to(device), labels.to(device)
#optimizer.begin()
#optimizer.zero_grad()
model.begin()
model.zero_grad()
pred = model.forward(features)
#loss = F.nll_loss(pred, labels)
loss = F.cross_entropy(pred,labels)
#model.print_grad_fn()
#optimizer.print_grad_fn()
#print('-'*50)
loss.backward(create_graph=True)
#optimizer.step()
if train_data: model.adjust()
else: model.adjust_val()
#tf = time.process_time()
#data = {
# "time": tf - t0,
# "iter": epoch * len(dl_train) + i,
# "loss": loss.item(),
# "params": {
# k: v.item()
# for k, v in model.optimizer.parameters.items()
# if "." not in k
# },
#}
#stats.append(data)
#print_torch_mem(i)
return loss.item()
def train_v2(model, optimizer, epochs=3, reshape_in=True, device = torch.device('cuda')):
log = []
for epoch in range(epochs):
#dl_train.dataset.transform=torchvision.transforms.Compose([
# torchvision.transforms.RandomAffine(degrees=model.param('mag'), translate=None, scale=None, shear=None, resample=False, fillcolor=0),
# torchvision.transforms.ToTensor()
#])
viz_data(fig_name='res/data_sample')
t0 = time.process_time()
loss = train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device)
train_one_epoch(model=model, optimizer=optimizer, epoch=epoch, reshape_in=reshape_in, device=device,train_data=False)
#acc = test(model=model, reshape_in=reshape_in, device=device)
acc = 0
tf = time.process_time()
data = {
"time": tf - t0,
"epoch": epoch,
"loss": loss,
"acc": acc,
"params": {
k: v.item()
for k, v in model.optimizer.parameters.items()
#for k, v in model.mods.data_aug.params.named_parameters()
if "." not in k
},
}
log.append(data)
print("Epoch :",epoch+1, "/",epochs, "- Loss :",log[-1]["loss"])
param = [p for p in model.param_grad() if p.grad is not None]
if(len(param)!=0):
print(param[-2],' / ', param[-2].grad)
print(param[-1],' / ', param[-1].grad)
return log
def train(model, epochs=3, height=1, reshape_in=True, device = torch.device('cuda')):
stats = []
for epoch in range(epochs):
for i, (features_, labels_) in enumerate(dl_train):
t0 = time.process_time()
model.begin()
if reshape_in :
features, labels = torch.reshape(features_, (BATCH_SIZE, 28 * 28)), labels_
else:
features, labels =features_, labels_
features, labels = features.to(device), labels.to(device)
pred = model.forward(
features
) # typo in https://www.groundai.com/project/gradient-descent-the-ultimate-optimizer/
#loss = F.nll_loss(pred, labels)
loss = F.cross_entropy(pred,labels)
#print('-'*50)
#param = [p for p in model.param_grad() if p.grad is not None]
#if(len(param)!=0):
# print(param[-2],' / ', param[-2].grad)
# print(param[-1],' / ', param[-1].grad)
model.zero_grad()
loss.backward(create_graph=True)
model.adjust()
tf = time.process_time()
data = {
"time": tf - t0,
"iter": epoch * len(dl_train) + i,
"loss": loss.item(),
"params": {
k: v.item()
for k, v in model.optimizer.parameters.items()
if "." not in k
},
}
stats.append(data)
print('-'*50)
i=0
for obj in gc.get_objects():
try:
if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)) and len(obj.size())>1:
print(i, type(obj), obj.size())
i+=1
except:
pass
print("Epoch :",epoch+1, "/",epochs, "- Loss :",stats[-1]["loss"])
param = [p for p in model.param_grad() if p.grad is not None]
if(len(param)!=0):
print(param[-2],' / ', param[-2].grad)
print(param[-1],' / ', param[-1].grad)
return stats
def run(opt, name="out", usr={}, epochs=10, height=1, cnn=True, device = torch.device('cuda')):
torch.manual_seed(0x42)
if not cnn:
reshape_in = True
#model = MNIST_FullyConnected(28 * 28, 128, 10, opt)
model = MNIST_FullyConnected_Augmented(28 * 28, 128, 10, opt, device=device)
else:
reshape_in = False
#model = LeNet(1, 10,opt, device)
#model = LeNet_v2(1, 10,opt, device).to(device=device)
model = LeNet_v2(3, 10,opt, device).to(device=device)
optimizer=None
'''
m = LeNet_v3(1, 10)
a = Data_aug()
model = Augmented_model(model=m,
data_augmenter=a,
optimizer=opt).to(device) #deux fois le meme optimizer ?...
'''
'''
m = LeNet_v3(1, 10)
a = Data_aug()
model = Augmented_model(model=m, data_augmenter=a).to(device)
#optimizer = SGD(model.parameters())
optimizer = SGD(model.parameters(), lr=0.01, height=1)
'''
#for idx, m in enumerate(model.modules()):
# print(idx, '->', m)
print("Running...", str(model))
model.initialize()
#print_model(model)
#model.data_augmentation(False)
#model.eval()
log = train_v2(model=model, optimizer=optimizer, epochs=epochs, reshape_in=reshape_in, device=device)
model.eval()
acc = test(model, reshape_in, device=device)
#param = [p for p in model.param_grad() if p.grad is not None]
#if(len(param)!=0):
# print(param[-2],' / ', param[-2].grad)
# print(param[-1],' / ', param[-1].grad)
out = {"acc": acc, "log": log, "usr": usr}
with open("log/%s.json" % name, "w+") as f:
json.dump(out, f, indent=True)
times = [x["time"] for x in log]
print("Times (ms):", np.mean(times), "+/-", np.std(times))
print("Final accuracy:", acc)
#plot_res(log, fig_name='res/'+name)
return out
def make_adam_stack(height, top=0.0000001, device = torch.device('cuda')):
#print(height,device)
if height == 0:
return Adam(alpha=top, device=device)
return Adam(alpha=top, optimizer=make_adam_stack(height - 1, top, device=device), device=device)
def plot_res(log, fig_name='res'):
fig, ax = plt.subplots(ncols=3, figsize=(15, 3))
ax[0].set_title('Loss')
ax[0].plot([x["loss"] for x in log])
ax[1].set_title('Acc')
ax[1].plot([x["acc"] for x in log])
ax[2].set_title('mag')
ax[2].plot([x["data_aug"] for x in log])
plt.savefig(fig_name)
def print_torch_mem(add_info=''):
nb=0
max_size=0
for obj in gc.get_objects():
try:
if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # and len(obj.size())>1:
#print(i, type(obj), obj.size())
size = np.sum(obj.size())
if(size>max_size): max_size=size
nb+=1
except:
pass
print(add_info, "-Pytroch tensor nb:",nb," / Max dim:", max_size)
def print_model(model, fig_name='graph/graph'): #Semble ne pas marcher pour les models en fonctionnel
x = torch.randn(1,1,28,28, device=device)
dot=make_dot(model(x), params=dict(model.named_parameters()))
dot.format = 'svg' #https://graphviz.readthedocs.io/en/stable/manual.html#formats
dot.render(fig_name)
print("Model graph generated !")
def viz_data(fig_name='data_sample'):
features_, labels_ = next(iter(dl_train))
plt.figure(figsize=(10,10))
#for i, (features_, labels_) in enumerate(dl_train):
for i in range(25):
if i==25: break
#print(features_.size(), labels_.size())
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
img = features_[i,0,:,:]
#print('im shape',img.shape)
plt.imshow(img, cmap=plt.cm.binary)
plt.xlabel(labels_[i].item())
plt.savefig(fig_name)
##########################################
if __name__ == "__main__":
try:
os.mkdir("log")
except:
print("log/ exists already")
device = torch.device('cuda')
run(make_adam_stack(height=1, top=0.001, device=device),
"Augmented_MNIST",
epochs=100,
cnn=True,
device = device)
print()