smart_augmentation/Gradient-Descent-The-Ultimate-Optimizer/hyperopt.py
Harle, Antoine (Contracteur) 3ae3e02e59 Initial Commit
2019-11-08 11:28:06 -05:00

345 lines
No EOL
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Optimizable():#nn.Module):
"""
This is the interface for anything that has parameters that need to be
optimized, somewhat like torch.nn.Model but with the right plumbing for
hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
interface which does not give us enough control about the detachments.)
Nominal operation of an Optimizable at the lowest level is as follows:
o = MyOptimizable(…)
o.initialize()
loop {
o.begin()
o.zero_grad()
loss = compute loss function from parameters
loss.backward()
o.adjust()
}
Optimizables recursively handle updates to their optimiz*ers*.
"""
#def __init__(self):
# super(Optimizable, self).__init__()
# self.parameters = nn.Parameter(torch.zeros(()))
def __init__(self, parameters, optimizer):
#super(Optimizable, self).__init__()
self.parameters = parameters # a dict mapping names to tensors
self.optimizer = optimizer # which must itself be Optimizable!
self.all_params_with_gradients = []
#self.device = device
def initialize(self):
"""Initialize parameters, e.g. with a Kaiming initializer."""
pass
def begin(self):
"""Enable gradient tracking on current parameters."""
self.all_params_with_gradients = [] #Reintialisation pour eviter surcharge de la memoire
for name, param in self.parameters.items():
#for param in self.parameters:
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
#param.to(self.device)
#if param.device == torch.device('cuda:0'):
# print(name, param.device)
self.all_params_with_gradients.append(param)
self.optimizer.begin()
def zero_grad(self):
""" Set all gradients to zero. """
for param in self.all_params_with_gradients:
#param = param.to(self.device)
param.grad = torch.zeros(param.shape, device=param.device)
self.optimizer.zero_grad()
""" Note: at this point you would probably call .backwards() on the loss
function. """
def adjust(self):
""" Update parameters """
pass
def print_grad_fn(self):
self.optimizer.print_grad_fn()
for n, p in self.parameters.items():
print(n," - ", p.grad_fn)
def param_grad(self):
return self.all_params_with_gradients
def param(self, param_name):
return self.parameters[param_name].item()
class MNIST_FullyConnected(Optimizable):
"""
A fully-connected NN for the MNIST task. This is Optimizable but not itself
an optimizer.
"""
def __init__(self, num_inp, num_hid, num_out, optimizer):
parameters = {
"w1": torch.zeros(num_inp, num_hid).t(),
"b1": torch.zeros(num_hid).t(),
"w2": torch.zeros(num_hid, num_out).t(),
"b2": torch.zeros(num_out).t(),
}
super().__init__(parameters, optimizer)
def initialize(self):
nn.init.kaiming_uniform_(self.parameters["w1"], a=math.sqrt(5))
nn.init.kaiming_uniform_(self.parameters["w2"], a=math.sqrt(5))
self.optimizer.initialize()
def forward(self, x):
"""Compute a prediction."""
x = F.linear(x, self.parameters["w1"], self.parameters["b1"])
x = torch.tanh(x)
x = F.linear(x, self.parameters["w2"], self.parameters["b2"])
x = torch.tanh(x)
x = F.log_softmax(x, dim=1)
return x
def adjust(self):
self.optimizer.adjust(self.parameters)
def __str__(self):
return "mnist / " + str(self.optimizer)
class NoOpOptimizer(Optimizable):#, nn.Module):
"""
NoOpOptimizer sits on top of a stack, and does not affect what lies below.
"""
def __init__(self):
#super(Optimizable, self).__init__()
pass
def initialize(self):
pass
def begin(self):
pass
def zero_grad(self):
pass
def adjust(self, params):
pass
def adjust_val(self, params):
pass
def print_grad_fn(self):
pass
def __str__(self):
return "static"
class Adam(Optimizable):
"""
A fully hyperoptimizable Adam optimizer
"""
def clamp(x):
return (x.tanh() + 1.0) / 2.0
def unclamp(y):
z = y * 2.0 - 1.0
return ((1.0 + z) / (1.0 - z)).log() / 2.0
def __init__(
self,
alpha=0.001,
beta1=0.9,
beta2=0.999,
log_eps=-8.0,
optimizer=NoOpOptimizer(),
device = torch.device('cuda')
):
self.device = device
parameters = {
"alpha": torch.tensor(alpha, device=self.device),
"beta1": Adam.unclamp(torch.tensor(beta1, device=self.device)),
"beta2": Adam.unclamp(torch.tensor(beta2, device=self.device)),
"log_eps": torch.tensor(log_eps, device=self.device),
}
super().__init__(parameters, optimizer)
self.num_adjustments = 0
self.num_adjustments_val = 0
self.cache = {}
for name, param in parameters.items():
param.requires_grad_() # keep gradient information…
param.retain_grad() # even if not a leaf…
#param.to(self.device)
#if param.device == torch.device('cuda:0'):
# print(name, param.device)
def adjust(self, params): #Update param d'apprentissage
self.num_adjustments += 1
self.optimizer.adjust(self.parameters)
#print('Adam update')
t = self.num_adjustments
beta1 = Adam.clamp(self.parameters["beta1"])
beta2 = Adam.clamp(self.parameters["beta2"])
for name, param in params.items():
if name == "mag": continue
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape, device=self.device),
"v": torch.zeros(param.shape, device=self.device)
+ 10.0 ** self.parameters["log_eps"].data
# NOTE that we add a little fudge factor' here because sqrt is not
# differentiable at exactly zero
}
#print(name, param.device)
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(m)
self.all_params_with_gradients.append(v)
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
params[name] = param.detach() - self.parameters["alpha"] * dparam
#print(name)
def adjust_val(self, params): #Update param Transformations
self.num_adjustments_val += 1
self.optimizer.adjust_val(self.parameters)
#print('Adam update')
t = self.num_adjustments_val
beta1 = Adam.clamp(self.parameters["beta1"])
beta2 = Adam.clamp(self.parameters["beta2"])
for name, param in params.items():
if name != "mag": continue
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape, device=self.device),
"v": torch.zeros(param.shape, device=self.device)
+ 10.0 ** self.parameters["log_eps"].data
# NOTE that we add a little fudge factor' here because sqrt is not
# differentiable at exactly zero
}
#print(name, param.device)
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(m)
self.all_params_with_gradients.append(v)
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.parameters["log_eps"])
params[name] = param.detach() - self.parameters["alpha"] * dparam
#print(name)
def __str__(self):
return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
'''
class SGD(Optimizable):
"""
A hyperoptimizable SGD
"""
def __init__(self, alpha=0.01, optimizer=NoOpOptimizer()):
parameters = {"alpha": torch.tensor(alpha)}
super().__init__(parameters, optimizer)
def adjust(self, params):
self.optimizer.adjust(self.parameters)
for name, param in params.items():
g = param.grad.detach()
params[name] = param.detach() - g * self.parameters["alpha"]
def __str__(self):
return "sgd(%f) / " % self.parameters["alpha"] + str(self.optimizer)
class SGDPerParam(Optimizable):
"""
Like above, but can be taught a separate step size for each parameter it
tunes.
"""
def __init__(self, alpha=0.01, params=[], optimizer=NoOpOptimizer()):
parameters = {name + "_alpha": torch.tensor(alpha) for name in params}
super().__init__(parameters, optimizer)
def adjust(self, params):
self.optimizer.adjust(self.parameters)
for name, param in params.items():
g = param.grad.detach()
params[name] = param.detach() - g * self.parameters[name + "_alpha"]
def __str__(self):
return "sgd(%s) / " % str(
{k: t.item() for k, t in self.parameters.items()}
) + str(self.optimizer)
'''
'''
class AdamBaydin(Optimizable):
""" Same as above, but only optimizes the learning rate, treating the
remaining hyperparameters as constants. """
def __init__(
self,
alpha=0.001,
beta1=0.9,
beta2=0.999,
log_eps=-8.0,
optimizer=NoOpOptimizer(),
):
parameters = {"alpha": torch.tensor(alpha)}
self.beta1 = beta1
self.beta2 = beta2
self.log_eps = log_eps
super().__init__(parameters, optimizer)
self.num_adjustments = 0
self.cache = {}
def adjust(self, params):
self.num_adjustments += 1
self.optimizer.adjust(self.parameters)
t = self.num_adjustments
beta1 = self.beta1
beta2 = self.beta2
for name, param in params.items():
if name not in self.cache:
self.cache[name] = {
"m": torch.zeros(param.shape),
"v": torch.zeros(param.shape) + 10.0 ** self.log_eps,
}
g = param.grad.detach()
self.cache[name]["m"] = m = (
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
)
self.cache[name]["v"] = v = (
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
)
self.all_params_with_gradients.append(m)
self.all_params_with_gradients.append(v)
m_hat = m / (1.0 - beta1 ** float(t))
v_hat = v / (1.0 - beta2 ** float(t))
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.log_eps)
params[name] = param.detach() - self.parameters["alpha"] * dparam
def __str__(self):
return "adam(" + str(self.parameters) + ") / " + str(self.optimizer)
'''