mirror of
https://github.com/AntoineHX/smart_augmentation.git
synced 2025-05-04 12:10:45 +02:00
Initial Commit
This commit is contained in:
commit
3ae3e02e59
44 changed files with 4908 additions and 0 deletions
296
Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py
Normal file
296
Gradient-Descent-The-Ultimate-Optimizer/hyperopt_v2.py
Normal file
|
@ -0,0 +1,296 @@
|
|||
import math
|
||||
import torch
|
||||
import torchvision
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.optim.optimizer import Optimizer
|
||||
|
||||
class Optimizable():
|
||||
"""
|
||||
This is the interface for anything that has parameters that need to be
|
||||
optimized, somewhat like torch.nn.Model but with the right plumbing for
|
||||
hyperoptimizability. (Specifically, torch.nn.Model uses the Parameter
|
||||
interface which does not give us enough control about the detachments.)
|
||||
Nominal operation of an Optimizable at the lowest level is as follows:
|
||||
o = MyOptimizable(…)
|
||||
o.initialize()
|
||||
loop {
|
||||
o.begin()
|
||||
o.zero_grad()
|
||||
loss = –compute loss function from parameters–
|
||||
loss.backward()
|
||||
o.adjust()
|
||||
}
|
||||
Optimizables recursively handle updates to their optimiz*ers*.
|
||||
"""
|
||||
#def __init__(self):
|
||||
# super(Optimizable, self).__init__()
|
||||
# self.parameters = nn.Parameter(torch.zeros(()))
|
||||
|
||||
def __init__(self, parameters, optimizer):
|
||||
self.params = parameters # a dict mapping names to tensors
|
||||
self.optimizer = optimizer # which must itself be Optimizable!
|
||||
self.all_params_with_gradients = []
|
||||
#self.device = device
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize parameters, e.g. with a Kaiming initializer."""
|
||||
pass
|
||||
|
||||
def begin(self):
|
||||
"""Enable gradient tracking on current parameters."""
|
||||
self.all_params_with_gradients = nn.ParameterList() #Reintialisation pour eviter surcharge de la memoire
|
||||
print("Opti param :", type(self.params))
|
||||
#for name, param in self.params:
|
||||
if isinstance(self.params,dict): #Dict
|
||||
for name, param in self.params:
|
||||
param.requires_grad_() # keep gradient information…
|
||||
param.retain_grad() # even if not a leaf…
|
||||
self.all_params_with_gradients.append(param)
|
||||
if isinstance(self.params,list): #List
|
||||
for param in self.params:
|
||||
param.requires_grad_() # keep gradient information…
|
||||
param.retain_grad() # even if not a leaf…
|
||||
self.all_params_with_gradients.append(param)
|
||||
self.optimizer.begin()
|
||||
|
||||
def zero_grad(self):
|
||||
""" Set all gradients to zero. """
|
||||
for param in self.all_params_with_gradients:
|
||||
param.grad = torch.zeros(param.shape, device=param.device)
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
""" Note: at this point you would probably call .backwards() on the loss
|
||||
function. """
|
||||
|
||||
def adjust(self):
|
||||
""" Update parameters """
|
||||
pass
|
||||
|
||||
|
||||
class NoOpOptimizer(Optimizable):#, nn.Module):
|
||||
"""
|
||||
NoOpOptimizer sits on top of a stack, and does not affect what lies below.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
#super(Optimizable, self).__init__()
|
||||
pass
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
def begin(self):
|
||||
#print("NoOpt begin")
|
||||
pass
|
||||
|
||||
def zero_grad(self):
|
||||
pass
|
||||
|
||||
def adjust(self, params):
|
||||
pass
|
||||
|
||||
def step(self):
|
||||
pass
|
||||
|
||||
def print_grad_fn(self):
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return "static"
|
||||
|
||||
|
||||
class SGD(Optimizer, nn.Module): #Eviter Optimizer
|
||||
"""
|
||||
A hyperoptimizable SGD
|
||||
"""
|
||||
|
||||
def __init__(self, params, lr=0.01, height=0):
|
||||
self.height=height
|
||||
#params : a optimiser
|
||||
#reste (defaults) param de l'opti
|
||||
print('SGD - H', height)
|
||||
nn.Module.__init__(self)
|
||||
|
||||
optim_keys = ('lr','') #A mettre dans Optimizable ? #'' pour eviter iteration dans la chaine de charactere...
|
||||
'''
|
||||
self_params = {"lr": torch.tensor(lr),
|
||||
"momentum": 0,
|
||||
"dampening":0,
|
||||
"weight_decay":0,
|
||||
"nesterov": False}
|
||||
'''
|
||||
#self_params = dict(lr=torch.tensor(lr),
|
||||
# momentum=0, dampening=0, weight_decay=0, nesterov=False)
|
||||
|
||||
self_params = nn.ParameterDict({
|
||||
"lr": nn.Parameter(torch.tensor(lr)),
|
||||
"momentum": nn.Parameter(torch.tensor(0.0)),
|
||||
"dampening": nn.Parameter(torch.tensor(0.0)),
|
||||
"weight_decay": nn.Parameter(torch.tensor(0.0)),
|
||||
})
|
||||
|
||||
for k in self_params.keys() & optim_keys:
|
||||
self_params[k].requires_grad_() # keep gradient information…
|
||||
self_params[k].retain_grad() # even if not a leaf…
|
||||
#self_params[k].register_hook(print)
|
||||
|
||||
if height==0:
|
||||
optimizer = NoOpOptimizer()
|
||||
else:
|
||||
#def dict_generator(): yield {k: self_params[k] for k in self_params.keys() & optim_keys}
|
||||
#(dict for dict in {k: self_params[k] for k in self_params.keys() & optim_keys}) #Devrait mar
|
||||
optimizer = SGD(params=(self_params[k]for k in self_params.keys() & optim_keys), lr=lr, height=height-1)
|
||||
#optimizer.register_backward_hook(print)
|
||||
|
||||
self.optimizer = optimizer
|
||||
#if(height==0):
|
||||
# for n,p in params.items():
|
||||
# print(n,p)
|
||||
|
||||
#Optimizable.__init__(self, self_params, optimizer)
|
||||
|
||||
#print(type(params))
|
||||
#for p in params:
|
||||
# print(type(p))
|
||||
Optimizer.__init__(self, params, self_params)
|
||||
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
print(type(p.data), p.size())
|
||||
print('End SGD-H', height)
|
||||
|
||||
def begin(self):
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
#print(type(p.data), p.size())
|
||||
p.requires_grad_() # keep gradient information…
|
||||
p.retain_grad() # even if not a leaf…
|
||||
#p.register_hook(lambda x: print(self.height, x.grad_fn))
|
||||
|
||||
self.optimizer.begin()
|
||||
|
||||
def print_grad_fn(self):
|
||||
self.optimizer.print_grad_fn()
|
||||
for group in self.param_groups:
|
||||
for i, p in enumerate(group['params']):
|
||||
print(self.height," - ", i, p.grad_fn)
|
||||
|
||||
#def adjust(self, params):
|
||||
# self.optimizer.adjust(self.params)
|
||||
# for name, param in params.items():
|
||||
# g = param.grad.detach()
|
||||
# params[name] = param.detach() - g * self.params["lr"]
|
||||
|
||||
def step(self):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
print('SGD start')
|
||||
self.optimizer.step()
|
||||
|
||||
for group in self.param_groups:
|
||||
for i, p in enumerate(group['params']):
|
||||
if p.grad is None:
|
||||
continue
|
||||
#d_p = p.grad.data
|
||||
d_p = p.grad.detach()
|
||||
|
||||
#print(group['lr'])
|
||||
p.data.add_(-group['lr'].item(), d_p)
|
||||
#group['params'][i] = p.detach() - d_p * group['lr']
|
||||
p.data-= group['lr']*d_p #Data ne pas utiliser perte info
|
||||
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
print(p, p.grad)
|
||||
continue
|
||||
|
||||
print("SGD end")
|
||||
#return loss
|
||||
|
||||
def __str__(self):
|
||||
return "sgd(%f) / " % self.params["lr"] + str(self.optimizer)
|
||||
|
||||
|
||||
class Adam(Optimizable, nn.Module):
|
||||
"""
|
||||
A fully hyperoptimizable Adam optimizer
|
||||
"""
|
||||
|
||||
def clamp(x):
|
||||
return (x.tanh() + 1.0) / 2.0
|
||||
|
||||
def unclamp(y):
|
||||
z = y * 2.0 - 1.0
|
||||
return ((1.0 + z) / (1.0 - z)).log() / 2.0
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
alpha=0.001,
|
||||
beta1=0.9,
|
||||
beta2=0.999,
|
||||
log_eps=-8.0,
|
||||
optimizer=NoOpOptimizer(),
|
||||
device = torch.device('cuda')
|
||||
):
|
||||
#super(Adam, self).__init__()
|
||||
nn.Module.__init__(self)
|
||||
self.device = device
|
||||
params = nn.ParameterDict({
|
||||
"alpha": nn.Parameter(torch.tensor(alpha, device=self.device)),
|
||||
"beta1": nn.Parameter(Adam.unclamp(torch.tensor(beta1, device=self.device))),
|
||||
"beta2": nn.Parameter(Adam.unclamp(torch.tensor(beta2, device=self.device))),
|
||||
"log_eps": nn.Parameter(torch.tensor(log_eps, device=self.device)),
|
||||
})
|
||||
Optimizable.__init__(self, params, optimizer)
|
||||
self.num_adjustments = 0
|
||||
self.cache = {}
|
||||
|
||||
for name, param in params.items():
|
||||
param.requires_grad_() # keep gradient information…
|
||||
param.retain_grad() # even if not a leaf…
|
||||
|
||||
def adjust(self, params, pytorch_mod=False):
|
||||
self.num_adjustments += 1
|
||||
self.optimizer.adjust(self.params)
|
||||
t = self.num_adjustments
|
||||
beta1 = Adam.clamp(self.params["beta1"])
|
||||
beta2 = Adam.clamp(self.params["beta2"])
|
||||
|
||||
updated_param = []
|
||||
if pytorch_mod:
|
||||
params = params.named_parameters(prefix='') #Changer nom d'input...
|
||||
|
||||
for name, param in params:
|
||||
if name not in self.cache:
|
||||
self.cache[name] = {
|
||||
"m": torch.zeros(param.shape, device=self.device),
|
||||
"v": torch.zeros(param.shape, device=self.device)
|
||||
+ 10.0 ** self.params["log_eps"].data
|
||||
# NOTE that we add a little ‘fudge factor' here because sqrt is not
|
||||
# differentiable at exactly zero
|
||||
}
|
||||
#print(name, param.device)
|
||||
g = param.grad.detach()
|
||||
self.cache[name]["m"] = m = (
|
||||
beta1 * self.cache[name]["m"].detach() + (1.0 - beta1) * g
|
||||
)
|
||||
self.cache[name]["v"] = v = (
|
||||
beta2 * self.cache[name]["v"].detach() + (1.0 - beta2) * g * g
|
||||
)
|
||||
self.all_params_with_gradients.append(nn.Parameter(m)) #Risque de surcharger la memoire => Dict mieux ?
|
||||
self.all_params_with_gradients.append(nn.Parameter(v))
|
||||
m_hat = m / (1.0 - beta1 ** float(t))
|
||||
v_hat = v / (1.0 - beta2 ** float(t))
|
||||
dparam = m_hat / (v_hat ** 0.5 + 10.0 ** self.params["log_eps"])
|
||||
updated_param[name] = param.detach() - self.params["alpha"] * dparam
|
||||
|
||||
if pytorch_mod: params.update(updated_param) #Changer nom d'input...
|
||||
else: params = updated_param
|
||||
|
||||
def __str__(self):
|
||||
return "adam(" + str(self.params) + ") / " + str(self.optimizer)
|
Loading…
Add table
Add a link
Reference in a new issue