Title
from nbdev import export2html
This is from my tutoring that I do. This is a preview for study groups.
from fastai2.vision.all import *
path=untar_data(URLs.MNIST_TINY)
Path('/home/fast/.fastai/data').ls()
db=DataBlock((ImageBlock, CategoryBlock), get_items=get_image_files, splitter=GrandparentSplitter(),
get_y=parent_label,batch_tfms=aug_transforms(do_flip=False))
dls=db.dataloaders(path,bs=16)
dls.show_batch()
This is basic setup, we will do this every time we change the code to reset all the involved variables.
x,labels = dls.one_batch()
x,labels=x.cpu(),labels.cpu()
m = nn.Sequential(nn.Conv2d(3,32,7,3,3),nn.Flatten(),nn.Linear(3200,2)) #very basic pytorch model, just linear operation, no activation, so not a deep model
l = nn.CrossEntropyLoss()
lr=0.1
opt=SGD(m.parameters(),lr)
pred=m(x)
loss=l(pred,labels)
loss.backward()
class OurSGD:
def __init__(self,params,lr):
self.params,self.lr=params,lr
def step(self):
updated_params=[]
for p in self.params:
updated_params.append(p.add(-self.lr*p.grad)) #important part!!!
return updated_params
The important part above is the p.add(-self.lr * p.grad) part. This is the essence of SGD. Notice that we are returning the updated parameters in a list. This is not done in the actual implementation, and instead everything is updated in place. Otherwise this is effectively the same as the Fastai source code.
our_sgd=OurSGD(m.parameters(),lr)
our_parameters=our_sgd.step()
Q1. Please try to update the array on the left, in order to get them to be equal to our_parameters. Pay attention to how sgd is implemented.
parameters_equal([p for p in m.parameters()],our_parameters)
Remember, running Fastai's optimizer will update the weights. So you will have to rerun the above to get the above problem.
opt.step()
Here we do a comparison to Fastai's implmentation, just to make sure we are getting the same values. We use all_close here, because later on there will start to be slight variations as more math (and therefore error ! ) is introduced.
def parameters_equal(mps,ops):
for mp,op in zip((mps),ops):
print(mp.allclose(op))
parameters_equal(m.parameters(),our_parameters)
x,labels = dls.one_batch()
x,labels=x.cpu(),labels.cpu()
m = nn.Sequential(nn.Conv2d(3,32,7,3,3),nn.Flatten(),nn.Linear(3200,2))
l = nn.CrossEntropyLoss()
lr=0.1
mom=0.9
opt=SGD(m.parameters(),lr,mom)
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
class OurSGDwithMomentum:
def __init__(self,params,lr,mom):
self.params,self.lr=list(params),lr
self.mom=mom ##added
self.avg_grad=[torch.zeros_like(p) for p in self.params] #notice how avg_Grad starts at 0.
def step(self):
updated_params=[]
for i,p in enumerate(self.params):
updated_params.append(p.add(-self.lr*self.mom_grad(i,p.grad)))
return updated_params
#avg_grad is weighted average using momentum
def mom_grad(self,i,grad):
self.avg_grad[i]=self.mom*self.avg_grad[i]+grad #this is the important part
return self.avg_grad[i]
Above we add momentum, it is important to realize that momentum is a weighted average and not the "mean." This weighted average is used a bit in machine learning, so it is good to get this concept down now. Momentum is one of the more important hyper parameters after learning rate and weight decay(coming up next).
our_sgd=OurSGDwithMomentum(m.parameters(),lr,mom)
our_parameters=our_sgd.step()
Q2, make this one work as in Q1. Pay attention to how momentum works in the code above.
parameters_equal([p-lr*(mom+p.grad) for p in m.parameters()],our_parameters)
opt.step()
parameters_equal(m.parameters(),our_parameters)
Now for step #2! We do a second step, as there is "state" within momentum, and we need to make sure that this state carries over to the second step.
Q3. Because of state changes, update the code here to get the correct answer below.
our_answer=[p-lr*(our_sgd.avg_grad[i]+p.grad) for i,p in enumerate(m.parameters())] #loops through avg_grad now
our_parameters=our_sgd.step()
parameters_equal(our_answer,our_parameters) #this is testing your solution.
opt.step()
parameters_equal(m.parameters(),our_parameters)
For SGD Weight Decay and l2_Regularization are effectively the same. One on the weights, one on the gradients. This is not the same for more complicated optimizers.
x,labels = dls.one_batch()
x,labels=x.cpu(),labels.cpu()
m = nn.Sequential(nn.Conv2d(3,32,7,3,3),nn.Flatten(),nn.Linear(3200,2))
l = nn.CrossEntropyLoss()
lr=0.1
mom=0.9
wd=0.01
opt=SGD(m.parameters(),lr,mom,wd)
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
class Momentum:
def __init__(self,params,lr,mom):
self.mom=mom
self.params=params
self.avg_grads=[torch.zeros_like(p) for p in self.params] #avg_grad is weighted average using momentum
def __call__(self,**kwargs):
self.avg_grads = [ self.mom*avg_grad+p.grad for p,avg_grad in zip(self.params,self.avg_grads) ]
return {'avg_grads': self.avg_grads,**kwargs}
class Weight_Decay:
def __init__(self,params,lr,wd):
self.lr=lr
self.wd=wd
self.params=params
def __call__(self,**kwargs):
return {**kwargs,'params':[p*(1-self.lr*self.wd) for p in self.params]} #same as params-lr*wd*params, important part!!!!
class OurSGD:
def __init__(self,params,lr,mom,wd):
self.params,self.lr=list(params),lr
self.mom=Momentum(self.params,self.lr,mom)
self.wd=Weight_Decay(self.params,self.lr,wd)
def step(self):
updated_params=[]
self.params=self.wd()['params']
avg_grads=self.mom()['avg_grads']
for i,p in enumerate(self.params):
updated_params.append(p.add(-self.lr*avg_grads[i]))
return updated_params
Okay, things have gotten more complicated. We now split Momentum and Weight Decay out into two seperate functions. These are optimizer callbacks in fastai. [p (1-self.lr self.wd) for p in self.params] is the important bit, as well as understanding the order the math is applied.
our_sgd=OurSGD(m.parameters(),lr,mom,wd)
def momentum(mom,avg_grad,p):
return mom*avg_grad+p.grad
def weight_decay(wd):
return wd
our_answers=[p-lr*momentum(mom,our_sgd.mom.avg_grads[i],p)-lr*weight_decay(wd) for i,p in enumerate(m.parameters())]
our_parameters=our_sgd.step()
Q4, make this one true by editing the weight_decay function above.
parameters_equal(our_answers,our_parameters)
opt.step()
parameters_equal(m.parameters(),our_parameters)
Step two...
opt.zero_grad()
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
our_parameters=our_sgd.step()
opt.step()
parameters_equal(m.parameters(),our_parameters)
l2 reg and weight decay have very similar effects, so no reason to use both.
x = torch.randn([10,3,32,32])
m = nn.Sequential(nn.Conv2d(3,32,7,3,3),nn.Flatten(),nn.Linear(3872,2))
l = nn.CrossEntropyLoss()
lr=0.1
mom=0.9
wd=0.01
opt=SGD(m.parameters(),lr,mom,wd,decouple_wd= False)
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
We are doing a bit of refactoring here to remove momentum and weight decay specific logic here. WE also split off the sgd_specific step while we are at it.
class Momentum:
def __init__(self,params=None,lr=0.0001,mom=0.9,**kwargs):
self.mom=mom
self.params=params
self.avg_grads=[torch.zeros_like(p) for p in self.params] #avg_grad is weighted average using momentum
def __call__(self,params=None,**kwargs):
params = self.params if params is None else params
self.avg_grads = [ self.mom*avg_grad+p.grad for p,avg_grad in zip(params,self.avg_grads) ]
return {**kwargs,'params':params,'avg_grads': self.avg_grads}
class Weight_Decay:
def __init__(self,params=None,lr=0.0001,wd=0.01,decouple=True,**kwargs):
self.lr=lr
self.wd=wd
self.params=params
self.decouple=decouple
def __call__(self,**kwargs):
params = self._do_wd() if self.decouple else self._do_l2_reg()
return {**kwargs,'params':params}
def _do_wd(self,**kwargs):
params=[p*(1-self.lr*self.wd) for p in self.params]
for p,mp in zip(params,self.params):
p.grad=mp.grad
return params #same as params-lr*wd*params
#this one is pretty ugly
def _do_l2_reg(self,**kwargs):
params=[deepcopy(p) for p in self.params]
for p,mp in zip(params,self.params):
p.grad=mp.grad + self.wd* mp
return params
class OurSGD:
hypers=[Weight_Decay,Momentum]
def __init__(self,params,lr,**kwargs):
self.lr=lr
self.params=params
def __call__(self,params=None,avg_grads=None,**kwargs):
return {**kwargs,'params':[ p.add(-self.lr*avg) for p,avg in zip(params,avg_grads) ]}
class OurOptimizer:
def __init__(self,params,lr,opt,**kwargs):
self.state={'params':list(params),'lr':lr}
self.cbs=[cls(**self.state,**kwargs) for cls in [*opt.hypers,opt]]
def step(self):
state=self.state
for cb in self.cbs:
state=cb(**state)
return state['params']
our_opt=OurOptimizer(m.parameters(),lr,OurSGD,decouple=False)
our_parameters=our_opt.step()
opt.step()
parameters_equal(m.parameters(),our_parameters)
opt.zero_grad()
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
our_parameters=our_opt.step()
opt.step()
parameters_equal(m.parameters(),our_parameters)
Done with my refactoring. If you notice there is a issue of lots of for...loops going over the same data. In fastai each function momentum/weight_decay/sgd works on a single parameter at a time, and that is encapsulated in a single for...loop, instead of my approach of passing all the parameters to function that does the looping itself. I just got tired of refactoring at this point and decided to keep what I had.... lots of refactoring happened not in this notebook.
x = torch.randn([10,3,32,32])
m = nn.Sequential(nn.Conv2d(3,32,7,3,3),nn.Flatten(),nn.Linear(3872,2))
l = nn.CrossEntropyLoss()
lr=0.1
mom=0.9
wd=0.01
sqr_mom=0.95
opt=RMSProp(m.parameters(),lr,sqr_mom,mom,wd)
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
class Momentum:
def __init__(self,params=None,lr=0.0001,mom=0.9,**kwargs):
self.mom=mom
self.params=params
self.avg_grads=[torch.zeros_like(p) for p in self.params] #avg_grad is weighted average using momentum
def __call__(self,params=None,**kwargs):
params = self.params if params is None else params
self.avg_grads = [ self.mom*avg_grad+p.grad for p,avg_grad in zip(params,self.avg_grads) ]
return {**kwargs,'params':params,'avg_grads': self.avg_grads}
class Weight_Decay:
def __init__(self,params=None,lr=0.0001,wd=0.01,decouple=True,**kwargs):
self.lr=lr
self.wd=wd
self.params=params
self.decouple=decouple
def __call__(self,**kwargs):
params = self._do_wd() if self.decouple else self._do_l2_reg()
return {**kwargs,'params':params}
def _do_wd(self,**kwargs):
params=[p*(1-self.lr*self.wd) for p in self.params]
for p,mp in zip(params,self.params):
p.grad=mp.grad
return params #same as params-lr*wd*params
#this one is pretty ugly
def _do_l2_reg(self,**kwargs):
params=[deepcopy(p) for p in self.params]
for p,mp in zip(params,self.params):
p.grad=mp.grad + self.wd* mp
return params
class OurSGD:
hypers=[Weight_Decay,Momentum]
def __init__(self,params,lr,**kwargs):
self.lr=lr
self.params=params
def __call__(self,params=None,avg_grads=None,**kwargs):
return {**kwargs,'params':[ p.add(-self.lr*avg) for p,avg in zip(params,avg_grads) ]}
class OurOptimizer:
def __init__(self,params,lr,opt,**kwargs):
self.state={'params':list(params),'lr':lr}
self.cbs=[cls(**self.state,**kwargs) for cls in [*opt.hypers,opt]]
def step(self):
state=self.state
for cb in self.cbs:
state=cb(**state)
return state['params']
class Learning_Rate_Decay:
def __init__(self, params=None,sqr_mom=0.99,**kwargs):
self.sqr_mom=sqr_mom
self.sqr_avgs=[torch.zeros_like(p) for p in params]
def __call__(self, params=None, dampening=True, **kwargs):
damp = 1-sqr_mom if dampening else 1.
self.sqr_avgs = [sqr_avg * self.sqr_mom + damp * p.grad.data ** 2 for p,sqr_avg in zip(params,self.sqr_avgs)]
return { **kwargs,'params':params,'sqr_avgs':self.sqr_avgs}
class OurRMSProp:
hypers=[Weight_Decay,Momentum,Learning_Rate_Decay]
def __init__(self,lr,params,**kwargs):
self.lr=lr
self.params=params
def __call__(self,params=None,avg_grads=None,eps=1e-08,sqr_avgs=None,**kwargs):
return {**kwargs,'params':[ p.add(-self.lr*avg/(sqr_avg**(0.5)+eps)) for p,avg,sqr_avg in zip(params,avg_grads,sqr_avgs) ]}
our_opt=OurOptimizer(m.parameters(),lr,OurRMSProp,sqr_mom=0.95)
our_parameters=our_opt.step()
opt.step()
parameters_equal(m.parameters(),our_parameters)
opt.zero_grad()
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
our_parameters=our_opt.step()
opt.step()
parameters_equal(m.parameters(),our_parameters)
x = torch.randn([10,3,32,32])
m = nn.Sequential(nn.Conv2d(3,32,7,3,3),nn.Flatten(),nn.Linear(3872,2))
l = nn.CrossEntropyLoss()
lr=0.1
mom=0.9
wd=0.01
eps=1e-05
sqr_mom=0.95
opt=Adam(m.parameters(),lr,mom,sqr_mom,eps,wd)
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
class Weight_Decay:
def __init__(self,params=None,lr=0.0001,wd=0.01,decouple=True,**kwargs):
self.lr=lr
self.wd=wd
self.params=params
self.decouple=decouple
def __call__(self,**kwargs):
params = self._do_wd() if self.decouple else self._do_l2_reg()
return {**kwargs,'params':params}
def _do_wd(self,**kwargs):
params=[p*(1-self.lr*self.wd) for p in self.params]
for p,mp in zip(params,self.params):
p.grad=mp.grad
return params #same as params-lr*wd*params
#this one is pretty ugly
def _do_l2_reg(self,**kwargs):
params=[deepcopy(p) for p in self.params]
for p,mp in zip(params,self.params):
p.grad=mp.grad + self.wd* mp
return params
class OurSGD:
hypers=[Weight_Decay,Momentum]
def __init__(self,params,lr,**kwargs):
self.lr=lr
self.params=params
def __call__(self,params=None,avg_grads=None,**kwargs):
return {**kwargs,'params':[ p.add(-self.lr*avg) for p,avg in zip(params,avg_grads) ]}
class OurOptimizer:
def __init__(self,params,lr,opt,**kwargs):
self.state={'params':list(params),'lr':lr}
self.cbs=[cls(**self.state,**kwargs) for cls in [*opt.hypers,opt]]
def step(self):
state=self.state
for cb in self.cbs:
state=cb(**state)
return state['params']
class Learning_Rate_Decay:
def __init__(self, params=None,sqr_mom=0.99,**kwargs):
self.sqr_mom=sqr_mom
self.sqr_avgs=[torch.zeros_like(p) for p in params]
def __call__(self, params=None, dampening=True, **kwargs):
damp = 1-sqr_mom if dampening else 1.
self.sqr_avgs = [sqr_avg * self.sqr_mom + damp * p.grad.data ** 2 for p,sqr_avg in zip(params,self.sqr_avgs)]
return { **kwargs,'params':params,'sqr_avgs':self.sqr_avgs}
class OurRMSProp:
hypers=[Weight_Decay,Momentum,Learning_Rate_Decay]
def __init__(self,lr,params,**kwargs):
self.lr=lr
self.params=params
def __call__(self,params=None,avg_grads=None,eps=1e-08,sqr_avgs=None,**kwargs):
return {**kwargs,'params':[ p.add(-self.lr*avg/(sqr_avg**(0.5)+eps)) for p,avg,sqr_avg in zip(params,avg_grads,sqr_avgs) ]}
class Step:
def __init__(self,**kwargs):
self.step=0
def __call__(self,**kwargs):
self.step+=1
return {'step':self.step,**kwargs}
class Momentum:
def __init__(self,params=None,lr=0.0001,mom=0.9,**kwargs):
self.mom=mom
self.params=params
self.avg_grads=[torch.zeros_like(p) for p in self.params] #avg_grad is weighted average using momentum
def __call__(self,params=None,**kwargs):
params = self.params if params is None else params
self.avg_grads = [ self.mom*avg_grad+(1-self.mom)*p.grad for p,avg_grad in zip(params,self.avg_grads) ]
return {**kwargs,'params':params,'avg_grads': self.avg_grads}
class OurAdam:
hypers=[Weight_Decay,Momentum,Learning_Rate_Decay,Step]
def __init__(self,lr,params,mom=0.9,sqr_mom=0.99,eps=1e-08,**kwargs):
self.lr=lr
self.params=params
self.mom=mom
self.sqr_mom=sqr_mom
self.eps=eps
def __call__(self,step=1,params=None,avg_grads=None,sqr_avgs=None,**kwargs): #eps=1e-08
sqr_avgs=[sqr_avg/(1 - sqr_mom**step) for sqr_avg in sqr_avgs]
avg_grads = [avg_grad / (1 - mom**step) for avg_grad in avg_grads]
return {**kwargs,'params':[ p.addcdiv( -lr ,grad_avg,(sqr_avg.sqrt() + self.eps )) for p,grad_avg,sqr_avg in zip(params,avg_grads,sqr_avgs) ]}
our_opt=OurOptimizer(m.parameters(),lr,OurAdam,eps=eps,sqr_mom=0.95)
our_parameters=our_opt.step()
opt.step()
#shows parameters close to not being equal
def parameters_equal_show(mps,ops):
for mp,op in zip((mps),ops):
print(mp.masked_select((mp-op).abs()>1e-08),op.masked_select((mp-op).abs()>1e-08))
break
parameters_equal_show(m.parameters(),our_parameters)
parameters_equal(m.parameters(),our_parameters)
opt.zero_grad()
pred=m(x)
loss=l(pred,torch.zeros([pred.size()[0]],dtype=torch.long))
loss.backward()
our_parameters=our_opt.step()
opt.step()
parameters_equal(m.parameters(),our_parameters)
opt.hypers
export2html.notebook2html(fname='2020-07-15-Optimizers.ipynb', dest='html/', template_file='fastpages.tpl',n_workers=1)