from nbdev import export2html

from fastai2.basics import *
from fastai2.vision.all import *

path=untar_data(URLs.IMAGENETTE)

db=DataBlock((ImageBlock, CategoryBlock), get_items=get_image_files, splitter=GrandparentSplitter(valid_name='val'),
                   get_y=parent_label,item_tfms=Resize(420),batch_tfms=aug_transforms(size=240))

dls=db.dataloaders(path)

FP32 Training

We start with a normal fp32 trained model as a baseline.

learner=cnn_learner(dls,resnet50,pretrained=False)

learner.lr_find()

(0.0015848932787775993, 7.585775847473997e-07)

learner.fit_one_cycle(30,lr_max=0.00275)

learner.recorder.plot_loss(skip_start=20,with_valid=False)

del learner

FP16

This is a purely trained fp16 model, and this isn't really done.

class MixedPrecision(Callback):
    "Run training in mixed precision"
    run_before = Recorder

    def __init__(self):
        assert torch.backends.cudnn.enabled, "Mixed precision training requires cudnn."

    def begin_batch(self): self.learn.xb = to_half(self.xb)
    def after_batch(self): self.learn.loss = to_float(self.learn.loss)

class ModelToHalf(Callback):
    "Use with MixedPrecision callback (but it needs to run at the very beginning)"
    run_before=TrainEvalCallback
    def begin_fit(self): self.learn.model = self.model.half() 
    def after_fit(self): self.learn.model = self.model.float() #convert back to float, for saving and such

learner=cnn_learner(dls,resnet50,pretrained=False)
learner.add_cbs((ModelToHalf(),MixedPrecision()))
learner.fit_one_cycle(30,lr_max=0.00275)
learner.recorder.plot_loss(skip_start=20)

del learner

FP16 with FP32 BatchNorm

We now use ModelToHalf from fastai2, the convert_network function converts it to a specfic data type without converting batchnorm.

del ModelToHalf
#reimporting ModelToHalf from fastai2
from fastai2.vision.all import *

class MixedPrecision(Callback):
    "Run training in mixed precision"
    run_before = Recorder

    def __init__(self):
        assert torch.backends.cudnn.enabled, "Mixed precision training requires cudnn."

    def begin_batch(self): self.learn.xb = to_half(self.xb)
    def after_batch(self): self.learn.loss = to_float(self.learn.loss)

learner=cnn_learner(dls,resnet50,pretrained=False)
learner.add_cbs((ModelToHalf(),MixedPrecision()))
learner.fit_one_cycle(30,lr_max=0.00275)
learner.recorder.plot_loss(skip_start=20)

del learner

FP16 with loss in fp32

class MixedPrecision(Callback):
    "Run training in mixed precision"
    toward_end = True

    def __init__(self):
        assert torch.backends.cudnn.enabled, "Mixed precision training requires cudnn."

    def begin_batch(self): self.learn.xb = to_half(self.xb)
    def after_pred(self): self.learn.pred = to_float(self.pred)

learner=cnn_learner(dls,resnet50,pretrained=False)
learner.add_cbs((ModelToHalf(),MixedPrecision()))
learner.fit_one_cycle(30,lr_max=0.00275)
learner.recorder.plot_loss(skip_start=20)

del learner
del MixedPrecision

fp16 with loss in fp32, with loss scale without fp32 accumulate

class MixedPrecision(Callback):
    "Run training in mixed precision"
    toward_end=True

    def __init__(self, loss_scale=512, flat_master=False, dynamic=True, max_loss_scale=2.**24,
                 div_factor=2., scale_wait=500, clip=None):
        assert torch.backends.cudnn.enabled, "Mixed precision training requires cudnn."
        self.flat_master,self.dynamic,self.max_loss_scale = flat_master,dynamic,max_loss_scale
        self.div_factor,self.scale_wait,self.clip = div_factor,scale_wait,clip
        self.loss_scale = max_loss_scale if dynamic else loss_scale

    def begin_fit(self):
        if self.learn.opt is None: self.learn.create_opt()
        self.model_pgs,_ = get_master(self.opt, self.flat_master)
        self.old_pgs = self.opt.param_groups
        #Changes the optimizer so that the optimization step is done in FP32.
        if self.dynamic: self.count = 0

    def begin_batch(self): self.learn.xb = to_half(self.xb)
    def after_pred(self): self.learn.pred = to_float(self.pred)
    def after_loss(self):
        if self.training: self.learn.loss *= self.loss_scale

    def after_backward(self):
        self.learn.loss /= self.loss_scale #To record the real loss
        #First, check for an overflow
        if self.dynamic and grad_overflow(self.model_pgs):
            self.loss_scale /= self.div_factor
            self.model.zero_grad()
            raise CancelBatchException() #skip step and zero_grad

        for params in self.model_pgs:
            for param in params:
                if param.grad is not None: param.grad.div_(self.loss_scale)
        #Check if it's been long enough without overflow
        if self.clip is not None:
            for group in self.model_pgs: nn.utils.clip_grad_norm_(group, self.clip)
        if self.dynamic:
            self.count += 1
            if self.count == self.scale_wait:
                self.count = 0
                self.loss_scale *= self.div_factor

    def after_step(self):
        self.model.zero_grad() #Zero the gradients of the model manually (optimizer disconnected)

    def after_fit(self):
        self.learn.opt.param_groups  = self.old_pgs
        delattr(self, "model_pgs")
        delattr(self, "old_pgs")

learner=cnn_learner(dls,resnet50,pretrained=False)
learner.add_cbs((ModelToHalf(),MixedPrecision()))
learner.fit_one_cycle(30,lr_max=0.00275)
learner.recorder.plot_loss(skip_start=20)

del learner
del MixedPrecision

FP16 with loss_scale

from fastai2.vision.all import *

learner=cnn_learner(dls,resnet50,pretrained=False)
learner.to_fp16()
learner.fit_one_cycle(30,lr_max=0.00275)
learner.recorder.plot_loss(skip_start=20)

Reduced range in fp16⁴. Insert Image

As seen about there is a limited range to fp16. fp16 overflow is when your gradients are too big too fit within fp16. Underflow is when fp16 gradients are so close to 0 that they are simply rounded to 0. Overflow and underflow can happen in fp32 as well, but isn't nearly as big of a problem because fp32 is able to "hold" more information. Loss scaling is multiplying our loss by as large a value as possible,which effectively is multiplied by our gradients to keep them within its representable range. This avoids underflow by increasing the gradients, but we need to make sure it is not big enough that it causes an overflow. One technicality to loss scaling is calculating the loss itself in fp32, because calculating a loss includes division and low precision division is very in-precise leading to unstable gradients/training. Another technicality is that the gradients have to be divided by the loss scale, to keep the magnitudes consistent with a fp32-bit model, otherwise this would be very similar to simply increasing the learning rate by the loss scale! But, wait a minute… if we divide by the loss scale won't we just run into the underflow issue from before? Your right, we have to have a fp32 copy of the weights as well, so that we can avoid this problem. We copy the scaled gradients over to an fp32 model and then divide by the loss scale there. We then continue the optimization step in fp32.

Batch Size

Batch size is imporant in fp16 training. I found that, the very first convolution layer was very unstable in my training. Increasing the batch size tends to smooth out this first layer's gradients as there are more inputs to the layer.

Limiting Maximum Loss and Differential Loss Functions

I found that is was very important to effectively cap the maximum loss and minimum loss. This becaomes a problem if your model is expected to have very large loss values such as values greater than a magnitude of 100. Generally the loss is positive and relatively small, though in the case of Wasserstein (https://arxiv.org/abs/1506.05439) loss the loss can take a negative value. For large maximal values I found that differential losses can be very large, especially if your loss functions look something like (1000 x loss1 + loss2). In the case of differential loss functions, I found that I was able to get satisfactory results simply by changing the function to 1000 x tanh(loss1)+loss2. I expect gradient clipping would also be very helpful, but I have not yet experimented with that myself.

Normalization

In my experience I found normalization to be very important in making sure an architecture performed well in fp16 training. Some architectures with very little normalization I had constant issues with overflowing or underflowing gradients. Introducing Similar normalization layers to those already present in the model gave me similar results to the fp32 models.

Contact Me

I am looking for a Job

Linked-In Profile: https://www.linkedin.com/in/molly-beavers-651025118/ Source Code(WIP): https://github.com/marii-moe/selfie2anime

export2html.notebook2html(fname='2020-05-11_FP16.ipynb', dest='html/', template_file='fastpages.tpl',n_workers=1)

converting: 2020-05-11_FP16.ipynb

epoch	train_loss	valid_loss	time
0	2.651336	2.056535	01:32
1	2.355040	2.005668	01:33
2	1.932189	1.904302	01:33
3	1.597234	11.361675	01:33
4	1.417671	1.946748	01:33
5	1.845148	14.619100	01:33
6	1.722217	3.752728	01:33
7	1.463190	1.973663	01:34
8	1.256870	1.894250	01:33
9	1.120424	2.125210	01:34
10	0.985170	1.701216	01:34
11	0.905693	1.357060	01:34
12	0.827493	0.917599	01:34
13	0.760628	0.794275	01:34
14	0.706049	0.966785	01:34
15	0.672275	0.784713	01:34
16	0.624744	1.014771	01:34
17	0.555164	0.583922	01:34
18	0.504771	0.766427	01:34
19	0.486427	0.609142	01:34
20	0.437780	0.533482	01:34
21	0.411537	0.425947	01:34
22	0.363749	0.550511	01:34
23	0.354133	0.402780	01:34
24	0.296108	0.394826	01:35
25	0.288457	0.379434	01:34
26	0.280953	0.379418	01:34
27	0.264525	0.366601	01:34
28	0.252890	0.366681	01:34
29	0.244159	0.364107	01:34

epoch	train_loss	valid_loss	time
0	2.771560	2.216975	01:08
1	2.476073	2.158066	01:05
2	2.018307	1.633934	01:05
3	1.684251	1.469444	01:05
4	1.457907	nan	01:05
5	1.428203	1.248842	01:05
6	1.536437	3.168239	01:05
7	1.225266	1.096525	01:05
8	1.103817	1.346189	01:05
9	0.969087	1.327698	01:05
10	0.868427	1.027902	01:05
11	0.785972	1.644904	01:04
12	0.724989	1.882721	01:05
13	0.677303	0.913675	01:05
14	0.621354	0.907284	01:04
15	0.583794	0.668342	01:05
16	0.539824	1.777023	01:04
17	0.486352	0.597984	01:04
18	0.448464	1.210160	01:04
19	0.410327	0.628322	01:04
20	0.390925	0.589199	01:04
21	0.358350	0.448581	01:05
22	0.329511	0.430788	01:04
23	0.292921	0.467403	01:05
24	0.274056	0.421481	01:05
25	0.237699	0.403993	01:05
26	0.226854	0.395099	01:05
27	0.231199	0.392145	01:05
28	0.222327	0.391661	01:05
29	0.220277	0.392155	01:05

epoch	train_loss	valid_loss	time
0	2.676094	2.062241	01:02
1	2.394882	2.239057	01:02
2	2.002832	1.627104	01:03
3	1.757027	1.800131	01:02
4	1.429366	3.083183	01:02
5	1.479008	4.834050	01:02
6	1.459492	1.715717	01:02
7	1.335028	1.120237	01:02
8	1.188040	1.130354	01:02
9	0.991492	1.022801	01:02
10	0.864592	1.437108	01:02
11	0.796745	0.886712	01:02
12	0.723405	0.980490	01:02
13	0.682777	0.801665	01:02
14	0.618106	0.826624	01:02
15	0.580326	0.619600	01:02
16	0.558647	0.959586	01:02
17	0.504328	3.640037	01:02
18	0.460288	0.511587	01:02
19	0.419463	0.492790	01:03
20	0.401901	0.803859	01:02
21	0.355560	0.455122	01:02
22	0.318891	0.428618	01:02
23	0.290560	0.404963	01:02
24	0.269655	0.456370	01:02
25	0.242815	0.396282	01:03
26	0.236340	0.376630	01:02
27	0.233574	0.381232	01:02
28	0.220733	0.376414	01:02
29	0.223098	0.378601	01:02

epoch	train_loss	valid_loss	time
0	2.656821	2.037143	01:02
1	2.375797	1.916572	01:03
2	2.028292	1.720190	01:03
3	1.701117	1.807122	01:03
4	1.499982	2.164554	01:03
5	1.584262	1.594690	01:02
6	1.310206	1.201177	01:02
7	1.285866	6.326769	01:03
8	1.195278	2.182448	01:02
9	1.027358	1.015578	01:02
10	0.937293	0.922560	01:02
11	0.843483	1.100431	01:02
12	0.765530	1.015288	01:02
13	0.717502	0.840533	01:03
14	0.686295	1.225734	01:02
15	0.615597	0.854381	01:02
16	0.556095	0.558067	01:02
17	0.528304	1.195737	01:02
18	0.495503	0.668504	01:02
19	0.457330	0.541667	01:02
20	0.418028	1.149197	01:02
21	0.369291	0.504387	01:03
22	0.338492	0.522376	01:02
23	0.316736	0.404864	01:02
24	0.277326	0.405616	01:02
25	0.273004	0.402565	01:03
26	0.244644	0.393408	01:03
27	0.229975	0.391603	01:02
28	0.224649	0.389079	01:02
29	0.237047	0.389761	01:02

epoch	train_loss	valid_loss	time
0	2.708762	2.261719	01:08
1	2.401522	1.939942	01:10
2	1.998625	2.204348	01:10
3	1.600047	1.737028	01:10
4	1.380643	3.546641	01:10
5	1.532201	3.255549	01:10
6	1.318069	1.603604	01:10
7	1.233026	1.262678	01:10
8	1.016523	1.241237	01:10
9	0.907380	2.354774	01:11
10	0.842147	0.953867	01:10
11	0.756705	1.135527	01:10
12	0.729484	0.750674	01:11
13	0.700462	1.289216	01:10
14	0.617139	0.583479	01:11
15	0.570849	0.880843	01:10
16	0.551235	0.582794	01:10
17	0.480748	0.767345	01:10
18	0.460275	0.978343	01:10
19	0.414565	0.457135	01:10
20	0.378115	0.449951	01:10
21	0.337399	0.486601	01:10
22	0.307479	0.414679	01:10
23	0.298396	0.416613	01:10
24	0.269521	0.396884	01:10
25	0.233148	0.387787	01:10
26	0.243302	0.372072	01:10
27	0.218694	0.371554	01:10
28	0.218254	0.371993	01:10
29	0.218056	0.371956	01:10

epoch	train_loss	valid_loss	time
0	2.757428	2.081527	01:12
1	2.353672	2.234412	01:14
2	2.012591	1.837813	01:13
3	1.635887	3.204813	01:14
4	1.531814	1.518298	01:13
5	1.446371	nan	01:14
6	1.498608	2.455172	01:13
7	1.350126	1.248328	01:13
8	1.250748	6.087388	01:13
9	1.085607	5.156849	01:13
10	1.042411	1.595870	01:13
11	0.934609	1.178143	01:13
12	0.805411	0.984007	01:14
13	0.760592	2.332953	01:13
14	0.708662	1.113873	01:13
15	0.641280	0.628776	01:14
16	0.588315	2.150546	01:13
17	0.537062	1.012078	01:13
18	0.507948	0.634026	01:14
19	0.488397	0.671309	01:13
20	0.448709	0.809253	01:14
21	0.393053	0.442282	01:13
22	0.365035	0.425639	01:14
23	0.327412	0.451561	01:13
24	0.328089	0.423867	01:13
25	0.279865	0.396011	01:14
26	0.261059	0.378210	01:13
27	0.260404	0.366170	01:14
28	0.247177	0.362599	01:14
29	0.262104	0.363247	01:14