yoloV5_6.1代码逐行解释__train.py

因为自己经常要使用yolov5,所以就用了一些时间研究了一下,也是收获颇丰,就放上来,一是方便自己以后查阅,二是希望能帮到需要的朋友.

我写的注释比较详细,主要是针对刚会python和了解深度学习的小白,有错误的和不懂的地方可以联系我,大家相互交流学习哦VX:Y1685637070

之前一篇文章注释的detet.py,时隔很久终于注释完了train.py(关于创建数据集和调用其他接口的函数正字注释中),后续会继续注释yolo.py和comon.py,卷起来!

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
"""
Train a YOLOv5 model on a custom dataset.

Models and datasets download automatically from the latest YOLOv5 release.
Models: https://github.com/ultralytics/yolov5/tree/master/models
Datasets: https://github.com/ultralytics/yolov5/tree/master/data
Tutorial: https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data

Usage:
    $ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640  # from pretrained (RECOMMENDED)
    $ python path/to/train.py --data coco128.yaml --weights '' --cfg yolov5s.yaml --img 640  # from scratch
"""

import argparse
import math
import random
import sys
import time
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import yaml
from torch.cuda import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import SGD, Adam, AdamW, lr_scheduler
from tqdm import tqdm
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
FILE = Path(__file__).resolve()  # 获取绝对路径
ROOT = FILE.parents[0]  # 获取上一级目录
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # 将路径添加至环境变量中方便后续导入各种包
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative

import val  # for end-of-epoch mAP
from models.experimental import attempt_load
from models.yolo import Model
from utils.autoanchor import check_anchors
from utils.autobatch import check_train_batch_size
from utils.callbacks import Callbacks
from utils.datasets import create_dataloader
from utils.downloads import attempt_download
from utils.general import (LOGGER, check_dataset, check_file, check_git_status, check_img_size, check_requirements,
                           check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds,
                           intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle,
                           print_args, print_mutation, strip_optimizer)
from utils.loggers import Loggers
from utils.loggers.wandb.wandb_utils import check_wandb_resume
from utils.loss import ComputeLoss
from utils.metrics import fitness
from utils.plots import plot_evolve, plot_labels
from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, select_device, torch_distributed_zero_first

LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # 'LOCAL_RANK'分布式训练中的序号,如没有就返回-1
# https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))

# 在整个分布式中的序号,每个进程都有一个rank和一个local_rank,
# rank是相对整个分布式而言(就是序号从0开始一直到整个分布式中最后一个GPU的数,
# 类似于range(0,整个分布式GPU数量),这里不是相对于一个node而言,
# 是所有node的GPU总和),local_rank是每个进程或者GPU相对属于哪个node而言的编号。
# 另外,rank=0代表master进程

WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))  # WORLD_SIZEGPU数量


def train(hyp,  # path/to/hyp.yaml or hyp dictionary
          opt,
          device,
          callbacks
          ):
    save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = 
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, 
        opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze # 参数赋值

    # Directories
    w = save_dir / 'weights'  # weights dir 权重保存位子
    (w.parent if evolve else w).mkdir(parents=True, exist_ok=True)  # make dir 如果文件夹不存在就创建
    last, best = w / 'last.pt', w / 'best.pt' # 两个权重文件的地址

    # Hyperparameters
    if isinstance(hyp, str): # 判断hyp是否未字符串
        with open(hyp, errors='ignore') as f:
            hyp = yaml.safe_load(f)  # load hyps dict 打开hyp读取超参数
    LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) #将超参数输出

    # Save run settings
    if not evolve:
        with open(save_dir / 'hyp.yaml', 'w') as f:
            yaml.safe_dump(hyp, f, sort_keys=False) # 将超参数写入yaml文件中并保存
        with open(save_dir / 'opt.yaml', 'w') as f:
            yaml.safe_dump(vars(opt), f, sort_keys=False)# 传入yaml文件中并保存

    # Loggers
    data_dict = None
    if RANK in [-1, 0]: #还是判断分布式训练
        loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)  # 日志记录
        if loggers.wandb: #使用wandb记录过程
            data_dict = loggers.wandb.data_dict
            if resume:
                weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size

        # Register actions
        for k in methods(loggers):
            callbacks.register_action(k, callback=getattr(loggers, k))
            #运行callbacks中的每个函数

    # Config
    plots = not evolve  # create plots
    cuda = device.type != 'cpu' # 使用什么卡
    init_seeds(1 + RANK) #随机种子初始化
    with torch_distributed_zero_first(LOCAL_RANK):
        data_dict = data_dict or check_dataset(data)  # 检查数据是否存在
    train_path, val_path = data_dict['train'], data_dict['val'] # 训练集路径和测试集路径
    nc = 1 if single_cls else int(data_dict['nc'])  # class
    names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # 单个类别时使用class names
    assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'  # 检查类别数目是统一
    is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt')  # COCO dataset 检查是否未coco数据集

    # Model
    check_suffix(weights, '.pt')  # check weights 检查权重文件按是否未pt结尾
    pretrained = weights.endswith('.pt') #判断加载的预训练模型是否为pt文件
    if pretrained: # 判断是否有预训练模型
        with torch_distributed_zero_first(LOCAL_RANK):
            weights = attempt_download(weights)  # download if not found locally如果权重不存在就下载
        ckpt = torch.load(weights, map_location='cpu')  # 通过torch加载豫训练模型 load checkpoint to CPU to avoid CUDA memory leak
        model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create创建网络,优先cfg
        exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []  # exclude keys从cfg或者hyo中得到anchors,优先cfg,也就是说如果需要改anchoes改yaml文件里的就行了
        csd = ckpt['model'].float().state_dict()  # checkpoint state_dict as FP32 取出豫训练模型的参数
        csd = intersect_dicts(csd, model.state_dict(), exclude=exclude)  # intersect提取预训练模型与新模型相同的部分
        model.load_state_dict(csd, strict=False)  # load加载相同的部分
        LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')  # report
    else:
        model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create 开始根据yml文件构建网络

    # Freeze
    freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))]  # layers to freeze
    # 做迁移学习时冻结某些层
    for k, v in model.named_parameters(): # 遍历每个层
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze): # 如果层在freeze中就冻结
            LOGGER.info(f'freezing {k}') # 打印
            v.requires_grad = False # 不更新对应膜层的梯度

    # Image size
    gs = max(int(model.stride.max()), 32)  # grid size (max stride) 在三个下采样的stride取最大值
    imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2)  # verify imgsz is gs-multiple 确认图片尺寸可以整除32

    # Batch size
    if RANK == -1 and batch_size == -1:  # single-GPU only, estimate best batch size,仅仅单个GPU,自动挑选最佳batchsize
        batch_size = check_train_batch_size(model, imgsz) # 计算batch_size
        loggers.on_params_update({"batch_size": batch_size})

    # Optimizer
    nbs = 64  # 每64张图片更新一次权重,为了让权重稳定的更新 nominal batch size
    accumulate = max(round(nbs / batch_size), 1)  # 计算累计几次到64accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  #根据batchsize更新 weight_decay  cale weight_decay
    LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")

    g0, g1, g2 = [], [], []  # optimizer parameter groups 各层权重
    for v in model.modules(): #遍历网络
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):  # bias
            g2.append(v.bias)
        if isinstance(v, nn.BatchNorm2d):  # weight (no decay)
            g0.append(v.weight)
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):  # weight (with decay)
            g1.append(v.weight)

    if opt.optimizer == 'Adam': #损失函数
        optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
    elif opt.optimizer == 'AdamW':
        optimizer = AdamW(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
    else:
        optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)

    optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']})  # 防止过拟合 add g1 with weight_decay
    optimizer.add_param_group({'params': g2})  # add g2 (biases)
    LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
                f"{len(g0)} weight (no decay), {len(g1)} weight, {len(g2)} bias")
    del g0, g1, g2

    # Scheduler 两个学习率下降方式
    if opt.cos_lr: # 用cos函数下将
        lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
    else:
        lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf']  # linear 直线下降
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)  # plot_lr_scheduler(optimizer, scheduler, epochs)


    # EMA 指数滑窗平均,目的是让weight变化更加平滑
    ema = ModelEMA(model) if RANK in [-1, 0] else None

    # Resume
    start_epoch, best_fitness = 0, 0.0 #开始轮数,最好的结果轮数
    if pretrained: # 判断是否为继续训练
        # Optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # EMA
        if ema and ckpt.get('ema'):
            ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
            ema.updates = ckpt['updates']

        # Epochs
        start_epoch = ckpt['epoch'] + 1 #从上次的epoch接着训练
        if resume:
            assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
        if epochs < start_epoch:
            LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt, csd

    # DP mode 使用dpMode 训练,目前一般不使用
    if cuda and RANK == -1 and torch.cuda.device_count() > 1:
        LOGGER.warning('WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.n'
                       'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
        model = torch.nn.DataParallel(model)

    # SyncBatchNorm
    if opt.sync_bn and cuda and RANK != -1: # 多卡训练,把不同卡的数据做个同步
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
        LOGGER.info('Using SyncBatchNorm()')

    # Trainloader 训练集
    train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
                                              hyp=hyp, augment=True, cache=None if opt.cache == 'val' else opt.cache,
                                              rect=opt.rect, rank=LOCAL_RANK, workers=workers,
                                              image_weights=opt.image_weights, quad=opt.quad,
                                              prefix=colorstr('train: '), shuffle=True) #
    mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max())  # max label class 也就是class数目
    nb = len(train_loader)  # number of batches 一共有多少个batch,也就是图片数量/batchsize
    assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
    # 从数据中获得的classs 不能比yaml中给的多
    # Process 0 训练集
    if RANK in [-1, 0]: #创建验证集
        val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls,
                                       hyp=hyp, cache=None if noval else opt.cache,
                                       rect=True, rank=-1, workers=workers * 2, pad=0.5,
                                       prefix=colorstr('val: '))[0]

        if not resume:
            labels = np.concatenate(dataset.labels, 0) # 所有的图片上的框和类别放到一个nmpy里
            # c = torch.tensor(labels[:, 0])  # classes
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
            # model._initialize_biases(cf.to(device))
            if plots: # 画出数据分布图
                plot_labels(labels, names, save_dir)

            # Anchors
            if not opt.noautoanchor: # 判断是否计算锚框
                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # 检查anchors设置是否合理
            model.half().float()  # pre-reduce anchor precision 半精度训练具体含义不知

        callbacks.run('on_pretrain_routine_end')

    # DDP mode
    if cuda and RANK != -1: #多卡时转换为ddpmodel
        model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)

    # Model attributes
    nl = de_parallel(model).model[-1].nl  # 构建网路时有几层检测层 number of detection layers (to scale hyps)
    hyp['box'] *= 3 / nl  # scale to layers 原始的hyp是有三个检测层,如果设置了多层,会将权重scale
    hyp['cls'] *= nc / 80 * 3 / nl  # scale to classes and layers 同上
    hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl  # scale to image size and layers 置信度权重
    hyp['label_smoothing'] = opt.label_smoothing
    model.nc = nc  # attach number of classes to model 权重变换后更新到model中
    model.hyp = hyp  # attach hyperparameters to model
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights 均衡各个类别的权重
    model.names = names # 每个class对应的名字

    # Start training
    t0 = time.time()
    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  #热身训练的batch数,也就是需要多少张图片热身 number of warmup iterations, max(3 epochs, 1k iterations)
    # nw = min(nw, (epochs - start_epoch) /  2 * nb)  # limit warmup to < 1/2 of training
    last_opt_step = -1
    maps = np.zeros(nc)  # mAP per class 每个class的map值
    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, [email protected], [email protected], val_loss(box, obj, cls) 保存各种结果
    scheduler.last_epoch = start_epoch - 1  # do not move
    scaler = amp.GradScaler(enabled=cuda) # amp
    stopper = EarlyStopping(patience=opt.patience) # 连续训练几轮 loss没有下降就会停止
    compute_loss = ComputeLoss(model)  # init loss class
    LOGGER.info(f'Image sizes {imgsz} train, {imgsz} valn'
                f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workersn'
                f"Logging results to {colorstr('bold', save_dir)}n"
                f'Starting training for {epochs} epochs...')
    for epoch in range(start_epoch, epochs):  # 开始训练epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional, single-GPU only)
        if opt.image_weights: # 参考opt注释
            cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # 根据每个class的map值更新class weights
            iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # 根据class计算的weight更新image weights
            dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # 下次训练时根据权重挑选数据 rand weighted idx

        # Update mosaic border (optional)
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

        mloss = torch.zeros(3, device=device)  # mean losses 平均loss
        if RANK != -1:
            train_loader.sampler.set_epoch(epoch)
        pbar = enumerate(train_loader) #进度条
        LOGGER.info(('n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
        if RANK in [-1, 0]:
            pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')  #pbar初始化,参考tqdm的使用 progress bar
        optimizer.zero_grad() # 清空梯度
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  #指历时以来训练了多少battch number integrated batches (since train start)
            imgs = imgs.to(device, non_blocking=True).float() / 255  #  将图像进行一个 255的归一化 uint8 to float32, 0-255 to 0.0-1.0

            # Warmup
            if ni <= nw: #热身过程
                xi = [0, nw]  # x interp wram up的过程
                # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) #计算累计多少个batch计算迭代,因为是热身,可在第一次就迭代
                for j, x in enumerate(optimizer.param_groups): # 遍历param_groups
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp (ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) # 更新学习率,在param_groups里的缓慢更新,不在的快速更新,具体可了解yolov5学习率更新策略
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # 同上

            # Multi-scale
            if opt.multi_scale: # 多尺度训练
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # 在0.5到1.5的倍率之间随机取imgsize
                sf = sz / max(imgs.shape[2:])  # scale factor
                if sf != 1: # 如果不等于1 就resize
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                    imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Forward
            with amp.autocast(enabled=cuda): #混合精度训练
                pred = model(imgs)  # forward 前向传播,返回预测值
                loss, loss_items = compute_loss(pred, targets.to(device))  # 计算loss值,loss为总的loss,items为box,conf,class的loss,class loss scaled by batch_size
                if RANK != -1:
                    loss *= WORLD_SIZE  # gradient averaged between devices in DDP mode
                if opt.quad:
                    loss *= 4.

            # Backward
            scaler.scale(loss).backward()

            # Optimize
            if ni - last_opt_step >= accumulate: # 优化迭代,如果现在的epoch和上一次的epoch的差值大于accumulate就会更新权重
                scaler.step(optimizer)  # optimizer.step,step更细
                scaler.update()
                optimizer.zero_grad() # 清零梯度
                if ema:
                    ema.update(model) #将计算出来的weight加到model中
                last_opt_step = ni #更新轮数

            # Log
            if RANK in [-1, 0]:
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses 平均loss
                mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'  # (GB) 计算需要多少内存
                pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
                    f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) #打印
                callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn)
                if callbacks.stop_training:
                    return
            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        lr = [x['lr'] for x in optimizer.param_groups]  # for loggers 将学习率取出方便计算打印log
        scheduler.step()

        if RANK in [-1, 0]:
            # mAP
            callbacks.run('on_train_epoch_end', epoch=epoch)
            ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
            final_epoch = (epoch + 1 == epochs) or stopper.possible_stop #检查是否为最后一个epoch
            if not noval or final_epoch:  # Calculate mAP 计算结果,得到results和map
                results, maps, _ = val.run(data_dict,
                                           batch_size=batch_size // WORLD_SIZE * 2,
                                           imgsz=imgsz,
                                           model=ema.ema,
                                           single_cls=single_cls,
                                           dataloader=val_loader,
                                           save_dir=save_dir,
                                           plots=False,
                                           callbacks=callbacks,
                                           compute_loss=compute_loss)

            # Update best mAP
            fi = fitness(np.array(results).reshape(1, -1))  # 从 [P, R, [email protected], [email protected]] 中得到fi,用于比较,得出最好的epochweighted combination of [P, R, [email protected], [email protected]]
            if fi > best_fitness: #如果fi大于目前最佳值,就赋值
                best_fitness = fi
            log_vals = list(mloss) + list(results) + lr # 得到结果
            callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) #打印

            # Save model
            if (not nosave) or (final_epoch and not evolve):  # if save 保存model,每一次结果后均保存
                # 将各种信息保存
                ckpt = {'epoch': epoch, #
                        'best_fitness': best_fitness,
                        'model': deepcopy(de_parallel(model)).half(),
                        'ema': deepcopy(ema.ema).half(),
                        'updates': ema.updates,
                        'optimizer': optimizer.state_dict(),
                        'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None,
                        'date': datetime.now().isoformat()}

                # Save last, best and delete
                torch.save(ckpt, last) # 将ckpt保存到last.pt
                if best_fitness == fi: # 判断是否和最好的相等,如果和最好的相等就保存的best.pt
                    torch.save(ckpt, best)
                if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0):
                    torch.save(ckpt, w / f'epoch{epoch}.pt')
                del ckpt # 保存完毕删掉ckpt
                callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)

            # Stop Single-GPU
            if RANK == -1 and stopper(epoch=epoch, fitness=fi): #判断是否停止
                break

            # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
            # stop = stopper(epoch=epoch, fitness=fi)
            # if RANK == 0:
            #    dist.broadcast_object_list([stop], 0)  # broadcast 'stop' to all ranks

        # Stop DPP
        # with torch_distributed_zero_first(RANK):
        # if stop:
        #    break  # must break all DDP ranks

        # end epoch ----------------------------------------------------------------------------------------------------
    # end training -----------------------------------------------------------------------------------------------------
   #结束训练
    if RANK in [-1, 0]:
        LOGGER.info(f'n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.') # 记录时间
        for f in last, best: #遍历last,best
            if f.exists():
                strip_optimizer(f)  # 如果文件存在,就删除其中某些元素,精简模型strip optimizers
                if f is best: #如果是best,得到最佳的结果
                    LOGGER.info(f'nValidating {f}...')
                    results, _, _ = val.run(data_dict,
                                            batch_size=batch_size // WORLD_SIZE * 2,
                                            imgsz=imgsz,
                                            model=attempt_load(f, device).half(),
                                            iou_thres=0.65 if is_coco else 0.60,  # best pycocotools results at 0.65
                                            single_cls=single_cls,
                                            dataloader=val_loader,
                                            save_dir=save_dir,
                                            save_json=is_coco,
                                            verbose=True,
                                            plots=True,
                                            callbacks=callbacks,
                                            compute_loss=compute_loss)  # val best model with plots
                    if is_coco:
                        callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi)
        callbacks.run('on_train_end', last, best, plots, epoch, results)
        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")

    torch.cuda.empty_cache()
    return results# 返回结果


def parse_opt(known=False):
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', type=str, default=ROOT / '', help='initial weights path')
    # 预训练模型
    parser.add_argument('--cfg', type=str, default='models/my_test.yaml', help='model.yaml path')
    # 模型配置文件,修改网络的话修改这个文件就可以了
    parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path')
    # 自己的数据集位置及类别
    parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch-low.yaml', help='hyperparameters path')
    # 超参数
    parser.add_argument('--epochs', type=int, default=3)
    # 训练轮数
    parser.add_argument('--batch-size', type=int, default=-1, help='total batch size for all GPUs, -1 for autobatch')
    # 每批次的输入数据量;default=-1将时自动调节batchsize大小
    parser.add_argument('--imgsz', '--img', '--img-size', default=640, type=int, help='train, val image size (pixels)')
    # 训练集和测试集图片的像素大小;输入默认640*640,这个参数在你选择yolov5l那些大一点的权重的时候,要进行适当的调整,这样才能达到好的效果
    parser.add_argument('--rect', action='store_true', help='rectangular training')
    # 是否采用矩形阵训练,就是一个加速训练的方法,所谓矩阵推理就是不再要求你训练的图片是正方形了;矩阵推理会加速模型的推理过程,减少一些冗余信息
    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
    # 如果训练因为某些问题终端可以启用这个参数继续训练
    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
    # 是否只保存最后一轮的pt文件;我们默认是保存best.pt和last.pt两个的
    parser.add_argument('--noval', action='store_true', help='only validate final epoch')
    # 只在最后一轮测试;正常情况下每个epoch都会计算mAP,但如果开启了这个参数,那么就只在最后一轮上进行测试,不建议开启
    parser.add_argument('--noautoanchor', action='store_true', help='disable AutoAnchor')
    # 是否禁用自动锚框;默认是开启的,自动锚点的好处是可以简化训练过程
    parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
    # 超参数进化,就是说训练完后会根据结果自动调整超参数,启用后会大大增加训练时间,建议不使用
    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
    # 谷歌云盘的相关项,一般不会用到
    parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
    # 是否提前缓存图片到内存,以加快训练速度,默认False;开启这个参数就会对图片进行缓存,从而更好的训练模型。
    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
    # 是否启用加权图像策略,默认是不开启的;主要是为了解决样本不平衡问题;开启后会对于上一轮训练效果不好的图片,在下一轮中增加一些权重
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
    # 是否启用多尺度训练,默认是不开启的;多尺度训练是指设置几种不同的图片输入尺度,训练时每隔一定iterations随机选取一种尺度训练,这样训练出来的模型鲁棒性更强
    parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
    # 设定训练数据集是单类别还是多类别;默认为 false多类别
    parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'AdamW'], default='SGD', help='optimizer')
    # 选择优化器;默认为SGD,可选SGD,Adam,AdamW
    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
    # 是否开启跨卡同步BN;开启参数后即可使用 SyncBatchNorm多 GPU 进行分布式训练,仅在DDP(分布式训练)模式下有效
    parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)')
    # 最大worker数量;这里经常出问题,建议设置成0
    parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
    # 指定训练好的模型的保存路径;默认在runs / train
    parser.add_argument('--name', default='expCTR3', help='save to project/name')
    # 保存文件夹的名字
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    # 每次预测模型的结果是否保存在原来的文件夹;如果指定了这个参数的话,那么本次预测的结果还是保存在上一次保存的文件夹里;如果不指定就是每次预测结果保存一个新的文件夹下。
    parser.add_argument('--quad', action='store_true', help='quad dataloader')
    # 好处是在比默认 640 大的数据集上训练效果更好
    # 副作用是在 640 大小的数据集上训练效果可能会差一些
    parser.add_argument('--cos-lr', action='store_true', help='cosine LR scheduler')
    # 是否开启余弦学习率 使学习率的变化更为平滑,有助于模型训练
    parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
    # 是否对标签进行平滑处理,默认是不启用的
    """在训练样本中,我们并不能保证所有sample都标注正确,
    如果某个样本标注错误,就可能产生负面印象,
    如果我们有办法“告诉”模型,样本的标签不一定正确,
    那么训练出来的模型对于少量的样本错误就会有“免疫力”采用随机化的标签作为训练数据时,
    损失函数有1-ε的概率与上面的式子相同,比如说告诉模型只有0.95概率是那个标签。"""
    parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')
    # 早停;如果模型在default值轮数里没有提升,则停止训练模型
    parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2')
    # 指定冻结层数量;可以在yolov5s.yaml中查看主干网络层数
    """冻结训练是迁移学习常用的方法,当我们在使用数据量不足的情况下,
    通常我们会选择公共数据集提供权重作为预训练权重,
    我们知道网络的backbone主要是用来提取特征用的,
    一般大型数据集训练好的权重主干特征提取能力是比较强的,
    这个时候我们只需要冻结主干网络,fine - tune后面层就可以了,
    不需要从头开始训练,大大减少了实践而且还提高了性能。"""
    parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)')
    # 用于设置多少个epoch保存一下checkpoint
    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
    # DistributedDataParallel 单机多卡训练,单GPU设备不需要设置
    # Weights & Biases arguments
    parser.add_argument('--entity', default=None, help='W&B: Entity')
    # 在线可视化工具,类似于tensorboard
    parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='W&B: Upload data, "val" option')
    # 是否上传dataset到wandb tabel(将数据集作为交互式 dsviz表 在浏览器中查看、查询、筛选和分析数据集) 默认False
    parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
    # 设置界框图像记录间隔 Set bounding-box image logging interval for W&B 默认-1
    parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')
    # 暂未实现
    opt = parser.parse_known_args()[0] if known else parser.parse_args()
    return opt


def main(opt, callbacks=Callbacks()):
    # Checks
    if RANK in [-1, 0]:
        print_args(FILE.stem, opt) #打印各个参数
        check_git_status() # 检查是否有新版模型
        check_requirements(exclude=['thop']) #检查环境

    # Resume
    if opt.resume and not check_wandb_resume(opt) and not opt.evolve:  # 检查是否继续训练
        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # 如果给定就参数就接着参数训练,如果没给定就获取最后一次训练结果
        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' # 文件不存在就报错
        with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f:
            opt = argparse.Namespace(**yaml.safe_load(f))  # replace 读取之前训练结果里的yaml文件
        opt.cfg, opt.weights, opt.resume = '', ckpt, True  # reinstate
        LOGGER.info(f'Resuming training from {ckpt}')
    else:
        opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = 
            check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project)  # 导入参数
        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
        # 两个构建模型的参数必须有一个,不然无法构建模型
        if opt.evolve: # 判断是否开始超参数遗传
            if opt.project == str(ROOT / 'runs/train'):  # if default project name, rename to runs/evolve
                opt.project = str(ROOT / 'runs/evolve')
            opt.exist_ok, opt.resume = opt.resume, False  # pass resume to exist_ok and disable resume
        opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) #参数保存地址,递增

    # DDP mode
    device = select_device(opt.device, batch_size=opt.batch_size)  #选择GPU,如果不给值会自动挑选
    if LOCAL_RANK != -1:# 分布式训练
        msg = 'is not compatible with YOLOv5 Multi-GPU DDP training'
        assert not opt.image_weights, f'--image-weights {msg}'
        assert not opt.evolve, f'--evolve {msg}'
        assert opt.batch_size != -1, f'AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size'
        assert opt.batch_size % WORLD_SIZE == 0, f'--batch-size {opt.batch_size} must be multiple of WORLD_SIZE'
        assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
        torch.cuda.set_device(LOCAL_RANK)
        device = torch.device('cuda', LOCAL_RANK)
        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")

    # Train
    if not opt.evolve:
        train(opt.hyp, opt, device, callbacks)
        if WORLD_SIZE > 1 and RANK == 0:
            LOGGER.info('Destroying process group... ')
            dist.destroy_process_group()

    # Evolve hyperparameters (optional)
    else:
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
                'box': (1, 0.02, 0.2),  # box loss gain
                'cls': (1, 0.2, 4.0),  # cls loss gain
                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
                'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
                'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight
                'iou_t': (0, 0.1, 0.7),  # IoU training threshold
                'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold
                'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)
                'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)
                'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)
                'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
                'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)
                'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)
                'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)
                'scale': (1, 0.0, 0.9),  # image scale (+/- gain)
                'shear': (1, 0.0, 10.0),  # image shear (+/- deg)
                'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
                'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)
                'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)
                'mosaic': (1, 0.0, 1.0),  # image mixup (probability)
                'mixup': (1, 0.0, 1.0),  # image mixup (probability)
                'copy_paste': (1, 0.0, 1.0)}  # segment copy-paste (probability)

        with open(opt.hyp, errors='ignore') as f:
            hyp = yaml.safe_load(f)  # load hyps dict
            if 'anchors' not in hyp:  # anchors commented in hyp.yaml
                hyp['anchors'] = 3
        opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir)  # only val/save final epoch
        # ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indices
        evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
        if opt.bucket:
            os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {evolve_csv}')  # download evolve.csv if exists

        for _ in range(opt.evolve):  # generations to evolve
            if evolve_csv.exists():  # if evolve.csv exists: select best hyps and mutate
                # Select parent(s)
                parent = 'single'  # parent selection method: 'single' or 'weighted'
                x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
                n = min(5, len(x))  # number of previous results to consider
                x = x[np.argsort(-fitness(x))][:n]  # top n mutations
                w = fitness(x) - fitness(x).min() + 1E-6  # weights (sum > 0)
                if parent == 'single' or len(x) == 1:
                    # x = x[random.randint(0, n - 1)]  # random selection
                    x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
                elif parent == 'weighted':
                    x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination

                # Mutate
                mp, s = 0.8, 0.2  # mutation probability, sigma
                npr = np.random
                npr.seed(int(time.time()))
                g = np.array([meta[k][0] for k in hyp.keys()])  # gains 0-1
                ng = len(meta)
                v = np.ones(ng)
                while all(v == 1):  # mutate until a change occurs (prevent duplicates)
                    v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
                for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
                    hyp[k] = float(x[i + 7] * v[i])  # mutate

            # Constrain to limits
            for k, v in meta.items():
                hyp[k] = max(hyp[k], v[1])  # lower limit
                hyp[k] = min(hyp[k], v[2])  # upper limit
                hyp[k] = round(hyp[k], 5)  # significant digits

            # Train mutation
            results = train(hyp.copy(), opt, device, callbacks)
            callbacks = Callbacks()
            # Write mutation results
            print_mutation(results, hyp.copy(), save_dir, opt.bucket)

        # Plot results
        plot_evolve(evolve_csv)
        LOGGER.info(f'Hyperparameter evolution finished {opt.evolve} generationsn'
                    f"Results saved to {colorstr('bold', save_dir)}n"
                    f'Usage example: $ python train.py --hyp {evolve_yaml}')


def run(**kwargs):
    # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
    opt = parse_opt(True)
    for k, v in kwargs.items():
        setattr(opt, k, v)
    main(opt)
    return opt


if __name__ == "__main__":
    opt = parse_opt()
    main(opt)

本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
THE END
分享
二维码
< <上一篇
下一篇>>