承接上一篇博客:https://blog.csdn.net/qq_38964360/article/details/128728145?spm=1001.2014.3001.5501
今天记录一下yolov8模型训练的调试过程。
在工程里创建训练脚本python_example.py
代码如下:
   
    - 
     
      
     
     
      
       # filename: python_example.py
      
     
- 
     
      
     
     
      
       # dir: yolov8/python_example.py
      
     
- 
     
      
     
     
      
       from ultralytics 
       import YOLO
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       # build a new model from scratch
      
     
- 
     
      
     
     
      
       model = YOLO(
       "yolov8/ultralytics/models/v8/yolov8m.yaml")
      
     
- 
     
      
     
     
      
       # train the model
      
     
- 
     
      
     
     
      
       results = model.train(data=
       "yolov8/ultralytics/yolo/data/datasets/wider_face.yaml", epochs=
       100)
      
     
以上示例首先是利用yolov8m.yaml文件初始化了YOLO类,
model = YOLO("yolov8/ultralytics/models/v8/yolov8m.yaml")接下来可以看看YOLO的__init__,
   
    - 
     
      
     
     
      
       # dir: yolov8/ultralytics/yolo/engine/model.py
      
     
- 
     
      
     
     
      
       class 
       YOLO:
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
          
       def 
       __init__(
       self, model='yolov8n.yaml', type="v8") -> 
       None:
      
     
- 
     
      
     
     
              
       """
      
     
- 
     
      
     
     
      
        Initializes the YOLO object.
      
     
- 
     
      
     
     
      
       
      
     
- 
     
      
     
     
      
        Args:
      
     
- 
     
      
     
     
      
        model (str, Path): model to load or create
      
     
- 
     
      
     
     
      
        type (str): Type/version of models to use. Defaults to "v8".
      
     
- 
     
      
     
     
      
        """
      
     
- 
     
      
     
     
      
               self.
       type = 
       type
      
     
- 
     
      
     
     
      
               self.ModelClass = 
       None  
       # model class
      
     
- 
     
      
     
     
      
               self.TrainerClass = 
       None  
       # trainer class
      
     
- 
     
      
     
     
      
               self.ValidatorClass = 
       None  
       # validator class
      
     
- 
     
      
     
     
      
               self.PredictorClass = 
       None  
       # predictor class
      
     
- 
     
      
     
     
      
               self.model = 
       None  
       # model object
      
     
- 
     
      
     
     
      
               self.trainer = 
       None  
       # trainer object
      
     
- 
     
      
     
     
      
               self.task = 
       None  
       # task type
      
     
- 
     
      
     
     
      
               self.ckpt = 
       None  
       # if loaded from *.pt
      
     
- 
     
      
     
     
      
               self.cfg = 
       None  
       # if loaded from *.yaml
      
     
- 
     
      
     
     
      
               self.ckpt_path = 
       None
      
     
- 
     
      
     
     
      
               self.overrides = {}  
       # overrides for trainer object
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Load or create new YOLO model
      
     
- 
     
      
     
     
      
               {
       '.pt': self._load, 
       '.yaml': self._new}[Path(model).suffix](model)
      
     
  这部分的重点是最后一句代码:
{'.pt': self._load, '.yaml': self._new}[Path(model).suffix](model)根据我们输入的model参数(yolov8m.yaml)的后缀,代码将跳到self._new中,根据yaml文件定义模型:
   
    - 
     
      
     
     
      
       def 
       _new(
       self, cfg: str, verbose=True):
      
     
- 
     
      
     
     
      
           cfg = check_yaml(cfg)  
       # cfg='yolov8/ultralytics/models/v8/yolov8m.yaml'
      
     
- 
     
      
     
     
      
           cfg_dict = yaml_load(cfg, append_filename=
       True)  
       # model dict
      
     
- 
     
      
     
     
          
       '''
      
     
- 
     
      
     
     
      
        cfg_dict=
      
     
- 
     
      
     
     
      
        {'nc': 80, 'depth_multiple': 0.33, 'width_multiple': 0.25, 'backbone': [[...], [...], [...], [...], [...], [...], [...], [...], [...], ...], 
      
     
- 
     
      
     
     
      
        'head': [[...], [...], [...], [...], [...], [...], [...], [...], [...], ...], 
      
     
- 
     
      
     
     
      
        'yaml_file': 'yolov8/ultralytics/models/v8/yolov8m.yaml',
      
     
- 
     
      
     
     
      
        'ch': 3}
      
     
- 
     
      
     
     
      
        '''
      
     
- 
     
      
     
     
      
            self.task = guess_task_from_head(cfg_dict[
       "head"][-
       1][-
       2])  
       # self.task='detect'
      
     
- 
     
      
     
     
      
            self.ModelClass, self.TrainerClass, self.ValidatorClass, self.PredictorClass = \
      
     
- 
     
      
     
     
      
                   self._guess_ops_from_task(self.task)  
       # 根据task定义ModelClass、TrainerClass等, self.TrainerClass=DetectionTrainer()
      
     
- 
     
      
     
     
      
            self.model = self.ModelClass(cfg_dict, verbose=verbose)  
       # self.model=DetectionModel(), class DetectionModel的定义在yolov8/ultralytics/nn/tasks.py
      
     
- 
     
      
     
     
      
            self.cfg = cfg  
       # self.cfg='yolov8/ultralytics/models/v8/yolov8m.yaml'
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
       def 
       _guess_ops_from_task(
       self, task):  
       # task='detect'
      
     
- 
     
      
     
     
      
           model_class, train_lit, val_lit, pred_lit = MODEL_MAP[task]  
       # model_class="<class 'ultralytics.nn.tasks.DetectionModel'>"
      
     
- 
     
      
     
     
          
       # warning: eval is unsafe. Use with caution
      
     
- 
     
      
     
     
      
           trainer_class = 
       eval(train_lit.replace(
       "TYPE", 
       f"{self.type}"))  
       # trainer_class="<class 'ultralytics.yolo.v8.detect.train.DetectionTrainer'>"
      
     
- 
     
      
     
     
      
           validator_class = 
       eval(val_lit.replace(
       "TYPE", 
       f"{self.type}"))  
       # validator_class="<class 'ultralytics.yolo.v8.detect.val.DetectionValidator'>"
      
     
- 
     
      
     
     
      
           predictor_class = 
       eval(pred_lit.replace(
       "TYPE", 
       f"{self.type}"))  
       # predictor_class="<class 'ultralytics.yolo.v8.detect.predict.DetectionPredictor'>"
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
          
       return model_class, trainer_class, validator_class, predictor_class
      
     
  实例化完模型(YOLO)后,就可以开始训练模型了,
results = model.train(data="yolov8/ultralytics/yolo/data/datasets/wider_face.yaml", epochs=100)跳转到class YOLO中的train,
   
    - 
     
      
     
     
      
       def 
       train(
       self, **kwargs):
      
     
- 
     
      
     
     
              
       """
      
     
- 
     
      
     
     
      
        Trains the model on a given dataset.
      
     
- 
     
      
     
     
      
       
      
     
- 
     
      
     
     
      
        Args:
      
     
- 
     
      
     
     
      
        **kwargs (Any): Any number of arguments representing the training configuration. List of all args can be found in 'config' section.
      
     
- 
     
      
     
     
      
        You can pass all arguments as a yaml file in `cfg`. Other args are ignored if `cfg` file is passed
      
     
- 
     
      
     
     
      
        """
      
     
- 
     
      
     
     
      
               overrides = self.overrides.copy()  
       # overrides={}
      
     
- 
     
      
     
     
      
               overrides.update(kwargs)  
       # overrides={'data': 'wider_face.yaml', 'epochs': 100}
      
     
- 
     
      
     
     
              
       if kwargs.get(
       "cfg"):
      
     
- 
     
      
     
     
      
                   LOGGER.info(
       f"cfg file passed. Overriding default params with {kwargs['cfg']}.")
      
     
- 
     
      
     
     
      
                   overrides = yaml_load(check_yaml(kwargs[
       "cfg"]), append_filename=
       True)
      
     
- 
     
      
     
     
      
               overrides[
       "task"] = self.task
      
     
- 
     
      
     
     
      
               overrides[
       "mode"] = 
       "train"  
       # overrides={'data': 'wider_face.yaml', 'epochs': 100, 'task': 'detect', 'mode': 'train'}
      
     
- 
     
      
     
     
              
       if 
       not overrides.get(
       "data"):
      
     
- 
     
      
     
     
                  
       raise AttributeError(
       "dataset not provided! Please define `data` in config.yaml or pass as an argument.")
      
     
- 
     
      
     
     
              
       if overrides.get(
       "resume"):
      
     
- 
     
      
     
     
      
                   overrides[
       "resume"] = self.ckpt_path
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
               self.trainer = self.TrainerClass(overrides=overrides)
      
     
- 
     
      
     
     
              
       if 
       not overrides.get(
       "resume"):  
       # manually set model only if not resuming
      
     
- 
     
      
     
     
      
                   self.trainer.model = self.trainer.get_model(weights=self.model 
       if self.ckpt 
       else 
       None, cfg=self.model.yaml)  
       # 如果有ckpt, 则直接加载; 没有则根据yolov8n.yaml新建一个模型
      
     
- 
     
      
     
     
      
                   self.model = self.trainer.model
      
     
- 
     
      
     
     
      
               self.trainer.train()
      
     
- 
     
      
     
     
              
       # update model and configs after training
      
     
- 
     
      
     
     
      
               self.model, _ = attempt_load_one_weight(
       str(self.trainer.best))
      
     
- 
     
      
     
     
      
               self.overrides = self.model.args
      
     
  上述代码的重点是self.trainer.train(),self.trainer是<class 'ultralytics.yolo.v8.detect.train.DetectionTrainer'>,而DetectionTrainer继承了class BaseTrainer(dir: 'yolov8/ultralytics/yolo/engine/trainer.py')。利用overrides来初始化BaseTrainer,
self.trainer = self.TrainerClass(overrides=overrides)
   
    - 
     
      
     
     
      
       class 
       BaseTrainer:
      
     
- 
     
      
     
     
          
       """
      
     
- 
     
      
     
     
      
        BaseTrainer
      
     
- 
     
      
     
     
      
       
      
     
- 
     
      
     
     
      
        A base class for creating trainers.
      
     
- 
     
      
     
     
      
       
      
     
- 
     
      
     
     
      
        Attributes:
      
     
- 
     
      
     
     
      
        args (OmegaConf): Configuration for the trainer.
      
     
- 
     
      
     
     
      
        check_resume (method): Method to check if training should be resumed from a saved checkpoint.
      
     
- 
     
      
     
     
      
        console (logging.Logger): Logger instance.
      
     
- 
     
      
     
     
      
        validator (BaseValidator): Validator instance.
      
     
- 
     
      
     
     
      
        model (nn.Module): Model instance.
      
     
- 
     
      
     
     
      
        callbacks (defaultdict): Dictionary of callbacks.
      
     
- 
     
      
     
     
      
        save_dir (Path): Directory to save results.
      
     
- 
     
      
     
     
      
        wdir (Path): Directory to save weights.
      
     
- 
     
      
     
     
      
        last (Path): Path to last checkpoint.
      
     
- 
     
      
     
     
      
        best (Path): Path to best checkpoint.
      
     
- 
     
      
     
     
      
        batch_size (int): Batch size for training.
      
     
- 
     
      
     
     
      
        epochs (int): Number of epochs to train for.
      
     
- 
     
      
     
     
      
        start_epoch (int): Starting epoch for training.
      
     
- 
     
      
     
     
      
        device (torch.device): Device to use for training.
      
     
- 
     
      
     
     
      
        amp (bool): Flag to enable AMP (Automatic Mixed Precision).
      
     
- 
     
      
     
     
      
        scaler (amp.GradScaler): Gradient scaler for AMP.
      
     
- 
     
      
     
     
      
        data (str): Path to data.
      
     
- 
     
      
     
     
      
        trainset (torch.utils.data.Dataset): Training dataset.
      
     
- 
     
      
     
     
      
        testset (torch.utils.data.Dataset): Testing dataset.
      
     
- 
     
      
     
     
      
        ema (nn.Module): EMA (Exponential Moving Average) of the model.
      
     
- 
     
      
     
     
      
        lf (nn.Module): Loss function.
      
     
- 
     
      
     
     
      
        scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
      
     
- 
     
      
     
     
      
        best_fitness (float): The best fitness value achieved.
      
     
- 
     
      
     
     
      
        fitness (float): Current fitness value.
      
     
- 
     
      
     
     
      
        loss (float): Current loss value.
      
     
- 
     
      
     
     
      
        tloss (float): Total loss value.
      
     
- 
     
      
     
     
      
        loss_names (list): List of loss names.
      
     
- 
     
      
     
     
      
        csv (Path): Path to results CSV file.
      
     
- 
     
      
     
     
      
        """
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
          
       def 
       __init__(
       self, config=DEFAULT_CONFIG, overrides=None):  
       # overrides={'data': 'wider_face.yaml', 'epochs': 100, 'task': 'detect', 'mode': 'train'}
      
     
- 
     
      
     
     
              
       if overrides 
       is 
       None:
      
     
- 
     
      
     
     
      
                   overrides = {}
      
     
- 
     
      
     
     
      
               self.args = get_config(config, overrides)  
       # config='yolov8/ultralytics/yolo/configs/default.yaml'
      
     
- 
     
      
     
     
              
       '''
      
     
- 
     
      
     
     
      
        self.args=
      
     
- 
     
      
     
     
      
        {'task': 'detect', 'mode': 'train', 'model': None, 'data': 'wider_face.yaml', 'epochs': 100, 'patience': 50, 'batch': 16, 
      
     
- 
     
      
     
     
      
        'imgsz': 640, 'save': True, 'cache': False, 'device': None, 'workers': 8, 'project': None, 'name': None, 
      
     
- 
     
      
     
     
      
        'exist_ok': False, 'pretrained': False, 'optimizer': 'SGD', 'verbose': False, 'seed': 0, 'deterministic': True, 
      
     
- 
     
      
     
     
      
        'single_cls': False, 'image_weights': False, 'rect': False, 'cos_lr': False, 'close_mosaic': 10, 'resume': False, 
      
     
- 
     
      
     
     
      
        'overlap_mask': True, 'mask_ratio': 4, 'dropout': 0.0, 'val': True, 'save_json': False, 'save_hybrid': False, 
      
     
- 
     
      
     
     
      
        'conf': None, 'iou': 0.7, 'max_det': 300, 'half': False, 'dnn': False, 'plots': True, 'source': None, 'show': False, 
      
     
- 
     
      
     
     
      
        'save_txt': False, 'save_conf': False, 'save_crop': False, 'hide_labels': False, 'hide_conf': False, 'vid_stride': 1, 
      
     
- 
     
      
     
     
      
        'line_thickness': 3, 'visualize': False, 'augment': False, 'agnostic_nms': False, 'retina_masks': False, 
      
     
- 
     
      
     
     
      
        'format': 'torchscript', 'keras': False, 'optimize': False, 'int8': False, 'dynamic': False, 'simplify': False, 
      
     
- 
     
      
     
     
      
        'opset': 17, 'workspace': 4, 'nms': False, 'lr0': 0.01, 'lrf': 0.01, 'momentum': 0.937, 'weight_decay': 0.0005, 
      
     
- 
     
      
     
     
      
        'warmup_epochs': 3.0, 'warmup_momentum': 0.8, 'warmup_bias_lr': 0.1, 'box': 7.5, 'cls': 0.5, 'dfl': 1.5, 
      
     
- 
     
      
     
     
      
        'fl_gamma': 0.0, 'label_smoothing': 0.0, 'nbs': 64, 'hsv_h': 0.015, 'hsv_s': 0.7, 'hsv_v': 0.4, 'degrees': 0.0, 
      
     
- 
     
      
     
     
      
        'translate': 0.1, 'scale': 0.5, 'shear': 0.0, 'perspective': 0.0, 'flipud': 0.0, 'fliplr': 0.5, 'mosaic': 1.0, 
      
     
- 
     
      
     
     
      
        'mixup': 0.0, 'copy_paste': 0.0, 'cfg': None, 'hydra': {'output_subdir': None, 'run': {'dir': '.'}}, 'v5loader': False}
      
     
- 
     
      
     
     
      
        '''
      
     
- 
     
      
     
     
      
               self.device = utils.torch_utils.select_device(self.args.device, self.args.batch)  
       # self.device=device(type='cuda', index=0)
      
     
- 
     
      
     
     
      
               self.check_resume()
      
     
- 
     
      
     
     
      
               self.console = LOGGER
      
     
- 
     
      
     
     
      
               self.validator = 
       None
      
     
- 
     
      
     
     
      
               self.model = 
       None
      
     
- 
     
      
     
     
      
               self.callbacks = defaultdict(
       list)
      
     
- 
     
      
     
     
      
               init_seeds(self.args.seed + 
       1 + RANK, deterministic=self.args.deterministic)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Dirs
      
     
- 
     
      
     
     
      
               project = self.args.project 
       or Path(SETTINGS[
       'runs_dir']) / self.args.task  
       # project='yolov8/runs/detect'
      
     
- 
     
      
     
     
      
               name = self.args.name 
       or 
       f"{self.args.mode}"  
       # name='train'
      
     
- 
     
      
     
     
      
               self.save_dir = Path(
      
     
- 
     
      
     
     
      
                   self.args.get(
      
     
- 
     
      
     
     
                      
       "save_dir",
      
     
- 
     
      
     
     
      
                       increment_path(Path(project) / name, exist_ok=self.args.exist_ok 
       if RANK 
       in {-
       1, 
       0} 
       else 
       True)))  
       # self.save_dir='yolov8/runs/detect/train'
      
     
- 
     
      
     
     
      
               self.wdir = self.save_dir / 
       'weights'  
       # self.wdir='yolov8/runs/detect/train/weights'
      
     
- 
     
      
     
     
              
       if RANK 
       in {-
       1, 
       0}:
      
     
- 
     
      
     
     
      
                   self.wdir.mkdir(parents=
       True, exist_ok=
       True)  
       # make dir
      
     
- 
     
      
     
     
                  
       with open_dict(self.args):
      
     
- 
     
      
     
     
      
                       self.args.save_dir = 
       str(self.save_dir)
      
     
- 
     
      
     
     
      
                   yaml_save(self.save_dir / 
       'args.yaml', OmegaConf.to_container(self.args, resolve=
       True))  
       # save run args
      
     
- 
     
      
     
     
      
               self.last, self.best = self.wdir / 
       'last.pt', self.wdir / 
       'best.pt'  
       # checkpoint paths
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
               self.batch_size = self.args.batch
      
     
- 
     
      
     
     
      
               self.epochs = self.args.epochs  
       # self.epochs=100
      
     
- 
     
      
     
     
      
               self.start_epoch = 
       0
      
     
- 
     
      
     
     
              
       if RANK == -
       1:
      
     
- 
     
      
     
     
      
                   print_args(
       dict(self.args))
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Device
      
     
- 
     
      
     
     
      
               self.amp = self.device.
       type != 
       'cpu'
      
     
- 
     
      
     
     
      
               self.scaler = amp.GradScaler(enabled=self.amp)
      
     
- 
     
      
     
     
              
       if self.device.
       type == 
       'cpu':
      
     
- 
     
      
     
     
      
                   self.args.workers = 
       0  
       # faster CPU training as time dominated by inference, not dataloading
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Model and Dataloaders.
      
     
- 
     
      
     
     
      
               self.model = self.args.model  
       # self.model=None
      
     
- 
     
      
     
     
      
               self.data = self.args.data  
       # self.data='wider_face.yaml'
      
     
- 
     
      
     
     
              
       if self.data.endswith(
       ".yaml"):
      
     
- 
     
      
     
     
      
                   self.data = check_dataset_yaml(self.data)
      
     
- 
     
      
     
     
                  
       '''
      
     
- 
     
      
     
     
      
        self.data=
      
     
- 
     
      
     
     
      
        {'path': PosixPath('yolov8/datasets/wider_face'), 
      
     
- 
     
      
     
     
      
        'train': 'yolov8/datasets/wider_face/images/train', 
      
     
- 
     
      
     
     
      
        'val': 'yolov8/datasets/wider_face/images/val', 
      
     
- 
     
      
     
     
      
        'test': None, 
      
     
- 
     
      
     
     
      
           'names': {0: 'face'}, 
      
     
- 
     
      
     
     
      
        'download': , 
      
     
- 
     
      
     
     
      
        'yaml_file': 'yolov8/ultralytics/yolo/data/datasets/wider_face.yaml', 
      
     
- 
     
      
     
     
      
        'nc': 1}
      
     
- 
     
      
     
     
      
        '''
      
     
- 
     
      
     
     
              
       else:
      
     
- 
     
      
     
     
      
                   self.data = check_dataset(self.data)
      
     
- 
     
      
     
     
      
               self.trainset, self.testset = self.get_dataset(self.data)  
       # self.trainset='yolov8/datasets/wider_face/images/train', self.testset='yolov8/datasets/wider_face/images/val'
      
     
- 
     
      
     
     
      
               self.ema = 
       None
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Optimization utils init
      
     
- 
     
      
     
     
      
               self.lf = 
       None
      
     
- 
     
      
     
     
      
               self.scheduler = 
       None
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Epoch level metrics
      
     
- 
     
      
     
     
      
               self.best_fitness = 
       None
      
     
- 
     
      
     
     
      
               self.fitness = 
       None
      
     
- 
     
      
     
     
      
               self.loss = 
       None
      
     
- 
     
      
     
     
      
               self.tloss = 
       None
      
     
- 
     
      
     
     
      
               self.loss_names = [
       'Loss']
      
     
- 
     
      
     
     
      
               self.csv = self.save_dir / 
       'results.csv'  
       # self.csv='yolov8/runs/detect/train/results.csv'
      
     
- 
     
      
     
     
      
               self.plot_idx = [
       0, 
       1, 
       2]
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       # Callbacks
      
     
- 
     
      
     
     
      
               self.callbacks = defaultdict(
       list, {k: [v] 
       for k, v 
       in callbacks.default_callbacks.items()})  
       # add callbacks
      
     
- 
     
      
     
     
              
       if RANK 
       in {
       0, -
       1}:
      
     
- 
     
      
     
     
      
                   callbacks.add_integration_callbacks(self)
      
     
  初始化完self.trainer后,便开始训练,
self.trainer.train()同样,跳转到BaseTrainer中的train()中,
   
    - 
     
      
     
     
      
       def 
       train(
       self):
      
     
- 
     
      
     
     
          
       # Allow device='', device=None on Multi-GPU systems to default to device=0
      
     
- 
     
      
     
     
          
       if 
       isinstance(self.args.device, 
       int) 
       or self.args.device:  
       # i.e. device=0 or device=[0,1,2,3]
      
     
- 
     
      
     
     
      
               world_size = torch.cuda.device_count()
      
     
- 
     
      
     
     
          
       elif torch.cuda.is_available():  
       # i.e. device=None or device=''
      
     
- 
     
      
     
     
      
               world_size = 
       1  
       # default to device 0
      
     
- 
     
      
     
     
          
       else:  
       # i.e. device='cpu' or 'mps'
      
     
- 
     
      
     
     
      
               world_size = 
       0
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
          
       # Run subprocess if DDP training, else train normally
      
     
- 
     
      
     
     
          
       if world_size > 
       1 
       and 
       "LOCAL_RANK" 
       not 
       in os.environ:
      
     
- 
     
      
     
     
      
              command = generate_ddp_command(world_size, self)
      
     
- 
     
      
     
     
          
       try:
      
     
- 
     
      
     
     
      
              subprocess.run(command)
      
     
- 
     
      
     
     
          
       except Exception 
       as e:
      
     
- 
     
      
     
     
      
              self.console(e)
      
     
- 
     
      
     
     
          
       finally:
      
     
- 
     
      
     
     
      
              ddp_cleanup(command, self)
      
     
- 
     
      
     
     
          
       else:
      
     
- 
     
      
     
     
      
              self._do_train(
       int(os.getenv(
       "RANK", -
       1)), world_size)  
       # world_size=1
      
     
  因为world_size=1,所以直接进入到self._do_train中,
   
    - 
     
      
     
     
      
       def 
       _do_train(
       self, rank=-1, world_size=1):  
       # rank=-1, world_size=1
      
     
- 
     
      
     
     
              
       if world_size > 
       1:
      
     
- 
     
      
     
     
      
                   self._setup_ddp(rank, world_size)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
               self._setup_train(rank, world_size)  
       # 设置与训练相关的参数, 如: optimizer、scheduler、train_loader、test_loader、validator、metrics等
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
               self.epoch_time = 
       None
      
     
- 
     
      
     
     
      
               self.epoch_time_start = time.time()
      
     
- 
     
      
     
     
      
               self.train_time_start = time.time()
      
     
- 
     
      
     
     
      
               nb = 
       len(self.train_loader)  
       # number of batches
      
     
- 
     
      
     
     
      
               nw = 
       max(
       round(self.args.warmup_epochs * nb), 
       100)  
       # number of warmup iterations
      
     
- 
     
      
     
     
      
               last_opt_step = -
       1
      
     
- 
     
      
     
     
      
               self.run_callbacks(
       "on_train_start")
      
     
- 
     
      
     
     
      
               self.log(
       f"Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n"
      
     
- 
     
      
     
     
                       
       f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
      
     
- 
     
      
     
     
                       
       f"Logging results to {colorstr('bold', self.save_dir)}\n"
      
     
- 
     
      
     
     
                       
       f"Starting training for {self.epochs} epochs...")
      
     
- 
     
      
     
     
              
       if self.args.close_mosaic:
      
     
- 
     
      
     
     
      
                   base_idx = (self.epochs - self.args.close_mosaic) * nb
      
     
- 
     
      
     
     
      
                   self.plot_idx.extend([base_idx, base_idx + 
       1, base_idx + 
       2])
      
     
- 
     
      
     
     
              
       for epoch 
       in 
       range(self.start_epoch, self.epochs):
      
     
- 
     
      
     
     
      
                   self.epoch = epoch
      
     
- 
     
      
     
     
      
                   self.run_callbacks(
       "on_train_epoch_start")
      
     
- 
     
      
     
     
      
                   self.model.train()
      
     
- 
     
      
     
     
                  
       if rank != -
       1:
      
     
- 
     
      
     
     
      
                       self.train_loader.sampler.set_epoch(epoch)
      
     
- 
     
      
     
     
      
                   pbar = 
       enumerate(self.train_loader)
      
     
- 
     
      
     
     
                  
       # Update dataloader attributes (optional)
      
     
- 
     
      
     
     
                  
       if epoch == (self.epochs - self.args.close_mosaic):
      
     
- 
     
      
     
     
      
                       self.console.info(
       "Closing dataloader mosaic")
      
     
- 
     
      
     
     
                      
       if 
       hasattr(self.train_loader.dataset, 
       'mosaic'):
      
     
- 
     
      
     
     
      
                           self.train_loader.dataset.mosaic = 
       False
      
     
- 
     
      
     
     
                      
       if 
       hasattr(self.train_loader.dataset, 
       'close_mosaic'):
      
     
- 
     
      
     
     
      
                           self.train_loader.dataset.close_mosaic(hyp=self.args)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                  
       if rank 
       in {-
       1, 
       0}:
      
     
- 
     
      
     
     
      
                       self.console.info(self.progress_string())
      
     
- 
     
      
     
     
      
                       pbar = tqdm(
       enumerate(self.train_loader), total=nb, bar_format=TQDM_BAR_FORMAT)
      
     
- 
     
      
     
     
      
                   self.tloss = 
       None
      
     
- 
     
      
     
     
      
                   self.optimizer.zero_grad()
      
     
- 
     
      
     
     
                  
       for i, batch 
       in pbar:
      
     
- 
     
      
     
     
      
                       self.run_callbacks(
       "on_train_batch_start")
      
     
- 
     
      
     
     
                      
       # Warmup
      
     
- 
     
      
     
     
      
                       ni = i + nb * epoch
      
     
- 
     
      
     
     
                      
       if ni <= nw:
      
     
- 
     
      
     
     
      
                           xi = [
       0, nw]  
       # x interp
      
     
- 
     
      
     
     
      
                           self.accumulate = 
       max(
       1, np.interp(ni, xi, [
       1, self.args.nbs / self.batch_size]).
       round())
      
     
- 
     
      
     
     
                          
       for j, x 
       in 
       enumerate(self.optimizer.param_groups):
      
     
- 
     
      
     
     
                              
       # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
      
     
- 
     
      
     
     
      
                               x[
       'lr'] = np.interp(
      
     
- 
     
      
     
     
      
                                   ni, xi, [self.args.warmup_bias_lr 
       if j == 
       0 
       else 
       0.0, x[
       'initial_lr'] * self.lf(epoch)])
      
     
- 
     
      
     
     
                              
       if 
       'momentum' 
       in x:
      
     
- 
     
      
     
     
      
                                   x[
       'momentum'] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                      
       # Forward
      
     
- 
     
      
     
     
                      
       with torch.cuda.amp.autocast(self.amp):
      
     
- 
     
      
     
     
      
                           batch = self.preprocess_batch(batch)
      
     
- 
     
      
     
     
      
                           preds = self.model(batch[
       "img"])
      
     
- 
     
      
     
     
      
                           self.loss, self.loss_items = self.criterion(preds, batch)
      
     
- 
     
      
     
     
                          
       if rank != -
       1:
      
     
- 
     
      
     
     
      
                               self.loss *= world_size
      
     
- 
     
      
     
     
      
                           self.tloss = (self.tloss * i + self.loss_items) / (i + 
       1) 
       if self.tloss 
       is 
       not 
       None \
      
     
- 
     
      
     
     
                              
       else self.loss_items
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                      
       # Backward
      
     
- 
     
      
     
     
      
                       self.scaler.scale(self.loss).backward()
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                      
       # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
      
     
- 
     
      
     
     
                      
       if ni - last_opt_step >= self.accumulate:
      
     
- 
     
      
     
     
      
                           self.optimizer_step()
      
     
- 
     
      
     
     
      
                           last_opt_step = ni
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                      
       # Log
      
     
- 
     
      
     
     
      
                       mem = 
       f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'  
       # (GB)
      
     
- 
     
      
     
     
      
                       loss_len = self.tloss.shape[
       0] 
       if 
       len(self.tloss.size()) 
       else 
       1
      
     
- 
     
      
     
     
      
                       losses = self.tloss 
       if loss_len > 
       1 
       else torch.unsqueeze(self.tloss, 
       0)
      
     
- 
     
      
     
     
                      
       if rank 
       in {-
       1, 
       0}:
      
     
- 
     
      
     
     
      
                           pbar.set_description(
      
     
- 
     
      
     
     
      
                               (
       '%11s' * 
       2 + 
       '%11.4g' * (
       2 + loss_len)) %
      
     
- 
     
      
     
     
      
                               (
       f'{epoch + 1}/{self.epochs}', mem, *losses, batch[
       "cls"].shape[
       0], batch[
       "img"].shape[-
       1]))
      
     
- 
     
      
     
     
      
                           self.run_callbacks(
       'on_batch_end')
      
     
- 
     
      
     
     
                          
       if self.args.plots 
       and ni 
       in self.plot_idx:
      
     
- 
     
      
     
     
      
                               self.plot_training_samples(batch, ni)
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
                       self.run_callbacks(
       "on_train_batch_end")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
                   self.lr = {
       f"lr/pg{ir}": x[
       'lr'] 
       for ir, x 
       in 
       enumerate(self.optimizer.param_groups)}  
       # for loggers
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
                   self.scheduler.step()
      
     
- 
     
      
     
     
      
                   self.run_callbacks(
       "on_train_epoch_end")
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                  
       if rank 
       in {-
       1, 
       0}:
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                      
       # Validation
      
     
- 
     
      
     
     
      
                       self.ema.update_attr(self.model, include=[
       'yaml', 
       'nc', 
       'args', 
       'names', 
       'stride', 
       'class_weights'])
      
     
- 
     
      
     
     
      
                       final_epoch = (epoch + 
       1 == self.epochs)
      
     
- 
     
      
     
     
                      
       if self.args.val 
       or final_epoch:
      
     
- 
     
      
     
     
      
                           self.metrics, self.fitness = self.validate()
      
     
- 
     
      
     
     
      
                       self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
                      
       # Save model
      
     
- 
     
      
     
     
                      
       if self.args.save 
       or (epoch + 
       1 == self.epochs):
      
     
- 
     
      
     
     
      
                           self.save_model()
      
     
- 
     
      
     
     
      
                           self.run_callbacks(
       'on_model_save')
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
      
                   tnow = time.time()
      
     
- 
     
      
     
     
      
                   self.epoch_time = tnow - self.epoch_time_start
      
     
- 
     
      
     
     
      
                   self.epoch_time_start = tnow
      
     
- 
     
      
     
     
      
                   self.run_callbacks(
       "on_fit_epoch_end")
      
     
- 
     
      
     
     
                  
       # TODO: termination condition
      
     
- 
     
      
     
     
       
      
     
- 
     
      
     
     
              
       if rank 
       in {-
       1, 
       0}:
      
     
- 
     
      
     
     
                  
       # Do final val with best.pt
      
     
- 
     
      
     
     
      
                   self.log(
       f'\n{epoch - self.start_epoch + 1} epochs completed in '
      
     
- 
     
      
     
     
                           
       f'{(time.time() - self.train_time_start) / 3600:.3f} hours.')
      
     
- 
     
      
     
     
      
                   self.final_eval()
      
     
- 
     
      
     
     
                  
       if self.args.plots:
      
     
- 
     
      
     
     
      
                       self.plot_metrics()
      
     
- 
     
      
     
     
      
                   self.log(
       f"Results saved to {colorstr('bold', self.save_dir)}")
      
     
- 
     
      
     
     
      
                   self.run_callbacks(
       'on_train_end')
      
     
- 
     
      
     
     
      
               torch.cuda.empty_cache()
      
     
- 
     
      
     
     
      
               self.run_callbacks(
       'teardown')
      
     
  以上便是yolov8的训练过程。
转载:https://blog.csdn.net/qq_38964360/article/details/128739669
查看评论
					 
					