飞道的博客

YOLOV8训练学习记录

1166人阅读  评论(0)

承接上一篇博客:https://blog.csdn.net/qq_38964360/article/details/128728145?spm=1001.2014.3001.5501

今天记录一下yolov8模型训练的调试过程。

在工程里创建训练脚本python_example.py

代码如下:


   
  1. # filename: python_example.py
  2. # dir: yolov8/python_example.py
  3. from ultralytics import YOLO
  4. # build a new model from scratch
  5. model = YOLO( "yolov8/ultralytics/models/v8/yolov8m.yaml")
  6. # train the model
  7. results = model.train(data= "yolov8/ultralytics/yolo/data/datasets/wider_face.yaml", epochs= 100)

以上示例首先是利用yolov8m.yaml文件初始化了YOLO类,

model = YOLO("yolov8/ultralytics/models/v8/yolov8m.yaml")

接下来可以看看YOLO的__init__,


   
  1. # dir: yolov8/ultralytics/yolo/engine/model.py
  2. class YOLO:
  3. def __init__( self, model='yolov8n.yaml', type="v8") -> None:
  4. """
  5. Initializes the YOLO object.
  6. Args:
  7. model (str, Path): model to load or create
  8. type (str): Type/version of models to use. Defaults to "v8".
  9. """
  10. self. type = type
  11. self.ModelClass = None # model class
  12. self.TrainerClass = None # trainer class
  13. self.ValidatorClass = None # validator class
  14. self.PredictorClass = None # predictor class
  15. self.model = None # model object
  16. self.trainer = None # trainer object
  17. self.task = None # task type
  18. self.ckpt = None # if loaded from *.pt
  19. self.cfg = None # if loaded from *.yaml
  20. self.ckpt_path = None
  21. self.overrides = {} # overrides for trainer object
  22. # Load or create new YOLO model
  23. { '.pt': self._load, '.yaml': self._new}[Path(model).suffix](model)

这部分的重点是最后一句代码:

{'.pt': self._load, '.yaml': self._new}[Path(model).suffix](model)

根据我们输入的model参数(yolov8m.yaml)的后缀,代码将跳到self._new中,根据yaml文件定义模型:


   
  1. def _new( self, cfg: str, verbose=True):
  2. cfg = check_yaml(cfg) # cfg='yolov8/ultralytics/models/v8/yolov8m.yaml'
  3. cfg_dict = yaml_load(cfg, append_filename= True) # model dict
  4. '''
  5. cfg_dict=
  6. {'nc': 80, 'depth_multiple': 0.33, 'width_multiple': 0.25, 'backbone': [[...], [...], [...], [...], [...], [...], [...], [...], [...], ...],
  7. 'head': [[...], [...], [...], [...], [...], [...], [...], [...], [...], ...],
  8. 'yaml_file': 'yolov8/ultralytics/models/v8/yolov8m.yaml',
  9. 'ch': 3}
  10. '''
  11. self.task = guess_task_from_head(cfg_dict[ "head"][- 1][- 2]) # self.task='detect'
  12. self.ModelClass, self.TrainerClass, self.ValidatorClass, self.PredictorClass = \
  13. self._guess_ops_from_task(self.task) # 根据task定义ModelClass、TrainerClass等, self.TrainerClass=DetectionTrainer()
  14. self.model = self.ModelClass(cfg_dict, verbose=verbose) # self.model=DetectionModel(), class DetectionModel的定义在yolov8/ultralytics/nn/tasks.py
  15. self.cfg = cfg # self.cfg='yolov8/ultralytics/models/v8/yolov8m.yaml'
  16. def _guess_ops_from_task( self, task): # task='detect'
  17. model_class, train_lit, val_lit, pred_lit = MODEL_MAP[task] # model_class="<class 'ultralytics.nn.tasks.DetectionModel'>"
  18. # warning: eval is unsafe. Use with caution
  19. trainer_class = eval(train_lit.replace( "TYPE", f"{self.type}")) # trainer_class="<class 'ultralytics.yolo.v8.detect.train.DetectionTrainer'>"
  20. validator_class = eval(val_lit.replace( "TYPE", f"{self.type}")) # validator_class="<class 'ultralytics.yolo.v8.detect.val.DetectionValidator'>"
  21. predictor_class = eval(pred_lit.replace( "TYPE", f"{self.type}")) # predictor_class="<class 'ultralytics.yolo.v8.detect.predict.DetectionPredictor'>"
  22. return model_class, trainer_class, validator_class, predictor_class

实例化完模型(YOLO)后,就可以开始训练模型了,

results = model.train(data="yolov8/ultralytics/yolo/data/datasets/wider_face.yaml", epochs=100)

跳转到class YOLO中的train,


   
  1. def train( self, **kwargs):
  2. """
  3. Trains the model on a given dataset.
  4. Args:
  5. **kwargs (Any): Any number of arguments representing the training configuration. List of all args can be found in 'config' section.
  6. You can pass all arguments as a yaml file in `cfg`. Other args are ignored if `cfg` file is passed
  7. """
  8. overrides = self.overrides.copy() # overrides={}
  9. overrides.update(kwargs) # overrides={'data': 'wider_face.yaml', 'epochs': 100}
  10. if kwargs.get( "cfg"):
  11. LOGGER.info( f"cfg file passed. Overriding default params with {kwargs['cfg']}.")
  12. overrides = yaml_load(check_yaml(kwargs[ "cfg"]), append_filename= True)
  13. overrides[ "task"] = self.task
  14. overrides[ "mode"] = "train" # overrides={'data': 'wider_face.yaml', 'epochs': 100, 'task': 'detect', 'mode': 'train'}
  15. if not overrides.get( "data"):
  16. raise AttributeError( "dataset not provided! Please define `data` in config.yaml or pass as an argument.")
  17. if overrides.get( "resume"):
  18. overrides[ "resume"] = self.ckpt_path
  19. self.trainer = self.TrainerClass(overrides=overrides)
  20. if not overrides.get( "resume"): # manually set model only if not resuming
  21. self.trainer.model = self.trainer.get_model(weights=self.model if self.ckpt else None, cfg=self.model.yaml) # 如果有ckpt, 则直接加载; 没有则根据yolov8n.yaml新建一个模型
  22. self.model = self.trainer.model
  23. self.trainer.train()
  24. # update model and configs after training
  25. self.model, _ = attempt_load_one_weight( str(self.trainer.best))
  26. self.overrides = self.model.args

上述代码的重点是self.trainer.train(),self.trainer是<class 'ultralytics.yolo.v8.detect.train.DetectionTrainer'>,而DetectionTrainer继承了class BaseTrainer(dir: 'yolov8/ultralytics/yolo/engine/trainer.py')。利用overrides来初始化BaseTrainer,

self.trainer = self.TrainerClass(overrides=overrides)

   
  1. class BaseTrainer:
  2. """
  3. BaseTrainer
  4. A base class for creating trainers.
  5. Attributes:
  6. args (OmegaConf): Configuration for the trainer.
  7. check_resume (method): Method to check if training should be resumed from a saved checkpoint.
  8. console (logging.Logger): Logger instance.
  9. validator (BaseValidator): Validator instance.
  10. model (nn.Module): Model instance.
  11. callbacks (defaultdict): Dictionary of callbacks.
  12. save_dir (Path): Directory to save results.
  13. wdir (Path): Directory to save weights.
  14. last (Path): Path to last checkpoint.
  15. best (Path): Path to best checkpoint.
  16. batch_size (int): Batch size for training.
  17. epochs (int): Number of epochs to train for.
  18. start_epoch (int): Starting epoch for training.
  19. device (torch.device): Device to use for training.
  20. amp (bool): Flag to enable AMP (Automatic Mixed Precision).
  21. scaler (amp.GradScaler): Gradient scaler for AMP.
  22. data (str): Path to data.
  23. trainset (torch.utils.data.Dataset): Training dataset.
  24. testset (torch.utils.data.Dataset): Testing dataset.
  25. ema (nn.Module): EMA (Exponential Moving Average) of the model.
  26. lf (nn.Module): Loss function.
  27. scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
  28. best_fitness (float): The best fitness value achieved.
  29. fitness (float): Current fitness value.
  30. loss (float): Current loss value.
  31. tloss (float): Total loss value.
  32. loss_names (list): List of loss names.
  33. csv (Path): Path to results CSV file.
  34. """
  35. def __init__( self, config=DEFAULT_CONFIG, overrides=None): # overrides={'data': 'wider_face.yaml', 'epochs': 100, 'task': 'detect', 'mode': 'train'}
  36. if overrides is None:
  37. overrides = {}
  38. self.args = get_config(config, overrides) # config='yolov8/ultralytics/yolo/configs/default.yaml'
  39. '''
  40. self.args=
  41. {'task': 'detect', 'mode': 'train', 'model': None, 'data': 'wider_face.yaml', 'epochs': 100, 'patience': 50, 'batch': 16,
  42. 'imgsz': 640, 'save': True, 'cache': False, 'device': None, 'workers': 8, 'project': None, 'name': None,
  43. 'exist_ok': False, 'pretrained': False, 'optimizer': 'SGD', 'verbose': False, 'seed': 0, 'deterministic': True,
  44. 'single_cls': False, 'image_weights': False, 'rect': False, 'cos_lr': False, 'close_mosaic': 10, 'resume': False,
  45. 'overlap_mask': True, 'mask_ratio': 4, 'dropout': 0.0, 'val': True, 'save_json': False, 'save_hybrid': False,
  46. 'conf': None, 'iou': 0.7, 'max_det': 300, 'half': False, 'dnn': False, 'plots': True, 'source': None, 'show': False,
  47. 'save_txt': False, 'save_conf': False, 'save_crop': False, 'hide_labels': False, 'hide_conf': False, 'vid_stride': 1,
  48. 'line_thickness': 3, 'visualize': False, 'augment': False, 'agnostic_nms': False, 'retina_masks': False,
  49. 'format': 'torchscript', 'keras': False, 'optimize': False, 'int8': False, 'dynamic': False, 'simplify': False,
  50. 'opset': 17, 'workspace': 4, 'nms': False, 'lr0': 0.01, 'lrf': 0.01, 'momentum': 0.937, 'weight_decay': 0.0005,
  51. 'warmup_epochs': 3.0, 'warmup_momentum': 0.8, 'warmup_bias_lr': 0.1, 'box': 7.5, 'cls': 0.5, 'dfl': 1.5,
  52. 'fl_gamma': 0.0, 'label_smoothing': 0.0, 'nbs': 64, 'hsv_h': 0.015, 'hsv_s': 0.7, 'hsv_v': 0.4, 'degrees': 0.0,
  53. 'translate': 0.1, 'scale': 0.5, 'shear': 0.0, 'perspective': 0.0, 'flipud': 0.0, 'fliplr': 0.5, 'mosaic': 1.0,
  54. 'mixup': 0.0, 'copy_paste': 0.0, 'cfg': None, 'hydra': {'output_subdir': None, 'run': {'dir': '.'}}, 'v5loader': False}
  55. '''
  56. self.device = utils.torch_utils.select_device(self.args.device, self.args.batch) # self.device=device(type='cuda', index=0)
  57. self.check_resume()
  58. self.console = LOGGER
  59. self.validator = None
  60. self.model = None
  61. self.callbacks = defaultdict( list)
  62. init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic)
  63. # Dirs
  64. project = self.args.project or Path(SETTINGS[ 'runs_dir']) / self.args.task # project='yolov8/runs/detect'
  65. name = self.args.name or f"{self.args.mode}" # name='train'
  66. self.save_dir = Path(
  67. self.args.get(
  68. "save_dir",
  69. increment_path(Path(project) / name, exist_ok=self.args.exist_ok if RANK in {- 1, 0} else True))) # self.save_dir='yolov8/runs/detect/train'
  70. self.wdir = self.save_dir / 'weights' # self.wdir='yolov8/runs/detect/train/weights'
  71. if RANK in {- 1, 0}:
  72. self.wdir.mkdir(parents= True, exist_ok= True) # make dir
  73. with open_dict(self.args):
  74. self.args.save_dir = str(self.save_dir)
  75. yaml_save(self.save_dir / 'args.yaml', OmegaConf.to_container(self.args, resolve= True)) # save run args
  76. self.last, self.best = self.wdir / 'last.pt', self.wdir / 'best.pt' # checkpoint paths
  77. self.batch_size = self.args.batch
  78. self.epochs = self.args.epochs # self.epochs=100
  79. self.start_epoch = 0
  80. if RANK == - 1:
  81. print_args( dict(self.args))
  82. # Device
  83. self.amp = self.device. type != 'cpu'
  84. self.scaler = amp.GradScaler(enabled=self.amp)
  85. if self.device. type == 'cpu':
  86. self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading
  87. # Model and Dataloaders.
  88. self.model = self.args.model # self.model=None
  89. self.data = self.args.data # self.data='wider_face.yaml'
  90. if self.data.endswith( ".yaml"):
  91. self.data = check_dataset_yaml(self.data)
  92. '''
  93. self.data=
  94. {'path': PosixPath('yolov8/datasets/wider_face'),
  95. 'train': 'yolov8/datasets/wider_face/images/train',
  96. 'val': 'yolov8/datasets/wider_face/images/val',
  97. 'test': None,
  98.     'names': {0: 'face'},
  99. 'download': ,
  100. 'yaml_file': 'yolov8/ultralytics/yolo/data/datasets/wider_face.yaml',
  101. 'nc': 1}
  102. '''
  103. else:
  104. self.data = check_dataset(self.data)
  105. self.trainset, self.testset = self.get_dataset(self.data) # self.trainset='yolov8/datasets/wider_face/images/train', self.testset='yolov8/datasets/wider_face/images/val'
  106. self.ema = None
  107. # Optimization utils init
  108. self.lf = None
  109. self.scheduler = None
  110. # Epoch level metrics
  111. self.best_fitness = None
  112. self.fitness = None
  113. self.loss = None
  114. self.tloss = None
  115. self.loss_names = [ 'Loss']
  116. self.csv = self.save_dir / 'results.csv' # self.csv='yolov8/runs/detect/train/results.csv'
  117. self.plot_idx = [ 0, 1, 2]
  118. # Callbacks
  119. self.callbacks = defaultdict( list, {k: [v] for k, v in callbacks.default_callbacks.items()}) # add callbacks
  120. if RANK in { 0, - 1}:
  121. callbacks.add_integration_callbacks(self)

初始化完self.trainer后,便开始训练,

self.trainer.train()

同样,跳转到BaseTrainer中的train()中,


   
  1. def train( self):
  2. # Allow device='', device=None on Multi-GPU systems to default to device=0
  3. if isinstance(self.args.device, int) or self.args.device: # i.e. device=0 or device=[0,1,2,3]
  4. world_size = torch.cuda.device_count()
  5. elif torch.cuda.is_available(): # i.e. device=None or device=''
  6. world_size = 1 # default to device 0
  7. else: # i.e. device='cpu' or 'mps'
  8. world_size = 0
  9. # Run subprocess if DDP training, else train normally
  10. if world_size > 1 and "LOCAL_RANK" not in os.environ:
  11. command = generate_ddp_command(world_size, self)
  12. try:
  13. subprocess.run(command)
  14. except Exception as e:
  15. self.console(e)
  16. finally:
  17. ddp_cleanup(command, self)
  18. else:
  19. self._do_train( int(os.getenv( "RANK", - 1)), world_size) # world_size=1

因为world_size=1,所以直接进入到self._do_train中,


   
  1. def _do_train( self, rank=-1, world_size=1): # rank=-1, world_size=1
  2. if world_size > 1:
  3. self._setup_ddp(rank, world_size)
  4. self._setup_train(rank, world_size) # 设置与训练相关的参数, 如: optimizer、scheduler、train_loader、test_loader、validator、metrics等
  5. self.epoch_time = None
  6. self.epoch_time_start = time.time()
  7. self.train_time_start = time.time()
  8. nb = len(self.train_loader) # number of batches
  9. nw = max( round(self.args.warmup_epochs * nb), 100) # number of warmup iterations
  10. last_opt_step = - 1
  11. self.run_callbacks( "on_train_start")
  12. self.log( f"Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n"
  13. f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
  14. f"Logging results to {colorstr('bold', self.save_dir)}\n"
  15. f"Starting training for {self.epochs} epochs...")
  16. if self.args.close_mosaic:
  17. base_idx = (self.epochs - self.args.close_mosaic) * nb
  18. self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
  19. for epoch in range(self.start_epoch, self.epochs):
  20. self.epoch = epoch
  21. self.run_callbacks( "on_train_epoch_start")
  22. self.model.train()
  23. if rank != - 1:
  24. self.train_loader.sampler.set_epoch(epoch)
  25. pbar = enumerate(self.train_loader)
  26. # Update dataloader attributes (optional)
  27. if epoch == (self.epochs - self.args.close_mosaic):
  28. self.console.info( "Closing dataloader mosaic")
  29. if hasattr(self.train_loader.dataset, 'mosaic'):
  30. self.train_loader.dataset.mosaic = False
  31. if hasattr(self.train_loader.dataset, 'close_mosaic'):
  32. self.train_loader.dataset.close_mosaic(hyp=self.args)
  33. if rank in {- 1, 0}:
  34. self.console.info(self.progress_string())
  35. pbar = tqdm( enumerate(self.train_loader), total=nb, bar_format=TQDM_BAR_FORMAT)
  36. self.tloss = None
  37. self.optimizer.zero_grad()
  38. for i, batch in pbar:
  39. self.run_callbacks( "on_train_batch_start")
  40. # Warmup
  41. ni = i + nb * epoch
  42. if ni <= nw:
  43. xi = [ 0, nw] # x interp
  44. self.accumulate = max( 1, np.interp(ni, xi, [ 1, self.args.nbs / self.batch_size]). round())
  45. for j, x in enumerate(self.optimizer.param_groups):
  46. # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
  47. x[ 'lr'] = np.interp(
  48. ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x[ 'initial_lr'] * self.lf(epoch)])
  49. if 'momentum' in x:
  50. x[ 'momentum'] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
  51. # Forward
  52. with torch.cuda.amp.autocast(self.amp):
  53. batch = self.preprocess_batch(batch)
  54. preds = self.model(batch[ "img"])
  55. self.loss, self.loss_items = self.criterion(preds, batch)
  56. if rank != - 1:
  57. self.loss *= world_size
  58. self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
  59. else self.loss_items
  60. # Backward
  61. self.scaler.scale(self.loss).backward()
  62. # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
  63. if ni - last_opt_step >= self.accumulate:
  64. self.optimizer_step()
  65. last_opt_step = ni
  66. # Log
  67. mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
  68. loss_len = self.tloss.shape[ 0] if len(self.tloss.size()) else 1
  69. losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
  70. if rank in {- 1, 0}:
  71. pbar.set_description(
  72. ( '%11s' * 2 + '%11.4g' * ( 2 + loss_len)) %
  73. ( f'{epoch + 1}/{self.epochs}', mem, *losses, batch[ "cls"].shape[ 0], batch[ "img"].shape[- 1]))
  74. self.run_callbacks( 'on_batch_end')
  75. if self.args.plots and ni in self.plot_idx:
  76. self.plot_training_samples(batch, ni)
  77. self.run_callbacks( "on_train_batch_end")
  78. self.lr = { f"lr/pg{ir}": x[ 'lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
  79. self.scheduler.step()
  80. self.run_callbacks( "on_train_epoch_end")
  81. if rank in {- 1, 0}:
  82. # Validation
  83. self.ema.update_attr(self.model, include=[ 'yaml', 'nc', 'args', 'names', 'stride', 'class_weights'])
  84. final_epoch = (epoch + 1 == self.epochs)
  85. if self.args.val or final_epoch:
  86. self.metrics, self.fitness = self.validate()
  87. self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
  88. # Save model
  89. if self.args.save or (epoch + 1 == self.epochs):
  90. self.save_model()
  91. self.run_callbacks( 'on_model_save')
  92. tnow = time.time()
  93. self.epoch_time = tnow - self.epoch_time_start
  94. self.epoch_time_start = tnow
  95. self.run_callbacks( "on_fit_epoch_end")
  96. # TODO: termination condition
  97. if rank in {- 1, 0}:
  98. # Do final val with best.pt
  99. self.log( f'\n{epoch - self.start_epoch + 1} epochs completed in '
  100. f'{(time.time() - self.train_time_start) / 3600:.3f} hours.')
  101. self.final_eval()
  102. if self.args.plots:
  103. self.plot_metrics()
  104. self.log( f"Results saved to {colorstr('bold', self.save_dir)}")
  105. self.run_callbacks( 'on_train_end')
  106. torch.cuda.empty_cache()
  107. self.run_callbacks( 'teardown')

以上便是yolov8的训练过程。


转载:https://blog.csdn.net/qq_38964360/article/details/128739669
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场