承接上一篇博客:https://blog.csdn.net/qq_38964360/article/details/128728145?spm=1001.2014.3001.5501
今天记录一下yolov8模型训练的调试过程。
在工程里创建训练脚本python_example.py
代码如下:
-
# filename: python_example.py
-
# dir: yolov8/python_example.py
-
from ultralytics
import YOLO
-
-
# build a new model from scratch
-
model = YOLO(
"yolov8/ultralytics/models/v8/yolov8m.yaml")
-
# train the model
-
results = model.train(data=
"yolov8/ultralytics/yolo/data/datasets/wider_face.yaml", epochs=
100)
以上示例首先是利用yolov8m.yaml文件初始化了YOLO类,
model = YOLO("yolov8/ultralytics/models/v8/yolov8m.yaml")
接下来可以看看YOLO的__init__,
-
# dir: yolov8/ultralytics/yolo/engine/model.py
-
class
YOLO:
-
-
def
__init__(
self, model='yolov8n.yaml', type="v8") ->
None:
-
"""
-
Initializes the YOLO object.
-
-
Args:
-
model (str, Path): model to load or create
-
type (str): Type/version of models to use. Defaults to "v8".
-
"""
-
self.
type =
type
-
self.ModelClass =
None
# model class
-
self.TrainerClass =
None
# trainer class
-
self.ValidatorClass =
None
# validator class
-
self.PredictorClass =
None
# predictor class
-
self.model =
None
# model object
-
self.trainer =
None
# trainer object
-
self.task =
None
# task type
-
self.ckpt =
None
# if loaded from *.pt
-
self.cfg =
None
# if loaded from *.yaml
-
self.ckpt_path =
None
-
self.overrides = {}
# overrides for trainer object
-
-
# Load or create new YOLO model
-
{
'.pt': self._load,
'.yaml': self._new}[Path(model).suffix](model)
这部分的重点是最后一句代码:
{'.pt': self._load, '.yaml': self._new}[Path(model).suffix](model)
根据我们输入的model参数(yolov8m.yaml)的后缀,代码将跳到self._new中,根据yaml文件定义模型:
-
def
_new(
self, cfg: str, verbose=True):
-
cfg = check_yaml(cfg)
# cfg='yolov8/ultralytics/models/v8/yolov8m.yaml'
-
cfg_dict = yaml_load(cfg, append_filename=
True)
# model dict
-
'''
-
cfg_dict=
-
{'nc': 80, 'depth_multiple': 0.33, 'width_multiple': 0.25, 'backbone': [[...], [...], [...], [...], [...], [...], [...], [...], [...], ...],
-
'head': [[...], [...], [...], [...], [...], [...], [...], [...], [...], ...],
-
'yaml_file': 'yolov8/ultralytics/models/v8/yolov8m.yaml',
-
'ch': 3}
-
'''
-
self.task = guess_task_from_head(cfg_dict[
"head"][-
1][-
2])
# self.task='detect'
-
self.ModelClass, self.TrainerClass, self.ValidatorClass, self.PredictorClass = \
-
self._guess_ops_from_task(self.task)
# 根据task定义ModelClass、TrainerClass等, self.TrainerClass=DetectionTrainer()
-
self.model = self.ModelClass(cfg_dict, verbose=verbose)
# self.model=DetectionModel(), class DetectionModel的定义在yolov8/ultralytics/nn/tasks.py
-
self.cfg = cfg
# self.cfg='yolov8/ultralytics/models/v8/yolov8m.yaml'
-
-
def
_guess_ops_from_task(
self, task):
# task='detect'
-
model_class, train_lit, val_lit, pred_lit = MODEL_MAP[task]
# model_class="<class 'ultralytics.nn.tasks.DetectionModel'>"
-
# warning: eval is unsafe. Use with caution
-
trainer_class =
eval(train_lit.replace(
"TYPE",
f"{self.type}"))
# trainer_class="<class 'ultralytics.yolo.v8.detect.train.DetectionTrainer'>"
-
validator_class =
eval(val_lit.replace(
"TYPE",
f"{self.type}"))
# validator_class="<class 'ultralytics.yolo.v8.detect.val.DetectionValidator'>"
-
predictor_class =
eval(pred_lit.replace(
"TYPE",
f"{self.type}"))
# predictor_class="<class 'ultralytics.yolo.v8.detect.predict.DetectionPredictor'>"
-
-
return model_class, trainer_class, validator_class, predictor_class
实例化完模型(YOLO)后,就可以开始训练模型了,
results = model.train(data="yolov8/ultralytics/yolo/data/datasets/wider_face.yaml", epochs=100)
跳转到class YOLO中的train,
-
def
train(
self, **kwargs):
-
"""
-
Trains the model on a given dataset.
-
-
Args:
-
**kwargs (Any): Any number of arguments representing the training configuration. List of all args can be found in 'config' section.
-
You can pass all arguments as a yaml file in `cfg`. Other args are ignored if `cfg` file is passed
-
"""
-
overrides = self.overrides.copy()
# overrides={}
-
overrides.update(kwargs)
# overrides={'data': 'wider_face.yaml', 'epochs': 100}
-
if kwargs.get(
"cfg"):
-
LOGGER.info(
f"cfg file passed. Overriding default params with {kwargs['cfg']}.")
-
overrides = yaml_load(check_yaml(kwargs[
"cfg"]), append_filename=
True)
-
overrides[
"task"] = self.task
-
overrides[
"mode"] =
"train"
# overrides={'data': 'wider_face.yaml', 'epochs': 100, 'task': 'detect', 'mode': 'train'}
-
if
not overrides.get(
"data"):
-
raise AttributeError(
"dataset not provided! Please define `data` in config.yaml or pass as an argument.")
-
if overrides.get(
"resume"):
-
overrides[
"resume"] = self.ckpt_path
-
-
self.trainer = self.TrainerClass(overrides=overrides)
-
if
not overrides.get(
"resume"):
# manually set model only if not resuming
-
self.trainer.model = self.trainer.get_model(weights=self.model
if self.ckpt
else
None, cfg=self.model.yaml)
# 如果有ckpt, 则直接加载; 没有则根据yolov8n.yaml新建一个模型
-
self.model = self.trainer.model
-
self.trainer.train()
-
# update model and configs after training
-
self.model, _ = attempt_load_one_weight(
str(self.trainer.best))
-
self.overrides = self.model.args
上述代码的重点是self.trainer.train(),self.trainer是<class 'ultralytics.yolo.v8.detect.train.DetectionTrainer'>,而DetectionTrainer继承了class BaseTrainer(dir: 'yolov8/ultralytics/yolo/engine/trainer.py')。利用overrides来初始化BaseTrainer,
self.trainer = self.TrainerClass(overrides=overrides)
-
class
BaseTrainer:
-
"""
-
BaseTrainer
-
-
A base class for creating trainers.
-
-
Attributes:
-
args (OmegaConf): Configuration for the trainer.
-
check_resume (method): Method to check if training should be resumed from a saved checkpoint.
-
console (logging.Logger): Logger instance.
-
validator (BaseValidator): Validator instance.
-
model (nn.Module): Model instance.
-
callbacks (defaultdict): Dictionary of callbacks.
-
save_dir (Path): Directory to save results.
-
wdir (Path): Directory to save weights.
-
last (Path): Path to last checkpoint.
-
best (Path): Path to best checkpoint.
-
batch_size (int): Batch size for training.
-
epochs (int): Number of epochs to train for.
-
start_epoch (int): Starting epoch for training.
-
device (torch.device): Device to use for training.
-
amp (bool): Flag to enable AMP (Automatic Mixed Precision).
-
scaler (amp.GradScaler): Gradient scaler for AMP.
-
data (str): Path to data.
-
trainset (torch.utils.data.Dataset): Training dataset.
-
testset (torch.utils.data.Dataset): Testing dataset.
-
ema (nn.Module): EMA (Exponential Moving Average) of the model.
-
lf (nn.Module): Loss function.
-
scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
-
best_fitness (float): The best fitness value achieved.
-
fitness (float): Current fitness value.
-
loss (float): Current loss value.
-
tloss (float): Total loss value.
-
loss_names (list): List of loss names.
-
csv (Path): Path to results CSV file.
-
"""
-
-
def
__init__(
self, config=DEFAULT_CONFIG, overrides=None):
# overrides={'data': 'wider_face.yaml', 'epochs': 100, 'task': 'detect', 'mode': 'train'}
-
if overrides
is
None:
-
overrides = {}
-
self.args = get_config(config, overrides)
# config='yolov8/ultralytics/yolo/configs/default.yaml'
-
'''
-
self.args=
-
{'task': 'detect', 'mode': 'train', 'model': None, 'data': 'wider_face.yaml', 'epochs': 100, 'patience': 50, 'batch': 16,
-
'imgsz': 640, 'save': True, 'cache': False, 'device': None, 'workers': 8, 'project': None, 'name': None,
-
'exist_ok': False, 'pretrained': False, 'optimizer': 'SGD', 'verbose': False, 'seed': 0, 'deterministic': True,
-
'single_cls': False, 'image_weights': False, 'rect': False, 'cos_lr': False, 'close_mosaic': 10, 'resume': False,
-
'overlap_mask': True, 'mask_ratio': 4, 'dropout': 0.0, 'val': True, 'save_json': False, 'save_hybrid': False,
-
'conf': None, 'iou': 0.7, 'max_det': 300, 'half': False, 'dnn': False, 'plots': True, 'source': None, 'show': False,
-
'save_txt': False, 'save_conf': False, 'save_crop': False, 'hide_labels': False, 'hide_conf': False, 'vid_stride': 1,
-
'line_thickness': 3, 'visualize': False, 'augment': False, 'agnostic_nms': False, 'retina_masks': False,
-
'format': 'torchscript', 'keras': False, 'optimize': False, 'int8': False, 'dynamic': False, 'simplify': False,
-
'opset': 17, 'workspace': 4, 'nms': False, 'lr0': 0.01, 'lrf': 0.01, 'momentum': 0.937, 'weight_decay': 0.0005,
-
'warmup_epochs': 3.0, 'warmup_momentum': 0.8, 'warmup_bias_lr': 0.1, 'box': 7.5, 'cls': 0.5, 'dfl': 1.5,
-
'fl_gamma': 0.0, 'label_smoothing': 0.0, 'nbs': 64, 'hsv_h': 0.015, 'hsv_s': 0.7, 'hsv_v': 0.4, 'degrees': 0.0,
-
'translate': 0.1, 'scale': 0.5, 'shear': 0.0, 'perspective': 0.0, 'flipud': 0.0, 'fliplr': 0.5, 'mosaic': 1.0,
-
'mixup': 0.0, 'copy_paste': 0.0, 'cfg': None, 'hydra': {'output_subdir': None, 'run': {'dir': '.'}}, 'v5loader': False}
-
'''
-
self.device = utils.torch_utils.select_device(self.args.device, self.args.batch)
# self.device=device(type='cuda', index=0)
-
self.check_resume()
-
self.console = LOGGER
-
self.validator =
None
-
self.model =
None
-
self.callbacks = defaultdict(
list)
-
init_seeds(self.args.seed +
1 + RANK, deterministic=self.args.deterministic)
-
-
# Dirs
-
project = self.args.project
or Path(SETTINGS[
'runs_dir']) / self.args.task
# project='yolov8/runs/detect'
-
name = self.args.name
or
f"{self.args.mode}"
# name='train'
-
self.save_dir = Path(
-
self.args.get(
-
"save_dir",
-
increment_path(Path(project) / name, exist_ok=self.args.exist_ok
if RANK
in {-
1,
0}
else
True)))
# self.save_dir='yolov8/runs/detect/train'
-
self.wdir = self.save_dir /
'weights'
# self.wdir='yolov8/runs/detect/train/weights'
-
if RANK
in {-
1,
0}:
-
self.wdir.mkdir(parents=
True, exist_ok=
True)
# make dir
-
with open_dict(self.args):
-
self.args.save_dir =
str(self.save_dir)
-
yaml_save(self.save_dir /
'args.yaml', OmegaConf.to_container(self.args, resolve=
True))
# save run args
-
self.last, self.best = self.wdir /
'last.pt', self.wdir /
'best.pt'
# checkpoint paths
-
-
self.batch_size = self.args.batch
-
self.epochs = self.args.epochs
# self.epochs=100
-
self.start_epoch =
0
-
if RANK == -
1:
-
print_args(
dict(self.args))
-
-
# Device
-
self.amp = self.device.
type !=
'cpu'
-
self.scaler = amp.GradScaler(enabled=self.amp)
-
if self.device.
type ==
'cpu':
-
self.args.workers =
0
# faster CPU training as time dominated by inference, not dataloading
-
-
# Model and Dataloaders.
-
self.model = self.args.model
# self.model=None
-
self.data = self.args.data
# self.data='wider_face.yaml'
-
if self.data.endswith(
".yaml"):
-
self.data = check_dataset_yaml(self.data)
-
'''
-
self.data=
-
{'path': PosixPath('yolov8/datasets/wider_face'),
-
'train': 'yolov8/datasets/wider_face/images/train',
-
'val': 'yolov8/datasets/wider_face/images/val',
-
'test': None,
-
'names': {0: 'face'},
-
'download': ,
-
'yaml_file': 'yolov8/ultralytics/yolo/data/datasets/wider_face.yaml',
-
'nc': 1}
-
'''
-
else:
-
self.data = check_dataset(self.data)
-
self.trainset, self.testset = self.get_dataset(self.data)
# self.trainset='yolov8/datasets/wider_face/images/train', self.testset='yolov8/datasets/wider_face/images/val'
-
self.ema =
None
-
-
# Optimization utils init
-
self.lf =
None
-
self.scheduler =
None
-
-
# Epoch level metrics
-
self.best_fitness =
None
-
self.fitness =
None
-
self.loss =
None
-
self.tloss =
None
-
self.loss_names = [
'Loss']
-
self.csv = self.save_dir /
'results.csv'
# self.csv='yolov8/runs/detect/train/results.csv'
-
self.plot_idx = [
0,
1,
2]
-
-
# Callbacks
-
self.callbacks = defaultdict(
list, {k: [v]
for k, v
in callbacks.default_callbacks.items()})
# add callbacks
-
if RANK
in {
0, -
1}:
-
callbacks.add_integration_callbacks(self)
初始化完self.trainer后,便开始训练,
self.trainer.train()
同样,跳转到BaseTrainer中的train()中,
-
def
train(
self):
-
# Allow device='', device=None on Multi-GPU systems to default to device=0
-
if
isinstance(self.args.device,
int)
or self.args.device:
# i.e. device=0 or device=[0,1,2,3]
-
world_size = torch.cuda.device_count()
-
elif torch.cuda.is_available():
# i.e. device=None or device=''
-
world_size =
1
# default to device 0
-
else:
# i.e. device='cpu' or 'mps'
-
world_size =
0
-
-
# Run subprocess if DDP training, else train normally
-
if world_size >
1
and
"LOCAL_RANK"
not
in os.environ:
-
command = generate_ddp_command(world_size, self)
-
try:
-
subprocess.run(command)
-
except Exception
as e:
-
self.console(e)
-
finally:
-
ddp_cleanup(command, self)
-
else:
-
self._do_train(
int(os.getenv(
"RANK", -
1)), world_size)
# world_size=1
因为world_size=1,所以直接进入到self._do_train中,
-
def
_do_train(
self, rank=-1, world_size=1):
# rank=-1, world_size=1
-
if world_size >
1:
-
self._setup_ddp(rank, world_size)
-
-
self._setup_train(rank, world_size)
# 设置与训练相关的参数, 如: optimizer、scheduler、train_loader、test_loader、validator、metrics等
-
-
self.epoch_time =
None
-
self.epoch_time_start = time.time()
-
self.train_time_start = time.time()
-
nb =
len(self.train_loader)
# number of batches
-
nw =
max(
round(self.args.warmup_epochs * nb),
100)
# number of warmup iterations
-
last_opt_step = -
1
-
self.run_callbacks(
"on_train_start")
-
self.log(
f"Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n"
-
f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
-
f"Logging results to {colorstr('bold', self.save_dir)}\n"
-
f"Starting training for {self.epochs} epochs...")
-
if self.args.close_mosaic:
-
base_idx = (self.epochs - self.args.close_mosaic) * nb
-
self.plot_idx.extend([base_idx, base_idx +
1, base_idx +
2])
-
for epoch
in
range(self.start_epoch, self.epochs):
-
self.epoch = epoch
-
self.run_callbacks(
"on_train_epoch_start")
-
self.model.train()
-
if rank != -
1:
-
self.train_loader.sampler.set_epoch(epoch)
-
pbar =
enumerate(self.train_loader)
-
# Update dataloader attributes (optional)
-
if epoch == (self.epochs - self.args.close_mosaic):
-
self.console.info(
"Closing dataloader mosaic")
-
if
hasattr(self.train_loader.dataset,
'mosaic'):
-
self.train_loader.dataset.mosaic =
False
-
if
hasattr(self.train_loader.dataset,
'close_mosaic'):
-
self.train_loader.dataset.close_mosaic(hyp=self.args)
-
-
if rank
in {-
1,
0}:
-
self.console.info(self.progress_string())
-
pbar = tqdm(
enumerate(self.train_loader), total=nb, bar_format=TQDM_BAR_FORMAT)
-
self.tloss =
None
-
self.optimizer.zero_grad()
-
for i, batch
in pbar:
-
self.run_callbacks(
"on_train_batch_start")
-
# Warmup
-
ni = i + nb * epoch
-
if ni <= nw:
-
xi = [
0, nw]
# x interp
-
self.accumulate =
max(
1, np.interp(ni, xi, [
1, self.args.nbs / self.batch_size]).
round())
-
for j, x
in
enumerate(self.optimizer.param_groups):
-
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
-
x[
'lr'] = np.interp(
-
ni, xi, [self.args.warmup_bias_lr
if j ==
0
else
0.0, x[
'initial_lr'] * self.lf(epoch)])
-
if
'momentum'
in x:
-
x[
'momentum'] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
-
-
# Forward
-
with torch.cuda.amp.autocast(self.amp):
-
batch = self.preprocess_batch(batch)
-
preds = self.model(batch[
"img"])
-
self.loss, self.loss_items = self.criterion(preds, batch)
-
if rank != -
1:
-
self.loss *= world_size
-
self.tloss = (self.tloss * i + self.loss_items) / (i +
1)
if self.tloss
is
not
None \
-
else self.loss_items
-
-
# Backward
-
self.scaler.scale(self.loss).backward()
-
-
# Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
-
if ni - last_opt_step >= self.accumulate:
-
self.optimizer_step()
-
last_opt_step = ni
-
-
# Log
-
mem =
f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'
# (GB)
-
loss_len = self.tloss.shape[
0]
if
len(self.tloss.size())
else
1
-
losses = self.tloss
if loss_len >
1
else torch.unsqueeze(self.tloss,
0)
-
if rank
in {-
1,
0}:
-
pbar.set_description(
-
(
'%11s' *
2 +
'%11.4g' * (
2 + loss_len)) %
-
(
f'{epoch + 1}/{self.epochs}', mem, *losses, batch[
"cls"].shape[
0], batch[
"img"].shape[-
1]))
-
self.run_callbacks(
'on_batch_end')
-
if self.args.plots
and ni
in self.plot_idx:
-
self.plot_training_samples(batch, ni)
-
-
self.run_callbacks(
"on_train_batch_end")
-
-
self.lr = {
f"lr/pg{ir}": x[
'lr']
for ir, x
in
enumerate(self.optimizer.param_groups)}
# for loggers
-
-
self.scheduler.step()
-
self.run_callbacks(
"on_train_epoch_end")
-
-
if rank
in {-
1,
0}:
-
-
# Validation
-
self.ema.update_attr(self.model, include=[
'yaml',
'nc',
'args',
'names',
'stride',
'class_weights'])
-
final_epoch = (epoch +
1 == self.epochs)
-
if self.args.val
or final_epoch:
-
self.metrics, self.fitness = self.validate()
-
self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
-
-
# Save model
-
if self.args.save
or (epoch +
1 == self.epochs):
-
self.save_model()
-
self.run_callbacks(
'on_model_save')
-
-
tnow = time.time()
-
self.epoch_time = tnow - self.epoch_time_start
-
self.epoch_time_start = tnow
-
self.run_callbacks(
"on_fit_epoch_end")
-
# TODO: termination condition
-
-
if rank
in {-
1,
0}:
-
# Do final val with best.pt
-
self.log(
f'\n{epoch - self.start_epoch + 1} epochs completed in '
-
f'{(time.time() - self.train_time_start) / 3600:.3f} hours.')
-
self.final_eval()
-
if self.args.plots:
-
self.plot_metrics()
-
self.log(
f"Results saved to {colorstr('bold', self.save_dir)}")
-
self.run_callbacks(
'on_train_end')
-
torch.cuda.empty_cache()
-
self.run_callbacks(
'teardown')
以上便是yolov8的训练过程。
转载:https://blog.csdn.net/qq_38964360/article/details/128739669
查看评论