40 行
		
	
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			40 行
		
	
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Resume all interrupted trainings in yolov5/ dir including DDP trainings
 | |
| # Usage: $ python utils/aws/resume.py
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| from pathlib import Path
 | |
| 
 | |
| import torch
 | |
| import yaml
 | |
| 
 | |
| FILE = Path(__file__).resolve()
 | |
| ROOT = FILE.parents[2]  # YOLOv5 root directory
 | |
| if str(ROOT) not in sys.path:
 | |
|     sys.path.append(str(ROOT))  # add ROOT to PATH
 | |
| 
 | |
| port = 0  # --master_port
 | |
| path = Path('').resolve()
 | |
| for last in path.rglob('*/**/last.pt'):
 | |
|     ckpt = torch.load(last)
 | |
|     if ckpt['optimizer'] is None:
 | |
|         continue
 | |
| 
 | |
|     # Load opt.yaml
 | |
|     with open(last.parent.parent / 'opt.yaml', errors='ignore') as f:
 | |
|         opt = yaml.safe_load(f)
 | |
| 
 | |
|     # Get device count
 | |
|     d = opt['device'].split(',')  # devices
 | |
|     nd = len(d)  # number of devices
 | |
|     ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1)  # distributed data parallel
 | |
| 
 | |
|     if ddp:  # multi-GPU
 | |
|         port += 1
 | |
|         cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
 | |
|     else:  # single-GPU
 | |
|         cmd = f'python train.py --resume {last}'
 | |
| 
 | |
|     cmd += ' > /dev/null 2>&1 &'  # redirect output to dev/null and run in daemon thread
 | |
|     print(cmd)
 | |
|     os.system(cmd)
 |