2021SC@SDUSC
preface
This article analyzes torch in yolov5 code_ Utils.py file is mainly some encapsulated tool classes.
torch_distributed_zero_frist function
@contextmanager def torch_distributed_zero_first(local_rank: int): """ Decorator to make all processes in distributed training wait for each local_master to do something. """ if local_rank not in [-1, 0]: dist.barrier(device_ids=[local_rank]) yield if local_rank == 0: dist.barrier(device_ids=[0])
This function is used to deal with the synchronization problem during model distributed training. If the current process is not the main process, set a blocking fence to keep the process in a waiting state and wait for all processes to reach the fence; Release the fence when all processes reach the fence.
date_modified function
def date_modified(path=__file__): # return human-readable file modification date, i.e. '2021-3-26' t = datetime.datetime.fromtimestamp(Path(path).stat().st_mtime) return f'{t.year}-{t.month}-{t.day}'
Change the system time to a human readable time format
select_device function
def select_device(device='', batch_size=None): # device = 'cpu' or '0' or '0,1,2,3' s = f'YOLOv5 🚀 {git_describe() or date_modified()} torch {torch.__version__} ' # string device = str(device).strip().lower().replace('cuda:', '') # to string, 'cuda:0' to '0' cpu = device == 'cpu' if cpu: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # force torch.cuda.is_available() = False elif device: # non-cpu device requested os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable assert torch.cuda.is_available(), f'CUDA unavailable, invalid device {device} requested' # check availability cuda = not cpu and torch.cuda.is_available() if cuda: devices = device.split(',') if device else '0' # range(torch.cuda.device_count()) # i.e. 0,1,6,7 n = len(devices) # device count if n > 1 and batch_size: # check batch_size is divisible by device_count assert batch_size % n == 0, f'batch-size {batch_size} not multiple of GPU count {n}' space = ' ' * (len(s) + 1) for i, d in enumerate(devices): p = torch.cuda.get_device_properties(i) s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / 1024 ** 2}MB)\n" # bytes to MB else: s += 'CPU\n' LOGGER.info(s.encode().decode('ascii', 'ignore') if platform.system() == 'Windows' else s) # emoji-safe return torch.device('cuda:0' if cuda else 'cpu')
This function selects the training device. When the parameter device is not empty, select the specified device, that is, cpu or gou; When the parameter device is empty, the device is automatically selected according to the machine situation, and gpu is preferred as the training device.
time_sync function
def time_sync(): # pytorch-accurate time if torch.cuda.is_available(): torch.cuda.synchronize() return time.time()
This function is used to accurately obtain the current time during distributed operation.
profile function
def profile(input, ops, n=10, device=None): # YOLOv5 speed/memory/FLOPs profiler # # Usage: # input = torch.randn(16, 3, 640, 640) # m1 = lambda x: x * torch.sigmoid(x) # m2 = nn.SiLU() # profile(input, [m1, m2], n=100) # profile over 100 iterations results = [] logging.basicConfig(format="%(message)s", level=logging.INFO) device = device or select_device() print(f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}" f"{'input':>24s}{'output':>24s}") for x in input if isinstance(input, list) else [input]: x = x.to(device) x.requires_grad = True for m in ops if isinstance(ops, list) else [ops]: m = m.to(device) if hasattr(m, 'to') else m # device m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m tf, tb, t = 0., 0., [0., 0., 0.] # dt forward, backward try: flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 # GFLOPs except: flops = 0 try: for _ in range(n): t[0] = time_sync() y = m(x) t[1] = time_sync() try: _ = (sum([yi.sum() for yi in y]) if isinstance(y, list) else y).sum().backward() t[2] = time_sync() except Exception as e: # no backward method print(e) t[2] = float('nan') tf += (t[1] - t[0]) * 1000 / n # ms per op forward tb += (t[2] - t[1]) * 1000 / n # ms per op backward mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0 # (GB) s_in = tuple(x.shape) if isinstance(x, torch.Tensor) else 'list' s_out = tuple(y.shape) if isinstance(y, torch.Tensor) else 'list' p = sum(list(x.numel() for x in m.parameters())) if isinstance(m, nn.Module) else 0 # parameters print(f'{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}') results.append([p, flops, mem, tf, tb, s_in, s_out]) except Exception as e: print(e) results.append(None) torch.cuda.empty_cache() return results
This function is used to output some relevant information in the training process, such as forward propagation time, back propagation time, shape of input variable, shape of output variable, etc.
initialize_weights function
def initialize_weights(model): for m in model.modules(): t = type(m) if t is nn.Conv2d: pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif t is nn.BatchNorm2d: m.eps = 1e-3 m.momentum = 0.03 elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]: m.inplace = True
This function initializes the training parameters. The eps of BN is set to 1e-3, the momentum is set to 0.03, and the activation function inplace is True
find_modules function
def find_modules(model, mclass=nn.Conv2d): # Finds layer indices matching module class 'mclass' return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)]
This function returns the index value of a module in the whole model
fuse_conv_and_bn function
def fuse_conv_and_bn(conv, bn): # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ fusedconv = nn.Conv2d(conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, groups=conv.groups, bias=True).requires_grad_(False).to(conv.weight.device) # prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) # prepare spatial bias b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv
In the reasoning test stage, the function fuses the convolution layer and BN layer to accelerate the reasoning.
model_info function
def model_info(model, verbose=False, img_size=640): # Model information. img_size may be int or list, i.e. img_size=640 or img_size=[640, 320] n_p = sum(x.numel() for x in model.parameters()) # number parameters n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients if verbose: print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) for i, (name, p) in enumerate(model.named_parameters()): name = name.replace('module_list.', '') print('%5g %40s %9s %12g %20s %10.3g %10.3g' % (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) try: # FLOPs from thop import profile stride = max(int(model.stride.max()), 32) if hasattr(model, 'stride') else 32 img = torch.zeros((1, model.yaml.get('ch', 3), stride, stride), device=next(model.parameters()).device) # input flops = profile(deepcopy(model), inputs=(img,), verbose=False)[0] / 1E9 * 2 # stride GFLOPs img_size = img_size if isinstance(img_size, list) else [img_size, img_size] # expand if int/float fs = ', %.1f GFLOPs' % (flops * img_size[0] / stride * img_size[1] / stride) # 640x640 GFLOPs except (ImportError, Exception): fs = '' LOGGER.info(f"Model Summary: {len(list(model.modules()))} layers, {n_p} parameters, {n_g} gradients{fs}")
This function is used to output all the information of the model, including the number of all layers, the total parameters of the model, the total parameters of the gradient, img_size, size, etc.
load_classifier function
def load_classifier(name='resnet101', n=2): # Loads a pretrained model reshaped to n-class output model = torchvision.models.__dict__[name](pretrained=True) # ResNet model properties # input_size = [3, 224, 224] # input_space = 'RGB' # input_range = [0, 1] # mean = [0.485, 0.456, 0.406] # std = [0.229, 0.224, 0.225] # Reshape output to n classes filters = model.fc.weight.shape[1] model.fc.bias = nn.Parameter(torch.zeros(n), requires_grad=True) model.fc.weight = nn.Parameter(torch.zeros(n, filters), requires_grad=True) model.fc.out_features = n return model
This function is used to perform secondary classification after detection, and change the output of fc layer of resnet to n, that is, the number of categories to be classified.
scale_img function
def scale_img(img, ratio=1.0, same_shape=False, gs=32): # img(16,3,256,416) # scales img(bs,3,y,x) by ratio constrained to gs-multiple if ratio == 1.0: return img else: h, w = img.shape[2:] s = (int(h * ratio), int(w * ratio)) # new size img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize if not same_shape: # pad/crop img h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)] return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean
This function is used to scale the picture. ratio is the scale to be scaled. When the same_ When shape is true, the scaled image size is required to be a multiple of gs.
EarlyStopping class
class EarlyStopping: # YOLOv5 simple early stopper def __init__(self, patience=30): self.best_fitness = 0.0 # i.e. mAP self.best_epoch = 0 self.patience = patience or float('inf') # epochs to wait after fitness stops improving to stop self.possible_stop = False # possible stop may occur next epoch def __call__(self, epoch, fitness): if fitness >= self.best_fitness: # >= 0 to allow for early zero-fitness stage of training self.best_epoch = epoch self.best_fitness = fitness delta = epoch - self.best_epoch # epochs without improvement self.possible_stop = delta >= (self.patience - 1) # possible stop may occur next epoch stop = delta >= self.patience # stop training if patience exceeded if stop: LOGGER.info(f'EarlyStopping patience {self.patience} exceeded, stopping training.') return stop
This class is used for the early stop strategy during training, that is, stop training if the current evaluation index is worse than the values of the previous epochs. patience is the tolerance, that is, the maximum value of the difference between the best epoch and the current epoch. When the difference between the current epoch and the best epoch is greater than this value and the current epoch index is not the best, stop training for a long time. Early stop strategy is a common method to prevent training over fitting.
summary
This article analyzes some pytorch tools and methods used, including some training and testing strategies, which are related to pytorch.