TODO 1.0.0 - Githubissues

[x] Metrics @gabeorlanski
[x] Tensorboard #55
[x] Logging
[x] DynamicScaling @ryanccarelli #52
[x] Gradient Accumulation @AkashGanesan #56
[x] Checkpointing @ryanccarelli #55
[ ] Improved config (.ipynb) @AkashGanesan
[x] Modules from pretrained
[ ] BYOL @ryanccarelli
- [ ] Implementation
- [x] Separate preprocess and postprocess pipelines
- [ ] Fix RandomCrop augmentation
- [ ] Benchmarks

[ ] DINO

[ ] Implementation

[ ] Centering Layer

def update_center(self, teacher_output):
"""
Update center used for teacher output.
"""
batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
dist.all_reduce(batch_center)
batch_center = batch_center / (len(teacher_output) * dist.get_world_size())

# ema update
self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)

# ema update
self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)

[ ] DINO/SWAV MLP ("The projection head consists of a 3-layer multi-layer perceptron (MLP) with hidden dimension 2048 followed by `2 normalization and a weight normalized fully connected layer [61] with K dimensions, which is similar to the design from SwAV") https://github.com/facebookresearch/dino

class DINOHead(nn.Module):
def __init__(self, in_dim, out_dim, use_bn=False, norm_last_layer=True, nlayers=3, hidden_dim=2048, bottleneck_dim=256):
    super().__init__()
    nlayers = max(nlayers, 1)
    if nlayers == 1:
        self.mlp = nn.Linear(in_dim, bottleneck_dim)
    else:
        layers = [nn.Linear(in_dim, hidden_dim)]
        if use_bn:
            layers.append(nn.BatchNorm1d(hidden_dim))
        layers.append(nn.GELU())
        for _ in range(nlayers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.GELU())
        layers.append(nn.Linear(hidden_dim, bottleneck_dim))
        self.mlp = nn.Sequential(*layers)
    self.apply(self._init_weights)
    self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
    self.last_layer.weight_g.data.fill_(1)
    if norm_last_layer:
        self.last_layer.weight_g.requires_grad = False

def _init_weights(self, m):
    if isinstance(m, nn.Linear):
        trunc_normal_(m.weight, std=.02)
        if isinstance(m, nn.Linear) and m.bias is not None:
            nn.init.constant_(m.bias, 0)

def forward(self, x):
    x = self.mlp(x)
    x = nn.functional.normalize(x, dim=-1, p=2)
    x = self.last_layer(x)
    return x

[ ] Benchmarks

[ ] SWAV
- [ ] Implementation
- [ ] Benchmarks
[ ] Documentation
- [ ] core
- [ ] introduction/overview
- [ ] model descriptions
[ ] Tests
[x] Load pretrained model for individual component (only ViT for example)

ryanccarelli / ssljax

TODO 1.0.0 #51