CCA is very small between a random net vs a pretrained one, bug?

brando90 commented 2 years ago

I am getting this issue:

import anatome
print(anatome)
# from anatome import CCAHook
from anatome import SimilarityHook
model = resnet18(pretrained=True)
random_model = resnet18()
# random_model = resnet18().cuda()
# hook1 = CCAHook(model, "layer1.0.conv1")
# hook2 = CCAHook(random_model, "layer1.0.conv1")
cxa_dist_type = 'pwcca'
layer_name = "layer1.0.conv1"
hook1 = SimilarityHook(model, layer_name, cxa_dist_type)
hook2 = SimilarityHook(random_model, layer_name, cxa_dist_type)
with torch.no_grad():
    model(data[0])
    random_model(data[0])
distance_btw_nets = hook1.distance(hook2, size=8)
print(f'{distance_btw_nets=}')
distance_btw_nets = hook1.distance(hook2, size=None)
print(f'{distance_btw_nets=}')
<module 'anatome' from '/Users/brando/anaconda3/envs/metalearning/lib/python3.9/site-packages/anatome/__init__.py'>
distance_btw_nets=0.3089657425880432
distance_btw_nets=-2.468004822731018e-08

the second is suppose to use the full features but we see the error is much smaller when I expected it to increase by a lot since we are using more info since we didn't down sample.

Is this a bug?

brando90 commented 2 years ago

reproduce here: https://colab.research.google.com/drive/1GrhWrWFPmlc6kmxc0TJY0Nb6qOBBgjzX?usp=sharing

brando90 commented 2 years ago

"""
attempt at a colab: https://colab.research.google.com/drive/1GrhWrWFPmlc6kmxc0TJY0Nb6qOBBgjzX#scrollTo=KhUWNu3J_6i4
"""
#%%

# import torch
# import torchvision
# from torch.nn import functional as F
# from torchvision.models import resnet18
# from torchvision import transforms
# from torchvision.datasets import ImageFolder
# from torch.utils.data import DataLoader
#
# import matplotlib.pyplot as plt
#
# batch_size = 128
#
# model = resnet18(pretrained=True)
# imagenet = ImageFolder('~/.torch/data/imagenet/val',
#                        transforms.Compose([transforms.CenterCrop(224), transforms.ToTensor(),
#                        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]))
# data = next(iter(DataLoader(imagenet, batch_size=batch_size, num_workers=8)))

#%%

import torch
import torchvision
from torch.nn import functional as F
from torchvision.models import resnet18
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

batch_size = 128

# imagenet = ImageFolder('~/.torch/data/imagenet/val',
#                        transforms.Compose([transforms.CenterCrop(224), transforms.ToTensor(),
#                        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]))
# data = next(iter(DataLoader(imagenet, batch_size=batch_size, num_workers=8)))

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 128
num_workers = 0

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=num_workers)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=num_workers)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

data = next(iter(trainloader))

#%%

import anatome
print(anatome)
# from anatome import CCAHook
from anatome import SimilarityHook

model = resnet18(pretrained=True)
random_model = resnet18()
# random_model = resnet18().cuda()

# hook1 = CCAHook(model, "layer1.0.conv1")
# hook2 = CCAHook(random_model, "layer1.0.conv1")

cxa_dist_type = 'pwcca'
layer_name = "layer1.0.conv1"

hook1 = SimilarityHook(model, layer_name, cxa_dist_type)
hook2 = SimilarityHook(random_model, layer_name, cxa_dist_type)

with torch.no_grad():
    model(data[0])
    random_model(data[0])
distance_btw_nets = hook1.distance(hook2, size=8)
print(f'{distance_btw_nets=}')
distance_btw_nets = hook1.distance(hook2, size=None)
print(f'{distance_btw_nets=}')

#%%

from meta_learning.base_models.learner_from_opt_as_few_shot_paper import Learner

from argparse import Namespace

args = Namespace()
# args.k_eval = 150
args.image_size = 84
args.bn_eps = 1e-3
args.bn_momentum = 0.95
args.n_classes = 5
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1 = Learner(image_size=args.image_size, bn_eps=args.bn_eps, bn_momentum=args.bn_momentum, n_classes=args.n_classes).to(args.device)
model2 = Learner(image_size=args.image_size, bn_eps=args.bn_eps, bn_momentum=args.bn_momentum, n_classes=args.n_classes).to(args.device)

cxa_dist_type = 'pwcca'
layer_name = "model.features.conv1"

hook1 = SimilarityHook(model1, layer_name, cxa_dist_type)
hook2 = SimilarityHook(model2, layer_name, cxa_dist_type)

with torch.no_grad():
    batch_x = data[0]
    print(f'{batch_x.size()=}')
    model1(batch_x)
    model2(batch_x)
distance_btw_nets = hook1.distance(hook2, size=8)
print(f'{distance_btw_nets=}')

output:

distance_btw_nets=0.3089657425880432
distance_btw_nets=-2.468004822731018e-08

but the distance should increase when the dimensionality of the activations is increased - especially when one net is pre-trained while the other is random.

brando90 commented 2 years ago

is this correct @moskomule ? shouldn't the distance increase not decrease as the size=8 goes to size=None (second one uses the whole activations according to the code)

brando90 commented 2 years ago

@moskomule sorry for the ping again...but just wanted to check if this was a bug or not.

Thanks for your time and patience, it's appreciated!

moskomule commented 2 years ago

I'm not sure. I think several methods have been proposed to fix unexpected behaviors of previous works, so it could happen.

brando90 commented 2 years ago

I'm not sure. I think several methods have been proposed to fix unexpected behaviors of previous works, so it could happen.

did you test your CCA code with for example scipy's results? (to make sure anatome's implementation is correct?)

moskomule commented 2 years ago

No

brando90 commented 2 years ago

No

how are you testing it?

brando90 commented 2 years ago

@moskomule I ran a sanity check from the original google tutorial and your code seems to work! Thought you'd be happy to know: ndata_vs_svcca_sim

https://github.com/google/svcca/blob/master/tutorials/001_Introduction.ipynb

as n->large svcca converge to the true svcca, plus at first it's very similar due to small # dat apoints.

brando90 commented 2 years ago

actually this stopped working :( need to fix bug, library might not be working at all without this santity check.

brando90 commented 2 years ago

ok it works now. Use this githash of anatome:

anatome git hash
c4c0691
c4c069183aca8aad6f73a4b7ab86f7f7e4ca3d04

moskomule commented 2 years ago

Thanks for reporting. Happy to know that!

brando90 commented 2 years ago

Thanks for reporting. Happy to know that!

No worries! will share code. seems I forgot

#%%
"""
The similarity of the same network should always be 1.0 on same input.
"""
import torch
import torch.nn as nn

import uutils.torch_uu
from uutils.torch_uu import cxa_sim, approx_equal
from uutils.torch_uu.models import get_named_identity_one_layer_linear_model

print('--- Sanity check: sCCA = 1.0 when using same net twice with same input. --')

Din: int = 10
Dout: int = Din
B: int = 2000
mdl1: nn.Module = get_named_identity_one_layer_linear_model(D=Din)
mdl2: nn.Module = mdl1
layer_name = 'fc0'
# cxa_dist_type = 'pwcca'
cxa_dist_type = 'svcca'

# - ends up comparing two matrices of size [B, Dout], on same data, on same model
X: torch.Tensor = torch.distributions.Normal(loc=0.0, scale=1.0).sample((B, Din))
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)

print(f'Should be very very close to 1.0: {sim=}')
print(f'Is it close to 1.0? {approx_equal(sim, 1.0)}')
assert(approx_equal(sim, 1.0))

#%%
"""
Reproducing: How many data points: https://github.com/google/svcca/blob/master/tutorials/001_Introduction.ipynb

As n increases, the cca sim should decrease until it converges to the true max linear correlation in the data.
This is because when D is small it's easy to correlate via Xw, Yw since there are less equations (m data) than unknown (D features). 
Similarly, the similarity decreases because the more data there is, the more variation has to be captured and thus the less
correlation there will be.
This is correct because 1/4*E[|| Xw - Yw||^2]^2 is proportional the pearson's correlation (assuming Xw, Yw is standardized).

"""
from pathlib import Path
from matplotlib import pyplot as plt

import torch
import torch.nn as nn

import uutils
from uutils.torch_uu import cxa_sim, approx_equal
from uutils.torch_uu.models import get_named_one_layer_random_linear_model

import uutils.plot as uulot

print('\n--- Sanity check: when number of data points B is smaller than D, then it should be trivial to make similiarty 1.0 '
      '(even if nets/matrices are different)')
B: int = 10
Dout: int = 300
mdl1: nn.Module = get_named_one_layer_random_linear_model(B, Dout)
mdl2: nn.Module = get_named_one_layer_random_linear_model(B, Dout)
layer_name = 'fc0'
# cxa_dist_type = 'pwcca'
cxa_dist_type = 'svcca'

# - get sim for B << D e.g. [B=10, D=300] easy to "fit", to many degrees of freedom
X: torch.Tensor = uutils.torch_uu.get_identity_data(B)
# mdl1(X) : [B, Dout] = [B, B] [B, Dout]
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
print(f'Should be very very close to 1.0: {sim=} (since we have many features to match the two Xw1, Yw2).')
print(f'Is it close to 1.0? {approx_equal(sim, 1.0)}')
# assert(approx_equal(sim, 1.0))

print('\n-- Santity: just makes sure that when low data is present sim is high and afterwards (as n->infty) sim (CCA) '
      'converges to the "true" cca value (eventually)')
# data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000]
data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000]
# data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000, 50_000, 100_000]
# data_sizes: list[int] = [10, 25, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000]
sims: list[float] = []
for b in data_sizes:
    X: torch.Tensor = uutils.torch_uu.get_identity_data(b)
    mdl1: nn.Module = get_named_one_layer_random_linear_model(b, Dout)
    mdl2: nn.Module = get_named_one_layer_random_linear_model(b, Dout)
    # print(f'{b=}')
    sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
    # print(f'{sim=}')
    sims.append(sim)

print(f'{sims=}')
uulot.plot(x=data_sizes, y=sims, xlabel='number of data points (n)', ylabel='similarity (svcca)', show=True, save_plot=True, plot_filename='ndata_vs_svcca_sim', title='Features (D) vs Sim (SVCCA)', x_hline=Dout, x_hline_label=f'B=D={Dout}')

#%%

from pathlib import Path
from matplotlib import pyplot as plt

import torch
import torch.nn as nn

import uutils
from uutils.torch_uu import cxa_sim, approx_equal
from uutils.torch_uu.models import get_named_one_layer_random_linear_model

from uutils.plot import plot, save_to_desktop
import uutils.plot as uuplot

B: int = 10  # [101, 200, 500, 1000, 2000, 5000, 10000]
Din: int = B
Dout: int = 300
mdl1: nn.Module = get_named_one_layer_random_linear_model(Din, Dout)
mdl2: nn.Module = get_named_one_layer_random_linear_model(Din, Dout)
layer_name = 'fc0'
# cxa_dist_type = 'pwcca'
cxa_dist_type = 'svcca'

X: torch.Tensor = uutils.torch_uu.get_identity_data(B)
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)

print(f'Should be very very close to 1.0: {sim=}')
print(f'Is it close to 1.0? {approx_equal(sim, 1.0)}')

# data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000, 50_000]
B: int = 300
D_feature_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000]
sims: list[float] = []
for d in D_feature_sizes:
    X: torch.Tensor = uutils.torch_uu.get_identity_data(B)
    mdl1: nn.Module = get_named_one_layer_random_linear_model(B, d)
    mdl2: nn.Module = get_named_one_layer_random_linear_model(B, d)
    sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
    # print(f'{d=}, {sim=}')
    sims.append(sim)

print(f'{sims=}')
uuplot.plot(x=D_feature_sizes, y=sims, xlabel='number of features/size of dimension (D)', ylabel='similarity (svcca)', show=True, save_plot=True, plot_filename='D_vs_sim_svcca', title='Features (D) vs Sim (SVCCA)', x_hline=B, x_hline_label=f'B=D={B}')
# uuplot.plot(x=D_feature_sizes, y=sims, xlabel='number of features/size of dimension (D)', ylabel='similarity (svcca)', show=True, save_plot=True, plot_filename='D_vs_sim', title='Features (D) vs Sim (SVCCA)')

should produce the plots above. If the plots reproduce we are good ;) :)

moskomule / anatome

CCA is very small between a random net vs a pretrained one, bug? #18