jwzimmer-zz / tv-tropening

1 stars 0 forks source link

Most salient code pieces/ artifacts: loading the original data and running SVD #22

Open jwzimmer-zz opened 2 years ago

jwzimmer-zz commented 2 years ago

The script to run SVD on the original data:

import pandas as pd
from pandas_ods_reader import read_ods
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import linalg
from numpy.linalg import svd
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D
import random
import scipy as sp

# === === Helper functions === ===
def write_json(data, filename):
    with open(filename,"w") as f:
        json.dump(data,f)
    return None
def get_json(filename):
    with open(filename) as f:
        jsonobj = json.load(f)
    return jsonobj

def add_character_name(row,character_map):
    return tuple(character_map[character_map['ID'] == row['unnamed.1']]['Character display name'])[0]

def add_work_name(row,character_map):
    return tuple(character_map[character_map['ID'] == row['unnamed.1']]['Fictional work'])[0]

#=================================
character_map, bap_map = pd.read_html("codebook.html")
df_bap = pd.read_json("July2021_df_bap.json")
df_traits = pd.read_json("July2021_df_traits.json")
df_std = pd.read_json("June2021_df_std_original.json")
df_n = pd.read_json("June2021_df_n_original.json")

clean_column_dict = get_json("July2021_cleaned_column_dict.json")

def runSVD(df1,dropcols=['unnamed.1','name','work'],n=None):
    if len(dropcols) > 0:
        for x in dropcols:
            if x in df1.columns:
                df1 = df1.drop(x,axis=1)
    if n==None:
        n=df1.shape[1]-1
    X = df1.to_numpy()
    #decompose
    U, D, V = np.linalg.svd(X)
    # get dim of X
    M,N = X.shape
    # Construct sigma matrix in SVD (it simply adds null row vectors to match the dim of X)
    Sig = sp.linalg.diagsvd(D,M,N)
    # Now you can get X back:
    remakeX = np.dot(U, np.dot(Sig, V))
    assert np.sum(remakeX - X) < 0.00001
    return df1, U, D, V, Sig, X, remakeX

# Output from SVD without removing means
df1, U, D, V, Sig, X, remakeX = runSVD(df_traits)

# Remove the average of each trait
#df1_means = df1.mean().mean()
df1_means = 50
df1_normed = df1 - df1_means

# Output from SVD WITH removing means
df2, U2, D2, V2, Sig2, X2, remakeX2 = runSVD(df1_normed,dropcols=[])
# When remaking X, Sig2 and V2 are combined, then their product is combined with U2, so here is that first product
SigV2 = np.dot(Sig2,V2)
# the traits in order of columns
col2 = df2.columns

Note that these objects are loaded by the script:

character_map, bap_map = pd.read_html("codebook.html")
df_bap = pd.read_json("July2021_df_bap.json")
df_traits = pd.read_json("July2021_df_traits.json")
df_std = pd.read_json("June2021_df_std_original.json")
df_n = pd.read_json("June2021_df_n_original.json")

clean_column_dict = get_json("July2021_cleaned_column_dict.json")

So those should be downloaded from the repo and put in a reasonable place so this script can access them. (Or those lines should be deleted, if you don't want to load the object.)