rebeccajohnson88 / qss20_s21_proj

Repo for DOL Summer Data Challenge on equity in H-2A oversight
Creative Commons Zero v1.0 Universal
2 stars 2 forks source link

ACS percentages code to borrow from #12

Closed rebeccajohnson88 closed 3 years ago

rebeccajohnson88 commented 3 years ago
## melt to long format
df_acs_long = pd.melt(df_acs, id_vars= ['county', 'state', 'tract']).sort_values(by = 
                                    ['tract', 'county'])
## create prefix and suffix columns
df_acs_long['variable_prefix'], df_acs_long['variable_suffix'] = df_acs_long['variable'].str.split('_', 1).str

## manually indicate which prefix columns don't follow the pattern
prefixes_perc_notrelevant = ['B01002','B25031','B25107']
df_acs_long['perc_NA'] = np.where(df_acs_long.variable_prefix.isin(prefixes_perc_notrelevant), 
                                    1, 0)

prefixes_perc_extrahier = ['B25026', 'B25123']
df_acs_long['perc_extrahier'] = np.where(df_acs_long.variable_prefix.isin(prefixes_perc_extrahier),
                                        1, 0)

## group by county, tract, and variable prefix to generate 
## percentages (and later variable names)
df_acs_long_toiterate = df_acs_long[df_acs_long.perc_NA == 0].copy()
group_co_tract_varg = df_acs_long_toiterate.groupby(['county', 'tract', 'variable_prefix'])

################################## Generate percentages: auto-calc #################################

df_acs_long_percentage = []
#flag = 0
for group,data in group_co_tract_varg:
    #if flag > 100: # flag to test code
     #   break
#     print(data)
    tract = data.tract.iloc[0]
    county = data.county.iloc[0]
    prefix = data.variable_prefix.iloc[0]
    row_list_group = []
    for i in range(1,data.shape[0]):
        numerator = data.value.iloc[i]
        denominator = data.value.iloc[0].astype(float)
        percentage= numerator/denominator
        row = [county,tract,prefix]
        row = row + [data.variable_suffix.iloc[i],percentage]
        row_list_group.append(row)
#         print(row)
#         print(row_list_group)
#         print('___________________________')

#         break
    df_acs_long_percentage.append(pd.DataFrame(row_list_group))
    #flag = flag + 1

percentages_all_groups = pd.concat(df_acs_long_percentage)
percentages_all_groups.columns = ['county', 'tract', 'variable_prefix', 
                                  'variable_suffix', 'percentage']

percentages_all_groups['variable_prefix_suffix'] = percentages_all_groups.variable_prefix + "_" + percentages_all_groups.variable_suffix
percentages_all_groups.drop(columns = ['variable_prefix', 'variable_suffix'], inplace= True)

####################################### Reshape percentages to wide ##################################

## subset to a few counties/tract (so even row numbers for wide reshape) (remove after testing)

### commented out testing code
## test_forwide = percentages_all_groups[(percentages_all_groups.county == 5) & 
   ##                                   (percentages_all_groups.tract.isin([100, 200, 400]))].copy()

percentages_all_groups['county_tract'] = percentages_all_groups.county.astype(str) + "_" + percentages_all_groups.tract.astype(str)

percentages_all_groups.drop(columns = ['county', 'tract'], inplace= True)

## try the pivot -- before pivoting, creating county-tract 
percentages_wide_pivot = percentages_all_groups.pivot(index = 'county_tract',
                                 columns = 'variable_prefix_suffix',
                                 values = 'percentage')

percentages_wide_pivot_reset = percentages_wide_pivot.reset_index()

## other cols to drop
percentages_wide_pivot_reset.drop(columns = ['b01001_002e', 
                                      'b01001_026e'], inplace = True)
rebeccajohnson88 commented 3 years ago

Some var renaming code but it may not be the most useful --- basic idea is to use the acs_predictors csv which has both the formal varname and the description to rename the variables using a dictionary-- since some descriptions just say "total" it might be good to paste the following two columns together:

def df_column_uniquify(df):
    df_columns = map(str.lower, df.columns)
    new_columns = []
    for item in df_columns:
        counter = 0
        newitem = item
        while newitem in new_columns:
            counter += 1
            newitem = "{}_{}".format(item, counter)
            print(newitem)
        new_columns.append(newitem)
    df.columns = new_columns
    return df
census_varnames_dict_percent = census_varnames.set_index('variable_topull_sql').to_dict()['varnames_percent']

## rename data
percentages_rename = percentages_wide_pivot_reset.rename(columns = census_varnames_dict_percent)