Closed rht closed 3 years ago
Before:
Total time: 76.7946 s
File: /home/rht/code/venv/lib/python3.8/site-packages/firststreet/api/csv_format.py
Function: format_aal_summary_property at line 1165
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1165 @profile
1166 def format_aal_summary_property(data):
1167 """Reformat the list of data to Location Summary format for property
1168
1169 Args:
1170 data (list): A list of FSF object
1171 Returns:
1172 A pandas formatted DataFrame
1173 """
1174 1 1251288.0 1251288.0 1.6 df = pd.json_normalize([vars(o) for o in data]).explode('annual_loss')
1175 1 35616.0 35616.0 0.0 df = df.explode('depth_loss')
1176
1177 1 8017.0 8017.0 0.0 if not df[['annual_loss', 'depth_loss']].isna().values.all():
1178 1 29314467.0 29314467.0 38.2 df = pd.concat([df.drop(['annual_loss'], axis=1), df['annual_loss'].apply(pd.Series)], axis=1)
1179 1 42.0 42.0 0.0 if 'data' in df.columns:
1180 1 25866379.0 25866379.0 33.7 df = pd.concat([df.drop(['data'], axis=1), df['data'].apply(pd.Series)], axis=1)
1181 else:
1182 df['year'] = pd.NA
1183 df['low'] = pd.NA
1184 df['mid'] = pd.NA
1185 df['high'] = pd.NA
1186
1187 1 20231266.0 20231266.0 26.3 df = pd.concat([df.drop(['depth_loss'], axis=1), df['depth_loss'].apply(pd.Series)], axis=1)
1188 else:
1189 df['fsid'] = df['fsid'].apply(str)
1190 df.drop(['annual_loss'], axis=1, inplace=True)
1191 df.drop(['depth_loss'], axis=1, inplace=True)
1192 df['depth'] = pd.NA
1193 df['data'] = pd.NA
1194 df['year'] = pd.NA
1195 df['low'] = pd.NA
1196 df['mid'] = pd.NA
1197 df['high'] = pd.NA
1198
1199 1 1330.0 1330.0 0.0 df.rename(columns={'data': 'damage'}, inplace=True)
1200 1 8176.0 8176.0 0.0 df['fsid'] = df['fsid'].apply(str)
1201 1 11816.0 11816.0 0.0 df['year'] = df['year'].astype('Int64').apply(str)
1202 1 11069.0 11069.0 0.0 df['low'] = df['low'].astype('Int64').apply(str)
1203 1 11370.0 11370.0 0.0 df['mid'] = df['mid'].astype('Int64').apply(str)
1204 1 11786.0 11786.0 0.0 df['high'] = df['high'].astype('Int64').apply(str)
1205 1 12198.0 12198.0 0.0 df['depth'] = df['depth'].astype('Int64').apply(str)
1206 1 12605.0 12605.0 0.0 df['damage'] = df['damage'].astype('Int64').apply(str)
1207
1208 1 7142.0 7142.0 0.0 return df[['fsid', 'valid_id', 'depth', 'damage', 'year', 'low', 'mid', 'high', 'error']]
After:
Total time: 2.12946 s
File: /home/rht/code/venv/lib/python3.8/site-packages/firststreet/api/csv_format.py
Function: format_aal_summary_property at line 1165
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1165 @profile
1166 def format_aal_summary_property(data):
1167 """Reformat the list of data to Location Summary format for property
1168
1169 Args:
1170 data (list): A list of FSF object
1171 Returns:
1172 A pandas formatted DataFrame
1173 """
1174 1 1357269.0 1357269.0 63.7 df = pd.json_normalize([vars(o) for o in data]).explode('annual_loss')
1175 1 38295.0 38295.0 1.8 df = df.explode('depth_loss')
1176
1177 1 7560.0 7560.0 0.4 if not df[['annual_loss', 'depth_loss']].isna().values.all():
1178 1 3.0 3.0 0.0 def expand_al(al):
1179 if pd.isnull(al):
1180 return pd.NA, pd.NA, pd.NA, pd.NA
1181 return al['year'], al['data']['low'], al['data']['mid'], al['data']['high']
1182 1 280243.0 280243.0 13.2 df['year'], df['low'], df['mid'], df['high'] = zip(*df['annual_loss'].apply(expand_al))
1183 1 9943.0 9943.0 0.5 df.drop(['annual_loss'], axis=1)
1184
1185 1 3.0 3.0 0.0 def expand_dl(dl):
1186 if pd.isnull(dl):
1187 return pd.NA, pd.NA
1188 return dl['depth'], dl['data']
1189 1 309376.0 309376.0 14.5 df['depth'], df['data'] = zip(*df['depth_loss'].apply(expand_dl))
1190 1 10783.0 10783.0 0.5 df.drop(['depth_loss'], axis=1)
1191 else:
1192 df['fsid'] = df['fsid'].apply(str)
1193 df.drop(['annual_loss'], axis=1, inplace=True)
1194 df.drop(['depth_loss'], axis=1, inplace=True)
1195 df['depth'] = pd.NA
1196 df['data'] = pd.NA
1197 df['year'] = pd.NA
1198 df['low'] = pd.NA
1199 df['mid'] = pd.NA
1200 df['high'] = pd.NA
1201
1202 1 1521.0 1521.0 0.1 df.rename(columns={'data': 'damage'}, inplace=True)
1203 1 6735.0 6735.0 0.3 df['fsid'] = df['fsid'].apply(str)
1204 1 19518.0 19518.0 0.9 df['year'] = df['year'].astype('Int64').apply(str)
1205 1 20060.0 20060.0 0.9 df['low'] = df['low'].astype('Int64').apply(str)
1206 1 16941.0 16941.0 0.8 df['mid'] = df['mid'].astype('Int64').apply(str)
1207 1 15927.0 15927.0 0.7 df['high'] = df['high'].astype('Int64').apply(str)
1208 1 15038.0 15038.0 0.7 df['depth'] = df['depth'].astype('Int64').apply(str)
1209 1 16043.0 16043.0 0.8 df['damage'] = df['damage'].astype('Int64').apply(str)
1210
1211 1 4202.0 4202.0 0.2 return df[['fsid', 'valid_id', 'depth', 'damage', 'year', 'low', 'mid', 'high', 'error']]
Note: I have downloaded 5000 FSIDs using each method and have already compared the diff. The files are identical. But definitely need someone else who is not me to confirm that they are indeed identical.
For details of the benchmark, see the answer https://stackoverflow.com/a/61039138.
I have observed that the most time consuming part of my bulk download of AAL data is in the conversion to CSV (1 min to download 5000 entries, 5 min to convert, in a Amazon AWS t2.micro instance). This refactor will speed up the process by 36x!