Closed Marigold closed 1 week ago
Quick links (staging server): Site | Admin | Wizard |
---|
Login: ssh owid@staging-site-fix-surface-temperature-memory
Edited: 2024-06-18 09:42:34 UTC Execution time: 14.89 seconds
Pasting output from profiling (before downgrading rioxarray and before Veronika's optimizations), could be interesting
CPU
Total time: 924.607 s
File: /home/owid/etl/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py
Function: run at line 43
Line # Hits Time Per Hit % Time Line Contents
==============================================================
43 def run(dest_dir: str) -> None:
44 # Activates the usage of the global context. Using this option can enhance the performance
45 # of initializing objects in single-threaded applications.
46 1 26680.0 26680.0 0.0 pyproj.set_use_global_context(True) # type: ignore
47
48 #
49 # Load inputs.
50 #
51 # Retrieve snapshot.
52 1 877332488.0 9e+08 0.1 snap = paths.load_snapshot("surface_temperature.gz")
53
54 # Read surface temperature data from snapshot and convert temperature from Kelvin to Celsius.
55 1 5e+11 5e+11 50.0 da = _load_data_array(snap) - 273.15
56
57 # Read the shapefile to extract country informaiton
58 1 23432949.0 2e+07 0.0 snap_geo = paths.load_snapshot("world_bank.zip")
59 1 300.0 300.0 0.0 shapefile_name = "WB_countries_Admin0_10m/WB_countries_Admin0_10m.shp"
60
61 # Check if the shapefile exists in the ZIP archive
62 2 4534163.0 2e+06 0.0 with zipfile.ZipFile(snap_geo.path, "r"):
63 # Construct the correct path for Geopandas
64 1 30077.0 30077.0 0.0 file_path = f"zip://{snap_geo.path}!/{shapefile_name}"
65
66 # Read the shapefile directly from the ZIP archive
67 1 2425032598.0 2e+09 0.3 shapefile = _load_shapefile(file_path)
68
69 #
70 # Process data.
71 #
72
73 # Initialize an empty dictionary to store the country-wise average temperature.
74 1 250.0 250.0 0.0 temp_country = {}
75
76 # Initialize a list to keep track of small countries where temperature data extraction fails.
77 1 241.0 241.0 0.0 small_countries = []
78
79 # Set the coordinate reference system for the temperature data to EPSG 4326.
80 1 5e+10 5e+10 5.7 da = da.rio.write_crs("epsg:4326")
81
82 # Iterate over each row in the shapefile data.
83 252 28781186.0 114211.1 0.0 for i in tqdm(range(shapefile.shape[0])):
84 # Extract the data for the current row.
85 251 57757155.0 230108.2 0.0 geometry = shapefile.iloc[i]["geometry"]
86 251 31993573.0 127464.4 0.0 country_name = shapefile.iloc[i]["WB_NAME"]
87
88 251 49663.0 197.9 0.0 try:
89 # Clip to the bounding box for the country's shape to significantly improve performance.
90 251 16590071.0 66095.9 0.0 xmin, ymin, xmax, ymax = geometry.bounds
91 251 8e+10 3e+08 8.3 clip = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax)
92
93 # Clip data to the country's shape.
94 # NOTE: if memory is an issue, we could use `from_disk=True` arg
95 221 2e+11 9e+08 20.7 clip = clip.rio.clip([mapping(geometry)], shapefile.crs)
96
97 # Calculate weights based on latitude to account for area distortion in latitude-longitude grids.
98 195 91104228.0 467201.2 0.0 weights = np.cos(np.deg2rad(clip.latitude))
99 195 407945.0 2092.0 0.0 weights.name = "weights"
100
101 # Apply the weights to the clipped temperature data.
102 195 10432867.0 53501.9 0.0 clim_month_weighted = clip.weighted(weights)
103
104 # Calculate the weighted mean temperature for the country.
105 195 7e+10 4e+08 7.8 country_weighted_mean = clim_month_weighted.mean(dim=["longitude", "latitude"]).values
106
107 # Store the calculated mean temperature in the dictionary with the country's name as the key.
108 195 304805.0 1563.1 0.0 temp_country[country_name] = country_weighted_mean
109
110 56 79008.0 1410.9 0.0 except (NoDataInBounds, OneDimensionalRaster):
111 112 7737107.0 69081.3 0.0 log.info(
112 56 42187.0 753.3 0.0 f"No data was found in the specified bounds for {country_name}."
113 ) # If an error occurs (usually due to small size of the country), add the country's name to the small_countries list. # If an error occurs (usually due to small size of the country), add the country's name to the small_countries list.
114 56 16386649.0 292618.7 0.0 small_countries.append(shapefile.iloc[i]["WB_NAME"])
115
116 # Log information about countries for which temperature data could not be extracted.
117 2 124717.0 62358.5 0.0 log.info(
118 1 1372.0 1372.0 0.0 f"It wasn't possible to extract temperature data for {len(small_countries)} small countries as they are too small for the resolution of the Copernicus data."
119 )
120
121 # Add Global mean temperature
122 1 499075.0 499075.0 0.0 weights = np.cos(np.deg2rad(da.latitude))
123 1 2655.0 2655.0 0.0 weights.name = "weights"
124 1 51828.0 51828.0 0.0 clim_month_weighted = da.weighted(weights)
125 1 6e+10 6e+10 7.0 global_mean = clim_month_weighted.mean(["longitude", "latitude"])
126 1 2304.0 2304.0 0.0 temp_country["World"] = global_mean
127
128 # Define the start and end dates
129 1 25183204.0 3e+07 0.0 start_time = da["time"].min().dt.date.astype(str).item()
130 1 1145689.0 1e+06 0.0 end_time = da["time"].max().dt.date.astype(str).item()
131
132 # Generate a date range from start_time to end_time with monthly frequency
133 1 50118627.0 5e+07 0.0 month_middles = pd.date_range(start=start_time, end=end_time, freq="MS") + pd.offsets.Day(14)
134
135 # month_starts is a DateTimeIndex object; you can convert it to a list if needed
136 1 167952685.0 2e+08 0.0 month_starts_list = month_middles.tolist()
137
138 # df of temperatures for each country
139 1 8265938.0 8e+06 0.0 df_temp = pd.DataFrame(temp_country)
140 1 28570766.0 3e+07 0.0 df_temp["time"] = month_starts_list
141
142 1 34984998.0 3e+07 0.0 melted_df = df_temp.melt(id_vars=["time"], var_name="country", value_name="temperature_2m")
143
144 # Create a new table and ensure all columns are snake-case and add relevant metadata.
145 1 8419509.0 8e+06 0.0 tb = Table(melted_df, short_name=paths.short_name, underscore=True)
146 1 35664204.0 4e+07 0.0 tb = tb.set_index(["time", "country"], verify_integrity=True)
147
148 1 238592.0 238592.0 0.0 tb["temperature_2m"].metadata.origins = [snap.metadata.origin]
149 #
150 # Save outputs.
151 #
152 # Create a new meadow dataset with the same metadata as the snapshot.
153 1 466858602.0 5e+08 0.1 ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
154
155 # Save changes in the new garden dataset.
156 1 41507617.0 4e+07 0.0 ds_meadow.save()
This is memory. Note that numbers in the for loop doesn't make much sense (incremental ones don't do at all). It probably needs more love to make it useful.
Line # Mem usage Increment Occurrences Line Contents
=============================================================
43 190.2 MiB 190.2 MiB 1 def run(dest_dir: str) -> None:
44 # Activates the usage of the global context. Using this option can enhance the performance
45 # of initializing objects in single-threaded applications.
46 190.2 MiB 0.0 MiB 1 pyproj.set_use_global_context(True) # type: ignore
47
48 #
49 # Load inputs.
50 #
51 # Retrieve snapshot.
52 191.4 MiB 1.2 MiB 1 snap = paths.load_snapshot("surface_temperature.gz")
53
54 # Read surface temperature data from snapshot and convert temperature from Kelvin to Celsius.
55 8096.2 MiB 7904.7 MiB 1 da = _load_data_array(snap) - 273.15
56
57 # Read the shapefile to extract country informaiton
58 8098.7 MiB 2.6 MiB 1 snap_geo = paths.load_snapshot("world_bank.zip")
59 8098.7 MiB 0.0 MiB 1 shapefile_name = "WB_countries_Admin0_10m/WB_countries_Admin0_10m.shp"
60
61 # Check if the shapefile exists in the ZIP archive
62 8224.0 MiB 0.3 MiB 2 with zipfile.ZipFile(snap_geo.path, "r"):
63 # Construct the correct path for Geopandas
64 8099.0 MiB 0.0 MiB 1 file_path = f"zip://{snap_geo.path}!/{shapefile_name}"
65
66 # Read the shapefile directly from the ZIP archive
67 8224.0 MiB 125.0 MiB 1 shapefile = _load_shapefile(file_path)
68
69 #
70 # Process data.
71 #
72
73 # Initialize an empty dictionary to store the country-wise average temperature.
74 8224.0 MiB 0.0 MiB 1 temp_country = {}
75
76 # Initialize a list to keep track of small countries where temperature data extraction fails.
77 8224.0 MiB 0.0 MiB 1 small_countries = []
78
79 # Set the coordinate reference system for the temperature data to EPSG 4326.
80 16248.2 MiB 8024.1 MiB 1 da = da.rio.write_crs("epsg:4326")
81
82 # Iterate over each row in the shapefile data.
83 30395.5 MiB -729350.6 MiB 252 for i in tqdm(range(shapefile.shape[0])):
84 # Extract the data for the current row.
85 29628.8 MiB -839039.4 MiB 251 geometry = shapefile.iloc[i]["geometry"]
86 29628.8 MiB -839039.4 MiB 251 country_name = shapefile.iloc[i]["WB_NAME"]
87
88 29628.8 MiB -839039.4 MiB 251 try:
89 # Clip to the bounding box for the country's shape to significantly improve performance.
90 29628.8 MiB -839039.4 MiB 251 xmin, ymin, xmax, ymax = geometry.bounds
91 30083.9 MiB -829997.6 MiB 251 clip = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax)
92
93 # Clip data to the country's shape.
94 # NOTE: if memory is an issue, we could use `from_disk=True` arg
95 30395.4 MiB -762635.8 MiB 221 clip = clip.rio.clip([mapping(geometry)], shapefile.crs)
96
97 # Calculate weights based on latitude to account for area distortion in latitude-longitude grids.
98 30395.4 MiB -666024.7 MiB 195 weights = np.cos(np.deg2rad(clip.latitude))
99 30395.4 MiB -666024.7 MiB 195 weights.name = "weights"
100
101 # Apply the weights to the clipped temperature data.
102 30395.4 MiB -666024.7 MiB 195 clim_month_weighted = clip.weighted(weights)
103
104 # Calculate the weighted mean temperature for the country.
105 30395.5 MiB -666965.0 MiB 195 country_weighted_mean = clim_month_weighted.mean(dim=["longitude", "latitude"]).values
106
107 # Store the calculated mean temperature in the dictionary with the country's name as the key.
108 30395.5 MiB -667013.0 MiB 195 temp_country[country_name] = country_weighted_mean
109
110 28193.7 MiB -182504.4 MiB 56 except (NoDataInBounds, OneDimensionalRaster):
111 28193.7 MiB -124675.4 MiB 112 log.info(
112 28193.7 MiB -62337.7 MiB 56 f"No data was found in the specified bounds for {country_name}."
113 ) # If an error occurs (usually due to small size of the country), add the country's name to the small_countries list. # If an error occurs (usually due to small size of the country), add the country's name to the small_countries list.
114 28193.7 MiB -62337.7 MiB 56 small_countries.append(shapefile.iloc[i]["WB_NAME"])
115
116 # Log information about countries for which temperature data could not be extracted.
117 30395.5 MiB 0.0 MiB 2 log.info(
118 30395.5 MiB 0.0 MiB 1 f"It wasn't possible to extract temperature data for {len(small_countries)} small countries as they are too small for the resolution of the Copernicus data."
119 )
120
121 # Add Global mean temperature
122 30395.5 MiB 0.0 MiB 1 weights = np.cos(np.deg2rad(da.latitude))
123 30395.5 MiB 0.0 MiB 1 weights.name = "weights"
124 30395.5 MiB 0.0 MiB 1 clim_month_weighted = da.weighted(weights)
125 30386.0 MiB -9.4 MiB 1 global_mean = clim_month_weighted.mean(["longitude", "latitude"])
126 30386.0 MiB 0.0 MiB 1 temp_country["World"] = global_mean
127
128 # Define the start and end dates
129 30386.4 MiB 0.4 MiB 1 start_time = da["time"].min().dt.date.astype(str).item()
130 30386.4 MiB 0.0 MiB 1 end_time = da["time"].max().dt.date.astype(str).item()
131
132 # Generate a date range from start_time to end_time with monthly frequency
133 30386.8 MiB 0.4 MiB 1 month_middles = pd.date_range(start=start_time, end=end_time, freq="MS") + pd.offsets.Day(14)
134
135 # month_starts is a DateTimeIndex object; you can convert it to a list if needed
136 27510.3 MiB -2876.5 MiB 1 month_starts_list = month_middles.tolist()
137
138 # df of temperatures for each country
139 27510.3 MiB 0.0 MiB 1 df_temp = pd.DataFrame(temp_country)
140 27510.6 MiB 0.3 MiB 1 df_temp["time"] = month_starts_list
141
142 27510.8 MiB 0.3 MiB 1 melted_df = df_temp.melt(id_vars=["time"], var_name="country", value_name="temperature_2m")
143
144 # Create a new table and ensure all columns are snake-case and add relevant metadata.
145 27511.0 MiB 0.2 MiB 1 tb = Table(melted_df, short_name=paths.short_name, underscore=True)
146 27511.3 MiB 0.3 MiB 1 tb = tb.set_index(["time", "country"], verify_integrity=True)
147
148 27511.3 MiB 0.0 MiB 1 tb["temperature_2m"].metadata.origins = [snap.metadata.origin]
149 #
150 # Save outputs.
151 #
152 # Create a new meadow dataset with the same metadata as the snapshot.
153 27515.9 MiB 4.6 MiB 1 ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
154
155 # Save changes in the new garden dataset.
156 27522.0 MiB 6.1 MiB 1 ds_meadow.save()
Possible fix for https://github.com/owid/etl/issues/2826. Needs to be verified on the staging server.
Downgrading didn't work. It still consumes 40gb of memory.