Closed Jrmy-rbr closed 2 years ago
Thanks for the feedback @Jrmy-rbr!
PRs are always welcome if you'd like to share your optimized version.
With this code, I get a speed-up of a factor ~ 2.25. The bottleneck being the call to _get_sections()
which cannot be solved in the tortuosity function itself.
M_TO_FT = 1/0.3048
FT_TO_M = 0.3048
def dist(pos1: Union[List[float], np.ndarray], pos2: Union[List[float], np.ndarray])->float:
vec = [p1i - p2i for p1i, p2i in zip(pos1, pos2)]
return math.sqrt(sum([v1 * v2 for v1, v2 in zip(vec, vec)]))
def cumsum(iterable: Union[List[float], np.ndarray])->List[float]:
result = iterable.copy()
for i,el in enumerate(iterable):
if i != 0:
result[i] += result[i-1]
return result
def tortuosity_py(survey:Survey)->List[float]:
sections = survey._get_sections()
sections_upper, sections_lower = sections[1:-1], sections[2:]
n_sections = len(sections_upper)
l_c = (sections[-1].md - sections[1].md) * M_TO_FT
factor = (n_sections / (n_sections + 1)) / l_c * 1e7
return cumsum([factor*((l.md - u.md) / dist(l.location, u.location)-1) for u,l in zip(sections_upper, sections_lower)])
Thanks @Jrmy-rbr
I'm not sure where you're getting your benchmarks from and they seem to be decreasing by orders of magnitude each time? :-)
I'm coding up a modified TI method so figured I'd take a look at your suggestion. See below some performance comparisons for your code, the original (poor) numpy
implementation and an updated numpy
implementation. When I run the Volve wells (about 100 actual/drilled wellpaths) through these functions I get the following:
Importing the data...
Extracting survey data...
Processing wells...
modified_tortuosity_index = 0.2917945384979248 seconds
tortuosity_py = 0.3112528324127197 seconds
tortuosity_index = 0.39501476287841797 seconds
So looks like your native code is about 20% faster and the refactored numpy
code is 7% faster again.
The test code is below if you want to try it out yourself... I expect the numpy
version to be faster the more sections there are per well, with your native version probably being quicker if there's only a few sections since the overhead should be much smaller.
I'll add the refactored numpy
implementation in the next version.
import welleng as we
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
import time
import math
import ray
# for ease I accessed the data file locally and gave it a
# shorter name. You'll need to change this to reflect the
# local location of the data file.
filename = 'data/Volve.xml'
M_TO_FT = 1/0.3048
FT_TO_M = 0.3048
ray.init()
@ray.remote
def generate_survey(df, well):
sh = we.survey.SurveyHeader(
name=well,
azi_reference="grid"
)
w = df.loc[
df['def_survey_header_id'] == well
].sort_values(by=['md'])
cov_nev = we.survey.make_long_cov(np.array([
w['covariance_yy'],
w['covariance_xy'],
w['covariance_yz'],
w['covariance_xx'],
w['covariance_xz'],
w['covariance_zz']
]).T).astype(float)
# radius data is sometimes missing or zero and looks to be in inches
# default these to 15" radius and convert to meters
radius = np.array(w['casing_radius']).astype(float)
radius = np.where((np.isnan(radius) | (radius == 0)), 15, radius)
radius *= 0.0254
s = we.survey.Survey(
md=np.array(w['md']).astype(float) / 3.281,
inc=np.array(w['inclination']).astype(float),
azi=np.array(w['azimuth']).astype(float),
n=np.array(w['offset_north']).astype(float),
e=np.array(w['offset_east']).astype(float),
tvd=np.array(w['tvd']).astype(float) / 3.281, # appears that TVD data is in feet?
header=sh,
cov_nev=cov_nev,
radius=radius
)
return s
def tortuosity_index(survey, data=False):
sections = survey._get_sections()
sections_upper, sections_lower = sections[1:-1], sections[2:]
n_sections = len(sections_upper)
# assume that Survey object is in meters
l_c = (sections[-1].md - sections[1].md) / 0.3048
l_cs, l_xs = np.vstack([
[
l.md - u.md,
np.linalg.norm(np.array(l.location) - np.array(u.location))
]
for u, l in zip(sections_upper, sections_lower)
]).T
ti = np.hstack((
np.array([0.0]),
(
(n_sections / (n_sections + 1)) * (1 / l_c) * (
np.cumsum(
(l_cs / l_xs) - 1
)
)
) * 1e7
))
return ti[-1]
def modified_tortuosity_index(self):
sections = self._get_sections()
sections_upper, sections_lower = sections[1:-1], sections[2:]
n_sections = len(sections_upper)
l_c = (sections[-1].md - sections[1].md) / 0.3048
l_cs, u_loc, l_loc = [], [], []
for u, l in zip(sections_upper, sections_lower):
l_cs.append(l.md - u.md)
u_loc.append(u.location)
l_loc.append(l.location)
l_cs = np.array(l_cs)
l_xs = np.linalg.norm(np.array(l_loc) - np.array(u_loc), axis=1)
ti = np.hstack((
np.array([0.0]),
(
(n_sections / (n_sections + 1)) * (1 / l_c) * (
np.cumsum(
(l_cs / l_xs) - 1
)
)
) * 1e7
))
return ti
def dist(pos1, pos2):
vec = [p1i - p2i for p1i, p2i in zip(pos1, pos2)]
return math.sqrt(sum([v1 * v2 for v1, v2 in zip(vec, vec)]))
def cumsum(iterable):
result = iterable.copy()
for i,el in enumerate(iterable):
if i != 0:
result[i] += result[i-1]
return result
def tortuosity_py(survey):
sections = survey._get_sections()
sections_upper, sections_lower = sections[1:-1], sections[2:]
n_sections = len(sections_upper)
l_c = (sections[-1].md - sections[1].md) * M_TO_FT
factor = (n_sections / (n_sections + 1)) / l_c * 1e7
return cumsum([
factor*((l.md - u.md) / dist(l.location, u.location)-1)
for u,l in zip(sections_upper, sections_lower)
])
def main():
# read the WITSML data
print("Importing the data...")
try:
tree = ET.parse(filename)
root = tree.getroot()
except:
print("Please download the volve data and point filename to its location")
# extract the survey data and create a dataframe
print("Extracting survey data...")
survey_data = [
c.attrib for c in root
if c.tag == "CD_DEFINITIVE_SURVEY_STATION"
]
df = pd.DataFrame(survey_data)
df['md'] = df['md'].astype(float)
wells = df['def_survey_header_id'].unique()
print("Processing wells...")
# this is a bit slow... multithread this if you want to do it faster
data = [
generate_survey.remote(df, well)
for well in tqdm(wells)
]
surveys = ray.get(data)
start = time.time()
for survey in surveys:
ti_0 = modified_tortuosity_index(survey)
finish = time.time()
print(f"modified_tortuosity_index = {finish - start} seconds")
start = time.time()
for survey in surveys:
ti_1 = tortuosity_py(survey)
finish = time.time()
print(f"tortuosity_py = {finish - start} seconds")
start = time.time()
for survey in surveys:
ti_2 = tortuosity_index(survey)
finish = time.time()
print(f"tortuosity_index = {finish - start} seconds")
print("Done")
if __name__ == "__main__":
main()
https://github.com/jonnymaserati/welleng/blob/0afc2ddd62f6b4605dc9ea3ea20d021b7315e974/welleng/survey.py#L1647-L1670