"""
--- Synopsis ---
This script is used to read Hydra-NL .html output files
--- Version ---
Created on Fri Mar 1 11:27:07 2019
@author: VERA7
Witteveen+Bos Consulting Engineers
Van Twickelostraat 2
P.O. box 233
7400 AE Deventer
The Netherlands
"""
#import modules
import codecs
import os
import pandas
import re
#import own script
from list_files_folders import list_files
def search_html_file(full_file_path, string_for_identification, strip_strings = []):
"""
this function can be used to open the full file path and look for string_for_identification in
each individual line. For the first line that is found with this string in it several modifications
are done
1 strip string_for_identification from line
2 strip all strings in list strip_strings from line
3 try to convert to float
args:
full_file_path - full path to html file that should be read
string_for_identification - string that we are looking for in the lines of html file
kwargs:
strip_strings - list with strings that should be stripped from
returns:
found_float - float that is found in html file
"""
#get content of html per line
with codecs.open(full_file_path, 'r') as html_file:
content = html_file.readlines()
#loop over all lines
for line in content:
#check if the string_for_identification is present in the line
if string_for_identification in line:
#if string_for_identification is present do some modifications
#1 strip string_for_identification from line
modified_line = line.strip(string_to_find)
#2 strip all strings in list strip_strings from line
for string in strip_strings:
modified_line = re.sub(string, '', modified_line)
#re.sub(r" ?\([^)]+\)"
#3 try to convert to float
try:
found_float = float(modified_line)
return found_float
except:
raise Exception("{} can not be converted to float. Please update strip_strings to adhere only float. Strip_strings currently is {}".format(modified_line, strip_strings))
#else:
# raise Exception("{} not found in individual line in {}".format(string_for_identification, full_file_path))
#only execute code below when this script is executed
if __name__ == '__main__':
#define search directory
directory = #link to path
#find all uitvoer.html files
html_files = list_files(file_incl='uitvoer.html', path = directory)
#define which strings we want to find and what to strip from found rows
#first is for wind speed, second for wind direction
strings_to_find = ['potentiële windsnelheid u [m/s]',
'windrichting r (bijdrage aan ov.freq)',
'significante golfhoogte Hm0 [m]',
'spectrale golfperiode Tm-1,0 [s]',
'golfrichting t.o.v. Noord [graden]',
'Uitwendige dijknormaal',
'lokale waterstand h [m+NAP]']
#define strings that should be stripped from rows to end up with float
strip_strings = [['\\|'],
['\\|', " ?\([^)]+\)"],
['\\|'],
['\\|'],
['\\|'],
['=', '\\(°\\)'],
['\\|']]
#define names the variables will have in output
variable_keys = ['windsnelheid [m/s]',
'windrichting [deg noord]',
'Hs [m]',
'Tm-1,0 [s]',
'wave direction',
'dijk normaal',
'water level [m NAP]']
#make an empty dictionary where we store the results
uitvoer_dict = {'sectie' :[], 'overslagdebiet [m3/s]' : []}
for variable_key in variable_keys:
uitvoer_dict[variable_key] = []
#loop over all uitvoer.html files we found and store the obtained output
for path_file in html_files:
#define raai based on filename
raai = os.path.basename(os.path.dirname(os.path.dirname(path_file)))
uitvoer_dict['sectie'].append(raai)
#get overslagdebied based on filename
q = os.path.basename(os.path.dirname(path_file)).split('_')[-1]
q = q.replace(',', '.')
#add to output
uitvoer_dict['overslagdebiet [m3/s]'].append(q)
#loop over all variables we want to read from html file
for string_to_find, strip_string, variable_key in zip(strings_to_find, strip_strings, variable_keys):
#get value from the html file
value = search_html_file(path_file, string_to_find, strip_string)
#append this file to the output
uitvoer_dict[variable_key].append(value)
#now convert dict to pandas dataframe and put it in excel
df = pandas.DataFrame(uitvoer_dict)
#set overslagdebiet as float type
df['overslagdebiet [m3/s]']= df['overslagdebiet [m3/s]'].astype(float)
#now sort on section and overslagdebiet
df = df.sort_values(by = ['sectie', 'overslagdebiet [m3/s]'])
#write to csv, trick is to use sep = ';' which enables reading csv in excel
#you can write to excel if you want
savename = os.path.join(os.path.dirname(__file__), '..', 'overslaguitvoer_zonder_onzekerheid.csv')
df.to_csv(savename, sep = ';', index = False)