witteveenbos / KPZSS

KPZSS code
3 stars 0 forks source link

General | read HydraNL output #7

Open casvbem opened 2 years ago

casvbem commented 2 years ago
"""
--- Synopsis --- 
This script is used to read Hydra-NL .html output files

--- Version --- 
Created on Fri Mar  1 11:27:07 2019
@author: VERA7

Witteveen+Bos Consulting Engineers 
Van Twickelostraat 2 
P.O. box 233 
7400 AE Deventer 
The Netherlands 
"""

#import modules
import codecs
import os
import pandas 
import re

#import own script
from list_files_folders import list_files

def search_html_file(full_file_path, string_for_identification, strip_strings = []):
    """
    this function can be used to open the full file path and look for string_for_identification in 
    each individual line. For the first line that is found with this string in it several modifications 
    are done
    1 strip string_for_identification from line
    2 strip all strings in list strip_strings from line
    3 try to convert to float

    args:
        full_file_path              -   full path to html file that should be read
        string_for_identification   -   string that we are looking for in the lines of html file
    kwargs:    
        strip_strings               -   list with strings that should be stripped from 
    returns:
        found_float                 -   float that is found in html file
    """ 

    #get content of html per line
    with codecs.open(full_file_path, 'r') as html_file:
        content = html_file.readlines()

    #loop over all lines
    for line in content:

        #check if the string_for_identification is present in the line
        if string_for_identification in line:

            #if string_for_identification is present do some modifications
            #1 strip string_for_identification from line
            modified_line = line.strip(string_to_find)

            #2 strip all strings in list strip_strings from line
            for string in strip_strings:
                modified_line = re.sub(string, '', modified_line)
                #re.sub(r" ?\([^)]+\)"
            #3 try to convert to float
            try:
                found_float = float(modified_line)
                return found_float
            except:
                raise Exception("{} can not be converted to float. Please update strip_strings to adhere only float. Strip_strings currently is {}".format(modified_line, strip_strings))
    #else:
    #    raise Exception("{} not found in individual line in {}".format(string_for_identification, full_file_path))

#only execute code below when this script is executed
if __name__ == '__main__':

    #define search directory
    directory = #link to path

    #find all uitvoer.html files
    html_files = list_files(file_incl='uitvoer.html', path = directory)

    #define which strings we want to find and what to strip from found rows
    #first is for wind speed, second for wind direction
    strings_to_find = ['potentiële windsnelheid u [m/s]', 
                       'windrichting r (bijdrage aan ov.freq)',
                       'significante golfhoogte Hm0 [m]',
                       'spectrale golfperiode Tm-1,0 [s]',
                       'golfrichting t.o.v. Noord [graden]',
                       'Uitwendige dijknormaal',
                       'lokale waterstand h [m+NAP]']

    #define strings that should be stripped from rows to end up with float
    strip_strings = [['\\|'], 
                     ['\\|', " ?\([^)]+\)"], 
                    ['\\|'], 
                    ['\\|'], 
                    ['\\|'], 
                    ['=', '\\(°\\)'],
                    ['\\|']]

    #define names the variables will have in output
    variable_keys = ['windsnelheid [m/s]', 
                     'windrichting [deg noord]', 
                     'Hs [m]', 
                     'Tm-1,0 [s]', 
                     'wave direction', 
                     'dijk normaal',
                     'water level [m NAP]']

    #make an empty dictionary where we store the results
    uitvoer_dict = {'sectie' :[], 'overslagdebiet [m3/s]' : []}
    for variable_key in variable_keys:
        uitvoer_dict[variable_key] = []

    #loop over all uitvoer.html files we found and store the obtained output
    for path_file in html_files:

        #define raai based on filename
        raai = os.path.basename(os.path.dirname(os.path.dirname(path_file)))
        uitvoer_dict['sectie'].append(raai)

        #get overslagdebied based on filename
        q = os.path.basename(os.path.dirname(path_file)).split('_')[-1]
        q = q.replace(',', '.')

        #add to output
        uitvoer_dict['overslagdebiet [m3/s]'].append(q)

        #loop over all variables we want to read from html file
        for string_to_find, strip_string, variable_key in zip(strings_to_find, strip_strings, variable_keys):
            #get value from the html file
            value = search_html_file(path_file, string_to_find, strip_string)

            #append this file to the output
            uitvoer_dict[variable_key].append(value)

    #now convert dict to pandas dataframe and put it in excel
    df = pandas.DataFrame(uitvoer_dict)

    #set overslagdebiet as float type
    df['overslagdebiet [m3/s]']= df['overslagdebiet [m3/s]'].astype(float)

    #now sort on section and overslagdebiet
    df = df.sort_values(by = ['sectie', 'overslagdebiet [m3/s]'])

    #write to csv, trick is to use sep = ';' which enables reading csv in excel
    #you can write to excel if you want
    savename = os.path.join(os.path.dirname(__file__), '..', 'overslaguitvoer_zonder_onzekerheid.csv')
    df.to_csv(savename, sep = ';', index = False)
casvbem commented 2 years ago

@BastiaanKuijper could you also share a code snippet to read HydraNL HTML output?

casvbem commented 2 years ago

@daanbader could you make a branch and give the code above a try?

casvbem commented 2 years ago

code received from Bastiaan attached as a zip file

reader.zip