MBKEngineers / collect

MBK Python scripts for scraping water data from the web
MIT License
3 stars 1 forks source link

New Data Feed in wcds.py #88

Open cmora1es opened 1 year ago

cmora1es commented 1 year ago

Client Code: 2496.14

Data Source

Pull Daily Operation Monthly Reports from US Army Corps of Engineers Sacramento District Website - Water Control Data System: URL to be used = https://www.spk-wc.usace.army.mil/fcgi-bin/monthly.py?month=feb&year=2023&project=frm Where "month=", "year=" and "project=" are inputs to a function.

Sample Code:

def download_table(month, year, project):
    url = f"https://www.spk-wc.usace.army.mil/fcgi-bin/monthly.py?month={month}&year={year}&project={project}&textonly=true"
    response = requests.get(url, verify=False)
    lines = response.text.splitlines()
    rows = [line.split() for line in lines if line.strip()]
    return rows

def save_to_csv(data, file, month_num, year):
    for j, sublist in enumerate(data):
        if sublist[0] == 'Totals':
            break

    for i, sublist in enumerate(data[2:]):
        if sublist[0] == 'Date':
            name_list = sublist 
            break

    units=data[i+3]
    column_name = []
    for i, item in enumerate(name_list):
        if i > 0 and item.startswith('#'):
            column_name[-1] = ' '.join([column_name[-1], item])
        else:
            column_name.append(item)
    in_index = column_name.index('(in)')
    units.insert(0, '')
    units.insert(in_index, '(in)')

    if len(column_name)==len(units):
        column_list = [f'{column_name[i]}\n{units[i]}' for i in range(len(column_name))]
    else: 
        column_list = data[8]

    df = pd.DataFrame(data[9:j])
    df['datetime']= [f'{year}-{month_num}-{i}' for i in df[0]]
    df.set_index('datetime', inplace=True)
    df.columns = column_list

    table = df
    print(table)
    table.to_csv(file, index = True, header=column_list, mode='a')

def download_and_save(month_num, months, years, project, file_prefix):
    for year in years:
        for i in range(len(month_num)):
            data = download_table(months[i], year, project)
            file = f"{year}_{month_num[i]}_{file_prefix}.csv"
            save_to_csv(data, file, month_num[i], year)

def append_to_csv(input_file, output_file):
    with open(input_file, 'r') as in_file, open(output_file, 'a') as out_file:
        reader = csv.reader(in_file)
        writer = csv.writer(out_file)
        header = next(reader)
        if not os.path.isfile(output_file):
            writer.writerow(header)
        for row in reader:
            writer.writerow(row)

months = ['jan', 'feb', 'mar','apr','may','dec']
month_num = ['01','02','03','12']
years = range(2000,2004)
project = 'eng'
file_prefix = 'eng'
download_and_save(month_num, months, years, project, file_prefix)

output_file = 'eng_output_1990-2022.csv'

for year in years:
    for i in range(len(month_num)):
        input_file = f"{year}_{month_num[i]}_{file_prefix}.csv"
        append_to_csv(input_file, output_file)

Use Case

This URL will be an alternative source to pull data starting 1990 as opposed to starting 1995 when collecting from Corps and Section 7 Projects in California - Hourly/Daily Data and Plots

This feature will be added in ".../collect/collect/usace/wcds.py"

Breaking Changes

Unknown

Priority

Low

narlesky commented 1 year ago

@cmora1es here's my pseudo-code notes from our meeting:

def get_monthly_report_data_for_reservoir(reservoir, year, month):
    """
    description (omit the totals)

    Arguments:
        reservoir (str):
        year (int):
        month (??): as a number or as a string?
    Returns:
    """

    # use helper function to lookup the columns as long as it's reliable for the reservoir
    # for the whole period of record

    return {'data': df, # date-indexed dataframe
            'info': {'url': ,
                    'date_published':
                    'date_retrieved': dt.datetime.now(),
                    'raw': the plain text}}

def get_por_monthly_report_data_for_reservoir(reservoir, start, end):

    frames = []
    # list of months between start and end
    for month in list:
        frames.append(get_monthly_report_data_for_reservoir(reservoir, year, month)['data'])
    df = pd.concat(frames, axis=0)

    return {'data': df, 
            'info': {
                    'urls': [],
                    'dates_published': [],
                    'date_retrieved': dt.datetime.now(),
                    }}