write tests - Githubissues

We have a new set of testing that have been performed to validate the package against and older version of downscaled data holdings. These could be wrangled into a nice little test suite that will run a test downscaling across a smaller subset of years in a NetCDF4 file, making sure to dump out anomalies in the output resolution/extent/crs, then rebuild the inputs (with some tolerance for float precision issues and rounding) from these output files.

This is the strongest test that the package is doing what it is supposed to be doing during the procedure.

Modifying something like this may do the trick:

# # # # # #
# compare new and old tas versions with a diff
# # # # # #

def sort_files( files, split_on='_', elem_month=-2, elem_year=-1 ):
    '''
    sort a list of files properly using the month and year parsed
    from the filename.  This is useful with SNAP data since the standard
    is to name files like '<prefix>_MM_YYYY.tif'.  If sorted using base
    Pythons sort/sorted functions, things will be sorted by the first char
    of the month, which makes thing go 1, 11, ... which sucks for timeseries
    this sorts it properly following SNAP standards as the default settings.
    ARGUMENTS:
    ----------
    files = [list] list of `str` pathnames to be sorted by month and year. usually from glob.glob.
    split_on = [str] `str` character to split the filename on.  default:'_', SNAP standard.
    elem_month = [int] slice element from resultant split filename list.  Follows Python slicing syntax.
        default:-2. For SNAP standard.
    elem_year = [int] slice element from resultant split filename list.  Follows Python slicing syntax.
        default:-1. For SNAP standard.
    RETURNS:
    --------
    sorted `list` by month and year ascending. 
    '''
    import pandas as pd
    months = [ int(fn.split('.')[0].split( split_on )[elem_month]) for fn in files ]
    years = [ int(fn.split('.')[0].split( split_on )[elem_year]) for fn in files ]
    df = pd.DataFrame( {'fn':files, 'month':months, 'year':years} )
    df_sorted = df.sort_values( ['year', 'month' ] )
    return df_sorted.fn.tolist()

def only_years( files, begin=1901, end=2100, split_on='_', elem_year=-1 ):
    '''
    return new list of filenames where they are truncated to begin:end
    ARGUMENTS:
    ----------
    files = [list] list of `str` pathnames to be sorted by month and year. usually from glob.glob.
    begin = [int] four digit integer year of the begin time default:1901
    end = [int] four digit integer year of the end time default:2100
    split_on = [str] `str` character to split the filename on.  default:'_', SNAP standard.
    elem_year = [int] slice element from resultant split filename list.  Follows Python slicing syntax.
        default:-1. For SNAP standard.
    RETURNS:
    --------
    sliced `list` to begin and end year.
    '''
    import pandas as pd
    years = [ int(fn.split('.')[0].split( split_on )[elem_year]) for fn in files ]
    df = pd.DataFrame( { 'fn':files, 'year':years } )
    df_slice = df[ (df.year >= begin ) & (df.year <= end ) ]
    return df_slice.fn.tolist()

def list_files( path, split_on='_', elem_month=-2, elem_year=-1 ):
    files = glob.glob( os.path.join( path, '*.tif' ) )
    return sort_files( files, split_on, elem_month, elem_year )

def diff( old_fn, new_fn, band=1 ):
    old = rasterio.open( old_fn ).read( band, masked=True )
    new = rasterio.open( new_fn ).read( band, masked=True )
    return old - new

def wrap( x ):
    return diff( x[0], x[1] )

def remove_anom_prism( ds_fn, anom_fn, prism_fn ):
    ds = rasterio.open( ds_fn ).read( 1, masked=True )
    anom = rasterio.open( anom_fn ).read( 1, masked=True )
    prism = rasterio.open( prism_fn ).read( 1, masked=True )
    return (ds - anom) - prism

def wrap2( x ):
    ds_fn, anom_fn, prism_fn = x
    return remove_anom_prism( ds_fn, anom_fn, prism_fn )

if __name__ == '__main__':
    import os, glob
    import rasterio
    import numpy as np
    from pathos.mp_map import mp_map

    # setup
    model = 'GFDL-CM3'
    scenario = 'rcp60'
    variable = 'tas'
    old_base = '/Data/Base_Data/Climate/AK_CAN_2km/projected/AR5_CMIP5_models'
    new_base = '/workspace/Shared/Tech_Projects/EPSCoR_Southcentral/project_data/downscaled_GFDL_TEST'
    prism_base = '/workspace/Shared/Tech_Projects/EPSCoR_Southcentral/project_data/prism'

    old_dir = os.path.join( old_base, scenario, model, variable )
    new_dir = os.path.join( new_base, model, scenario, variable )
    anom_dir = os.path.join( new_base, model, scenario, variable, 'anom' )
    prism_dir = os.path.join( prism_base, variable )

    # # EXAMPLE 1: list the data and get an idea of how diff they are in a given model/scenario
    old = list_files( old_dir )
    new = list_files( new_dir )
    args = zip( old, new )

    diff_arr = np.array( mp_map( wrap, args, nproc=32 ) )
    # mask it
    diff_masked = np.ma.masked_where( diff_arr == diff_arr.min(), diff_arr )
    print( 'min: {}  max: {}'.format( diff_masked.min(), diff_masked.max() ) )

    diff_uniques = [ zip(*np.unique( i, return_counts=True )) for i in diff_masked ]

    # # EXAMPLE 2: list the data and remove anoms and prism as a test
    ds = list_files( new_dir )
    anom = list_files( anom_dir, elem_month=-3, elem_year=-2 )
    prism = sorted( glob.glob( os.path.join( prism_dir, '*.tif' ) ) ) * ( len(ds) / 12 )
    args = zip( ds, anom, prism )

    diff_arr_prism = np.array( mp_map( wrap2, args, nproc=32 ) )
    mi = diff_arr_prism[ diff_arr_prism > -3.39999995e+38 ].min()
    ma = diff_arr_prism[ diff_arr_prism > -3.39999995e+38 ].max()

ua-snap / downscale

write tests #8