Bug in do_end_to_end bool argument. Should be action='store_false'

The issue is due to how the argparse module handles boolean command line arguments. When you pass a string to a boolean argument in argparse (like you are doing in the command line), it doesn't convert 'False' string to a boolean False. In Python, the bool function treats any non-empty string as True.

For boolean arguments, instead of using type=bool in add_argument, a better practice would be to use action='store_true' or action='store_false'.

These will store True if the argument is present and False otherwise (for store_true), or vice versa for store_false. You then use the flag on the command line if you want the value to be True.

For example:

parser.add_argument("--do_end_to_end", action='store_false')

And in the command line, if you want do_end_to_end to be False, you would include the flag:

python your_script.py --do_end_to_end

If you want the default to be False and to set it to True from the command line, use action='store_true' and include the flag when you want it to be True.

This way of handling boolean arguments is generally more intuitive for command-line users.

finall stores false:

brando9@ampere1:~/data/evaporate$ python ~/evaporate/evaporate/run_profiler.py \
>     --data_lake small_debug_lin_alg_textbook \
>     --do_end_to_end ''\
>     --num_attr_to_cascade 15 \
>     --num_top_k_scripts 5 \
>     --train_size 10 \
>     --combiner_mode mv \
>     --use_dynamic_backoff True \
>     --KEYS keys
Running main: main=<function main at 0x7f40486b4ca0>
os.getenv('CONDA_DEFAULT_ENV')='maf'
sys.argv=['/lfs/ampere1/0/brando9/evaporate/evaporate/run_profiler.py', '--data_lake', 'small_debug_lin_alg_textbook', '--do_end_to_end', '', '--num_attr_to_cascade', '15', '--num_top_k_scripts', '5', '--train_size', '10', '--combiner_mode', 'mv', '--use_dynamic_backoff', 'True', '--KEYS', 'keys']
experiment_args=Namespace(data_lake='small_debug_lin_alg_textbook', do_end_to_end=False, num_attr_to_cascade=15, num_top_k_scripts=5, train_size=10, combiner_mode='mv', use_dynamic_backoff=True, KEYS=['keys'])
experiment_args.do_end_to_end=False

its using the string False since strings are true:

brando9@ampere1:~/data/evaporate$ python ~/evaporate/evaporate/run_profiler.py \
>     --data_lake small_debug_lin_alg_textbook \
>     --do_end_to_end False\
>     --num_attr_to_cascade 15 \
>     --num_top_k_scripts 5 \
>     --train_size 10 \
>     --combiner_mode mv \
>     --use_dynamic_backoff True \
>     --KEYS keys

Running main: main=<function main at 0x7ff7f94b8ca0>
os.getenv('CONDA_DEFAULT_ENV')='maf'
sys.argv=['/lfs/ampere1/0/brando9/evaporate/evaporate/run_profiler.py', '--data_lake', 'small_debug_lin_alg_textbook', '--do_end_to_end', 'False', '--num_attr_to_cascade', '15', '--num_top_k_scripts', '5', '--train_size', '10', '--combiner_mode', 'mv', '--use_dynamic_backoff', 'True', '--KEYS', 'keys']
experiment_args=Namespace(data_lake='small_debug_lin_alg_textbook', do_end_to_end=True, num_attr_to_cascade=15, num_top_k_scripts=5, train_size=10, combiner_mode='mv', use_dynamic_backoff=True, KEYS=['keys'])
experiment_args.do_end_to_end=True

def get_experiment_args():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--data_lake", 
        type=str,
        help="Name of the data lake to operate over. Must be in configs.py"
    )

    parser.add_argument(
        "--do_end_to_end", 
        action='store_true',
        default=False,  # not end to end so do ClosedIE
        help="True for OpenIE, False for ClosedIE. Default is False so it does ClosedIE."
    )

    parser.add_argument(
        "--num_attr_to_cascade", 
        type=int,
        default=35,
        help="Number of attributes to generate functions for"
    )

    parser.add_argument(
        "--num_top_k_scripts",
        type=int,
        default=10,
        help="Number of generated functions to combine over for each attribute"
    )

    parser.add_argument(
        "--train_size",
        type=int,
        default=10,
        help="Number of files to prompt on"
    )

    parser.add_argument(
        "--combiner_mode",
        type=str,
        default='ws',
        help="Combiner mode for combining the outputs of the generated functions",
        choices=['ws', 'mv', 'top_k']
    )

    parser.add_argument(
        "--use_dynamic_backoff",
        action='store_true',  # flag --> gen functions
        default=False,  # default no flag --> evaporate direct
        help="True (flag set) for using generate functions, False (default) for using evaporate direct."
    )

    parser.add_argument(
        "--KEYS",
        type=str,
        default=[],
        help="List of keys to use the model api",
        nargs='*'
    )

    print(f'{sys.argv=}')
    experiment_args = parser.parse_args(args=sys.argv[1:])
    print(f'{experiment_args=}')
    return experiment_args

if you really want to do it with the original method do

def get_experiment_args():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--data_lake", 
        type=str,
        help="Name of the data lake to operate over. Must be in configs.py"
    )

    parser.add_argument(
        "--do_end_to_end",  
        type=bool,
        default="True", 
        help="True for generating schema from data/OpenIE, False for ClosedIE/given schema. Default is True genererate schema/OpenIE.",
    )

    parser.add_argument(
        "--num_attr_to_cascade", 
        type=int,
        default=35,
        help="Number of attributes to generate functions for. "
    )

    parser.add_argument(
        "--num_top_k_scripts",
        type=int,
        default=10,
        help="Number of generated functions to combine over for each attribute"
    )

    parser.add_argument(
        "--train_size",
        type=int,
        default=10,
        help="Number of files to prompt on"
    )

    parser.add_argument(
        "--combiner_mode",
        type=str,
        default='ws',
        help="Combiner mode for combining the outputs of the generated functions",
        choices=['ws', 'mv', 'top_k']
    )

    parser.add_argument(
        "--use_dynamic_backoff",
        type=bool,
        default="True",  
        help="True (default) uses generated functions for extraction. Else, False uses evaporate direct/LLM cfor extraction."
    )

    parser.add_argument(
        "--KEYS",
        type=str,
        default=[],
        help="List of keys to use the model api",
        nargs='*'
    )

    print(f'{sys.argv=}')
    experiment_args = parser.parse_args(args=sys.argv[1:])
    experiment_args.do_end_to_end = True if experiment_args.do_end_to_end.lower() == 'true' else False
    experiment_args.use_dynamic_backoff = True if experiment_args.use_dynamic_backoff.lower() == 'true' else False
    print(f'{experiment_args=}')
    return experiment_args

HazyResearch / evaporate

Bug in do_end_to_end bool argument. Should be action='store_false' #27