Open Sypek opened 1 year ago
@Sypek Are you using any Metaflow extensions? Would it be possible for you to share the output of python flow.py step-functions create --only-json
?
I have the exact same problem with metaflow 2.12.8
The json output is
MLflow Experiment name: JobSectioningModelTraining profile-data-ml-production
2024-07-19 14:50:56.261 Bootstrapping virtual environment(s) ...
2024-07-19 14:50:56.649 Virtual environment(s) bootstrapped!
{
"StartAt": "start",
"States": {
"start": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--start--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "start",
"metaflow.version": "2.12.8",
"step_name": "start",
"metaflow.production_token": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd",
"metaflow.run_id.$": "$$.Execution.Name"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/start/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && if ! linux-64/26f0fca042bd88a/bin/python -s train.py dump --max-value-size=0 sfn-${METAFLOW_RUN_ID}/_parameters/${AWS_BATCH_JOB_ID}-params >/dev/null 2>/dev/null; then python -m metaflow.plugins.aws.step_functions.set_batch_environment parameters mggwzlflzp && . `pwd`/mggwzlflzp && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal init --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID}-params; fi && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step start --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/_parameters/${AWS_BATCH_JOB_ID}-params) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_PARAMETERS",
"Value.$": "$.Parameters"
},
{
"Name": "METAFLOW_DEFAULT_PARAMETERS",
"Value": "{\"build-env\": \"{\\\"type\\\": \\\"uploader-v2\\\", \\\"url\\\": \\\"s3://profile-data-ml-production-ml-artifacts/data/JobSectioningModelTraining/7696d24d15dbf75cf74eb2b213708f0a72737b50\\\", \\\"is_text\\\": true, \\\"encoding\\\": \\\"utf-8\\\", \\\"note\\\": \\\"Internal representation of IncludeFile(/var/folders/5_/dqkkbk7n591djv58p8zn29w40000gp/T/tmp51zdutgs)\\\", \\\"sub-type\\\": \\\"uploaded\\\", \\\"size\\\": 220}\", \"git_diff\": \"{\\\"type\\\": \\\"uploader-v2\\\", \\\"url\\\": \\\"s3://profile-data-ml-production-ml-artifacts/data/JobSectioningModelTraining/a69f494e767a4b3431e6d097f3b5f68cd7e3de67\\\", \\\"is_text\\\": true, \\\"encoding\\\": \\\"utf-8\\\", \\\"note\\\": \\\"Internal representation of IncludeFile(/var/folders/5_/dqkkbk7n591djv58p8zn29w40000gp/T/tmp3irotdfd)\\\", \\\"sub-type\\\": \\\"uploaded\\\", \\\"size\\\": 476055}\", \"git-env\": \"{\\\"type\\\": \\\"uploader-v2\\\", \\\"url\\\": \\\"s3://profile-data-ml-production-ml-artifacts/data/JobSectioningModelTraining/b35fc735fbfffcbfcb4dd45ff84d623cfff1a983\\\", \\\"is_text\\\": true, \\\"encoding\\\": \\\"utf-8\\\", \\\"note\\\": \\\"Internal representation of IncludeFile(/var/folders/5_/dqkkbk7n591djv58p8zn29w40000gp/T/tmp2s958a2z)\\\", \\\"sub-type\\\": \\\"uploaded\\\", \\\"size\\\": 193}\", \"nw-config\": \"{\\\"NW_CONDA_CHANNEL\\\": \\\"https://bucket.vpce-039746235a06cd3ee-lzqoegpw.s3.eu-central-1.vpce.amazonaws.com/nw-ml-production-conda-channel\\\", \\\"MLFLOW_TRACKING_URI\\\": \\\"https://ml-platform.xing.io\\\", \\\"ARTIFACT_AUTO_MERGE_PREFIXES\\\": [\\\"_mlflow_rid_\\\"], \\\"MLFLOW_ENABLE_PARAMETER_AUTOLOG\\\": true}\", \"spacy_de_conf_file\": \"{\\\"type\\\": \\\"uploader-v2\\\", \\\"url\\\": \\\"s3://profile-data-ml-production-ml-artifacts/data/JobSectioningModelTraining/9fa1fb8f8c481151704aee83fcfdd12a7804b1ce\\\", \\\"is_text\\\": true, \\\"encoding\\\": \\\"utf-8\\\", \\\"note\\\": \\\"Internal representation of IncludeFile(config/base_textcat_de.cfg)\\\", \\\"sub-type\\\": \\\"uploaded\\\", \\\"size\\\": 1335}\", \"spacy_en_conf_file\": \"{\\\"type\\\": \\\"uploader-v2\\\", \\\"url\\\": \\\"s3://profile-data-ml-production-ml-artifacts/data/JobSectioningModelTraining/8816ae7cf4742c84fe165b6cc317cf8ba9a8304c\\\", \\\"is_text\\\": true, \\\"encoding\\\": \\\"utf-8\\\", \\\"note\\\": \\\"Internal representation of IncludeFile(config/base_textcat_en.cfg)\\\", \\\"sub-type\\\": \\\"uploaded\\\", \\\"size\\\": 1369}\", \"train_data\": \"{\\\"type\\\": \\\"uploader-v2\\\", \\\"url\\\": \\\"s3://profile-data-ml-production-ml-artifacts/data/JobSectioningModelTraining/079d6098add2afe88bae29764c51d85c787659ea\\\", \\\"is_text\\\": true, \\\"encoding\\\": \\\"utf-8\\\", \\\"note\\\": \\\"Internal representation of IncludeFile(data/sections_to_train.json)\\\", \\\"sub-type\\\": \\\"uploaded\\\", \\\"size\\\": 10511203}\"}"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "start"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$$.Execution.Name"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
},
{
"Name": "METAFLOW_SFN_DYNAMO_DB_TABLE",
"Value": "profile-data-ml-productionstep_functions_state"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"Next": "#prepare_data"
},
"#prepare_data": {
"Type": "Task",
"Resource": "arn:aws:states:::dynamodb:getItem",
"Parameters": {
"TableName": "profile-data-ml-productionstep_functions_state",
"Key": {
"pathspec": {
"S.$": "$.JobId"
}
},
"ConsistentRead": true,
"ProjectionExpression": "for_each_cardinality"
},
"ResultPath": "$.Result",
"Next": "*prepare_data"
},
"*prepare_data": {
"Type": "Map",
"MaxConcurrency": 100,
"ItemsPath": "$.Result.Item.for_each_cardinality.NS",
"Parameters": {
"JobId.$": "$.JobId",
"SplitParentTaskId.$": "$.JobId",
"Parameters.$": "$.Parameters",
"Index.$": "$$.Map.Item.Value"
},
"Next": "find_best_model",
"Iterator": {
"StartAt": "prepare_data",
"ProcessorConfig": {
"Mode": "INLINE"
},
"States": {
"prepare_data": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--prepare_data--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "prepare_data",
"metaflow.version": "2.12.8",
"step_name": "prepare_data",
"metaflow.run_id.$": "$.Parameters.['metaflow.run_id']",
"split_parent_task_id_start.$": "$.SplitParentTaskId"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/prepare_data/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step prepare_data --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/start/${METAFLOW_PARENT_TASK_ID} --split-index $METAFLOW_SPLIT_INDEX) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_PARENT_TASK_ID",
"Value.$": "$.JobId"
},
{
"Name": "METAFLOW_INPUT_PATHS",
"Value": "sfn-${METAFLOW_RUN_ID}/start/${METAFLOW_PARENT_TASK_ID}"
},
{
"Name": "METAFLOW_SPLIT_INDEX",
"Value.$": "$.Index"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "prepare_data"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$.Parameters.['metaflow.run_id']"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"Next": "start_sklearn"
},
"start_sklearn": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--start_sklearn--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "start_sklearn",
"metaflow.version": "2.12.8",
"step_name": "start_sklearn",
"metaflow.run_id.$": "$.Parameters.['metaflow.run_id']",
"split_parent_task_id_start.$": "$.Parameters.split_parent_task_id_start"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/start_sklearn/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step start_sklearn --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/prepare_data/${METAFLOW_PARENT_TASK_ID}) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_PARENT_TASK_ID",
"Value.$": "$.JobId"
},
{
"Name": "METAFLOW_INPUT_PATHS",
"Value": "sfn-${METAFLOW_RUN_ID}/prepare_data/${METAFLOW_PARENT_TASK_ID}"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "start_sklearn"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$.Parameters.['metaflow.run_id']"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
},
{
"Name": "METAFLOW_SFN_DYNAMO_DB_TABLE",
"Value": "profile-data-ml-productionstep_functions_state"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"Next": "#train_sklearn"
},
"#train_sklearn": {
"Type": "Task",
"Resource": "arn:aws:states:::dynamodb:getItem",
"Parameters": {
"TableName": "profile-data-ml-productionstep_functions_state",
"Key": {
"pathspec": {
"S.$": "$.JobId"
}
},
"ConsistentRead": true,
"ProjectionExpression": "for_each_cardinality"
},
"ResultPath": "$.Result",
"Next": "*train_sklearn"
},
"*train_sklearn": {
"Type": "Map",
"MaxConcurrency": 100,
"ItemsPath": "$.Result.Item.for_each_cardinality.NS",
"Parameters": {
"JobId.$": "$.JobId",
"SplitParentTaskId.$": "$.JobId",
"Parameters.$": "$.Parameters",
"Index.$": "$$.Map.Item.Value"
},
"Next": "merge_sklearn",
"Iterator": {
"StartAt": "train_sklearn",
"ProcessorConfig": {
"Mode": "INLINE"
},
"States": {
"train_sklearn": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--train_sklearn--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "train_sklearn",
"metaflow.version": "2.12.8",
"step_name": "train_sklearn",
"metaflow.run_id.$": "$.Parameters.['metaflow.run_id']",
"split_parent_task_id_start_sklearn.$": "$.SplitParentTaskId",
"split_parent_task_id_start.$": "$.Parameters.split_parent_task_id_start"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/train_sklearn/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step train_sklearn --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/start_sklearn/${METAFLOW_PARENT_TASK_ID} --split-index $METAFLOW_SPLIT_INDEX) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_PARENT_TASK_ID",
"Value.$": "$.JobId"
},
{
"Name": "METAFLOW_INPUT_PATHS",
"Value": "sfn-${METAFLOW_RUN_ID}/start_sklearn/${METAFLOW_PARENT_TASK_ID}"
},
{
"Name": "METAFLOW_SPLIT_PARENT_TASK_ID_FOR_FOREACH_JOIN",
"Value.$": "$.SplitParentTaskId"
},
{
"Name": "METAFLOW_SPLIT_INDEX",
"Value.$": "$.Index"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "train_sklearn"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$.Parameters.['metaflow.run_id']"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
},
{
"Name": "METAFLOW_SFN_DYNAMO_DB_TABLE",
"Value": "profile-data-ml-productionstep_functions_state"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"End": true
}
}
},
"OutputPath": "$.[0]"
},
"merge_sklearn": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--merge_sklearn--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "merge_sklearn",
"metaflow.version": "2.12.8",
"step_name": "merge_sklearn",
"metaflow.run_id.$": "$.Parameters.['metaflow.run_id']",
"split_parent_task_id_start_sklearn.$": "$.Parameters.split_parent_task_id_start_sklearn",
"split_parent_task_id_start.$": "$.Parameters.split_parent_task_id_start"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/merge_sklearn/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && python -m metaflow.plugins.aws.step_functions.set_batch_environment parent_tasks lvokkenhly && . `pwd`/lvokkenhly && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step merge_sklearn --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/train_sklearn/:${METAFLOW_PARENT_TASK_IDS}) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_SPLIT_PARENT_TASK_ID",
"Value.$": "$.Parameters.split_parent_task_id_start_sklearn"
},
{
"Name": "METAFLOW_INPUT_PATHS",
"Value": "sfn-${METAFLOW_RUN_ID}/train_sklearn/:${METAFLOW_PARENT_TASK_IDS}"
},
{
"Name": "METAFLOW_SPLIT_PARENT_TASK_ID_FOR_FOREACH_JOIN",
"Value.$": "$.Parameters.split_parent_task_id_start"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "merge_sklearn"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$.Parameters.['metaflow.run_id']"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
},
{
"Name": "METAFLOW_SFN_DYNAMO_DB_TABLE",
"Value": "profile-data-ml-productionstep_functions_state"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"End": true
}
}
},
"OutputPath": "$.[0]"
},
"find_best_model": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--find_best_model--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "find_best_model",
"metaflow.version": "2.12.8",
"step_name": "find_best_model",
"metaflow.run_id.$": "$.Parameters.['metaflow.run_id']"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/find_best_model/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && python -m metaflow.plugins.aws.step_functions.set_batch_environment parent_tasks yvqqqgnqho && . `pwd`/yvqqqgnqho && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step find_best_model --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/merge_sklearn/:${METAFLOW_PARENT_TASK_IDS}) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_SPLIT_PARENT_TASK_ID",
"Value.$": "$.Parameters.split_parent_task_id_start"
},
{
"Name": "METAFLOW_INPUT_PATHS",
"Value": "sfn-${METAFLOW_RUN_ID}/merge_sklearn/:${METAFLOW_PARENT_TASK_IDS}"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "find_best_model"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$.Parameters.['metaflow.run_id']"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
},
{
"Name": "METAFLOW_SFN_DYNAMO_DB_TABLE",
"Value": "profile-data-ml-productionstep_functions_state"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"Next": "end"
},
"end": {
"Type": "Task",
"Resource": "arn:aws:states:::batch:submitJob.sync",
"Parameters": {
"JobDefinition": "arn:aws:batch:eu-central-1:504971495248:job-definition/metaflow_f1534b33ce3799c5b40fe0c360424871edb372ddfee0ca7b0218f758:1",
"JobName": "SFN-JobSectioningModelTraining--end--",
"JobQueue": "arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production",
"Parameters": {
"metaflow.user": "SFN",
"metaflow.owner": "michael_aydinbas",
"metaflow.flow_name": "JobSectioningModelTraining",
"metaflow.step_name": "end",
"metaflow.version": "2.12.8",
"step_name": "end",
"metaflow.run_id.$": "$.Parameters.['metaflow.run_id']"
},
"ContainerOverrides": {
"Command": [
"bash",
"-c",
"true && mkdir -p $PWD/.logs && export PYTHONUNBUFFERED=x MF_PATHSPEC=JobSectioningModelTraining/sfn-$METAFLOW_RUN_ID/end/$AWS_BATCH_JOB_ID MF_DATASTORE=s3 MF_ATTEMPT=$((AWS_BATCH_JOB_ATTEMPT-1)) MFLOG_STDOUT=$PWD/.logs/mflog_stdout MFLOG_STDERR=$PWD/.logs/mflog_stderr && mflog(){ T=$(date -u -Ins|tr , .); echo \"[MFLOG|0|${T:0:26}Z|task|$T]$1\" >> $MFLOG_STDOUT; echo $1; } && mflog 'Setting up task environment.' && python -m pip install requests -qqq && python -m pip install awscli boto3 -qqq && mkdir metaflow && cd metaflow && mkdir .metaflow && i=0; while [ $i -le 5 ]; do mflog 'Downloading code package...'; python -m awscli ${METAFLOW_S3_ENDPOINT_URL:+--endpoint-url=\"${METAFLOW_S3_ENDPOINT_URL}\"} s3 cp s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 job.tar >/dev/null && mflog 'Code package downloaded.' && break; sleep 10; i=$((i+1)); done && if [ $i -gt 5 ]; then mflog 'Failed to download code package from s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860 after 6 tries. Exiting...' && exit 1; fi && TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar && mflog 'Task is starting.' && (echo 'Bootstrapping virtual environment...' && DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap JobSectioningModelTraining 26f0fca042bd88a s3 linux-64 && echo 'Environment bootstrapped.' && export PATH=$PATH:$(pwd)/micromamba && linux-64/26f0fca042bd88a/bin/python -s train.py --with batch:cpu=1,gpu=0,memory=4096,image=public.ecr.aws/docker/library/python:3.10,queue=arn:aws:batch:eu-central-1:504971495248:job-queue/profile-data-ml-production,iam_role=arn:aws:iam::504971495248:role/nw-ml-batch-ecs-policy,use_tmpfs=False,tmpfs_tempdir=True,tmpfs_path=/metaflow_temp --quiet --metadata=service --environment=conda --datastore=s3 --datastore-root=s3://profile-data-ml-production-ml-artifacts/metaflow --event-logger=nullSidecarLogger --monitor=nullSidecarMonitor --no-pylint --with=step_functions_internal step end --run-id sfn-$METAFLOW_RUN_ID --task-id ${AWS_BATCH_JOB_ID} --retry-count $((AWS_BATCH_JOB_ATTEMPT-1)) --max-user-code-retries 0 --input-paths sfn-${METAFLOW_RUN_ID}/find_best_model/${METAFLOW_PARENT_TASK_ID}) 1>> >(python -m metaflow.mflog.tee task $MFLOG_STDOUT) 2>> >(python -m metaflow.mflog.tee task $MFLOG_STDERR >&2); c=$?; python -m metaflow.mflog.save_logs; exit $c"
],
"ResourceRequirements": [
{
"Value": "1",
"Type": "VCPU"
},
{
"Value": "4096",
"Type": "MEMORY"
}
],
"Environment": [
{
"Name": "AWS_DEFAULT_REGION",
"Value": "eu-central-1"
},
{
"Name": "METAFLOW_CODE_SHA",
"Value": "f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_CODE_DS",
"Value": "s3"
},
{
"Name": "METAFLOW_USER",
"Value": "SFN"
},
{
"Name": "METAFLOW_SERVICE_URL",
"Value": "https://ml-platform.xing.io:8080/"
},
{
"Name": "METAFLOW_SERVICE_HEADERS",
"Value": "{}"
},
{
"Name": "METAFLOW_DATASTORE_SYSROOT_S3",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow"
},
{
"Name": "METAFLOW_DATATOOLS_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/data"
},
{
"Name": "METAFLOW_DEFAULT_DATASTORE",
"Value": "s3"
},
{
"Name": "METAFLOW_DEFAULT_METADATA",
"Value": "service"
},
{
"Name": "METAFLOW_CARD_S3ROOT",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/mf.cards"
},
{
"Name": "METAFLOW_RUNTIME_ENVIRONMENT",
"Value": "aws-batch"
},
{
"Name": "METAFLOW_PARENT_TASK_ID",
"Value.$": "$.JobId"
},
{
"Name": "METAFLOW_INPUT_PATHS",
"Value": "sfn-${METAFLOW_RUN_ID}/find_best_model/${METAFLOW_PARENT_TASK_ID}"
},
{
"Name": "METAFLOW_CODE_URL",
"Value": "s3://profile-data-ml-production-ml-artifacts/metaflow/JobSectioningModelTraining/data/f2/f28bb46c02af58001e638d12e03e0d0fed73e860"
},
{
"Name": "METAFLOW_FLOW_NAME",
"Value": "JobSectioningModelTraining"
},
{
"Name": "METAFLOW_STEP_NAME",
"Value": "end"
},
{
"Name": "METAFLOW_RUN_ID",
"Value.$": "$.Parameters.['metaflow.run_id']"
},
{
"Name": "METAFLOW_PRODUCTION_TOKEN",
"Value": "profile-data-ml-production_jobsectioningmodeltraining-0-ybhd"
},
{
"Name": "SFN_STATE_MACHINE",
"Value": "profile-data-ml-production_JobSectioningModelTraining"
},
{
"Name": "METAFLOW_OWNER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_RUNTIME_NAME",
"Value": "step-functions"
},
{
"Name": "USER",
"Value": "michael_aydinbas"
},
{
"Name": "METAFLOW_VERSION",
"Value": "{\"platform\": \"Darwin\", \"username\": \"michael_aydinbas\", \"production_token\": \"profile-data-ml-production_jobsectioningmodeltraining-0-ybhd\", \"runtime\": \"dev\", \"app\": null, \"environment_type\": \"conda\", \"use_r\": false, \"python_version\": \"3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]\", \"python_version_code\": \"3.10.14\", \"metaflow_version\": \"2.12.8\", \"script\": \"train.py\", \"flow_name\": \"JobSectioningModelTraining\"}"
}
]
},
"RetryStrategy": {
"Attempts": 1
},
"Timeout": {
"AttemptDurationSeconds": 432000
}
},
"Retry": [
{
"ErrorEquals": [
"Batch.AWSBatchException"
],
"BackoffRate": 2,
"IntervalSeconds": 2,
"MaxDelaySeconds": 60,
"MaxAttempts": 10,
"JitterStrategy": "FULL"
}
],
"OutputPath": "$.['JobId', 'Parameters', 'Index', 'SplitParentTaskId']",
"End": true
}
}
}
Hi,
I'm deploying my flow on AWS Step Functions and unfortunately I'm getting this error:
I saw two issues with the similar problem:
I've already stopped using metaflow parameters to reduce the amount of variables passed.
I'm using:
Do you have any tips how to avoid this error, is there anything on metaflow side that can be done to omit this error? Thanks in advance.