aws / sagemaker-training-toolkit

Train machine learning models within a 🐳 Docker container using 🧠 Amazon SageMaker.
Apache License 2.0
496 stars 118 forks source link

Model and output files do not get saved to S3 when training own model #82

Open fiocam opened 4 years ago

fiocam commented 4 years ago

I am trying to train my own model on AWS Sagemaker in order to eventually create an endpoint, basically following this tutorial: https://github.com/aws/amazon-sagemaker-examples/tree/master/advanced_functionality/r_bring_your_own , but using Python code for the model. When executing the create_training_command no error message comes up and a training job is created in Sagemaker. However, no files are stored in S3 model or output directory. When clicking on the link that should lead to the model.tar.gz file in the training job directory, this folder is also empty. I have included my docker, algorithm.py and .ipynb file.

Any help is greatly appreciated!

Dockerfile

# Select image to use
FROM python:3.6

# Adding requirements file to current directory
COPY docker/requirements.txt .

# Install all packages defined in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Copies the training code inside the container
COPY algorithm.py /opt/ml/algorithm.py

# Define entrypoint
ENTRYPOINT ["python3.6", "/opt/ml/algorithm.py"]

algorithm.py

#!/usr/bin/env python
# coding: utf-8

# # Sample Algorithm

# Import packages
import boto3
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import json
import os
import signal
import sys
import time
import pickle

# Create KMS client
kms = boto3.client('kms', region_name='eu-west-1')

# Setup parameters
# Container directories
input_dir = '/opt/ml/input'
model_dir = '/opt/ml/model'
output_dir = '/opt/ml/output'

#channel name for training
channel_name = 'train'
training_path = os.path.join(input_dir, channel_name)
failure_path = output_dir + '/failure'

def arima_algo():

    try:

        #read training data
        train_data = pd.read_csv(os.path.join(training_path,r'/train_data.csv'))

        #carry out arima model
        model = SARIMAX(data['y'], order=(2,1,2), enforce_invertibility=False)
        model_fit = model.fit()

        prediction = pd.DataFrame({'ds': pd.date_range(start='2019-01-01', periods=4, freq='MS'),
                                   'yhat': model_fit.predict(47, 50)}).reset_index(drop=True)

        #save model
        #json.dump(model, open(model_dir + '/sarimax_model.json', 'w'))
        model_json = model.to_json()
        with open(os.path.join(model_dir,"/model.json"), "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        model.save_weights("model.h5")
        print("Saved model to disk")

        #save predictions
        prediction.to_csv(os.path.join(output_dir,r'prediction.csv'))

    except Exception:
        print('Failed to train: %s' % (sys.exc_info()[0]))
        touch(failure_file)
        raise 

    if __name__ == '__main__':
        arima_algo()    

Extract of .ipynb file

Publish container

%%sh

# Set the AWS variables for later purpose
ecr_repo='python3repo'
aws_region='eu-west-1'

# Check if AWS ECR repository already exists
aws ecr describe-repositories --repository-names ${ecr_repo} --region ${aws_region} > /dev/null 2>&1

# Create the repository on AWS ECR (if not already existent)
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name ${ecr_repo} --region ${aws_region}
fi

# Check if image with tag 'latest' already exists in the repository and if yes, delete it
aws ecr describe-images --repository-name ${ecr_repo} --image-ids imageTag=latestfiona > /dev/null 2>&1

# Delete the 'latest' tagged image as it should be overwritten by later lines
if [ $? -eq 0 ]
then
    aws ecr batch-delete-image --repository-name ${ecr_repo} --image-ids imageTag=latestfiona
fi

# Get the repositoryUri
repo_uri=$(aws ecr describe-repositories --repository-names ${ecr_repo} --region ${aws_region} --query repositories[0].repositoryUri --output text)':latestfiona'

# Login to the repositoryUri
aws ecr get-login-password --region ${aws_region} | docker login --username AWS --password-stdin ${repo_uri}

sudo docker build -f docker/Dockerfile -t python3container:latestfiona .

# Tag the python3container image with the repositoryUri
docker tag python3container:latestfiona ${repo_uri}    

# Push the image to Amazon ECR with the repositoryUri value from the earlier step
docker push ${repo_uri}

Train

# Create ECR client
ecr = boto3.client('ecr', region_name='eu-west-1')
kms = boto3.client('kms', region_name='eu-west-1')

# Read the image name to use for training
img_name = ecr.describe_repositories(repositoryNames=['python3repo'])['repositories'][0]['repositoryUri'] + ':latestfiona'

smxdatalab_job = 'DEMO-training-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", smxdatalab_job)

training_params = {
    "RoleArn": role,
    "TrainingJobName": smxdatalab_job,
    "AlgorithmSpecification": {
        "TrainingImage": img_name,
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/output/".format(bucket)
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

%%time

region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**training_params)

status = sm.describe_training_job(TrainingJobName=smxdatalab_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=smxdatalab_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=smxdatalab_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')
chuyang-deng commented 4 years ago

Hi @fiocam, are you running your training job with your own container? (img_name = ecr.describe_repositories(repositoryNames=['python3repo'])['repositories'][0]['repositoryUri'] + ':latestfiona' )

Could you share your training logs as well as your container dockerfile?

fiocam commented 4 years ago

Hi Chuyang Deng,

Thanks for you answer! Yes I'm using my own container. The training and docker file are already contained in the original post. But I'll just put them here again:

Dockerfile

# Select image to use
FROM python:3.6

# Adding requirements file to current directory
COPY docker/requirements.txt .

# Install all packages defined in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Copies the training code inside the container
COPY algorithm.py /opt/ml/algorithm.py

# Define entrypoint
ENTRYPOINT ["python3.6", "/opt/ml/algorithm.py"]

Training file .py

#!/usr/bin/env python
# coding: utf-8

# # Sample Algorithm

# Import packages
import boto3
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import json
import os
import signal
import sys
import time
import pickle

# Create KMS client
kms = boto3.client('kms', region_name='eu-west-1')

# Setup parameters
# Container directories
input_dir = '/opt/ml/input'
model_dir = '/opt/ml/model'
output_dir = '/opt/ml/output'

#channel name for training
channel_name = 'train'
training_path = os.path.join(input_dir, channel_name)
failure_path = output_dir + '/failure'

def arima_algo():

    try:

        #read training data
        train_data = pd.read_csv(os.path.join(training_path,r'/train_data.csv'))

        #carry out arima model
        model = SARIMAX(train_data['y'], order=(2,1,2), enforce_invertibility=False)
        model_fit = model.fit()

        prediction = pd.DataFrame({'ds': pd.date_range(start='2019-01-01', periods=4, freq='MS'),
                                   'yhat': model_fit.predict(47, 50)}).reset_index(drop=True)

        #save model
        #json.dump(model, open(model_dir + '/sarimax_model.json', 'w'))
        model_json = model.to_json()
        with open(os.path.join(model_dir,"/model.json"), "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        model.save_weights("model.h5")
        print("Saved model to disk")

        #save predictions
        prediction.to_csv(os.path.join(output_dir,r'prediction.csv'))

    except Exception:
        print('Failed to train: %s' % (sys.exc_info()[0]))
        touch(failure_file)
        raise 

    if __name__ == '__main__':
        arima_algo()    
zwarshavsky commented 3 years ago

Same issue. Would love an update.

steelersd commented 3 years ago

@fiocam I know this doesn't answer your question, but maybe take a look at the python Script-mode example. I've been using something similar to this example successfully for a few months now. More info here on SageMaker Training Toolkit.

lucky6qi commented 2 years ago

basically only /opt/ml/model content will be packed as model.tar.gz and saved to the output_path you specified. So make sure that the needed files are saved to this path, the /opt/ml/output is the folder to indicate whether your training is successful or not.