Adding support for AWS Glue blueprints, ml transforms, sessions, and workflows

This PR adds support for Glue blueprints, ml transforms, sessions, and workflows.
Testing

Glue resources were created using the setup code mentioned below, and then AWS Nuke was used to clean these new resources up, specifying : "GlueBlueprint" "GlueSession" "CodePipelinePipeline" "GlueWorkflow"
Setup code

#!/bin/bash

# Generate a random string to use as a bucket name and classifier name suffix
RANDOM_STRING=$(openssl rand -hex 20)
# Generate a random string for shorter names
SHORT_RANDOM_STRING=$(openssl rand -hex 10)

# Set your preferred bucket names
INPUT_BUCKET="input-bucket-$RANDOM_STRING"

# Get AWS account ID
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
echo "AWS Account ID: $AWS_ACCOUNT_ID"

# Create input bucket
aws s3api create-bucket --bucket $INPUT_BUCKET
echo "Input bucket created: s3://$INPUT_BUCKET"

# Create Glue_DefaultRole if it doesn't exist
aws iam create-role --role-name Glue_DefaultRole --assume-role-policy-document '{
    "Version": "2012-10-17",
    "Statement": [{
        "Effect": "Allow",
        "Principal": {
            "Service": "glue.amazonaws.com"
        },
        "Action": "sts:AssumeRole"
    }]
}'
echo "Glue_DefaultRole created"

# # Attach policy to Glue_DefaultRole
aws iam attach-role-policy --role-name Glue_DefaultRole --policy-arn arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole
echo "Policy attached to Glue_DefaultRole"

# Call the get-role command and save the ARN to a variable
ROLE_ARN=$(aws iam get-role --role-name Glue_DefaultRole --query 'Role.Arn' --output text)

# Print the ARN to verify
echo "IAM Role ARN: $ROLE_ARN"

# Create Python script
cat << EOF > glue_script.py
import numpy as np
print("Hello world")

a = np.array([20,30,40,50])
print(a)

b = np.arange( 4 )

print(b)

c = a-b

print(c)S

d = b**2

print(d)
EOF
echo "Python script created"

# Upload Python script to the input bucket
aws s3 cp glue_script.py s3://$INPUT_BUCKET/
echo "Python script uploaded to s3://$INPUT_BUCKET/glue_script.py"

# Create Glue job using the uploaded Python script
aws glue create-job --name python-job-cli --role Glue_DefaultRole \
    --command '{"Name" :  "pythonshell", "PythonVersion": "3.9", "ScriptLocation" : "s3://'$INPUT_BUCKET'/glue_script.py"}'
echo "Glue job created"

# Create notebook file
cat << EOF > notebook.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Hello from Notebook!')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
EOF

# Upload notebook file to the input bucket
aws s3 cp notebook.ipynb s3://$INPUT_BUCKET/
echo "Notebook uploaded to s3://$INPUT_BUCKET/notebook.ipynb"

# Create Glue notebook job
aws glue create-job --name notebook-job-cli-$SHORT_RANDOM_STRING --role Glue_DefaultRole \
    --command '{"Name" :  "glueetl", "ScriptLocation" : "s3://'$INPUT_BUCKET'/notebook.ipynb"}'
echo "Notebook job created"

# Create a KMS key
KMS_KEY_ARN=$(aws kms create-key --query 'KeyMetadata.Arn' --output text)
echo "KMS key created: $KMS_KEY_ARN"

# Extract the key ID from the ARN
KMS_KEY_ID=$(basename "$KMS_KEY_ARN" | cut -d '/' -f 2)
echo "KMS key ID: $KMS_KEY_ID"

# Set encryption settings for the Data Catalog
aws glue put-data-catalog-encryption-settings \
    --data-catalog-encryption-settings '{
        "EncryptionAtRest": {
            "CatalogEncryptionMode": "SSE-KMS",
            "SseAwsKmsKeyId": "'"$KMS_KEY_ID"'"
        }
    }'
echo "Data Catalog encryption settings set"

# Create the database if it doesn't exist
aws glue create-database --database-input '{"Name": "my_database"}'
echo "Database created"

# Create a data catalog table
aws glue create-table --database-name my_database --table-input '{
    "Name": "my_table",
    "Description": "My Glue Data Catalog table",
    "PartitionKeys": [
        {
            "Name": "partition_column",
            "Type": "string"
        }
    ],
    "StorageDescriptor": {
        "Columns": [
            {
                "Name": "column1",
                "Type": "string"
            },
            {
                "Name": "column2",
                "Type": "int"
            }
        ],
        "Location": "s3://'$INPUT_BUCKET'/data/",
        "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
        "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
        "Compressed": false,
        "SerdeInfo": {
            "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
            "Parameters": {
                "field.delim": ","
            }
        }
    }
}'
echo "Data catalog table created"

# Set the partition values and location
PARTITION_VALUES="2024-03-15"
LOCATION="s3://$INPUT_BUCKET/data/$PARTITION_VALUES/"

# Create sample partition data
echo "Column1,Column2" > sample_partition_data.csv
echo "Value1,123" >> sample_partition_data.csv
echo "Value2,456" >> sample_partition_data.csv

# Upload sample partition data to S3
aws s3 cp sample_partition_data.csv "$LOCATION"

# Create the partition
aws glue create-partition \
    --database-name "my_database" \
    --table-name "my_table" \
    --partition-input '{
        "Values": ["'$PARTITION_VALUES'"],
        "StorageDescriptor": {
            "Location": "'$LOCATION'",
            "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
            "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
            "Compressed": false,
            "SerdeInfo": {
                "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                "Parameters": {
                    "field.delim": ","
                }
            }
        }
    }'
echo "Partition created"

aws glue create-partition-index \
    --database-name my_database \
    --table-name my_table \
    --partition-index '{
        "Keys": ["partition_column"],
        "IndexName": "test_partition_index"
    }'
echo "Partition index created"

aws glue create-registry \
    --registry-name "test-registry" \
    --description "My Glue registry"
echo "Glue registry created"

echo "Getting a subnet from the default VPC"
DEFAULT_VPC_ID=$(aws ec2 describe-vpcs --filters "Name=isDefault,Values=true" --query 'Vpcs[0].VpcId' --output text)

SUBNET_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$DEFAULT_VPC_ID" "Name=availability-zone,Values=us-east-1*" --query 'Subnets[0].SubnetId' --output text)
echo "Using subnet $SUBNET_ID"

# # describe security groups and get the id of the default group
SEC_GROUP_ID=`aws ec2 describe-security-groups --group-names default --output text --query 'SecurityGroups[0].GroupId'`
echo "Using security group id: $SEC_GROUP_ID"

# Create a Glue connection for Kafka
aws glue create-connection \
    --connection-input '{
        "Name":"my_kafka_connection",
        "Description":"Kafka connection to my Kafka broker",
        "ConnectionType":"KAFKA",
        "ConnectionProperties":{
            "KAFKA_BOOTSTRAP_SERVERS":"<Kafka-broker-server-url>:<Kafka-port>",
            "KAFKA_SSL_ENABLED":"true",
            "KAFKA_CUSTOM_CERT": "s3://bucket/prefix/cert-file.pem"
        },
        "PhysicalConnectionRequirements":{
            "SubnetId":"$SUBNET_ID",
            "SecurityGroupIdList":["$SEC_GROUP_ID"],
            "AvailabilityZone":"us-east-1a"
        }
    }'
echo "Glue connection for Kafka created"

# Create a schema
aws glue create-schema \
    --schema-name 'testSchemaJson' \
    --compatibility NONE \
    --data-format JSON \
    --schema-definition "{\"$schema\": \"http://json-schema.org/draft-07/schema#\",\"type\":\"object\",\"properties\":{\"f1\":{\"type\":\"string\"}}}"
echo "Glue schema created"

# Create a JSON classifier
aws glue create-classifier \
    --json-classifier '{
        "Name": "my_json_classifier",
        "JsonPath": "$[*].my_field"
    }'
echo "Glue JSON classifier created"

# Create a crawler
aws glue create-crawler \
    --name "my_crawler" \
    --role "Glue_DefaultRole" \
    --database-name "my_database" \
    --description "My Glue Crawler" \
    --targets '{
        "S3Targets": [
            {
                "Path": "s3://'$INPUT_BUCKET'/data/"
            }
        ]
    }'
echo "Glue crawler created"

# Create a Glue session
aws glue create-session --id test-glue-session --role $ROLE_ARN --command Name=glueetl,PythonVersion=3
echo "Glue session created"

# Create a Glue workflow
aws glue create-workflow --name test-workflow --description "My Glue workflow"

# Create a sample file in the run directory
echo "Sample content" > sample_file.txt

# Zip the sample file along with other contents for the blueprint
zip -r blueprint.zip ./sample_file.txt

# Upload the Zip file to the S3 bucket
aws s3 cp blueprint.zip "s3://$INPUT_BUCKET/blueprint.zip"

# Call create-blueprint command using the uploaded Zip file
aws glue create-blueprint \
    --name "testBlueprint" \
    --description "Your blueprint description" \
    --blueprint-location "s3://$INPUT_BUCKET/blueprint.zip"

# Create a Glue trigger
aws glue create-trigger \
    --name "testTrigger" \
    --workflow-name "test-workflow" \
    --type SCHEDULED \
    --schedule "cron(0 12 * * ? *)" \
    --actions '[{"JobName": "python-job-cli", "Arguments": {"arg1": "value1", "arg2": "value2"}}]'
echo "Glue trigger created"   

# Create ML Transform
aws glue create-ml-transform \
    --name "test-ml-transform" \
    --input-record-tables '[
        {
            "DatabaseName": "my_database",
            "TableName": "my_table"
        }
    ]' \
    --parameters '{
        "TransformType": "FIND_MATCHES",
        "FindMatchesParameters": {
            "PrimaryKeyColumnName": "column1",
            "PrecisionRecallTradeoff": 0.5,
            "AccuracyCostTradeoff": 0.5,
            "EnforceProvidedLabels": false
        }
    }' \
    --role "Glue_DefaultRole"
echo "ML Transform created"

# Clean up
rm -rf glue_script.py notebook.ipynb sample_partition_data.csv blueprint.zip sample_file.txt
rebuy-de / aws-nuke

Adding support for AWS Glue blueprints, ml transforms, sessions, and workflows #1218

Testing

Setup code