sigdba / sig-shared-sceptre

Shared templates for Sceptre/CloudFormation
2 stars 0 forks source link

EcsCluster: awslogs deprecated #146

Open gillfimj opened 1 month ago

gillfimj commented 1 month ago

The package awslogs was removed from the AL2023 repository and is no longer supported. While this package was removed from the install package list in the user_data.txt content,

# Install awslogs and the jq JSON parser
dnf install -y jq wget aws-cfn-bootstrap aws-cli chrony python3-boto3

there are still references in the rest of the code that pertain to awslogs that could potentially cause failures when spinning up new instances.

# Inject the CloudWatch Logs configuration file contents
cat > /etc/awslogs/awslogs.conf <<- EOF
[general]
state_file = /var/lib/awslogs/agent-state

...

# Set the region to send CloudWatch Logs data to (the region where the container instance is located)
region=$(curl -s 169.254.169.254/latest/dynamic/instance-identity/document | jq -r .region)
sed -i -e "s/region = us-east-1/region = $region/g" /etc/awslogs/awscli.conf

...

  # Replace the cluster name and container instance ID placeholders with the actual values
  sed -i -e "s/{cluster}/$cluster/g" /etc/awslogs/awslogs.conf
  sed -i -e "s/{container_instance_id}/$container_instance_id/g" /etc/awslogs/awslogs.conf

  service awslogs start
  chkconfig awslogs on

Potential code updates involve installing a different package and creating a baseline json file to be used to start and configure the CloudWatch agent.

# Install awslogs and the jq JSON parser
dnf install -y jq wget aws-cfn-bootstrap aws-cli chrony python3-boto3 amazon-cloudwatch-agent

...

# Restart ecs after cluster name change
rm /var/lib/ecs/data/agent.db
systemctl restart ecs

# Create config file
cat > /opt/aws/amazon-cloudwatch-agent/bin/config.json <<EOF
<see additional note for the full json example generated by running the CloudWatch agent config wizard>
EOF

...

# Start and configure CloudWatch Agent
/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -m ec2 -a start
/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -s -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json 
gillfimj commented 1 month ago

Sample json for the CloudWatch agent config. This was generated using the command after the CloudWatch agent was installed:

/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-config-wizard

Resulting json:

{
        "agent": {
                "metrics_collection_interval": 60,
                "run_as_user": "cwagent"
        },
        "logs": {
                "logs_collected": {
                        "files": {
                                "collect_list": [
                                        {
                                                "file_path": "/var/log/dmesg",
                                                "log_group_class": "STANDARD",
                                                "log_group_name": "dmesg",
                                                "log_stream_name": "{cluster}/{instance_id}",
                                                "retention_in_days": 30
                                        },
                                        {
                                                "file_path": "/var/log/messages",
                                                "log_group_class": "STANDARD",
                                                "log_group_name": "/var/log/messages",
                                                "log_stream_name": "{cluster}/{instance_id}",
                                                "retention_in_days": 30
                                        },
                                        {
                                                "file_path": "/var/log/docker",
                                                "log_group_class": "STANDARD",
                                                "log_group_name": "/var/log/docker",
                                                "log_stream_name": "{cluster}/{instance_id}",
                                                "retention_in_days": 30
                                        },
                                        {
                                                "file_path": "/var/log/ecs/ecs-init.log",
                                                "log_group_class": "STANDARD",
                                                "log_group_name": "/var/log/ecs/ecs-init.log",
                                                "log_stream_name": "{cluster}/{instance_id}",
                                                "retention_in_days": 30
                                        },
                                        {
                                                "file_path": "/var/log/ecs/ecs-agent.log.*",
                                                "log_group_class": "STANDARD",
                                                "log_group_name": "/var/log/ecs/ecs-agent.log",
                                                "log_stream_name": "{cluster}/{instance_id}",
                                                "retention_in_days": 30
                                        },
                                        {
                                                "file_path": "/var/log/ecs/audit.log.*",
                                                "log_group_class": "STANDARD",
                                                "log_group_name": "/var/log/ecs/audit.log",
                                                "log_stream_name": "{cluster}/{instance_id}",
                                                "retention_in_days": 30
                                        }
                                ]
                        }
                }
        },
        "metrics": {
                "aggregation_dimensions": [
                        [
                                "InstanceId"
                        ]
                ],
                "append_dimensions": {
                        "AutoScalingGroupName": "{aws:AutoScalingGroupName}",
                        "ImageId": "{aws:ImageId}",
                        "InstanceId": "{aws:InstanceId}",
                        "InstanceType": "{aws:InstanceType}"
                },
                "metrics_collected": {
                        "disk": {
                                "measurement": [
                                        "used_percent"
                                ],
                                "metrics_collection_interval": 60,
                                "resources": [
                                        "*"
                                ]
                        },
                        "mem": {
                                "measurement": [
                                        "mem_used_percent"
                                ],
                                "metrics_collection_interval": 60
                        },
                        "statsd": {
                                "metrics_aggregation_interval": 60,
                                "metrics_collection_interval": 60,
                                "service_address": ":8125"
                        }
                }
        }
}
gillfimj commented 1 week ago

In certain circumstances, the dnf upgrade will fail to find any valid repository. This block of code will go in the "Apply security updates" section.

            attempt=0
            max_attempts=10
            until dnf upgrade -y ; do
                attempt=$((attempt + 1))
                if [ $attempt -ge $max_attempts ]; then
                    echo "Failed to upgrade after $max_attempts attempts. Exiting."
                    exit 1
                fi
                echo "dnf upgrade failed (attempt $attempt/$max_attempts), retrying in $attempt seconds..."
                sleep $attempt
            done
            echo "system upgraded successfully."