Open mbaijal opened 2 years ago
That's a good idea @mbaijal , after a lot of trials and errors, here is a POC I came up with that finally works (code not clean, will make more functions and make it cleaner etc) but maybe it can help someone.
I am able to use this to delete all needed ressources, then re-create a new EFS file system from the auto script and it works.
The very tricky/not-obvious-part for me was to delete the deployment and the daemon set, took me a while to figure out that without these delete, your new EFS file system simply wont work even if the install doesnt fail.
import argparse
import boto3
import subprocess
from utils import get_cloud_formation_stacks
from utils import delete_stacks
from utils import wait_for_stacks_to_be_gone
from time import sleep
EFS_FILE_SYSTEM_NAME = "KubeflowEfs"
EFS_SECURITY_GROUP_NAME = "KubeflowEfsSecurityGroup"
EFS_STORAGE_CLASS_NAME = "efs-sc"
def delete_efs_resources():
print("Deleting EFS resources...")
try:
file_system = get_file_system()
delete_mount_targets(file_system)
delete_file_system(file_system)
delete_storage_class()
subprocess.run([
"eksctl",
"delete",
"iamserviceaccount",
"--name",
"efs-csi-controller-sa",
"--namespace",
"kube-system",
"--cluster",
CLUSTER_NAME,
"--region",
CLUSTER_REGION
])
stacks = get_cloud_formation_stacks(CLUSTER_REGION)
stacks_to_delete = [stack for stack in stacks if CLUSTER_NAME in stack and "efs-csi-controller-sa" in stack]
delete_stacks(stacks_to_delete, CLUSTER_REGION)
wait_for_stacks_to_be_gone(stacks_to_delete, CLUSTER_REGION)
except:
print("No file system found!")
try:
delete_security_group()
except:
print("No file system security group found!")
delete_daemon_set()
delete_deployment()
def get_file_system():
file_systems = get_file_systems()
return next(file_system for file_system in file_systems if file_system["Name"]==EFS_FILE_SYSTEM_NAME)
def get_file_systems():
client = get_efs_client()
return client.describe_file_systems()["FileSystems"]
def get_efs_client():
return boto3.client(
'efs',
region_name=CLUSTER_REGION,
)
def delete_mount_targets(file_system):
client = get_efs_client()
for mount_target in get_mount_targets(file_system):
mount_target_id = mount_target["MountTargetId"]
print(f"Deleting mount target {mount_target_id}...")
client.delete_mount_target(
MountTargetId=mount_target_id
)
wait_for_mount_targets_deletion(file_system)
def get_mount_targets(file_system):
client = get_efs_client()
return client.describe_mount_targets(FileSystemId=file_system["FileSystemId"])["MountTargets"]
def wait_for_mount_targets_deletion(file_system):
while (len(get_mount_targets(file_system)) > 0):
print("Waiting for EFS mount targets deletion to complete...")
sleep(1)
print("EFS mount targets deletion completed!")
def delete_file_system(file_system):
client = get_efs_client()
file_system_name = file_system["Name"]
file_system_id = file_system["FileSystemId"]
print(f"Deleting EFS file system Name: {file_system_name}, ID : {file_system_id}")
client.delete_file_system(
FileSystemId=file_system_id
)
wait_for_file_system_deletion()
def wait_for_file_system_deletion():
while any(file_system["Name"] == EFS_FILE_SYSTEM_NAME for file_system in get_file_systems()):
print("Waiting for EFS file system deletion to complete...")
sleep(1)
print(f"EFS file system {EFS_FILE_SYSTEM_NAME} deletion completed!")
def delete_storage_class():
subprocess.run([
"kubectl",
"delete",
"storageclass",
EFS_STORAGE_CLASS_NAME
])
def delete_security_group():
client = get_ec2_client()
security_group_id = get_security_group_id()
print(f"Deleting security group {security_group_id}...")
client.delete_security_group(
GroupId=security_group_id
)
def get_ec2_client():
return boto3.client(
'ec2',
region_name=CLUSTER_REGION,
)
def get_security_group_id():
client = get_ec2_client()
security_groups = client.describe_security_groups()["SecurityGroups"]
return next(security_group for security_group in security_groups if security_group["GroupName"]==EFS_SECURITY_GROUP_NAME)["GroupId"]
def delete_daemon_set():
subprocess.run([
"kubectl",
"delete",
"ds",
"efs-csi-node",
"-n",
"kube-system"
])
def delete_deployment():
subprocess.run([
"kubectl",
"delete",
"deployment",
"efs-csi-controller",
"-n",
"kube-system"
])
parser = argparse.ArgumentParser()
parser.add_argument(
'--region',
type=str,
metavar="CLUSTER_REGION",
help='Your cluster region code (eg: us-east-2)',
required=True
)
parser.add_argument(
'--cluster',
type=str,
metavar="CLUSTER_NAME",
help='Your cluster name (eg: mycluster-1)',
required=True
)
if __name__ == "__main__":
args, _ = parser.parse_known_args()
CLUSTER_REGION = args.region
CLUSTER_NAME = args.cluster
delete_efs_resources()
utils.py
import boto3
from time import sleep
def get_ec2_client(cluster_region):
return boto3.client(
"ec2",
region_name=cluster_region
)
def get_load_balancer_client(cluster_region):
return boto3.client(
'elbv2',
region_name=cluster_region,
)
def wait_for_stacks_to_be_gone(stacks, cluster_region):
all_stacks = get_cloud_formation_stacks(cluster_region)
while stacks in all_stacks:
print("Waiting for stacks to be deleted...")
sleep(1)
print("Stacks deleted!")
def get_cloud_formation_stacks(cluster_region):
client = get_cloud_formation_client(cluster_region)
stacks = client.list_stacks()["StackSummaries"]
def get_name(stack):
return stack["StackName"]
return list(map(get_name, stacks))
def delete_stacks(stacks, cluster_region):
client = get_cloud_formation_client(cluster_region)
for stack in stacks:
print(f"Deleting {stack}...")
client.delete_stack(
StackName=stack
)
def get_cloud_formation_client(cluster_region):
return boto3.client(
"cloudformation",
region_name=cluster_region
)
Is your feature request related to a problem? Please describe. We need a way to be able to delete all resources created by the automated or manual EFS/FSx tests or user runs. This would also help with providing easy commands in docs.
Describe the solution you'd like Create a script for Filesystem cleanup (EFS and FSx) that can be used directly by the user
Describe alternatives you've considered A clear and concise description of any alternative solutions or features you've considered.
Additional context Add any other context or screenshots about the feature request here.