Closed sy-be closed 5 months ago
Hi @sy-bee - thank you for opening this issue. I agree that this behavior seems very concerning. Would you be able to provide an example program which replicates this issue? It would be helpful to know what operations are being run to cause this issue. Thank you!
Thanks @kpitzen for taking a look at this. Below is stripped down version of the pulumi_program
we are using. Please ignore missing variables and functions as I don't want them to be public, but the rest of the code should give a good idea about resources we are creating.
def pulumi_program(self, input) -> None:
vpc = aws_classic.ec2.Vpc(
stack_name,
tags=self.get_resource_tags(),
cidr_block=str(input.network_ip_range_cidr_block),
enable_dns_hostnames=True,
)
vpc_subnets = self.calculate_subnets()
static_ip = aws_classic.ec2.Eip(
stack_name,
tags=self.get_resource_tags(),
vpc=True,
)
region = get_region_by_name()
ingress_cidr_blocks = get_ingress_cidr_blocks()
security_group = aws_classic.ec2.SecurityGroup(
stack_name,
tags=self.get_resource_tags(),
description=f"Security group",
vpc_id=vpc.id,
egress=[
aws_classic.ec2.SecurityGroupEgressArgs(
from_port=0, to_port=0, protocol=-1, cidr_blocks=["0.0.0.0/0"]
)
],
ingress=[
aws_classic.ec2.SecurityGroupIngressArgs(
from_port=input.port,
to_port=input.port,
protocol="tcp",
cidr_blocks=ingress_cidr_blocks,
)
],
)
debian_amd64 = aws_classic.ec2.get_ami(
most_recent=True,
filters=[
aws_classic.ec2.GetAmiFilterArgs(
name="name",
values=["debian-amd64-*"],
),
aws_classic.ec2.GetAmiFilterArgs(
name="virtualization-type",
values=["hvm"],
),
],
owners=[
"136693071363", # Debian
"self",
],
)
instance_assume_role_policy = aws_classic.iam.get_policy_document(
statements=[
aws_classic.iam.GetPolicyDocumentStatementArgs(
actions=["sts:AssumeRole"],
effect="Allow",
principals=[
aws_classic.iam.GetPolicyDocumentStatementPrincipalArgs(
type="Service",
identifiers=["ec2.amazonaws.com"],
)
],
)
]
)
iam_role = aws_classic.iam.Role(
stack_name,
name=self.rfc1035_resource_name,
tags=self.get_resource_tags(),
assume_role_policy=instance_assume_role_policy.json,
description=f"SSM access",
)
policy_arn = f"arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
aws_classic.iam.RolePolicyAttachment(
stack_name,
role=iam_role.name,
policy_arn=policy_arn,
)
vpc_internet_gateway = aws_classic.ec2.InternetGateway(
stack_name,
tags=self.get_resource_tags(),
vpc_id=vpc.id,
)
instance_profile = aws_classic.iam.InstanceProfile(
stack_name,
tags=self.get_resource_tags(),
role=iam_role.name,
)
vm = aws_classic.ec2.Instance(
stack_name,
tags=self.get_resource_tags(),
ami=debian_10_amd64.id,
availability_zone=az_names[0],
iam_instance_profile=instance_profile.name,
instance_type="c5n.large",
source_dest_check=True,
subnet_id=vm_subnet_id,
user_data=config.rendered,
user_data_replace_on_change=True,
vpc_security_group_ids=[security_group.id],
)
route_destination_cidr_block = "0.0.0.0/0"
aws_classic.ec2.Route(
stack_name,
route_table_id=vpc.main_route_table_id,
destination_cidr_block=route_destination_cidr_block,
gateway_id=vpc_internet_gateway.id,
)
aws_classic.ec2.EipAssociation(
stack_name,
instance_id=vm.id,
allocation_id=static_ip.id,
)
aws_classic.cloudwatch.MetricAlarm(
f"{stack_name}_autorecovery",
tags=self.get_resource_tags(),
alarm_actions=[
f"arn:aws:automate:{region.cloud_region_name}:ec2:recover"
],
alarm_description=f"Recover VPN tunnel instance {stack_name}",
comparison_operator="GreaterThanThreshold",
dimensions={
"InstanceId": vm.id,
},
datapoints_to_alarm=2,
evaluation_periods=2,
metric_name="StatusCheckFailed_System",
namespace="AWS/EC2",
period=input.cloudwatch_evaluation_period_seconds,
statistic="Minimum",
threshold=0,
)
vpn_gateway = aws_classic.ec2.VpnGateway(
stack_name,
tags=self.get_resource_tags(),
vpc_id=vpc.id,
amazon_side_asn=amazon_side_asn,
)
vpn_customer_gateway = aws_classic.ec2.CustomerGateway(
stack_name,
tags=self.get_resource_tags(),
bgp_asn=bgp_asn,
ip_address=str(input.customer_network_public_address),
type="ipsec.10",
)
tunnel1_args = (
{
"tunnel1_dpd_timeout_action": input.vpn_tunnel1.dpd_timeout_action,
"tunnel1_dpd_timeout_seconds": input.vpn_tunnel1.dpd_timeout_seconds,
"tunnel1_ike_versions": input.vpn_tunnel1.ike_versions,
"tunnel1_inside_cidr": str(input.vpn_tunnel1.inside_cidr)
if input.vpn_tunnel1.inside_cidr
else None,
"tunnel1_phase1_dh_group_numbers": input.vpn_tunnel1.phase1_dh_group_numbers,
"tunnel1_phase1_encryption_algorithms": input.vpn_tunnel1.phase1_encryption_algorithms,
"tunnel1_phase1_integrity_algorithms": input.vpn_tunnel1.phase1_integrity_algorithms,
"tunnel1_phase1_lifetime_seconds": input.vpn_tunnel1.phase1_lifetime_seconds,
"tunnel1_phase2_dh_group_numbers": input.vpn_tunnel1.phase2_dh_group_numbers,
"tunnel1_phase2_encryption_algorithms": input.vpn_tunnel1.phase2_encryption_algorithms,
"tunnel1_phase2_integrity_algorithms": input.vpn_tunnel1.phase2_integrity_algorithms,
"tunnel1_phase2_lifetime_seconds": input.vpn_tunnel1.phase2_lifetime_seconds,
"tunnel1_rekey_fuzz_percentage": input.vpn_tunnel1.rekey_fuzz_percentage,
"tunnel1_rekey_margin_time_seconds": input.vpn_tunnel1.rekey_margin_time_seconds,
"tunnel1_replay_window_size": input.vpn_tunnel1.replay_window_size,
"tunnel1_startup_action": input.vpn_tunnel1.startup_action,
}
if input.vpn_tunnel1
else {}
)
tunnel1_args = {k: v for k, v in tunnel1_args.items() if v}
tunnel2_args = (
{
"tunnel2_dpd_timeout_action": input.vpn_tunnel2.dpd_timeout_action,
"tunnel2_dpd_timeout_seconds": input.vpn_tunnel2.dpd_timeout_seconds,
"tunnel2_ike_versions": input.vpn_tunnel2.ike_versions,
"tunnel2_inside_cidr": str(input.vpn_tunnel2.inside_cidr)
if input.vpn_tunnel2.inside_cidr
else None,
"tunnel2_phase1_dh_group_numbers": input.vpn_tunnel2.phase1_dh_group_numbers,
"tunnel2_phase1_encryption_algorithms": input.vpn_tunnel2.phase1_encryption_algorithms,
"tunnel2_phase1_integrity_algorithms": input.vpn_tunnel2.phase1_integrity_algorithms,
"tunnel2_phase1_lifetime_seconds": input.vpn_tunnel2.phase1_lifetime_seconds,
"tunnel2_phase2_dh_group_numbers": input.vpn_tunnel2.phase2_dh_group_numbers,
"tunnel2_phase2_encryption_algorithms": input.vpn_tunnel2.phase2_encryption_algorithms,
"tunnel2_phase2_integrity_algorithms": input.vpn_tunnel2.phase2_integrity_algorithms,
"tunnel2_phase2_lifetime_seconds": input.vpn_tunnel2.phase2_lifetime_seconds,
"tunnel2_rekey_fuzz_percentage": input.vpn_tunnel2.rekey_fuzz_percentage,
"tunnel2_rekey_margin_time_seconds": input.vpn_tunnel2.rekey_margin_time_seconds,
"tunnel2_replay_window_size": input.vpn_tunnel2.replay_window_size,
"tunnel2_startup_action": input.vpn_tunnel2.startup_action,
}
if input.vpn_tunnel2
else {}
)
tunnel2_args = {k: v for k, v in tunnel2_args.items() if v}
# vpn_tunnel_logging_enabled is shared among tunnel1 and tunnel2
vpn_connection = aws_classic.ec2.VpnConnection(
stack_name,
tags=self.get_resource_tags(),
customer_gateway_id=vpn_customer_gateway.id,
local_ipv4_network_cidr=str(
input.vpn_tunnel_custom_cidr_ranges.local_ipv4_network_cidr
)
if input.vpn_tunnel_custom_cidr_ranges
and input.vpn_tunnel_custom_cidr_ranges.local_ipv4_network_cidr
else None,
remote_ipv4_network_cidr=str(
input.vpn_tunnel_custom_cidr_ranges.remote_ipv4_network_cidr
)
if input.vpn_tunnel_custom_cidr_ranges
and input.vpn_tunnel_custom_cidr_ranges.remote_ipv4_network_cidr
else None,
type="ipsec.10",
static_routes_only=not input.vpn_bgp_enabled,
vpn_gateway_id=vpn_gateway.id,
# Tunnel 1
tunnel1_preshared_key=input.vpn_tunnel1.shared_key or input.vpn_tunnel_shared_key
if input.vpn_tunnel1
else input.vpn_tunnel_shared_key,
**tunnel1_args,
# Tunnel 2
tunnel2_preshared_key=input.vpn_tunnel2.shared_key or input.vpn_tunnel_shared_key
if input.vpn_tunnel2
else input.vpn_tunnel_shared_key,
**tunnel2_args,
)
for index, subnet in enumerate(input.customer_network_internal_subnets):
aws_classic.ec2.VpnConnectionRoute(
f"{stack_name}-{index}",
destination_cidr_block=str(subnet),
vpn_connection_id=vpn_connection.id,
)
aws_classic.ec2.VpnGatewayRoutePropagation(
stack_name,
route_table_id=vpc.main_route_table_id,
vpn_gateway_id=vpn_gateway.id,
)
Hi @kpitzen , let me know if the above information is helpful. Thanks!
@kpitzen just FYI having bugs like this in the repo with no comments since July is enough to put me off considering Pulumi.
Hi @mjaggard - understood. I am no longer part of the team responsible for this repository. @mikhailshilkov should be able to assist you further. Apologies for the inconvenience.
Our production stack today was removed by pulumi with exactly the same log dump as provided by the original author with same conditions. We are unable to understand. Here's our log dump:
2024-04-12T11:36:32.0658905Z [LOG] Deploying pulumi stack codedamn-cloudfront-urls-production
2024-04-12T11:36:34.2313758Z [LOG] Refreshing (production):
2024-04-12T11:36:34.2314600Z
2024-04-12T11:36:34.6675357Z [LOG]
2024-04-12T11:36:34.6676005Z
2024-04-12T11:36:35.6672905Z [LOG] @ refreshing....
2024-04-12T11:36:36.0176092Z [LOG]
2024-04-12T11:36:36.0178437Z ~ pulumi:pulumi:Stack codedamn-cloudfront-urls-production-production refreshing (0s)
2024-04-12T11:36:36.0180906Z pulumi:pulumi:Stack codedamn-cloudfront-urls-production-production running
2024-04-12T11:36:36.0183237Z ~ aws:s3:BucketV2 codedamn-cloudfront-urls-production-robots-txt refreshing (0s)
2024-04-12T11:36:36.0186119Z ~ aws:s3:BucketOwnershipControls codedamn-cloudfront-urls-production-robots-txt refreshing (0s)
2024-04-12T11:36:36.0187603Z ~ aws:s3:BucketObjectv2 robots.txt refreshing (0s)
2024-04-12T11:36:36.0189185Z ~ aws:s3:BucketPublicAccessBlock codedamn-cloudfront-urls-production-robots-txt refreshing (0s)
2024-04-12T11:36:36.0190101Z
2024-04-12T11:36:36.0191064Z [LOG] ~ aws:cloudfront:Distribution codedamn-cloudfront-urls-production refreshing (0s)
2024-04-12T11:36:36.0191820Z
2024-04-12T11:36:36.5608267Z [LOG] aws:s3:BucketPublicAccessBlock codedamn-cloudfront-urls-production-robots-txt
2024-04-12T11:36:36.5609409Z
2024-04-12T11:36:36.6329120Z [LOG] aws:s3:BucketOwnershipControls codedamn-cloudfront-urls-production-robots-txt
2024-04-12T11:36:36.6330124Z
2024-04-12T11:36:36.6676879Z [LOG] @ refreshing....
2024-04-12T11:36:36.7114612Z [LOG]
2024-04-12T11:36:36.7115744Z aws:s3:BucketObjectv2 robots.txt
2024-04-12T11:36:36.7116571Z
2024-04-12T11:36:36.9453470Z [LOG] aws:cloudfront:Distribution codedamn-cloudfront-urls-production
2024-04-12T11:36:36.9454557Z
2024-04-12T11:36:37.4700213Z [LOG] aws:s3:BucketV2 codedamn-cloudfront-urls-production-robots-txt
2024-04-12T11:36:37.4701423Z
2024-04-12T11:36:37.4938701Z [LOG] pulumi:pulumi:Stack codedamn-cloudfront-urls-production-production
2024-04-12T11:36:37.4940675Z Resources:
2024-04-12T11:36:37.4941787Z 6 unchanged
2024-04-12T11:36:37.4942470Z
2024-04-12T11:36:37.4943129Z Duration: 3s
2024-04-12T11:36:37.4943667Z
2024-04-12T11:36:37.4943787Z
2024-04-12T11:36:40.9313777Z [LOG] Updating (production):
2024-04-12T11:36:40.9314607Z
2024-04-12T11:36:41.3475864Z [LOG]
2024-04-12T11:36:41.3476872Z
2024-04-12T11:36:41.3622197Z [LOG] pulumi:pulumi:Stack codedamn-cloudfront-urls-production-production running
2024-04-12T11:36:41.3623464Z
2024-04-12T11:36:42.3454641Z [LOG] @ updating....
2024-04-12T11:36:42.6991472Z [LOG]
2024-04-12T11:36:42.6992356Z
2024-04-12T11:36:42.6999538Z [LOG] - aws:s3:BucketObjectv2 robots.txt deleting (0s)
2024-04-12T11:36:42.7000672Z
2024-04-12T11:36:42.7008643Z [LOG] - aws:s3:BucketPublicAccessBlock codedamn-cloudfront-urls-production-robots-txt deleting (0s)
2024-04-12T11:36:42.7010259Z
2024-04-12T11:36:42.7030735Z [LOG] - aws:s3:BucketOwnershipControls codedamn-cloudfront-urls-production-robots-txt deleting (0s)
2024-04-12T11:36:42.7032315Z
2024-04-12T11:36:42.7038687Z [LOG] - aws:cloudfront:Distribution codedamn-cloudfront-urls-production deleting (0s)
2024-04-12T11:36:42.7040064Z
2024-04-12T11:36:43.3455468Z [LOG] @ updating....
2024-04-12T11:36:43.5607329Z [LOG]
2024-04-12T11:36:43.5608929Z - aws:s3:BucketObjectv2 robots.txt deleted (0.86s)
2024-04-12T11:36:43.5609806Z
2024-04-12T11:36:44.3452132Z [LOG] @ updating....
2024-04-12T11:36:44.5458553Z [LOG]
2024-04-12T11:36:44.5460682Z - aws:s3:BucketPublicAccessBlock codedamn-cloudfront-urls-production-robots-txt deleted (1s)
2024-04-12T11:36:44.5461918Z
2024-04-12T11:36:45.1540773Z [LOG] - aws:s3:BucketOwnershipControls codedamn-cloudfront-urls-production-robots-txt deleted (2s)
2024-04-12T11:36:45.1542126Z
2024-04-12T11:36:45.3453516Z [LOG] @ updating....
2024-04-12T11:36:46.3456251Z [LOG] .
[[removed extra wait logs]]
2024-04-12T11:40:06.8050117Z [LOG]
2024-04-12T11:40:06.8052285Z - aws:cloudfront:Distribution codedamn-cloudfront-urls-production deleted (204s)
2024-04-12T11:40:06.8053544Z
2024-04-12T11:40:07.3462100Z [LOG] @ updating....
2024-04-12T11:40:07.5055304Z [LOG]
2024-04-12T11:40:07.5057226Z - aws:s3:BucketV2 codedamn-cloudfront-urls-production-robots-txt deleting (0s)
2024-04-12T11:40:07.5058481Z
2024-04-12T11:40:08.3082619Z [LOG] - aws:s3:BucketV2 codedamn-cloudfront-urls-production-robots-txt deleted (0.80s)
2024-04-12T11:40:08.3083743Z
2024-04-12T11:40:08.3459766Z [LOG] @ updating....
2024-04-12T11:40:09.3457945Z [LOG] .
2024-04-12T11:40:09.7767850Z [LOG]
2024-04-12T11:40:09.7770127Z pulumi:pulumi:Stack codedamn-cloudfront-urls-production-production
2024-04-12T11:40:09.7772020Z Resources:
2024-04-12T11:40:09.7773365Z - 5 deleted
2024-04-12T11:40:09.7774550Z 1 unchanged
2024-04-12T11:40:09.7775220Z
2024-04-12T11:40:09.7775748Z Duration: 3m29s
2024-04-12T11:40:09.7776534Z
After logging 6 unchanged
, pulumi deleted our cloudfront distribution for no apparent reason.
@mehulmpt can you provide an example program that we can run to attempt to reproduce this? When refresh is run it syncs what exists in AWS to the state file and when up
is run it will sync what exists in your program to AWS and the state file.
So for example, if you had this setup with two buckets being created.
from pulumi import automation as auto
from pulumi_aws import s3
def pulumi_program() -> None:
s3.BucketV2("my-bucket1")
s3.BucketV2("my-bucket2")
stack_name = "dev"
project_name = "pulumi-python-app"
stack = auto.create_or_select_stack(stack_name=stack_name,
project_name=project_name,
program=pulumi_program)
stack.workspace.install_plugin("aws", "v6.29.0")
stack.set_config("aws:region", auto.ConfigValue(value="us-east-2"))
stack.refresh(on_output=print)
stack.up(on_output=print)
And then I remove one of the buckets, refresh will show that there are no changes (since the bucket exists in AWS and in the state file), but up
will destroy the bucket because it no longer exists in the config.
If you program was exactly the same between the previous and current run which deleted the resource then there is definitely something that we need to try and reproduce.
@corymhall Here's a trimmed-down code of the program @mehulmpt and I were using: https://github.com/rishabhrao/pulumi-issue I've also included all the relevant logs in the repo itself.
Additional details:
deployments
variable. It is logged in the good run but not in the next run.
Could this indicate that our program function was not invoked at all and instead, pulumi assumed we hadn't declared any resources (thereby deleting them on aws)?A few more things to add on here:
We are unable to successfully reproduce this issue anymore, but we feel it is a critical bug. All we can figure out so far is it looks like there is no bug on the AWS stack, our code, and the deployment JSON file used by Pulumi at the time of nuking the distribution.
@rishabhrao / @mehulmpt thanks for the detailed repro!
Notice how we have a console.log() statement inside the program that logs the deployments variable. It is logged in the good run but not in the next run. Could this indicate that our program function was not invoked at all and instead, pulumi assumed we hadn't declared any resources (thereby deleting them on aws)?
I think you may be on to something here. I'm not super familiar with the automation api side of things, but it would seem to indicate that somehow the program was empty on the run where the resources were deleted. I'm going to talk with the team to see if anyone has any more ideas.
@rishabhrao / @mehulmpt after talking with the team, it looks like this issue is related to https://github.com/pulumi/pulumi/issues/15390 which should be fixed in @pulumi/pulumi@v3.104.0
. Can you try upgrading to at least that version and then definitely let us know if this happens again.
Thanks! We're on a 7 month old version right now but the error looks exactly the same as what happened with us. Giving it another shot.
Assuming this is fixed on latest, please reopen if still having difficulty! Thank you.
What happened?
We use Pulumi automations API to manage our infrastructure resources, however during one of the updates to resources of the same type of stack, some of the resources were deleted for no apparent reason. A refresh indicated that almost all of the resources match to provisioned resources in AWS, however subsequent update deleted almost half of the resources including VPN tunnels and EC2 instances. Refresh:
And a subsequent update less than a minute afterwards:
As you can see 8 resources were deleted. This remind me of https://github.com/pulumi/pulumi-aws/issues/1615, however that bug was apparently solved.
Expected Behavior
Update was expected to update EC2 instance (replace it) but it should not have changed anything VPN related. I have other similar refresh / update cycles with our stacks and none experienced this behaviour.
Steps to reproduce
N/A
Output of
pulumi about
Additional info:
pulumi-aws
version being 5.14. During the event above,pulumi-aws
was version 5.31.Additional context
No response
Contributing
Vote on this issue by adding a 👍 reaction. To contribute a fix for this issue, leave a comment (and link to your pull request, if you've opened one already).