Closed kosta709 closed 4 years ago
@kosta709 can you please share your full apimodel (secrets redacted)? Are you using the private cluster feature?
no, not a private cluster it happens on any simple model with "masterProfile": {"count": 3, ...} see my comments above about internal lb and iptables redirect from 4443 to 443
full api model:
{
"apiVersion": "vlabs",
"properties": {
"orchestratorProfile": {
"orchestratorType": "Kubernetes",
"orchestratorRelease": "1.10",
"orchestratorVersion": "1.10.8",
"kubernetesConfig": {
"kubernetesImageBase": "k8s.gcr.io/",
"clusterSubnet": "10.244.0.0/16",
"dnsServiceIP": "10.0.0.10",
"serviceCidr": "10.0.0.0/16",
"networkPolicy": "calico",
"networkPlugin": "kubenet",
"dockerBridgeSubnet": "172.17.0.1/16",
"useManagedIdentity": true,
"useInstanceMetadata": true,
"enableRbac": true,
"enableSecureKubelet": true,
"enableAggregatedAPIs": true,
"privateCluster": {
"enabled": false
},
"gchighthreshold": 85,
"gclowthreshold": 80,
"etcdVersion": "3.2.23",
"etcdDiskSizeGB": "64",
"addons": [
{
"name": "tiller",
"enabled": true,
"containers": [
{
"name": "tiller",
"cpuRequests": "50m",
"memoryRequests": "150Mi",
"cpuLimits": "50m",
"memoryLimits": "150Mi"
}
],
"config": {
"max-history": "0"
}
},
{
"name": "aci-connector",
"enabled": false,
"containers": [
{
"name": "aci-connector",
"cpuRequests": "50m",
"memoryRequests": "150Mi",
"cpuLimits": "50m",
"memoryLimits": "150Mi"
}
],
"config": {
"nodeName": "aci-connector",
"os": "Linux",
"region": "westus",
"taint": "azure.com/aci"
}
},
{
"name": "cluster-autoscaler",
"enabled": false,
"containers": [
{
"name": "cluster-autoscaler",
"cpuRequests": "100m",
"memoryRequests": "300Mi",
"cpuLimits": "100m",
"memoryLimits": "300Mi"
}
],
"config": {
"maxNodes": "5",
"minNodes": "1"
}
},
{
"name": "blobfuse-flexvolume",
"enabled": true,
"containers": [
{
"name": "blobfuse-flexvolume",
"cpuRequests": "50m",
"memoryRequests": "10Mi",
"cpuLimits": "50m",
"memoryLimits": "10Mi"
}
]
},
{
"name": "smb-flexvolume",
"enabled": true,
"containers": [
{
"name": "smb-flexvolume",
"cpuRequests": "50m",
"memoryRequests": "10Mi",
"cpuLimits": "50m",
"memoryLimits": "10Mi"
}
]
},
{
"name": "keyvault-flexvolume",
"enabled": true,
"containers": [
{
"name": "keyvault-flexvolume",
"cpuRequests": "50m",
"memoryRequests": "10Mi",
"cpuLimits": "50m",
"memoryLimits": "10Mi"
}
]
},
{
"name": "kubernetes-dashboard",
"enabled": true,
"containers": [
{
"name": "kubernetes-dashboard",
"cpuRequests": "300m",
"memoryRequests": "150Mi",
"cpuLimits": "300m",
"memoryLimits": "150Mi"
}
]
},
{
"name": "rescheduler",
"enabled": false,
"containers": [
{
"name": "rescheduler",
"cpuRequests": "10m",
"memoryRequests": "100Mi",
"cpuLimits": "10m",
"memoryLimits": "100Mi"
}
]
},
{
"name": "metrics-server",
"enabled": true,
"containers": [
{
"name": "metrics-server"
}
]
},
{
"name": "nvidia-device-plugin",
"enabled": false,
"containers": [
{
"name": "nvidia-device-plugin",
"cpuRequests": "50m",
"memoryRequests": "10Mi",
"cpuLimits": "50m",
"memoryLimits": "10Mi"
}
]
},
{
"name": "container-monitoring",
"enabled": false,
"containers": [
{
"name": "omsagent",
"image": "microsoft/oms:acsenginelogfixnew",
"cpuRequests": "50m",
"memoryRequests": "200Mi",
"cpuLimits": "150m",
"memoryLimits": "750Mi"
}
],
"config": {
"dockerProviderVersion": "2.0.0-3",
"omsAgentVersion": "1.6.0-42"
}
},
{
"name": "azure-cni-networkmonitor",
"enabled": false,
"containers": [
{
"name": "azure-cni-networkmonitor"
}
]
},
{
"name": "azure-npm-daemonset",
"enabled": false,
"containers": [
{
"name": "azure-npm-daemonset"
}
]
}
],
"kubeletConfig": {
"--address": "0.0.0.0",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--authorization-mode": "Webhook",
"--azure-container-registry-config": "/etc/kubernetes/azure.json",
"--cadvisor-port": "0",
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-dns": "10.0.0.10",
"--cluster-domain": "cluster.local",
"--enable-controller-attach-detach": "false",
"--enforce-node-allocatable": "pods",
"--event-qps": "0",
"--eviction-hard": "memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%",
"--feature-gates": "PodPriority=true",
"--image-gc-high-threshold": "85",
"--image-gc-low-threshold": "80",
"--image-pull-progress-deadline": "30m",
"--keep-terminated-pod-volumes": "false",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--max-pods": "110",
"--network-plugin": "cni",
"--node-status-update-frequency": "10s",
"--non-masquerade-cidr": "10.244.0.0/16",
"--pod-infra-container-image": "k8s.gcr.io/pause-amd64:3.1",
"--pod-manifest-path": "/etc/kubernetes/manifests",
"--pod-max-pids": "100"
},
"controllerManagerConfig": {
"--allocate-node-cidrs": "true",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-cidr": "10.244.0.0/16",
"--cluster-name": "cluster-shared-2-dind-k8s",
"--cluster-signing-cert-file": "/etc/kubernetes/certs/ca.crt",
"--cluster-signing-key-file": "/etc/kubernetes/certs/ca.key",
"--configure-cloud-routes": "true",
"--feature-gates": "ServiceNodeExclusion=true",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--leader-elect": "true",
"--node-monitor-grace-period": "40s",
"--pod-eviction-timeout": "5m0s",
"--profiling": "false",
"--root-ca-file": "/etc/kubernetes/certs/ca.crt",
"--route-reconciliation-period": "10s",
"--service-account-private-key-file": "/etc/kubernetes/certs/apiserver.key",
"--terminated-pod-gc-threshold": "5000",
"--use-service-account-credentials": "true",
"--v": "2"
},
"cloudControllerManagerConfig": {
"--allocate-node-cidrs": "true",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-cidr": "10.244.0.0/16",
"--cluster-name": "cluster-shared-2-dind-k8s",
"--configure-cloud-routes": "true",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--leader-elect": "true",
"--route-reconciliation-period": "10s",
"--v": "2"
},
"apiServerConfig": {
"--advertise-address": "<kubernetesAPIServerIP>",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--audit-log-maxage": "30",
"--audit-log-maxbackup": "10",
"--audit-log-maxsize": "100",
"--audit-log-path": "/var/log/kubeaudit/audit.log",
"--audit-policy-file": "/etc/kubernetes/addons/audit-policy.yaml",
"--authorization-mode": "Node,RBAC",
"--bind-address": "0.0.0.0",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--enable-admission-plugins": "NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultStorageClass,DefaultTolerationSeconds,MutatingAdmissionWebhook,ValidatingAdmissionWebhook,ResourceQuota,ExtendedResourceToleration",
"--etcd-cafile": "/etc/kubernetes/certs/ca.crt",
"--etcd-certfile": "/etc/kubernetes/certs/etcdclient.crt",
"--etcd-keyfile": "/etc/kubernetes/certs/etcdclient.key",
"--etcd-servers": "https://127.0.0.1:2379",
"--insecure-port": "8080",
"--kubelet-client-certificate": "/etc/kubernetes/certs/client.crt",
"--kubelet-client-key": "/etc/kubernetes/certs/client.key",
"--profiling": "false",
"--proxy-client-cert-file": "/etc/kubernetes/certs/proxy.crt",
"--proxy-client-key-file": "/etc/kubernetes/certs/proxy.key",
"--repair-malformed-updates": "false",
"--requestheader-allowed-names": "",
"--requestheader-client-ca-file": "/etc/kubernetes/certs/proxy-ca.crt",
"--requestheader-extra-headers-prefix": "X-Remote-Extra-",
"--requestheader-group-headers": "X-Remote-Group",
"--requestheader-username-headers": "X-Remote-User",
"--secure-port": "443",
"--service-account-key-file": "/etc/kubernetes/certs/apiserver.key",
"--service-account-lookup": "true",
"--service-cluster-ip-range": "10.0.0.0/16",
"--storage-backend": "etcd3",
"--tls-cert-file": "/etc/kubernetes/certs/apiserver.crt",
"--tls-private-key-file": "/etc/kubernetes/certs/apiserver.key",
"--v": "4"
},
"schedulerConfig": {
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--leader-elect": "true",
"--profiling": "false",
"--v": "2"
},
"cloudProviderBackoff": true,
"cloudProviderBackoffRetries": 6,
"cloudProviderBackoffJitter": 1,
"cloudProviderBackoffDuration": 5,
"cloudProviderBackoffExponent": 1.5,
"cloudProviderRateLimit": true,
"cloudProviderRateLimitQPS": 3,
"cloudProviderRateLimitBucket": 10,
"loadBalancerSku": "Basic"
}
},
"masterProfile": {
"count": 3,
"dnsPrefix": "cluster-shared-2-dind-k8s",
"subjectAltNames": null,
"vmSize": "Standard_DS3_v2",
"vnetSubnetID": "/subscriptions/*****/resourceGroups/cluster-shared-2-vnet/providers/Microsoft.Network/virtualNetworks/vnet1/subnets/subnet1",
"vnetCidr": "10.0.0.0/8",
"firstConsecutiveStaticIP": "10.240.254.5",
"storageProfile": "ManagedDisks",
"oauthEnabled": false,
"preProvisionExtension": null,
"extensions": [],
"distro": "aks",
"kubernetesConfig": {
"kubeletConfig": {
"--address": "0.0.0.0",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--authorization-mode": "Webhook",
"--azure-container-registry-config": "/etc/kubernetes/azure.json",
"--cadvisor-port": "0",
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-dns": "10.0.0.10",
"--cluster-domain": "cluster.local",
"--enable-controller-attach-detach": "false",
"--enforce-node-allocatable": "pods",
"--event-qps": "0",
"--eviction-hard": "memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%",
"--feature-gates": "PodPriority=true",
"--image-gc-high-threshold": "85",
"--image-gc-low-threshold": "80",
"--image-pull-progress-deadline": "30m",
"--keep-terminated-pod-volumes": "false",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--max-pods": "110",
"--network-plugin": "cni",
"--node-status-update-frequency": "10s",
"--non-masquerade-cidr": "10.244.0.0/16",
"--pod-infra-container-image": "k8s.gcr.io/pause-amd64:3.1",
"--pod-manifest-path": "/etc/kubernetes/manifests",
"--pod-max-pids": "100"
}
}
},
"agentPoolProfiles": [
{
"name": "internal",
"count": 1,
"vmSize": "Standard_DS3_v2",
"osType": "Linux",
"availabilityProfile": "VirtualMachineScaleSets",
"storageProfile": "ManagedDisks",
"vnetSubnetID": "/subscriptions/*****/resourceGroups/cluster-shared-2-vnet/providers/Microsoft.Network/virtualNetworks/vnet1/subnets/subnet1",
"distro": "aks",
"kubernetesConfig": {
"kubeletConfig": {
"--address": "0.0.0.0",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--authorization-mode": "Webhook",
"--azure-container-registry-config": "/etc/kubernetes/azure.json",
"--cadvisor-port": "0",
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-dns": "10.0.0.10",
"--cluster-domain": "cluster.local",
"--enable-controller-attach-detach": "true",
"--enforce-node-allocatable": "pods",
"--event-qps": "0",
"--eviction-hard": "memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%",
"--feature-gates": "PodPriority=true",
"--image-gc-high-threshold": "85",
"--image-gc-low-threshold": "80",
"--image-pull-progress-deadline": "30m",
"--keep-terminated-pod-volumes": "false",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--max-pods": "110",
"--network-plugin": "cni",
"--node-status-update-frequency": "10s",
"--non-masquerade-cidr": "10.244.0.0/16",
"--pod-infra-container-image": "k8s.gcr.io/pause-amd64:3.1",
"--pod-manifest-path": "/etc/kubernetes/manifests",
"--pod-max-pids": "100"
}
},
"acceleratedNetworkingEnabled": false,
"fqdn": "",
"customNodeLabels": {
"node-type": "internal"
},
"preProvisionExtension": null,
"extensions": [],
"singlePlacementGroup": true
},
{
"name": "app",
"count": 1,
"vmSize": "Standard_DS2_v2",
"osType": "Linux",
"availabilityProfile": "VirtualMachineScaleSets",
"storageProfile": "ManagedDisks",
"vnetSubnetID": "/subscriptions/*****/resourceGroups/cluster-shared-2-vnet/providers/Microsoft.Network/virtualNetworks/vnet1/subnets/subnet1",
"distro": "aks",
"kubernetesConfig": {
"kubeletConfig": {
"--address": "0.0.0.0",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--authorization-mode": "Webhook",
"--azure-container-registry-config": "/etc/kubernetes/azure.json",
"--cadvisor-port": "0",
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-dns": "10.0.0.10",
"--cluster-domain": "cluster.local",
"--enable-controller-attach-detach": "false",
"--enforce-node-allocatable": "pods",
"--event-qps": "0",
"--eviction-hard": "memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%",
"--feature-gates": "PodPriority=true",
"--image-gc-high-threshold": "85",
"--image-gc-low-threshold": "80",
"--image-pull-progress-deadline": "30m",
"--keep-terminated-pod-volumes": "false",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--max-pods": "110",
"--network-plugin": "cni",
"--node-status-update-frequency": "10s",
"--non-masquerade-cidr": "10.244.0.0/16",
"--pod-infra-container-image": "k8s.gcr.io/pause-amd64:3.1",
"--pod-manifest-path": "/etc/kubernetes/manifests",
"--pod-max-pids": "100"
}
},
"acceleratedNetworkingEnabled": false,
"fqdn": "",
"customNodeLabels": {
"node-type": "app"
},
"preProvisionExtension": null,
"extensions": [],
"singlePlacementGroup": true
},
{
"name": "dind1",
"count": 1,
"vmSize": "Standard_DS4_v2",
"osType": "Linux",
"availabilityProfile": "VirtualMachineScaleSets",
"storageProfile": "ManagedDisks",
"vnetSubnetID": "/subscriptions/*****/resourceGroups/cluster-shared-2-vnet/providers/Microsoft.Network/virtualNetworks/vnet1/subnets/subnet1",
"distro": "aks",
"kubernetesConfig": {
"kubeletConfig": {
"--address": "0.0.0.0",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--authorization-mode": "Webhook",
"--azure-container-registry-config": "/etc/kubernetes/azure.json",
"--cadvisor-port": "0",
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-dns": "10.0.0.10",
"--cluster-domain": "cluster.local",
"--enable-controller-attach-detach": "false",
"--enforce-node-allocatable": "pods",
"--event-qps": "0",
"--eviction-hard": "memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%",
"--feature-gates": "PodPriority=true",
"--image-gc-high-threshold": "85",
"--image-gc-low-threshold": "80",
"--image-pull-progress-deadline": "30m",
"--keep-terminated-pod-volumes": "false",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--max-pods": "110",
"--network-plugin": "cni",
"--node-status-update-frequency": "10s",
"--non-masquerade-cidr": "10.244.0.0/16",
"--pod-infra-container-image": "k8s.gcr.io/pause-amd64:3.1",
"--pod-manifest-path": "/etc/kubernetes/manifests",
"--pod-max-pids": "100"
}
},
"acceleratedNetworkingEnabled": false,
"fqdn": "",
"customNodeLabels": {
"runtime-environment": "public",
"node-type": "dind"
},
"preProvisionExtension": null,
"extensions": [],
"singlePlacementGroup": true
},
{
"name": "dind2",
"count": 1,
"vmSize": "Standard_DS3_v2",
"osType": "Linux",
"availabilityProfile": "VirtualMachineScaleSets",
"storageProfile": "ManagedDisks",
"vnetSubnetID": "/subscriptions/*****/resourceGroups/cluster-shared-2-vnet/providers/Microsoft.Network/virtualNetworks/vnet1/subnets/subnet1",
"distro": "aks",
"kubernetesConfig": {
"kubeletConfig": {
"--address": "0.0.0.0",
"--allow-privileged": "true",
"--anonymous-auth": "false",
"--authorization-mode": "Webhook",
"--azure-container-registry-config": "/etc/kubernetes/azure.json",
"--cadvisor-port": "0",
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-config": "/etc/kubernetes/azure.json",
"--cloud-provider": "azure",
"--cluster-dns": "10.0.0.10",
"--cluster-domain": "cluster.local",
"--enable-controller-attach-detach": "false",
"--enforce-node-allocatable": "pods",
"--event-qps": "0",
"--eviction-hard": "memory.available<100Mi,nodefs.available<10%,nodefs.inodesFree<5%",
"--feature-gates": "PodPriority=true",
"--image-gc-high-threshold": "85",
"--image-gc-low-threshold": "80",
"--image-pull-progress-deadline": "30m",
"--keep-terminated-pod-volumes": "false",
"--kubeconfig": "/var/lib/kubelet/kubeconfig",
"--max-pods": "110",
"--network-plugin": "cni",
"--node-status-update-frequency": "10s",
"--non-masquerade-cidr": "10.244.0.0/16",
"--pod-infra-container-image": "k8s.gcr.io/pause-amd64:3.1",
"--pod-manifest-path": "/etc/kubernetes/manifests",
"--pod-max-pids": "100"
}
},
"acceleratedNetworkingEnabled": false,
"fqdn": "",
"customNodeLabels": {
"runtime-environment": "paying",
"node-type": "dind"
},
"preProvisionExtension": null,
"extensions": [],
"singlePlacementGroup": true
}
],
"linuxProfile": {
"adminUsername": "ubuntu",
"ssh": {
"publicKeys": [
{
"keyData": "*****"
}
]
}
},
"certificateProfile": {
}
}
}
Hi @kosta709, I've added your suggested validation to our E2E fleet here:
https://github.com/Azure/acs-engine/pull/3996
I'll run it against a calico cluster.
this behavior confirmed on multi-master clusters
Hi @jackfrancis , Thanks the liveness probe test you are adding will fail in ~ 1/3 of calls (in case of 3 masters), because it goes though internal LB and will fail only if selected backend will be the same node of where kubectl is running
Can you please provide a manual workaround (iptables commands for 4443 to 443 redirection) before publish a release
Hi @kosta709, thanks for your patience. Sorry for the initial misdirection, but in fact this is a known limitation of the current Kubernetes + Azure implementation.
Really, the only way to unblock yourself here is to ensure that pods needing to contact kubernetes.default.svc
are scheduled on nodes (and not masters). Is that not feasible in your scenario?
@lachie83 @khenidak @feiskyer @andyzhangx, are you able to add any more workaround suggestions?
because it goes though internal LB and will fail only if selected backend will be the same node of where kubectl is running. Can you please provide a manual workaround (iptables commands for 4443 to 443 redirection) before publish a release.
Just noticed that kubernetes endpoints are set to ILB's IP address. Are there any reasons to do so?
So another workaround is replacing the endpoints with three master's IP, then they will be load balanced via kube-proxy.
@jackfrancis , we need kube api calls from master because our app needs to call Azure api and only master has enough perrmissions through msi. I dont want to pass additional azure secrets or roles to nodes.
@feiskyer, great idea, thanks, will try. Currently i just set retries
@kosta709 thank you for enduring, please share your implementation of @feiskyer's suggestion if you find it to be a better solution
@jackfrancis do we correctly set advertise as
in api server arguments? should point to internal LB ip..
@khenidak yes
👋 Thanks for opening your first issue here! If you're reporting a 🐞 bug, please make sure you include steps to reproduce it.
We may have a fix for this:
create a Service type with ExternalIPs that point the internal ip addresses of the master. Any workloads needing to talk to the masters through one endpoint would call the Service hostname. e.g.:
$ cat master-svc.yaml
kind: Service
apiVersion: v1
metadata:
name: master-service
spec:
selector:
app: kube-apiserver-masters
ports:
- name: http
protocol: TCP
port: 443
targetPort: 443
externalIPs:
- 10.0.0.10
- 10.0.0.12
- 10.0.0.13
@sylr I heard through the grapevine that you tried this approach and it worked!
@jackfrancis , I'm the one who tested :) I haven't tested from the master node within a pod. Let me try looping some curl using this new service.
Looking at @feiskyer solution which is also very similar to the one where we create a new apiserver service.
So another workaround is replacing the endpoints with three master's IP, then they will be load balanced via kube-proxy.
are we open to a missing healthprobe, if one master goes down, will the service automatically remove the faulty endpoints ?
Wrong Sylvain :|
from within a pod running on the master, the following curl https://kubernetes.default -k
will exhibit timeouts 33% of the time, while curl https://master-service -k
will always pass.
I also tested when one apiserver is down, the kubernetes service master-service
will update his endpoints as long as the mirror-pod associated with the static apiserver pod is removed from etcd.
See the comment of https://github.com/Azure/aks-engine/pull/243
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
Please re-open this, this issue still occurs to some users.
Is this a request for help?: yes
Is this an ISSUE or FEATURE REQUEST? (choose one): ISSUE
What version of acs-engine?: v0.22.2
Orchestrator and version (e.g. Kubernetes, DC/OS, Swarm) Kubernetes
What happened: on k8s multimaster installation - "masterProfile": {"count": 3, ...} a pod scheduled to master nodes gets sporadic errors on accessing kubernetes.default.svc (internal endpoint for k8s api) by kubectl or curl
Unable to connect to the server: dial tcp 10.0.0.1:443: i/o timeout
What you expected to happen: accessing to kubernetes.default.svc from master nodes to be stable
How to reproduce it (as minimally and precisely as possible): Deploy 3-master k8s cluster - "masterProfile": {"count": 3, ...} in the json Ensure that 3 masters are up:
submit pod like below:
enter to pod's shell and try kubectl or curl to master
Anything else we need to know: in acs-engine installation such requests go through internal load balance with endpoint to each master on port 4443. There is also iptables nat PREROUTING to redirect 4443 to 443 , see https://github.com/Azure/acs-engine/blob/master/parts/k8s/kubernetesmastercustomdata.yml
Looks like PREROUTING chain is not working then it goes to the same host there pod is running: on my case the pod has been scheduled to k8s-master-17552040-1 , so this node cannot be accessed, but 2 others are ok, and that is why we get the error in ~1/3 occurancies: accessing local node:
accessing other master nodes:
I tried to change PREROUTING to OUTPUT on all masters
iptables -t nat -A OUTPUT -p tcp --dport 4443 -j REDIRECT --to-port 443
, it fixes curl, but does not fix kubectl