Azure / batch-shipyard

Simplify HPC and Batch workloads on Azure
MIT License
277 stars 121 forks source link

Trying to create a pool in Azure Batch inside a vnet results in host not found error for - batch.core.windows.net #262

Closed tonynilan closed 5 years ago

tonynilan commented 5 years ago

Using Shipyard 3.4.0, when running the pool add command by specifying a virtual_network in the pool.json file, it says the below error. When creating the pool without specifying a vnet, its creates it just fine.

msrest.exceptions.ClientRequestError: Error occurred in request., ConnectionError: HTTPSConnectionPool(host='batch.core.windows.net', port=443): Max retries exceeded with url: /%20%20%20/subscriptions/{some subscription}/resourceGroups/{some resource group}/providers/Microsoft.Network/virtualNetworks/{some vnet}/subnets/defaultSubnet?api-version=2017-11-01 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object

alfpark commented 5 years ago

Please ensure that you have read and followed the Virtual Network Guide.

If you still have issues after following the guide, please post redacted credentials.yaml and pool.yaml configuration files.

Additionally, please post the full execution output (including the full error/stack trace) of the command executed.

tonynilan commented 5 years ago

I have followed the above guide. Below is are the 3 files. Below that is the full exception.

config.json------------------------

{
  "batch_shipyard": {
    "storage_account_settings": "batchstorage"
  },
  "global_resources": {
    "docker_images": [
      "{REDACTED}"
    ]
  }
}

credentials.json------------------------

{
  "credentials": {
      "batch": {
        "account_service_url": "https://{REDACTED}.centralus.batch.azure.com",
          "aad": {
          "endpoint": "https://batch.core.windows.net/",
          "directory_id": "{REDACTED}",
          "application_id": "{REDACTED}",
          "auth_key": "{REDACTED}"
            },

  "resource_group": "{REDACTED}"
  },

  "management": {
            "subscription_id": "{REDACTED}",
            "aad": {
                "endpoint": "https://management.core.windows.net/",
                "directory_id": "{REDACTED}",
                "application_id": "{REDACTED}",
                "auth_key": "{REDACTED}"
    }
        },

    "docker_registry": {
    "{REDACTED}": {
    "username": "{REDACTED}",
    "password": "{REDACTED}"
            }
  },
  "storage": {
          "batchstorage": {
        "account": "{REDACTED}",
        "account_key": "{REDACTED}",
        "endpoint": "core.windows.net"
              }
    }   
  }   
}

pool.json------------------------

{
  "pool_specification": {
      "id": "{REDACTED}",
      "vm_configuration": {
      "platform_image": {
        "publisher": "Canonical",
        "offer": "UbuntuServer",
        "sku": "16.04-LTS"
      }
    },
    "vm_size": "STANDARD_F8",
    "vm_count": {
    "dedicated": 1,
    "low_priority": 0
    },

    "virtual_network": {
        "name": "{REDACTED}",
        "resource_group": "{REDACTED}",
        "address_space": "{REDACTED}",
        "subnet": {
        "name": "defaultSubnet",
        "address_prefix": "{REDACTED}"
           }
    },

    "max_tasks_per_node": 8,
    "ssh": {
      "username": "{REDACTED}",
       "expiry_days": 30
    },
    "autoscale": {
    "evaluation_interval": "00:05:00",
    "formula": "$TargetDedicatedNodes = min(max($PendingTasks.GetSample(1) / 8, 1), 30)"
    }
  }
}

Below is the exception:

Traceback (most recent call last):
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connection.py", line 144, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/util/connection.py", line 63, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/usr/local/lib/python3.5/socket.py", line 733, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -5] No address associated with hostname

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 346, in _make_request
    self._validate_conn(conn)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 850, in _validate_conn
    conn.connect()
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connection.py", line 287, in connect
    conn = self._new_conn()
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connection.py", line 153, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7fb41d9d1fd0>: Failed to establish a new connection: [Errno -5] No address associated with hostname

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/adapters.py", line 440, in send
    timeout=timeout
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 668, in urlopen
    **response_kw)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 668, in urlopen
    **response_kw)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 668, in urlopen
    **response_kw)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 668, in urlopen
    **response_kw)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/connectionpool.py", line 639, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/urllib3/util/retry.py", line 388, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='batch.core.windows.net', port=443): Max retries exceeded with url: /%20%20%20/subscriptions/{REDACTED}/resourceGroups/{REDACTED}/providers/Microsoft.Network/virtualNetworks/{REDACTED}/subnets/defaultSubnet?api-version=2017-11-01 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fb41d9d1fd0>: Failed to establish a new connection: [Errno -5] No address associated with hostname',))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/msrest/service_client.py", line 205, in send
    **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests_oauthlib/oauth2_session.py", line 425, in request
    headers=headers, data=data, **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/sessions.py", line 508, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/sessions.py", line 618, in send
    r = adapter.send(request, **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/adapters.py", line 508, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='batch.core.windows.net', port=443): Max retries exceeded with url: /%20%20%20/subscriptions/{REDACTED}/resourceGroups/{REDACTED}/providers/Microsoft.Network/virtualNetworks/{REDACTED}/subnets/defaultSubnet?api-version=2017-11-01 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fb41d9d1fd0>: Failed to establish a new connection: [Errno -5] No address associated with hostname',))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "../batch-shipyard/shipyard.py", line 2024, in <module>
    cli()
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 722, in __call__
    return self.main(*args, **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 697, in main
    rv = self.invoke(ctx)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 1066, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 1066, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 895, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 535, in invoke
    return callback(*args, **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/decorators.py", line 64, in new_func
    return ctx.invoke(f, obj, *args[1:], **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/click/core.py", line 535, in invoke
    return callback(*args, **kwargs)
  File "../batch-shipyard/shipyard.py", line 1232, in pool_add
    ctx.table_client, ctx.config)
  File "/usr/src/batch-shipyard/convoy/fleet.py", line 2823, in action_pool_add
    batch_client, blob_client, config
  File "/usr/src/batch-shipyard/convoy/fleet.py", line 1493, in _add_pool
    batch_client, blob_client, config)
  File "/usr/src/batch-shipyard/convoy/fleet.py", line 970, in _construct_pool_object
    resource_client, network_client, config, pool_settings, bc)
  File "/usr/src/batch-shipyard/convoy/fleet.py", line 839, in _pool_virtual_network_subnet_address_space_check
    subnet_components[1], subnet_components[3], subnet_components[4])
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/azure/mgmt/network/v2017_11_01/operations/subnets_operations.py", line 170, in get
    response = self._client.send(request, header_parameters, **operation_config)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/msrest/service_client.py", line 231, in send
    raise_with_traceback(ClientRequestError, msg, err)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/msrest/exceptions.py", line 45, in raise_with_traceback
    raise error.with_traceback(exc_traceback)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/msrest/service_client.py", line 205, in send
    **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests_oauthlib/oauth2_session.py", line 425, in request
    headers=headers, data=data, **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/sessions.py", line 508, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/sessions.py", line 618, in send
    r = adapter.send(request, **kwargs)
  File "/usr/src/batch-shipyard/v/lib/python3.5/site-packages/requests/adapters.py", line 508, in send
    raise ConnectionError(e, request=request)
msrest.exceptions.ClientRequestError: Error occurred in request., ConnectionError: HTTPSConnectionPool(host='batch.core.windows.net', port=443): Max retries exceeded with url: /%20%20%20/subscriptions/{REDACTED}/resourceGroups/{REDACTED}/providers/Microsoft.Network/virtualNetworks/{REDACTED}/subnets/defaultSubnet?api-version=2017-11-01 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fb41d9d1fd0>: Failed to establish a new connection: [Errno -5] No address associated with hostname',))
alfpark commented 5 years ago

It appears that your aad endpoint for both batch and management are incorrect. Instead of manually specifying them, just remove those entries from your configuration files as Batch Shipyard will default to the correct public Azure endpoints if not specified.

tonynilan commented 5 years ago

Thank-you! That did it.