Open sytianhe opened 21 hours ago
A few questions:
Generally, this issue belongs to https://github.com/googleapis/google-auth-library-java
Is this the full stack trace?
I didn't add the non-java stack trace. Added full stack trace below.
Are you running on EC2?
Yes, this is a Databricks cluster running on AWS ec2
Generally, this issue belongs to https://github.com/googleapis/google-auth-library-java
I tested identity federation using the same credential config.json using google python client. It works fine. My suspicion is that the environment variables (needed when reaching AWS url) are not passed to the java runtime. I also tried to set the environment variables in spark executorEnv or workerEnv but no luck.
Full stack trace:
Py4JJavaError Traceback (most recent call last)
File <command-1264826551924062>, line 15
10 df = spark.read.format("bigquery") \
11 .option("table", "some-table-name") \
12 .option("project", "project-name") \
13 .option("parentProject", "parent-project-name") \
14 .option("credentialsFile", credential_file_path) \
---> 15 .load()
17 display(df)
File /databricks/spark/python/pyspark/instrumentation_utils.py:48, in _wrap_function.<locals>.wrapper(*args, **kwargs)
46 start = time.perf_counter()
47 try:
---> 48 res = func(*args, **kwargs)
49 logger.log_success(
50 module_name, class_name, function_name, time.perf_counter() - start, signature
51 )
52 return res
File /databricks/spark/python/pyspark/sql/readwriter.py:314, in DataFrameReader.load(self, path, format, schema, **options)
312 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
313 else:
--> 314 return self._df(self._jreader.load())
File /databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355, in JavaMember.__call__(self, *args)
1349 command = proto.CALL_COMMAND_NAME +\
1350 self.command_header +\
1351 args_command +\
1352 proto.END_COMMAND_PART
1354 answer = self.gateway_client.send_command(command)
-> 1355 return_value = get_return_value(
1356 answer, self.gateway_client, self.target_id, self.name)
1358 for temp_arg in temp_args:
1359 if hasattr(temp_arg, "_detach"):
File /databricks/spark/python/pyspark/errors/exceptions/captured.py:188, in capture_sql_exception.<locals>.deco(*a, **kw)
186 def deco(*a: Any, **kw: Any) -> Any:
187 try:
--> 188 return f(*a, **kw)
189 except Py4JJavaError as e:
190 converted = convert_exception(e.java_exception)
File /databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o448.load.
: com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryException: Failed to retrieve AWS region.
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.translate(HttpBigQueryRpc.java:115)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.getTable(HttpBigQueryRpc.java:286)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl$18.call(BigQueryImpl.java:746)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl$18.call(BigQueryImpl.java:743)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.retrying.DirectRetryingExecutor.submit(DirectRetryingExecutor.java:105)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.RetryHelper.run(RetryHelper.java:76)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.RetryHelper.runWithRetries(RetryHelper.java:50)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl.getTable(BigQueryImpl.java:742)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.connector.common.BigQueryClient.getTable(BigQueryClient.java:89)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.connector.common.BigQueryClient.getReadTable(BigQueryClient.java:102)
at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelationInternal(BigQueryRelationProvider.scala:81)
at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:48)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:391)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:381)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:337)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:337)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:397)
at py4j.Gateway.invoke(Gateway.java:306)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:199)
at py4j.ClientServerConnection.run(ClientServerConnection.java:119)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.IOException: Failed to retrieve AWS region.
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.AwsCredentials.retrieveResource(AwsCredentials.java:217)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.AwsCredentials.getAwsRegion(AwsCredentials.java:264)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.AwsCredentials.retrieveSubjectToken(AwsCredentials.java:171)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.AwsCredentials.refreshAccessToken(AwsCredentials.java:155)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.OAuth2Credentials$1.call(OAuth2Credentials.java:243)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.OAuth2Credentials$1.call(OAuth2Credentials.java:240)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at com.google.cloud.spark.bigquery.repackaged.com.google.common.util.concurrent.DirectExecutor.execute(DirectExecutor.java:30)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.OAuth2Credentials$AsyncRefreshResult.executeIfNew(OAuth2Credentials.java:567)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.OAuth2Credentials.asyncFetch(OAuth2Credentials.java:206)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.OAuth2Credentials.getRequestMetadata(OAuth2Credentials.java:156)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.ExternalAccountCredentials.getRequestMetadata(ExternalAccountCredentials.java:244)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.http.HttpCredentialsAdapter.initialize(HttpCredentialsAdapter.java:96)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.http.HttpTransportOptions$1.initialize(HttpTransportOptions.java:159)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.http.HttpRequestFactory.buildRequest(HttpRequestFactory.java:91)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.buildHttpRequest(AbstractGoogleClientRequest.java:404)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:514)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:455)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:565)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.getTable(HttpBigQueryRpc.java:284)
... 28 more
Caused by: com.google.cloud.spark.bigquery.repackaged.com.google.api.client.http.HttpResponseException: 401 Unauthorized
GET http://169.254.169.254/latest/meta-data/placement/availability-zone" target="_blank" rel="noopener noreferrer">http://169.254.169.254/latest/meta-data/placement/availability-zone</a>
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" target="_blank" rel="noopener noreferrer">http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd</a>">
<html xmlns="http://www.w3.org/1999/xhtml" target="_blank" rel="noopener noreferrer">http://www.w3.org/1999/xhtml</a>" xml:lang="en" lang="en">
<head>
<title>401 - Unauthorized</title>
</head>
<body>
<h1>401 - Unauthorized</h1>
</body>
</html>
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.http.HttpResponseException$Builder.build(HttpResponseException.java:293)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1118)
at com.google.cloud.spark.bigquery.repackaged.com.google.auth.oauth2.AwsCredentials.retrieveResource(AwsCredentials.java:214)
... 47 more
Hi,
Spark 3.4.1 Scala 2.12
I got error when using GCP identity federation with AWS credentials. It seems that it has trouble to use the environment variables defined for AWS (AWS_REGION, AWS_ACCESS_KEY_ID and etc).
With following code
credential_file_path
is where I stored theconfig.json
followed thisThis gives error Failed to retrieve AWS region. I believe it's due to not be able use the env vars already defined defined.
Traceback: