googleapis / java-spanner-jdbc

Apache License 2.0
64 stars 48 forks source link

JDBC Connectivity issue with Serverless Dataproc 1.2 #1751

Closed rajc242 closed 1 month ago

rajc242 commented 1 month ago

Environment details

Please find out details related to runtime 1.2 version at documentation

Steps to reproduce

  1. Generate any parquet file and store it on GCS bucket
  2. Read using Spark
  3. Ingest data to Spanner

Code example

package com.google.spanner.spannersparkintegration;

import com.google.spanner.spannersparkintegration.dialect.SpannerJDBCDialect;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions;
import org.apache.spark.sql.jdbc.JdbcDialects;

import java.util.logging.Logger;

public final class SpannerDemo {

    private static final Logger LOGGER = Logger.getLogger(SpannerDemo.class.getName());
    private static final String GCS_PATH = "gs://YOUR_GCS_PARQUET_PATH/";
    private static final String SPANNER_JDBC_DRIVER = "com.google.cloud.spanner.jdbc.JdbcDriver";
    private static final String SPANNER_INSTANCE_ID = "dataproc-test";
    private static final String SPANNER_GOOGLESQL_DATABASE = "googlesql";
    private static final String SPANNER_PGSQL_DATABASE = "pgsql";
    private static final String SPANNER_TABLE = "sparktest";
    private static final String GCP_PROJECT_ID = "YOUR_GCP_PROJECT_ID";

    public static void main(String[] args) {

        LOGGER.info("Start Spark Session");
        SparkSession spark = SparkSession.builder().appName("SpannerSparkIntegration").getOrCreate();
        JdbcDialects.registerDialect(new SpannerJDBCDialect());

        LOGGER.info(String.format("Read Dataset From %s", GCS_PATH));
        Dataset<Row> df = spark.read().parquet(GCS_PATH);

        LOGGER.info("Prepare Spanner JDBC URL");
        String spannerUrl =
                String.format(
                        "jdbc:cloudspanner:/projects/%s/instances/%s/databases/%s?lenient=true",
                        GCP_PROJECT_ID, SPANNER_INSTANCE_ID, SPANNER_GOOGLESQL_DATABASE);

        LOGGER.info(String.format("Write Dataset To Spanner %s", spannerUrl));
        df
                .write()
                .format("jdbc")
                .option(JDBCOptions.JDBC_URL(), spannerUrl)
                .option(JDBCOptions.JDBC_TABLE_NAME(), SPANNER_TABLE)
                .option(
                        JDBCOptions.JDBC_CREATE_TABLE_OPTIONS(),
                        String.format("PRIMARY KEY (%s)", "row_id"))
                .option(
                        JDBCOptions.JDBC_TXN_ISOLATION_LEVEL(),
                        "NONE") // Needed because transaction have a 20,000 mutation limit per commit.
                .option(JDBCOptions.JDBC_DRIVER_CLASS(), SPANNER_JDBC_DRIVER)
                .mode(SaveMode.Overwrite)
                .save();

        LOGGER.info("Stop Spark Session");
        spark.stop();

    }
}
package com.google.spanner.spannersparkintegration.dialect;

import static org.apache.spark.sql.types.DataTypes.BinaryType;
import static org.apache.spark.sql.types.DataTypes.BooleanType;
import static org.apache.spark.sql.types.DataTypes.ByteType;
import static org.apache.spark.sql.types.DataTypes.DateType;
import static org.apache.spark.sql.types.DataTypes.DoubleType;
import static org.apache.spark.sql.types.DataTypes.FloatType;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
import static org.apache.spark.sql.types.DataTypes.LongType;
import static org.apache.spark.sql.types.DataTypes.ShortType;
import static org.apache.spark.sql.types.DataTypes.StringType;
import static org.apache.spark.sql.types.DataTypes.TimestampType;

import java.sql.Types;
import org.apache.spark.sql.jdbc.JdbcDialect;
import org.apache.spark.sql.jdbc.JdbcType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DecimalType;
import org.apache.spark.sql.types.MetadataBuilder;
import scala.Option;

public class SpannerJDBCDialect extends JdbcDialect {

    private static final long serialVersionUID = 1L;

    @Override
    public boolean canHandle(String url) {
        return url.toLowerCase().startsWith("jdbc:cloudspanner:");
    }

    /**
     * Spanner uses backticks to quote column identifiers
     *
     * @param column column name
     * @return column name wrapped in backticks
     */
    @Override
    public String quoteIdentifier(String column) {
        return "`" + column + "`";
    }

    /**
     * Handle spanner DDL type names, used when creating tables when writing to spanner
     *
     * @param dt Spark SQL data type
     * @return Jdbc type representing a corresponding Spanner DDL type name
     */
    public Option<JdbcType> getJDBCType(final DataType dt) {
        if (IntegerType.equals(dt)) {
            return Option.apply(new JdbcType("INT64", 4));
        } else if (LongType.equals(dt)) {
            return Option.apply(new JdbcType("INT64", -5));
        } else if (DoubleType.equals(dt)) {
            return Option.apply(new JdbcType("FLOAT64", 8));
        } else if (FloatType.equals(dt)) {
            return Option.apply(new JdbcType("FLOAT64", 6));
        } else if (ShortType.equals(dt)) {
            return Option.apply(new JdbcType("INT64", 5));
        } else if (ByteType.equals(dt)) {
            return Option.apply(new JdbcType("BYTES(1)", -6));
        } else if (BooleanType.equals(dt)) {
            return Option.apply(new JdbcType("BOOL", -7));
        } else if (StringType.equals(dt)) {
            return Option.apply(new JdbcType("STRING(MAX)", 2005));
        } else if (BinaryType.equals(dt)) {
            org.apache.spark.sql.types.BinaryType binaryType = (org.apache.spark.sql.types.BinaryType) dt;
            return Option.apply(new JdbcType("BYTES(MAX)", 2004));
        } else if (TimestampType.equals(dt)) {
            return Option.apply(new JdbcType("TIMESTAMP", 93));
        } else if (DateType.equals(dt)) {
            return Option.apply(new JdbcType("DATE", 91));
        } else if (dt instanceof DecimalType) {
            return Option.apply(new JdbcType("NUMERIC", 3));
        } else {
            return Option.empty();
        }
    }

    @Override
    public Option<DataType> getCatalystType(
            int sqlType, String typeName, int size, MetadataBuilder md) {
        if (sqlType == Types.NUMERIC) return Option.apply(DecimalType.apply(38, 9));
        return Option.empty();
    }
}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.google.spanner</groupId>
    <artifactId>SpannerSparkIntegration</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <junit-platform.version>5.7.2</junit-platform.version>
        <spark.version>3.5.0</spark.version>
        <scala.version>2.12</scala.version>
    </properties>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>com.google.cloud</groupId>
                <artifactId>libraries-bom</artifactId>
                <version>26.45.0</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>com.google.cloud</groupId>
            <artifactId>google-cloud-spanner-jdbc</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>com.google.api.grpc</groupId>
                    <artifactId>proto-google-cloud-spanner-executor-v1</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>3.0.0-M5</version>
                <dependencies>
                    <dependency>
                        <groupId>org.junit.jupiter</groupId>
                        <artifactId>junit-jupiter-engine</artifactId>
                        <version>${junit-platform.version}</version>
                    </dependency>
                </dependencies>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.google.spanner.spannersparkintegration.SpannerDemo</mainClass>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/maven/**</exclude>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <relocations>
                                <relocation>
                                    <pattern>com</pattern>
                                    <shadedPattern>repackaged.com.google</shadedPattern>
                                    <includes>
                                        <include>com.google.protobuf.**</include>
                                        <!--                    <inclde>com.google.common.**</inclde>-->
                                        <!--                    <include>io.grpc.internal.**</include>-->
                                    </includes>
                                </relocation>
                            </relocations>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

Stack trace

Exception in thread "main" java.lang.NoClassDefFoundError: com/google/protobuf/MessageOrBuilder
    at java.base/java.lang.ClassLoader.defineClass1(Native Method)
    at java.base/java.lang.ClassLoader.defineClass(ClassLoader.java:1017)
    at java.base/java.security.SecureClassLoader.defineClass(SecureClassLoader.java:150)
    at java.base/jdk.internal.loader.BuiltinClassLoader.defineClass(BuiltinClassLoader.java:862)
    at java.base/jdk.internal.loader.BuiltinClassLoader.findClassOnClassPathOrNull(BuiltinClassLoader.java:760)
    at java.base/jdk.internal.loader.BuiltinClassLoader.loadClassOrNull(BuiltinClassLoader.java:681)
    at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:639)
    at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:188)
    at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
    at java.base/java.lang.ClassLoader.defineClass1(Native Method)
    at java.base/java.lang.ClassLoader.defineClass(ClassLoader.java:1017)
    at java.base/java.security.SecureClassLoader.defineClass(SecureClassLoader.java:150)
    at java.base/jdk.internal.loader.BuiltinClassLoader.defineClass(BuiltinClassLoader.java:862)
    at java.base/jdk.internal.loader.BuiltinClassLoader.findClassOnClassPathOrNull(BuiltinClassLoader.java:760)
    at java.base/jdk.internal.loader.BuiltinClassLoader.loadClassOrNull(BuiltinClassLoader.java:681)
    at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:639)
    at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:188)
    at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:579)
    at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
    at com.google.cloud.spanner.spi.v1.SpannerErrorInterceptor.<clinit>(SpannerErrorInterceptor.java:47)
    at com.google.cloud.spanner.spi.v1.SpannerInterceptorProvider.createDefault(SpannerInterceptorProvider.java:50)
    at com.google.cloud.spanner.spi.v1.GapicSpannerRpc.<init>(GapicSpannerRpc.java:352)
    at com.google.cloud.spanner.spi.v1.GapicSpannerRpc.<init>(GapicSpannerRpc.java:283)
    at com.google.cloud.spanner.SpannerOptions$DefaultSpannerRpcFactory.create(SpannerOptions.java:508)
    at com.google.cloud.spanner.SpannerOptions$DefaultSpannerRpcFactory.create(SpannerOptions.java:503)
    at com.google.cloud.ServiceOptions.getRpc(ServiceOptions.java:602)
    at com.google.cloud.spanner.SpannerOptions.getSpannerRpcV1(SpannerOptions.java:1740)
    at com.google.cloud.spanner.SpannerImpl.<init>(SpannerImpl.java:149)
    at com.google.cloud.spanner.SpannerOptions$DefaultSpannerFactory.create(SpannerOptions.java:498)
    at com.google.cloud.spanner.SpannerOptions$DefaultSpannerFactory.create(SpannerOptions.java:493)
    at com.google.cloud.ServiceOptions.getService(ServiceOptions.java:582)
    at com.google.cloud.spanner.connection.SpannerPool.createSpanner(SpannerPool.java:392)
    at com.google.cloud.spanner.connection.SpannerPool.getSpanner(SpannerPool.java:320)
    at com.google.cloud.spanner.connection.ConnectionImpl.<init>(ConnectionImpl.java:292)
    at com.google.cloud.spanner.connection.ConnectionOptions.getConnection(ConnectionOptions.java:1352)
    at com.google.cloud.spanner.jdbc.AbstractJdbcConnection.<init>(AbstractJdbcConnection.java:64)
    at com.google.cloud.spanner.jdbc.JdbcConnection.<init>(JdbcConnection.java:97)
    at com.google.cloud.spanner.jdbc.JdbcDriver.connect(JdbcDriver.java:219)
    at org.apache.spark.sql.execution.datasources.jdbc.connection.BasicConnectionProvider.getConnection(BasicConnectionProvider.scala:49)
    at org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProviderBase.create(ConnectionProvider.scala:102)
    at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1(JdbcDialects.scala:160)
    at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1$adapted(JdbcDialects.scala:156)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:50)
    at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
    at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
    at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
    at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
    at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:473)
    at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:473)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
    at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
    at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:449)
    at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
    at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
    at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
    at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
    at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
    at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
    at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
    at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
    at com.google.spanner.spannersparkintegration.SpannerDemo.main(SpannerDemo.java:53)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:569)
    at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
    at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1032)
    at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:194)
    at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:217)
    at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
    at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1124)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1133)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.ClassNotFoundException: com.google.protobuf.MessageOrBuilder
    at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:641)
    at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:188)
    at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
    ... 85 more
gcloud dataproc batches submit --project GCP_PROJECT_ID --region us-east1 spark \
--class com.google.spanner.spannersparkintegration.SpannerDemo --version 1.2 \ 
--jars gs://BUCKET_NAME/jars/SpannerSparkIntegration-1.0-SNAPSHOT.jar \ 
--subnet YOUR_SUBNET --service-account YOUR_SERVICE_ACCOUNT \
--properties spark.executor.instances=2,spark.driver.cores=4,spark.executor.cores=4,spark.dynamicAllocation.executorAllocationRatio=0.3,spark.app.name=SpannerSparkIntegration,spark.dataproc.scaling.version=1

Any additional information below

It is working perfectly fine without any issues with Serverless Dataproc 1.1

Thanks!

olavloite commented 1 month ago

@rajc242 I just created a simple test application using only your pom.xml file and then ran mvn package. The result of that is indeed a .jar that does not contain com/google/protobuf/MessageOrBuilder.class, meaning that this error message seems relatively clear. What I see is that:

  1. com/google/protobuf/ is not in the jar due to the <shadedPattern>repackaged.com.google</shadedPattern> configuration in your pom.xml. If you remove that part of the configuration from your pom.xml, then the protobuf files are added to the jar in the normal folder.
  2. What is however also surprising (and potentially the problem here) is that there is also no repackaged/com/google/protobuf directory in your jar. Instead, the protobuf package is placed in repackaged/com/google/google/protobuf/ (notice the double google directory).

More in general though: Using an über-jar (fat jar, jar-with-dependencies, shaded jar, ...) is a very common source of problems. It is much better to modify your build so that you create a list of jars, instead of trying to put everything into one jar. Single-jars are highly prone to problems due to conflicting files in folders like META-INF/services and similar folders. If two or more of your jars try to add the same service file to the über-jar, then only one of them will be added, and which one is arbitrary.

rajc242 commented 1 month ago

Thank you for your quick response. Usually we follow our Google public documentation which suggest to use above pom.xml file. I can try removing repackaged but not sure if it will create any conflict with spark libraries.

olavloite commented 1 month ago

Thank you for your quick response. Usually we follow our Google public documentation which suggest to use above pom.xml file. I can try removing repackaged but not sure if it will create any conflict with spark libraries.

Thanks. I was not aware that the suggestion to repackage was actually from the Dataproc documentation. Please let me know if it skipping it works or not.

rajc242 commented 1 month ago

Updated pom.xml file

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.google.spanner</groupId>
    <artifactId>SpannerSparkIntegration</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <junit-platform.version>5.7.2</junit-platform.version>
        <spark.version>3.5.0</spark.version>
        <scala.version>2.12</scala.version>
        <spanner.jdbc.version>2.22.0</spanner.jdbc.version>
    </properties>

<!--    <dependencyManagement>-->
<!--        <dependencies>-->
<!--            <dependency>-->
<!--                <groupId>com.google.cloud</groupId>-->
<!--                <artifactId>libraries-bom</artifactId>-->
<!--                <version>26.45.0</version>-->
<!--                <type>pom</type>-->
<!--                <scope>import</scope>-->
<!--            </dependency>-->
<!--        </dependencies>-->
<!--    </dependencyManagement>-->

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>com.google.cloud</groupId>
            <artifactId>google-cloud-spanner-jdbc</artifactId>
            <version>${spanner.jdbc.version}</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>3.0.0-M5</version>
                <dependencies>
                    <dependency>
                        <groupId>org.junit.jupiter</groupId>
                        <artifactId>junit-jupiter-engine</artifactId>
                        <version>${junit-platform.version}</version>
                    </dependency>
                </dependencies>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.google.spanner.spannersparkintegration.SpannerDemo</mainClass>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/maven/**</exclude>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
<!--                            <relocations>-->
<!--                                <relocation>-->
<!--                                    <pattern>com</pattern>-->
<!--                                    <shadedPattern>repackaged.com.google</shadedPattern>-->
<!--                                    <includes>-->
<!--&lt;!&ndash;                                        <include>com.google.protobuf.**</include>&ndash;&gt;-->
<!--                                        &lt;!&ndash;                    <inclde>com.google.common.**</inclde>&ndash;&gt;-->
<!--                                        &lt;!&ndash;                    <include>io.grpc.internal.**</include>&ndash;&gt;-->
<!--                                    </includes>-->
<!--                                </relocation>-->
<!--                            </relocations>-->
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

I got the same error message. I think class is there but not sure why still throwing class def not found exception.

jar tf target/SpannerSparkIntegration-1.0-SNAPSHOT.jar | grep MessageOrBuilder

Output

org/sparkproject/spark_core/protobuf/GeneratedMessage$ExtendableMessageOrBuilder.class
org/sparkproject/spark_core/protobuf/GeneratedMessageLite$ExtendableMessageOrBuilder.class
org/sparkproject/spark_core/protobuf/GeneratedMessageV3$ExtendableMessageOrBuilder.class
org/sparkproject/spark_core/protobuf/MessageOrBuilder.class
org/sparkproject/spark_core/protobuf/RepeatedFieldBuilder$MessageOrBuilderExternalList.class
org/sparkproject/spark_core/protobuf/RepeatedFieldBuilderV3$MessageOrBuilderExternalList.class
org/apache/hadoop/thirdparty/protobuf/GeneratedMessage$ExtendableMessageOrBuilder.class
org/apache/hadoop/thirdparty/protobuf/GeneratedMessageLite$ExtendableMessageOrBuilder.class
org/apache/hadoop/thirdparty/protobuf/GeneratedMessageV3$ExtendableMessageOrBuilder.class
org/apache/hadoop/thirdparty/protobuf/MessageOrBuilder.class
org/apache/hadoop/thirdparty/protobuf/RepeatedFieldBuilder$MessageOrBuilderExternalList.class
org/apache/hadoop/thirdparty/protobuf/RepeatedFieldBuilderV3$MessageOrBuilderExternalList.class
org/apache/hadoop/shaded/com/google/protobuf/GeneratedMessage$ExtendableMessageOrBuilder.class
org/apache/hadoop/shaded/com/google/protobuf/GeneratedMessageLite$ExtendableMessageOrBuilder.class
org/apache/hadoop/shaded/com/google/protobuf/MessageOrBuilder.class
org/apache/hadoop/shaded/com/google/protobuf/RepeatedFieldBuilder$MessageOrBuilderExternalList.class
org/apache/orc/protobuf/GeneratedMessage$ExtendableMessageOrBuilder.class
org/apache/orc/protobuf/GeneratedMessageLite$ExtendableMessageOrBuilder.class
org/apache/orc/protobuf/GeneratedMessageV3$ExtendableMessageOrBuilder.class
org/apache/orc/protobuf/MessageOrBuilder.class
org/apache/orc/protobuf/RepeatedFieldBuilder$MessageOrBuilderExternalList.class
org/apache/orc/protobuf/RepeatedFieldBuilderV3$MessageOrBuilderExternalList.class
com/google/rpc/LocalizedMessageOrBuilder.class
io/grpc/binarylog/v1/MessageOrBuilder.class
com/google/protobuf/GeneratedMessage$ExtendableMessageOrBuilder.class
com/google/protobuf/GeneratedMessageLite$ExtendableMessageOrBuilder.class
com/google/protobuf/GeneratedMessageV3$ExtendableMessageOrBuilder.class
com/google/protobuf/MessageOrBuilder.class
com/google/protobuf/RepeatedFieldBuilder$MessageOrBuilderExternalList.class
com/google/protobuf/RepeatedFieldBuilderV3$MessageOrBuilderExternalList.class
rajc242 commented 1 month ago

@olavloite Can we connect offline on Google side ? This is blocker for us to migrate our public repository dataproc-templates to serverless dataproc 1.2

rajc242 commented 1 month ago

We were able to make it working with jdbc version 2.14.0 due to dependency conflicts with spark 3.5.1. Please find below working pom.xml which supports bigquery, bigtable, spanner and gcs connector with spark.

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.google.spanner</groupId>
    <artifactId>SpannerSparkIntegration</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>17</maven.compiler.source>
        <maven.compiler.target>17</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <junit-platform.version>5.7.2</junit-platform.version>
        <spark.version>3.5.1</spark.version>
        <scala.version>2.12</scala.version>
        <spark.bigtable.version>0.2.1</spark.bigtable.version>
        <google.cloud.bom.version>26.47.0</google.cloud.bom.version>

        <gpg.skip>true</gpg.skip>
        <revision>0.0.1-SNAPSHOT</revision>

        <avro.version>1.11.3</avro.version>
        <arrow.version>12.0.0</arrow.version>
        <gax.version>2.30.0</gax.version>
        <google-cloud-dataproc.version>4.11.0</google-cloud-dataproc.version>
        <google-cloud-spanner.version>6.50.0</google-cloud-spanner.version>
        <google-cloud-storage.version>2.20.2</google-cloud-storage.version>
        <google-truth.version>1.1.3</google-truth.version>
        <grpc.version>1.55.1</grpc.version>
        <guava.version>32.0.0-jre</guava.version>
        <jackson.version>2.15.2</jackson.version>
        <netty.version>4.1.92.Final</netty.version>
        <paranamer.version>2.8</paranamer.version>
        <protobuf.version>3.23.0</protobuf.version>
        <zstd.version>1.4.9-1</zstd.version>
        <openlineage.version>1.19.0</openlineage.version>
        <deploy.skip>true</deploy.skip>
        <nexus.remote.skip>false</nexus.remote.skip>
        <shade.skip>false</shade.skip>
    </properties>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>aopalliance</groupId>
                <artifactId>aopalliance</artifactId>
                <version>1.0</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>com.fasterxml.jackson</groupId>
                <artifactId>jackson-bom</artifactId>
                <version>${jackson.version}</version>
                <scope>import</scope>
                <type>pom</type>
            </dependency>
            <dependency>
                <groupId>com.fasterxml.jackson.core</groupId>
                <artifactId>jackson-core</artifactId>
                <version>${jackson.version}</version>
                <scope>compile</scope>
            </dependency>
            <!-- Fixing CVE-2022-42003 -->
            <dependency>
                <groupId>com.fasterxml.jackson.core</groupId>
                <artifactId>jackson-databind</artifactId>
                <version>${jackson.version}</version>
            </dependency>
            <dependency>
                <groupId>com.github.luben</groupId>
                <artifactId>zstd-jni</artifactId>
                <version>${zstd.version}</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>com.google.api</groupId>
                <artifactId>gax-bom</artifactId>
                <version>${gax.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>com.google.auth</groupId>
                <artifactId>google-auth-library-bom</artifactId>
                <version>1.16.1</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>com.google.cloud</groupId>
                <artifactId>google-cloud-dataproc-bom</artifactId>
                <version>${google-cloud-dataproc.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>com.google.cloud</groupId>
                <artifactId>google-cloud-spanner-bom</artifactId>
                <version>${google-cloud-spanner.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>com.google.cloud</groupId>
                <artifactId>google-cloud-storage</artifactId>
                <version>${google-cloud-storage.version}</version>
            </dependency>
            <dependency>
                <groupId>com.google.cloud.bigdataoss</groupId>
                <artifactId>gcs-connector</artifactId>
                <version>hadoop2-2.2.8</version>
                <classifier>shaded</classifier>
                <exclusions>
                    <exclusion>
                        <groupId>com.google.cloud.bigdataoss</groupId>
                        <artifactId>util-hadoop</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
            <dependency>
                <groupId>com.google.errorprone</groupId>
                <artifactId>error_prone_annotations</artifactId>
                <version>2.18.0</version>
            </dependency>
            <dependency>
                <groupId>com.google.code.findbugs</groupId>
                <artifactId>jsr305</artifactId>
                <version>3.0.2</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>com.google.guava</groupId>
                <artifactId>guava</artifactId>
                <version>${guava.version}</version>
            </dependency>
            <dependency>
                <groupId>com.google.protobuf</groupId>
                <artifactId>protobuf-bom</artifactId>
                <version>${protobuf.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>com.google.truth</groupId>
                <artifactId>truth</artifactId>
                <version>${google-truth.version}</version>
            </dependency>
            <!-- see https://github.com/paul-hammant/paranamer#releases -->
            <dependency>
                <groupId>com.thoughtworks.paranamer</groupId>
                <artifactId>paranamer</artifactId>
                <version>${paranamer.version}</version>
            </dependency>
            <dependency>
                <groupId>commons-codec</groupId>
                <artifactId>commons-codec</artifactId>
                <version>1.15</version>
            </dependency>
            <dependency>
                <groupId>io.grpc</groupId>
                <artifactId>grpc-bom</artifactId>
                <version>${grpc.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>io.netty</groupId>
                <artifactId>netty-bom</artifactId>
                <version>${netty.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
            <dependency>
                <groupId>javax.inject</groupId>
                <artifactId>javax.inject</artifactId>
                <version>1</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>javax.annotation</groupId>
                <artifactId>javax.annotation-api</artifactId>
                <version>1.3.2</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.13.2</version>
            </dependency>
            <dependency>
                <groupId>org.apache.avro</groupId>
                <artifactId>avro</artifactId>
                <version>${avro.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.avro</groupId>
                <artifactId>avro-ipc</artifactId>
                <version>${avro.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.avro</groupId>
                <artifactId>avro-mapred</artifactId>
                <version>${avro.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.arrow</groupId>
                <artifactId>arrow-compression</artifactId>
                <version>${arrow.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.arrow</groupId>
                <artifactId>arrow-memory-netty</artifactId>
                <version>${arrow.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.arrow</groupId>
                <artifactId>arrow-vector</artifactId>
                <version>${arrow.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-compress</artifactId>
                <version>1.26.0</version>
            </dependency>
            <dependency>
                <groupId>org.mockito</groupId>
                <artifactId>mockito-core</artifactId>
                <version>4.10.0</version>
            </dependency>
            <dependency>
                <groupId>org.conscrypt</groupId>
                <artifactId>conscrypt-openjdk-uber</artifactId>
                <version>2.5.2</version>
                <scope>provided</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>com.google.cloud</groupId>
            <artifactId>google-cloud-spanner</artifactId>
        </dependency>
        <dependency>
            <groupId>com.google.cloud</groupId>
            <artifactId>google-cloud-spanner-jdbc</artifactId>
            <version>2.14.0</version>
        </dependency>
        <dependency>
            <groupId>com.google.cloud.spark.bigtable</groupId>
            <artifactId>spark-bigtable_2.12</artifactId>
            <version>0.2.1</version>
        </dependency>
        <dependency>
            <groupId>com.google.cloud.spark</groupId>
            <artifactId>spark-bigquery_2.12</artifactId>
            <version>0.36.4</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>com.google.cloud</groupId>
            <artifactId>google-cloud-storage</artifactId>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.16</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.mockito</groupId>
            <artifactId>mockito-core</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>com.google.truth</groupId>
            <artifactId>truth</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>io.openlineage</groupId>
            <artifactId>openlineage-spark_2.12</artifactId>
            <version>${openlineage.version}</version>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <skip>${shade.skip}</skip>
                            <shadedArtifactAttached>false</shadedArtifactAttached>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.local.dttemplate.spannersparkintegration.SpannerDemo</mainClass>
                                </transformer>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/maven/**</exclude>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <relocations>
                                <relocation>
                                    <pattern>android</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.android</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>autovalue</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.autovalue</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>avro.shaded</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.avro.shaded</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>com.fasterxml</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.com.fasterxml</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>com.github</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.com.github</shadedPattern>
                                    <!-- Need to exclude the zstd library from being repackaged because of https://github.com/luben/zstd-jni#limitations -->
                                    <excludes>
                                        <exclude>com.github.luben.zstd.**</exclude>
                                    </excludes>
                                </relocation>
                                <relocation>
                                    <pattern>com.google</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.com.google</shadedPattern>
                                    <excludes>
                                        <exclude>com.google.cloud.bigquery.connector.common.**</exclude>
                                        <exclude>com.local.dttemplate.**</exclude>
                                    </excludes>
                                </relocation>
                                <relocation>
                                    <pattern>com.thoughtworks.paranamer</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.com.thoughtworks.paranamer
                                    </shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>com.typesafe</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.com.typesafe</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>io.grpc</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.io.grpc</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>io.netty</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.io.netty</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>io.opencensus</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.io.opencensus</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>io.perfmark</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.io.perfmark</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.apache.arrow</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.apache.arrow</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.apache.beam</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.apache.beam</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.apache.commons</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.apache.commons</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.apache.http</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.apache.http</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.checkerframework</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.checkerframework
                                    </shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.codehaus.mojo</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.codehaus.mojo</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.json</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.json</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.threeten</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.threeten</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.tukaani.xz</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.tukaani.xz</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>org.xerial.snappy</pattern>
                                    <shadedPattern>com.local.dttemplate.repackaged.org.xerial.snappy</shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>META-INF/native/libio_grpc_netty_shaded_netty</pattern>
                                    <shadedPattern>META-INF/native/libcom_google_cloud_spark_spanner_repackaged_io_grpc_netty_shaded_io_grpc_netty
                                    </shadedPattern>
                                </relocation>
                                <relocation>
                                    <pattern>META-INF/native/io_grpc_netty_shaded_netty</pattern>
                                    <shadedPattern>META-INF/native/com_google_cloud_spark_spanner_repackaged_io_grpc_netty_shaded_io_grpc_netty
                                    </shadedPattern>
                                </relocation>
                            </relocations>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>