aws / sagemaker-spark

A Spark library for Amazon SageMaker.
https://aws.github.io/sagemaker-spark/
Apache License 2.0
300 stars 128 forks source link

SageMakerProtobufFileFormat could not be instantiated #84

Closed filipe-plutoflume closed 5 years ago

filipe-plutoflume commented 5 years ago

Please fill out the form below.

System Information

Describe the problem

I am currently trying to load a file from S3/or local filesystem into my SageMaker instance but I am getting the following error since this morning:

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyspark/sql/readwriter.py in json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding)
    272             path = [path]
    273         if type(path) == list:
--> 274             return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
    275         elif isinstance(path, RDD):
    276             def func(iterator):

~/anaconda3/envs/python3/lib/python3.6/site-packages/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

~/anaconda3/envs/python3/lib/python3.6/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o381.json.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider com.amazonaws.services.sagemaker.sparksdk.protobuf.SageMakerProtobufFileFormat could not be instantiated
    at java.util.ServiceLoader.fail(ServiceLoader.java:232)
    at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
    at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
    at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
    at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
    at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:44)
    at scala.collection.Iterator.foreach(Iterator.scala:941)
    at scala.collection.Iterator.foreach$(Iterator.scala:941)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
    at scala.collection.IterableLike.foreach(IterableLike.scala:74)
    at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
    at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
    at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:250)
    at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:248)
    at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
    at scala.collection.TraversableLike.filter(TraversableLike.scala:262)
    at scala.collection.TraversableLike.filter$(TraversableLike.scala:262)
    at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
    at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:630)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:194)
    at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:391)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)

Minimal repo / logs

Please provide any logs and a bare minimum reproducible test case, as this will be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.

I am currently running the following on an Ipython notebook:

from pyspark.sql import DataFrame, Window, SparkSession, Row
from pyspark.sql import functions as f
from pyspark.sql.types import *
import time
import pandas as pd
from os import listdir
from os.path import isfile, join
import sagemaker_pyspark
import sys
classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath).getOrCreate()
spark.conf.set('spark.executor.memory', '2g')
spark.conf.set('spark.executor.cores', '4')
spark.conf.set('spark.cores.max', '4')

df = spark.read.format("s3selectCSV").json(
    's3a://path/to/filename.json'
) # raises exception above
dgokeeffe commented 5 years ago

Seconded - it appears you can not load any Spark dataframes from S3 into Sagemaker. Even the AWS examples are broken.

EDIT: It appears the conda_python3 environment in Sagemaker hasn't been updated - simply run these commands inside any of your notebooks and it should run correctly.

import sys
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install --upgrade sagemaker_pyspark
filipe-plutoflume commented 5 years ago

My issue with this is that in our production environment we have no internet access to the non AWS world. So it would be great if we could by default have these packages updated by default.

jesterhazy commented 5 years ago

If you restart your notebook instance, it will be automatically updated to the latest version of the underlying AMI. The AMI is updated daily with the latest versions of sagemaker and sagemaker_pyspark.

jesterhazy commented 5 years ago

Closing this issue. Please reopen if the notebook restart doesn't help!

filipe-plutoflume commented 5 years ago

Hi @jesterhazy, just tried to reboot the instance but am still getting the same error. Could you check if you can reproduce the issue?

jesterhazy commented 5 years ago

@filipe-plutoflume what region is your notebook instance running in? it turns out that the deployment of this fix is still in progress and has not been completed in all regions yet. some have the update already, the rest should have it by end of week.

filipe-plutoflume commented 5 years ago

Ok, thanks @jesterhazy. We are in eu-west-1, I will try again tomorrow / Friday and will let you know if that works.

jesterhazy commented 5 years ago

eu-west-1 is in our final group of deploys, so I'd try late tomorrow or sometime on Friday. Sorry for the misinformation on this.

filipe-plutoflume commented 5 years ago

Hi @jesterhazy, I just tested this and it seems to be working. I will close the issue, thanks for your help.

wleepang commented 2 years ago

I encountered this issue again. I recently created my notebook instance (May 3, 2022), so I'm assuming I have the most current version of the AMI. The error is effectively the same, albeit I'm trying to load CSVs and Parquet files:

Py4JJavaError: An error occurred while calling o33.load.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider com.amazonaws.services.sagemaker.sparksdk.protobuf.SageMakerProtobufFileFormat could not be instantiated
    at java.util.ServiceLoader.fail(ServiceLoader.java:232)
    at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
    at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
    at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
    at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
    at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:44)
    at scala.collection.Iterator.foreach(Iterator.scala:941)
    at scala.collection.Iterator.foreach$(Iterator.scala:941)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
    at scala.collection.IterableLike.foreach(IterableLike.scala:74)
    at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
    at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
    at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:255)
    at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:249)
    at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
    at scala.collection.TraversableLike.filter(TraversableLike.scala:347)
    at scala.collection.TraversableLike.filter$(TraversableLike.scala:347)
    at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
    at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:644)
    at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:728)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:230)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:214)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NoClassDefFoundError: org/apache/spark/sql/execution/datasources/FileFormat$class
    at com.amazonaws.services.sagemaker.sparksdk.protobuf.SageMakerProtobufFileFormat.<init>(SageMakerProtobufFileFormat.scala:41)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
    at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
    at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
    at java.lang.Class.newInstance(Class.java:442)
    at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
    ... 30 more
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.execution.datasources.FileFormat$class
    at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:419)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:352)
    ... 37 more

minimal steps to reproduce error:

  1. create a notebook instance
  2. create a notebook using the “conda_python3” kernel
  3. Run the following in a cell to setup a local in memory Spark session and read data from S3:
import sagemaker_pyspark
from pyspark.sql import SparkSession

classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath).getOrCreate()
data = spark.read.parquet("s3a://bucket/path/to/parquet/data/")
saltyJeff commented 2 years ago

I encountered this issue again. I recently created my notebook instance (May 3, 2022), so I'm assuming I have the most current version of the AMI. The error is effectively the same, albeit I'm trying to load CSVs and Parquet files:

Py4JJavaError: An error occurred while calling o33.load.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider com.amazonaws.services.sagemaker.sparksdk.protobuf.SageMakerProtobufFileFormat could not be instantiated
  at java.util.ServiceLoader.fail(ServiceLoader.java:232)
  at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
  at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
  at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
  at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
  at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:44)
  at scala.collection.Iterator.foreach(Iterator.scala:941)
  at scala.collection.Iterator.foreach$(Iterator.scala:941)
  at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
  at scala.collection.IterableLike.foreach(IterableLike.scala:74)
  at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
  at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
  at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:255)
  at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:249)
  at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
  at scala.collection.TraversableLike.filter(TraversableLike.scala:347)
  at scala.collection.TraversableLike.filter$(TraversableLike.scala:347)
  at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
  at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:644)
  at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:728)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:230)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:214)
  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
  at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
  at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
  at java.lang.reflect.Method.invoke(Method.java:498)
  at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
  at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
  at py4j.Gateway.invoke(Gateway.java:282)
  at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
  at py4j.commands.CallCommand.execute(CallCommand.java:79)
  at py4j.GatewayConnection.run(GatewayConnection.java:238)
  at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NoClassDefFoundError: org/apache/spark/sql/execution/datasources/FileFormat$class
  at com.amazonaws.services.sagemaker.sparksdk.protobuf.SageMakerProtobufFileFormat.<init>(SageMakerProtobufFileFormat.scala:41)
  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
  at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
  at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
  at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
  at java.lang.Class.newInstance(Class.java:442)
  at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
  ... 30 more
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.execution.datasources.FileFormat$class
  at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:419)
  at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:352)
  ... 37 more

minimal steps to reproduce error:

  1. create a notebook instance
  2. create a notebook using the “conda_python3” kernel
  3. Run the following in a cell to setup a local in memory Spark session and read data from S3:
import sagemaker_pyspark
from pyspark.sql import SparkSession

classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath).getOrCreate()
data = spark.read.parquet("s3a://bucket/path/to/parquet/data/")

I'm getting the same issue: On a ml.t3.medium instance with the conda_python3 kernel:

import sagemaker_pyspark
from pyspark.sql import SparkSession

classpath = ":".join(sagemaker_pyspark.classpath_jars())
print(classpath)
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath).getOrCreate()
df = spark.read.parquet("")

My classpath:

/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/aws-java-sdk-core-1.11.835.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/aws-java-sdk-kms-1.11.835.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/aws-java-sdk-s3-1.11.835.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/aws-java-sdk-sagemaker-1.11.835.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/aws-java-sdk-sagemakerruntime-1.11.835.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/aws-java-sdk-sts-1.11.835.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/hadoop-annotations-2.8.1.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/hadoop-auth-2.8.1.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/hadoop-aws-2.8.1.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/hadoop-common-2.8.1.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/htrace-core4-4.0.1-incubating.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/jars/sagemaker-spark_2.11-spark_2.4.0-1.4.2.dev0.jar
saltyJeff commented 2 years ago

I've nailed it to a version conflict. By default:

The trick is to use the following code cell:

!pip install pyspark=2.4.0

We now have this issue:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_27471/68992327.py in <cell line: 1>()
----> 1 import sagemaker_pyspark
      2 from pyspark.sql import SparkSession
      3 
      4 classpath = ":".join(sagemaker_pyspark.classpath_jars())
      5 spark = SparkSession.builder \

~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/__init__.py in <module>
     17 """
     18 
---> 19 from .wrapper import SageMakerJavaWrapper, Option
     20 from .IAMRoleResource import IAMRole, IAMRoleFromConfig
     21 from .SageMakerClients import SageMakerClients

~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker_pyspark/wrapper.py in <module>
     16 from abc import ABCMeta
     17 
---> 18 from pyspark import SparkContext
     19 from pyspark.ml.common import _java2py
     20 from pyspark.ml.wrapper import JavaWrapper

~/anaconda3/envs/python3/lib/python3.8/site-packages/pyspark/__init__.py in <module>
     49 
     50 from pyspark.conf import SparkConf
---> 51 from pyspark.context import SparkContext
     52 from pyspark.rdd import RDD, RDDBarrier
     53 from pyspark.files import SparkFiles

~/anaconda3/envs/python3/lib/python3.8/site-packages/pyspark/context.py in <module>
     29 from py4j.protocol import Py4JError
     30 
---> 31 from pyspark import accumulators
     32 from pyspark.accumulators import Accumulator
     33 from pyspark.broadcast import Broadcast, BroadcastPickleRegistry

~/anaconda3/envs/python3/lib/python3.8/site-packages/pyspark/accumulators.py in <module>
     95     import socketserver as SocketServer
     96 import threading
---> 97 from pyspark.serializers import read_int, PickleSerializer
     98 
     99 

~/anaconda3/envs/python3/lib/python3.8/site-packages/pyspark/serializers.py in <module>
     69     xrange = range
     70 
---> 71 from pyspark import cloudpickle
     72 from pyspark.util import _exception_message
     73 

~/anaconda3/envs/python3/lib/python3.8/site-packages/pyspark/cloudpickle.py in <module>
    143 
    144 
--> 145 _cell_set_template_code = _make_cell_set_template_code()
    146 
    147 

~/anaconda3/envs/python3/lib/python3.8/site-packages/pyspark/cloudpickle.py in _make_cell_set_template_code()
    124         )
    125     else:
--> 126         return types.CodeType(
    127             co.co_argcount,
    128             co.co_kwonlyargcount,

TypeError: an integer is required (got type bytes)

This is because pyspark 2.4.0 doesn't work with the default python version (3.8)

The first option is to run this magic (which will take forever because conda sucks)

!conda install python=3.7

The second option is to create a new conda environment fixed to python3.7, install all the correct packages, then register that environment as a kernel