delta-io / delta-examples

Delta Lake examples
Apache License 2.0
208 stars 76 forks source link

Error: runtime error with pyspark quickstart notebook #24

Open silpara opened 1 year ago

silpara commented 1 year ago

I installed the conda environment using conda env create -f envs/pyspark-330-delta-220 and tried running the notebook 01_quickstart.ipynb but I get the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[3], line 1
----> 1 spark = configure_spark_with_delta_pip(builder).getOrCreate()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
    267     sparkConf.set(key, value)
    268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
    270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    271 # by all sessions.
    272 session = SparkSession(sc, options=self._options)

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
    481 with SparkContext._lock:
    482     if SparkContext._active_spark_context is None:
--> 483         SparkContext(conf=conf or SparkConf())
    484     assert SparkContext._active_spark_context is not None
    485     return SparkContext._active_spark_context

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:195, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
    189 if gateway is not None and gateway.gateway_parameters.auth_token is None:
    190     raise ValueError(
    191         "You are trying to pass an insecure Py4j gateway to Spark. This"
    192         " is not allowed as it is a security risk."
    193     )
--> 195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    196 try:
    197     self._do_init(
    198         master,
    199         appName,
   (...)
    208         udf_profiler_cls,
    209     )

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:417, in SparkContext._ensure_initialized(cls, instance, gateway, conf)
    415 with SparkContext._lock:
    416     if not SparkContext._gateway:
--> 417         SparkContext._gateway = gateway or launch_gateway(conf)
    418         SparkContext._jvm = SparkContext._gateway.jvm
    420     if instance:

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\java_gateway.py:106, in launch_gateway(conf, popen_kwargs)
    103     time.sleep(0.1)
    105 if not os.path.isfile(conn_info_file):
--> 106     raise RuntimeError("Java gateway process exited before sending its port number")
    108 with open(conn_info_file, "rb") as info:
    109     gateway_port = read_int(info)

RuntimeError: Java gateway process exited before sending its port number

I am on windows 11 machine.

silpara commented 1 year ago

Update: I installed java from https://download.oracle.com/java/20/latest/jdk-20_windows-x64_bin.exe, set JAVA_HOME environment variable and tried again. Getting the following error now

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
Cell In[3], line 1
----> 1 spark = configure_spark_with_delta_pip(builder).getOrCreate()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
    267     sparkConf.set(key, value)
    268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
    270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    271 # by all sessions.
    272 session = SparkSession(sc, options=self._options)

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
    481 with SparkContext._lock:
    482     if SparkContext._active_spark_context is None:
--> 483         SparkContext(conf=conf or SparkConf())
    484     assert SparkContext._active_spark_context is not None
    485     return SparkContext._active_spark_context

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:197, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
    195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    196 try:
--> 197     self._do_init(
    198         master,
    199         appName,
    200         sparkHome,
    201         pyFiles,
    202         environment,
    203         batchSize,
    204         serializer,
    205         conf,
    206         jsc,
    207         profiler_cls,
    208         udf_profiler_cls,
    209     )
    210 except BaseException:
    211     # If an error occurs, clean up in order to allow future SparkContext creation:
    212     self.stop()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:282, in SparkContext._do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls, udf_profiler_cls)
    279 self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
    281 # Create the Java SparkContext through Py4J
--> 282 self._jsc = jsc or self._initialize_context(self._conf._jconf)
    283 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
    284 self._conf = SparkConf(_jconf=self._jsc.sc().conf())

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:402, in SparkContext._initialize_context(self, jconf)
    398 """
    399 Initialize SparkContext in function to allow subclass specific initialization
    400 """
    401 assert self._jvm is not None
--> 402 return self._jvm.JavaSparkContext(jconf)

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\py4j\java_gateway.py:1585, in JavaClass.__call__(self, *args)
   1579 command = proto.CONSTRUCTOR_COMMAND_NAME +\
   1580     self._command_header +\
   1581     args_command +\
   1582     proto.END_COMMAND_PART
   1584 answer = self._gateway_client.send_command(command)
-> 1585 return_value = get_return_value(
   1586     answer, self._gateway_client, None, self._fqn)
   1588 for temp_arg in temp_args:
   1589     temp_arg._detach()

File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
    324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325 if answer[1] == REFERENCE_TYPE:
--> 326     raise Py4JJavaError(
    327         "An error occurred while calling {0}{1}{2}.\n".
    328         format(target_id, ".", name), value)
    329 else:
    330     raise Py4JError(
    331         "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
    332         format(target_id, ".", name, value))

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
    at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
    at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
    at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1108)
    at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1094)
    at org.apache.spark.util.Utils$.fetchFile(Utils.scala:579)
    at org.apache.spark.SparkContext.addFile(SparkContext.scala:1647)
    at org.apache.spark.SparkContext.$anonfun$new$13(SparkContext.scala:514)
    at org.apache.spark.SparkContext.$anonfun$new$13$adapted(SparkContext.scala:514)
    at scala.collection.immutable.List.foreach(List.scala:431)
    at org.apache.spark.SparkContext.<init>(SparkContext.scala:514)
    at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
    at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:67)
    at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
    at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:238)
    at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
    at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
    at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
    at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
    at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
    at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
    at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
    at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
    at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
    at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
    at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
    at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
    at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
    at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
    at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
    at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
    at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
    at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
    at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
    at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
    at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
    at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
    at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
    at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
    at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
    at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
    at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
    at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
    at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
    at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
    at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
    at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
    ... 22 more
handreassa commented 1 year ago

@silpara in case this is not fixed yet, try to install Hadoop directly as per in their docs (https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems):

How to fix a missing WINUTILS.EXE You can fix this problem in two ways Install a full native windows Hadoop version. The ASF does not currently (September 2015) release such a version; releases are available externally. Or: get the WINUTILS.EXE binary from a Hadoop redistribution. There is a repository of this for some Hadoop versions on github. Then Set the environment variable %HADOOP_HOME% to point to the directory above the BIN dir containing WINUTILS.EXE. Or: run the Java process with the system property hadoop.home.dir set to the home directory.