Open silpara opened 1 year ago
Update: I installed java from https://download.oracle.com/java/20/latest/jdk-20_windows-x64_bin.exe, set JAVA_HOME environment variable and tried again. Getting the following error now
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[3], line 1
----> 1 spark = configure_spark_with_delta_pip(builder).getOrCreate()
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
267 sparkConf.set(key, value)
268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
271 # by all sessions.
272 session = SparkSession(sc, options=self._options)
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
481 with SparkContext._lock:
482 if SparkContext._active_spark_context is None:
--> 483 SparkContext(conf=conf or SparkConf())
484 assert SparkContext._active_spark_context is not None
485 return SparkContext._active_spark_context
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:197, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
196 try:
--> 197 self._do_init(
198 master,
199 appName,
200 sparkHome,
201 pyFiles,
202 environment,
203 batchSize,
204 serializer,
205 conf,
206 jsc,
207 profiler_cls,
208 udf_profiler_cls,
209 )
210 except BaseException:
211 # If an error occurs, clean up in order to allow future SparkContext creation:
212 self.stop()
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:282, in SparkContext._do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls, udf_profiler_cls)
279 self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
281 # Create the Java SparkContext through Py4J
--> 282 self._jsc = jsc or self._initialize_context(self._conf._jconf)
283 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
284 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\pyspark\context.py:402, in SparkContext._initialize_context(self, jconf)
398 """
399 Initialize SparkContext in function to allow subclass specific initialization
400 """
401 assert self._jvm is not None
--> 402 return self._jvm.JavaSparkContext(jconf)
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\py4j\java_gateway.py:1585, in JavaClass.__call__(self, *args)
1579 command = proto.CONSTRUCTOR_COMMAND_NAME +\
1580 self._command_header +\
1581 args_command +\
1582 proto.END_COMMAND_PART
1584 answer = self._gateway_client.send_command(command)
-> 1585 return_value = get_return_value(
1586 answer, self._gateway_client, None, self._fqn)
1588 for temp_arg in temp_args:
1589 temp_arg._detach()
File ~\anaconda3\envs\pyspark-330-delta-220\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1108)
at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1094)
at org.apache.spark.util.Utils$.fetchFile(Utils.scala:579)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1647)
at org.apache.spark.SparkContext.$anonfun$new$13(SparkContext.scala:514)
at org.apache.spark.SparkContext.$anonfun$new$13$adapted(SparkContext.scala:514)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:514)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:67)
at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
... 22 more
@silpara in case this is not fixed yet, try to install Hadoop directly as per in their docs (https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems):
How to fix a missing WINUTILS.EXE You can fix this problem in two ways Install a full native windows Hadoop version. The ASF does not currently (September 2015) release such a version; releases are available externally. Or: get the WINUTILS.EXE binary from a Hadoop redistribution. There is a repository of this for some Hadoop versions on github. Then Set the environment variable %HADOOP_HOME% to point to the directory above the BIN dir containing WINUTILS.EXE. Or: run the Java process with the system property hadoop.home.dir set to the home directory.
I installed the conda environment using
conda env create -f envs/pyspark-330-delta-220
and tried running the notebook 01_quickstart.ipynb but I get the following error:I am on windows 11 machine.