oap-project / raydp

RayDP provides simple APIs for running Spark on Ray and integrating Spark with AI libraries.
Apache License 2.0
293 stars 66 forks source link

raydp start occur null error #357

Open sydt2014 opened 1 year ago

sydt2014 commented 1 year ago

Traceback (most recent call last): File "python/ray/_raylet.pyx", line 1197, in ray._raylet.task_execution_handler File "python/ray/_raylet.pyx", line 1100, in ray._raylet.execute_task_with_cancellation_handler File "python/ray/_raylet.pyx", line 823, in ray._raylet.execute_task File "python/ray/_raylet.pyx", line 1001, in ray._raylet.execute_task File "python/ray/_raylet.pyx", line 623, in ray._raylet.store_task_errors File "python/ray/_raylet.pyx", line 2563, in ray._raylet.CoreWorker.store_task_outputs File "/usr/local/lib/python3.9/site-packages/ray/_private/serialization.py", line 466, in serialize return self._serialize_to_msgpack(value) File "/usr/local/lib/python3.9/site-packages/ray/_private/serialization.py", line 421, in _serialize_to_msgpack value = value.to_bytes() File "/usr/local/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes serialized_exception=pickle.dumps(self), File "/usr/local/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 88, in dumps cp.dump(obj) File "/usr/local/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 733, in dump return Pickler.dump(self, obj) TypeError: cannot pickle '_thread.RLock' object An unexpected internal error occurred while the worker was executing a task.

(RayDPSparkMaster pid=56543) 2023-06-30 10:36:39,370 ERROR worker.py:844 -- Worker exits with an exit code None. (RayDPSparkMaster pid=56543) Traceback (most recent call last): (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 870, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 921, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 877, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 881, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 821, in ray._raylet.execute_task.function_executor (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/_private/function_manager.py", line 670, in actor_method_executor (RayDPSparkMaster pid=56543) return method(ray_actor, *args, *kwargs) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 460, in _resume_span (RayDPSparkMaster pid=56543) return method(self, _args, **_kwargs) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/raydp/spark/ray_cluster_master.py", line 56, in start_up (RayDPSparkMaster pid=56543) self._gateway.jvm.org.apache.spark.deploy.raydp.RayAppMaster.setProperties(jvm_properties) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/py4j/java_gateway.py", line 1321, in call__ (RayDPSparkMaster pid=56543) return_value = get_return_value( (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/py4j/protocol.py", line 326, in get_return_value (RayDPSparkMaster pid=56543) raise Py4JJavaError( (RayDPSparkMaster pid=56543) py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.deploy.raydp.RayAppMaster.setProperties. (RayDPSparkMaster pid=56543) : java.lang.NullPointerException (RayDPSparkMaster pid=56543) at java.util.Hashtable.put(Hashtable.java:460) (RayDPSparkMaster pid=56543) at java.util.Properties.setProperty(Properties.java:166) (RayDPSparkMaster pid=56543) at java.lang.System.setProperty(System.java:796) (RayDPSparkMaster pid=56543) at org.apache.spark.deploy.raydp.RayAppMaster$.$anonfun$setProperties$1(RayAppMaster.scala:336) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:400) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:728) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:728) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:728) (RayDPSparkMaster pid=56543) at org.apache.spark.deploy.raydp.RayAppMaster$.setProperties(RayAppMaster.scala:335) (RayDPSparkMaster pid=56543) at org.apache.spark.deploy.raydp.RayAppMaster.setProperties(RayAppMaster.scala) (RayDPSparkMaster pid=56543) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) (RayDPSparkMaster pid=56543) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) (RayDPSparkMaster pid=56543) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) (RayDPSparkMaster pid=56543) at java.lang.reflect.Method.invoke(Method.java:498) (RayDPSparkMaster pid=56543) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) (RayDPSparkMaster pid=56543) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) (RayDPSparkMaster pid=56543) at py4j.Gateway.invoke(Gateway.java:282) (RayDPSparkMaster pid=56543) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) (RayDPSparkMaster pid=56543) at py4j.commands.CallCommand.execute(CallCommand.java:79) (RayDPSparkMaster pid=56543) at py4j.GatewayConnection.run(GatewayConnection.java:238) (RayDPSparkMaster pid=56543) at java.lang.Thread.run(Thread.java:748) (RayDPSparkMaster pid=56543) (RayDPSparkMaster pid=56543) (RayDPSparkMaster pid=56543) During handling of the above exception, another exception occurred: (RayDPSparkMaster pid=56543) (RayDPSparkMaster pid=56543) Traceback (most recent call last): (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 1197, in ray._raylet.task_execution_handler (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 1100, in ray._raylet.execute_task_with_cancellation_handler (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 823, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 1001, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 623, in ray._raylet.store_task_errors (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 2563, in ray._raylet.CoreWorker.store_task_outputs (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/_private/serialization.py", line 466, in serialize (RayDPSparkMaster pid=56543) return self._serialize_to_msgpack(value) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/_private/serialization.py", line 421, in _serialize_to_msgpack (RayDPSparkMaster pid=56543) value = value.to_bytes() (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/exceptions.py", line 32, in to_bytes (RayDPSparkMaster pid=56543) serialized_exception=pickle.dumps(self), (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 88, in dumps (RayDPSparkMaster pid=56543) cp.dump(obj) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 733, in dump (RayDPSparkMaster pid=56543) return Pickler.dump(self, obj) (RayDPSparkMaster pid=56543) TypeError: cannot pickle '_thread.RLock' object (RayDPSparkMaster pid=56543) An unexpected internal error occurred while the worker was executing a task. (RayDPSparkMaster pid=56543) Traceback (most recent call last): (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 870, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 921, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 877, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 881, in ray._raylet.execute_task (RayDPSparkMaster pid=56543) File "python/ray/_raylet.pyx", line 821, in ray._raylet.execute_task.function_executor (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/_private/function_manager.py", line 670, in actor_method_executor (RayDPSparkMaster pid=56543) return method(ray_actor, *args, *kwargs) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 460, in _resume_span (RayDPSparkMaster pid=56543) return method(self, _args, **_kwargs) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/raydp/spark/ray_cluster_master.py", line 56, in start_up (RayDPSparkMaster pid=56543) self._gateway.jvm.org.apache.spark.deploy.raydp.RayAppMaster.setProperties(jvm_properties) (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/py4j/java_gateway.py", line 1321, in call__ (RayDPSparkMaster pid=56543) return_value = get_return_value( (RayDPSparkMaster pid=56543) File "/usr/local/lib/python3.9/site-packages/py4j/protocol.py", line 326, in get_return_value (RayDPSparkMaster pid=56543) raise Py4JJavaError( (RayDPSparkMaster pid=56543) py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.deploy.raydp.RayAppMaster.setProperties. (RayDPSparkMaster pid=56543) : java.lang.NullPointerException (RayDPSparkMaster pid=56543) at java.util.Hashtable.put(Hashtable.java:460) (RayDPSparkMaster pid=56543) at java.util.Properties.setProperty(Properties.java:166) (RayDPSparkMaster pid=56543) at java.lang.System.setProperty(System.java:796) (RayDPSparkMaster pid=56543) at org.apache.spark.deploy.raydp.RayAppMaster$.$anonfun$setProperties$1(RayAppMaster.scala:336) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:400) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:728) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:728) (RayDPSparkMaster pid=56543) at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:728) (RayDPSparkMaster pid=56543) at org.apache.spark.deploy.raydp.RayAppMaster$.setProperties(RayAppMaster.scala:335) (RayDPSparkMaster pid=56543) at org.apache.spark.deploy.raydp.RayAppMaster.setProperties(RayAppMaster.scala) (RayDPSparkMaster pid=56543) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) (RayDPSparkMaster pid=56543) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) (RayDPSparkMaster pid=56543) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) (RayDPSparkMaster pid=56543) at java.lang.reflect.Method.invoke(Method.java:498) (RayDPSparkMaster pid=56543) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) (RayDPSparkMaster pid=56543) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) (RayDPSparkMaster pid=56543) at py4j.Gateway.invoke(Gateway.java:282) (RayDPSparkMaster pid=56543) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) (RayDPSparkMaster pid=56543) at py4j.commands.CallCommand.execute(CallCommand.java:79) (RayDPSparkMaster pid=56543) at py4j.GatewayConnection.run(GatewayConnection.java:238) (RayDPSparkMaster pid=56543) at java.lang.Thread.run(Thread.java:748) (RayDPSparkMaster pid=56543)

kira-lin commented 12 months ago

Hi @sydt2014 , what versions of Ray and RayDP are you using? If you are using Ray after Ray 2.1,0, please use RayDP nightly. Install it via pip install --pre -U raydp. Sorry for the inconvenience, we'll make a release soon