austinoboyle / scrape-linkedin-selenium

`scrape_linkedin` is a python package that allows you to scrape personal LinkedIn profiles & company pages - turning the data into structured json.
MIT License
449 stars 162 forks source link

Scrape in parallel #76

Closed Benhoro closed 3 years ago

Benhoro commented 3 years ago

I got the error below when I try to run the following code : **from scrape_linkedin import scrape_in_parallel, CompanyScraper

companies = ['facebook', 'google', 'amazon', 'microsoft']

Scrape all companies, output to 'companies.json' file, use 4 browser instances

scrape_in_parallel( scraper_type=CompanyScraper(cookie='AQEDAQS9deUF3aCgAAABc3XQzd4AAAFzmd1R3lYA0tP3bkRPMfs9CnXLRduXshYHDto8gGFV4BMhzRvRdMiuQ1HVCTQ7isAQmOYX3uUnFh1RxGmUSDWCSLH9VAh03SvukDj6JJh98by1F9PMf6gIHvj5',timeout=100) , items=companies, output_file="companies.json", num_instances=4 )**

_RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "C:\Users\HORO BEN\anaconda3\lib\site-packages\joblib\externals\loky\backend\queues.py", line 150, in feed obj = dumps(obj, reducers=reducers) File "C:\Users\HORO BEN\anaconda3\lib\site-packages\joblib\externals\loky\backend\reduction.py", line 247, in dumps dump(obj, buf, reducers=reducers, protocol=protocol) File "C:\Users\HORO BEN\anaconda3\lib\site-packages\joblib\externals\loky\backend\reduction.py", line 240, in dump _LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj) File "C:\Users\HORO BEN\anaconda3\lib\site-packages\joblib\externals\cloudpickle\cloudpickle.py", line 482, in dump return Pickler.dump(self, obj) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 437, in dump self.save(obj) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 890, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 819, in save_list self._batch_appends(obj) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 846, in _batch_appends save(tmp[0]) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 774, in save_tuple save(element) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 549, in save self.save_reduce(obj=obj, *rv) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 662, in save_reduce save(state) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 504, in save f(self, obj) # Call unbound method with explicit self File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 859, in save_dict self._batch_setitems(obj.items()) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 885, in _batch_setitems save(v) File "C:\Users\HORO BEN\anaconda3\lib\pickle.py", line 524, in save rv = reduce(self.proto) TypeError: can't pickle _thread.lock objects """

The above exception was the direct cause of the following exception:

PicklingError Traceback (most recent call last)

in 8 items=companies, 9 output_file="companies.json", ---> 10 num_instances=4 11 ) ~\anaconda3\lib\site-packages\scrape_linkedin\ParallelScraper.py in scrape_in_parallel(scraper_type, items, output_file, num_instances, temp_dir, driver, driver_options, **kwargs) 31 driver_options=driver_options, 32 **kwargs ---> 33 ) for i in range(num_instances)) 34 35 all_data = {} ~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable) 1015 1016 with self._backend.retrieval_context(): -> 1017 self.retrieve() 1018 # Make sure that we get a last message telling us we are done 1019 elapsed_time = time.time() - self._start_time ~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self) 907 try: 908 if getattr(self._backend, 'supports_timeout', False): --> 909 self._output.extend(job.get(timeout=self.timeout)) 910 else: 911 self._output.extend(job.get()) ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout) 560 AsyncResults.get from multiprocessing.""" 561 try: --> 562 return future.result(timeout=timeout) 563 except LokyTimeoutError: 564 raise TimeoutError() ~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self.__get_result() 436 else: 437 raise TimeoutError() ~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result PicklingError: Could not pickle the task to send it to the workers.
austinoboyle commented 3 years ago

The issue here could be that you don't have enough space on disk to store the results. What available disk space are you working with?

Also, note that there is a known issue with the company scraper right now (#78 ), so if you solve the disk space issue, you will likely run into the other problem.

austinoboyle commented 3 years ago

Closing for lack of response. Feel free to re-open if this is still a problem.