Closed laurasav closed 5 years ago
Thanks, @laurasav !
Looks good, but we need some finetuning. I tested it, but ran into some issues:
It would be good to have some argument maxpages
or maxstories
etc (like e.g. in the tripadvisor-scraper) so that the user can limit the number of things to be scraped.
I got an error that should be caught with a try/except construction:
Fetching article on index: 29
Fallback-method.
Returning a dictionary.
---------------------------------------------------------------------------
ConnectionRefusedError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/urllib3/connection.py in _new_conn(self)
158 conn = connection.create_connection(
--> 159 (self._dns_host, self.port), self.timeout, **extra_kw)
160
/usr/local/lib/python3.6/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
79 if err is not None:
---> 80 raise err
81
/usr/local/lib/python3.6/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
69 sock.bind(source_address)
---> 70 sock.connect(sa)
71 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/elasticsearch/connection/http_urllib3.py in perform_request(self, method, url, params, body, timeout, ignore)
113
--> 114 response = self.pool.urlopen(method, url, body, retries=False, headers=self.headers, **kw)
115 duration = time.time() - start
/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
/usr/local/lib/python3.6/dist-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
342 # Disabled, indicate to re-raise the error.
--> 343 raise six.reraise(type(error), error, _stacktrace)
344
/usr/local/lib/python3.6/dist-packages/urllib3/packages/six.py in reraise(tp, value, tb)
685 raise value.with_traceback(tb)
--> 686 raise value
687
/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
353 else:
--> 354 conn.request(method, url, **httplib_request_kw)
355
/usr/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
1238 """Send a complete request to the server."""
-> 1239 self._send_request(method, url, body, headers, encode_chunked)
1240
/usr/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1284 body = _encode(body, 'body')
-> 1285 self.endheaders(body, encode_chunked=encode_chunked)
1286
/usr/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
1233 raise CannotSendHeader()
-> 1234 self._send_output(message_body, encode_chunked=encode_chunked)
1235
/usr/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
1025 del self._buffer[:]
-> 1026 self.send(msg)
1027
/usr/lib/python3.6/http/client.py in send(self, data)
963 if self.auto_open:
--> 964 self.connect()
965 else:
/usr/local/lib/python3.6/dist-packages/urllib3/connection.py in connect(self)
180 def connect(self):
--> 181 conn = self._new_conn()
182 self._prepare_conn(conn)
/usr/local/lib/python3.6/dist-packages/urllib3/connection.py in _new_conn(self)
167 raise NewConnectionError(
--> 168 self, "Failed to establish a new connection: %s" % e)
169
NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f4762882b00>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-4-46d44f9339eb> in <module>()
----> 1 myinca.scrapers.junknews_scraper()
~/inca/inca/__main__.py in endpoint(*args, **kwargs)
255 else:
256 def endpoint(*args, **kwargs):
--> 257 return method(*args, **kwargs)
258 return endpoint
259
~/inca/inca/core/document_class.py in runwrap(self, action, *args, **kwargs)
38 '''
39 if action == 'run':
---> 40 return self.run(*args, **kwargs)
41
42 if action == 'delay':
~/inca/inca/core/scraper_class.py in run(self, save, *args, **kwargs)
77 if type(doc)==dict:
78 doc = self._add_metadata(doc)
---> 79 self._save_document(doc)
80 else:
81 doc = self._add_metadata(doc)
~/inca/inca/core/document_class.py in _save_document(self, document, forced)
78 custom_identifier = None
79 self._verify(document)
---> 80 insert_document(document, custom_identifier=custom_identifier)
81
82 def _save_documents(self, documents, forced=False):
~/inca/inca/core/database.py in insert_document(document, custom_identifier)
199 if not custom_identifier:
200 try:
--> 201 doc = client.index(index=elastic_index, doc_type='doc', body=document.get('_source',document))
202 except ConnectionTimeout:
203 doc = {'_id':insert_document(document, custom_identifier)}
/usr/local/lib/python3.6/dist-packages/elasticsearch/client/utils.py in _wrapped(*args, **kwargs)
71 if p in kwargs:
72 params[p] = kwargs.pop(p)
---> 73 return func(*args, params=params, **kwargs)
74 return _wrapped
75 return _wrapper
/usr/local/lib/python3.6/dist-packages/elasticsearch/client/__init__.py in index(self, index, doc_type, body, id, params)
296 raise ValueError("Empty value passed for a required argument.")
297 return self.transport.perform_request('POST' if id in SKIP_IN_PATH else 'PUT',
--> 298 _make_path(index, doc_type, id), params=params, body=body)
299
300 @query_params('_source', '_source_exclude', '_source_include', 'parent',
/usr/local/lib/python3.6/dist-packages/elasticsearch/transport.py in perform_request(self, method, url, params, body)
310
311 try:
--> 312 status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout)
313
314 except TransportError as e:
/usr/local/lib/python3.6/dist-packages/elasticsearch/connection/http_urllib3.py in perform_request(self, method, url, params, body, timeout, ignore)
121 if isinstance(e, ReadTimeoutError):
122 raise ConnectionTimeout('TIMEOUT', str(e), e)
--> 123 raise ConnectionError('N/A', str(e), e)
124
125 # raise errors based on http status codes, let the client handle those if needed
ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7f4762882b00>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7f4762882b00>: Failed to establish a new connection: [Errno 111] Connection refused)
Also, in order to be merged, the scraper must run headless so that we can run it on the server.
Thanks for the changes! However, I cannot run it - the line
options.headless = True
causes an AttributeError: can't set attribute
for me.
I think we need to use the function options.set_headless()
Another point: [ ] change the key "website" to "url"
Here's the scraper for the Junk News Aggregator -site. All in all, it the data looks good and clean!
There following functions might still need some fine-tuning. I left comments on them.
Also, I already took care of the issue described in the comment to the lifezette-function.