Closed AlexaLempel closed 6 years ago
I don't really know my way around BeautifulSoup, but I modified your get_results method with an added "repost" field and that's been working pretty well for me to filter duplicates out on my own.
`def get_results(self, limit=None, start=0, sort_by=None, geotagged=False): """ Get results from Craigslist based on the specified filters.
If geotagged=True, the results will include the (lat, lng) in the
'geotag' attrib (this will make the process a little bit longer).
"""
if sort_by:
try:
self.filters['sort'] = self.sort_by_options[sort_by]
except KeyError:
msg = ("'%s' is not a valid sort_by option, "
"use: 'newest', 'price_asc' or 'price_desc'" % sort_by)
self.logger.error(msg)
raise ValueError(msg)
total_so_far = start
results_yielded = 0
total = 0
while True:
self.filters['s'] = start
response = requests_get(self.url, params=self.filters,
logger=self.logger)
self.logger.info('GET %s', response.url)
self.logger.info('Response code: %s', response.status_code)
response.raise_for_status() # Something failed?
soup = BeautifulSoup(response.content, 'html.parser')
if not total:
totalcount = soup.find('span', {'class': 'totalcount'})
total = int(totalcount.text) if totalcount else 0
for row in soup.find_all('p', {'class': 'result-info'}):
if limit is not None and results_yielded >= limit:
break
self.logger.debug('Processing %s of %s results ...',
total_so_far + 1, total)
########################################################################
'''EDIT: ADDED repost FIELD'''
repost = row.find_parent('li')
if 'data-repost-of' in repost.attrs:
repost = repost.attrs['data-repost-of']
else:
repost = None
########################################################################
link = row.find('a', {'class': 'hdrlnk'})
id = link.attrs['data-id']
name = link.text
url = urljoin(self.url, link.attrs['href'])
time = row.find('time')
if time:
datetime = time.attrs['datetime']
else:
pl = row.find('span', {'class': 'pl'})
datetime = pl.text.split(':')[0].strip() if pl else None
price = row.find('span', {'class': 'result-price'})
where = row.find('span', {'class': 'result-hood'})
if where:
where = where.text.strip()[1:-1] # remove ()
tags_span = row.find('span', {'class': 'result-tags'})
tags = tags_span.text if tags_span else ''
result = {'id': id,
########################################################################
'repost': repost,
########################################################################
'name': name,
'url': url,
'datetime': datetime,
'price': price.text if price else None,
'where': where,
'has_image': 'pic' in tags,
# TODO: Look into this, looks like all show map now
'has_map': 'map' in tags,
'geotag': None}
if self.custom_result_fields:
self.customize_result(result, row)
if geotagged and result['has_map']:
self.geotag_result(result)
yield result
results_yielded += 1
total_so_far += 1
if results_yielded == limit:
break
if (total_so_far - start) < RESULTS_PER_REQUEST:
break
start = total_so_far`
Hey there @AlexaLempel. Thanks for the great suggestions! I've added the repost info on 273abda, and the bundle_duplicates filter in cc43988.
Thanks!
New version release (1.0.6
) including these changes!
It would be nice to add the &bundleDuplicates=1 option to the list of base filters.