asmaier / ImmoSpider

Immospider is a crawler for the Immoscout24 website.
183 stars 49 forks source link

feat/add-similar-objects #7

Open DefJM opened 4 years ago

DefJM commented 4 years ago

Hi @asmaier

I'd like to create a pull request to add "similar objects" in the parsing section of immoscout.py. Can you give me the rights so I can do so? Currently I don't seem to have permission.

Thanks a lot! Jan

# -*- coding: utf-8 -*-
import scrapy
import json
from immospider.items import ImmoscoutItem

class ImmoscoutSpider(scrapy.Spider):
    name = "immoscout"
    allowed_domains = ["immobilienscout24.de"]
    # start_urls = ['https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete/Berlin/Berlin']
    # start_urls = ['https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete/Berlin/Berlin/Lichterfelde-Steglitz_Nikolassee-Zehlendorf_Dahlem-Zehlendorf_Zehlendorf-Zehlendorf/2,50-/60,00-/EURO--800,00/-/-/']

    # The immoscout search results are stored as json inside their javascript. This makes the parsing very easy.
    # I learned this trick from https://github.com/balzer82/immoscraper/blob/master/immoscraper.ipynb .
    script_xpath = './/script[contains(., "IS24.resultList")]'
    next_xpath = '//div[@id = "pager"]/div/a/@href'

    def start_requests(self):
        yield scrapy.Request(self.url)

    def parse(self, response):

        print(response.url)

        for line in response.xpath(self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                immo_json = json.loads(immo_json[17:-1])

                #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary.
                #TODO: So extracting data will fail.
                for result in immo_json["searchResponseModel"]["resultlist.resultlist"]["resultlistEntries"][0]["resultlistEntry"]:
                    item = self.parse_result(result, response)
                    yield item

                    # check for and parse "similar objects" with additional matching results in json body
                    if "similarObjects" in result:
                        for i in result["similarObjects"][0]["similarObject"]:
                            item = self.parse_data_object(i, response)
                            yield item

        next_page_list = response.xpath(self.next_xpath).extract()
        if next_page_list:
            next_page = next_page_list[-1]
            print("Scraping next page", next_page)
            if next_page:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)

    def parse_result(self, result, response):
        """parse json result for each site

        :param result: [description]
        :type result: [type]
        """
        item = ImmoscoutItem()
        data = result["resultlist.realEstate"]

        item["immo_id"] = data["@id"]
        item["url"] = response.urljoin("/expose/" + str(data["@id"]))
        item["title"] = data["title"]
        address = data["address"]
        try:
            item["address"] = address["street"] + " " + address["houseNumber"]
        except:
            item["address"] = None
        item["city"] = address["city"]
        item["zip_code"] = address["postcode"]
        item["district"] = address["quarter"]

        item["rent"] = data["price"]["value"]
        item["sqm"] = data["livingSpace"]
        item["rooms"] = data["numberOfRooms"]

        if "calculatedPrice" in data:
            item["extra_costs"] = (
                data["calculatedPrice"]["value"] - data["price"]["value"]
            )
        if "builtInKitchen" in data:
            item["kitchen"] = data["builtInKitchen"]
        if "balcony" in data:
            item["balcony"] = data["balcony"]
        if "garden" in data:
            item["garden"] = data["garden"]
        if "privateOffer" in data:
            item["private"] = data["privateOffer"]
        if "plotArea" in data:
            item["area"] = data["plotArea"]
        if "cellar" in data:
            item["cellar"] = data["cellar"]

        try:
            contact = data["contactDetails"]
            item["contact_name"] = contact["firstname"] + " " + contact["lastname"]
        except:
            item["contact_name"] = None

        try:
            item["media_count"] = len(data["galleryAttachments"]["attachment"])
        except:
            item["media_count"] = 0

        try:
            item["lat"] = address["wgs84Coordinate"]["latitude"]
            item["lng"] = address["wgs84Coordinate"]["longitude"]
        except Exception as e:
            # print(e)
            item["lat"] = None
            item["lng"] = None

        print(item)

        return item