scrapfly / scrapfly-scrapers

Web scrapers for popular targets powered Scrapfly.io
https://scrapfly.io
Other
169 stars 46 forks source link

crunchbase is responding with empty response body #17

Closed MHossain-dviz closed 7 months ago

MHossain-dviz commented 7 months ago

This is the code : `import json from typing import Dict, List, TypedDict from datetime import datetime from typing import Iterator, List, Literal, Tuple

import httpx from loguru import logger as log from parsel import Selector import asyncio BASE_HEADERS = { "accept-language": "en-US,en;q=0.9", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8", "accept-language": "en-US;en;q=0.9", "accept-encoding": "gzip, deflate, br", }

class CompanyData(TypedDict): """Type hint for data returned by Crunchbase company page parser"""

organization: Dict
employees: List[Dict]

def _parse_organization_data(data: Dict) -> Dict: """example that parses main company details from the whole company dataset""" properties = data['properties'] cards = data['cards'] parsed = {

theres meta data in the properties field:

    "name": properties['title'],
    "id": properties['identifier']['permalink'],
    "logo": "https://res.cloudinary.com/crunchbase-production/image/upload/" + properties['identifier']['image_id'],
    "description": properties['short_description'],
    # but most of the data is in the cards field:
    "semrush_global_rank": cards['semrush_summary']['semrush_global_rank'],
    "semrush_visits_latest_month": cards['semrush_summary']['semrush_visits_latest_month'],
    # etc... There's much more data!
}
return parsed

def _parse_employee_data(data: Dict) -> List[Dict]: """example that parses employee details from the whole employee dataset""" parsed = [] for person in data['entities']: parsed.append({ "name": person['properties']['name'], "linkedin": person['properties'].get('linkedin'), "job_levels": person['properties'].get('job_levels'), "job_departments": person['properties'].get('job_departments'),

etc...

    })
return parsed

def _unescape_angular(text): """Helper function to unescape Angular quoted text""" ANGULARESCAPE = { "&a;": "&", "&q;": '"', "&s;": "'", "&l;": "<", "&g;": ">", } for from, to in ANGULARESCAPE.items(): text = text.replace(from, to) if text else "" return text

def parse_company(response) -> CompanyData: """parse company page for company and employee data"""

sel = Selector(text=response.text)
print("Select ",sel)

# Extracting app state data
app_state_data = sel.css("script#client-app-state::text").get()
print("app_state_data ",app_state_data)

# Check if app_state_data is not None before parsing as JSON
if app_state_data:
    app_state_data = _unescape_angular(app_state_data)
    app_state_data = json.loads(app_state_data)

    # Continue with the rest of the parsing logic
    cache_keys = list(app_state_data["HttpState"])
    data_cache_key = next(key for key in cache_keys if "entities/organizations/" in key)
    people_cache_key = next(key for key in cache_keys if "/data/searches/contacts" in key)

    organization = app_state_data["HttpState"][data_cache_key]["data"]
    employees = app_state_data["HttpState"][people_cache_key]["data"]

    return {
        "organization": _parse_organization_data(organization),
        "employees": _parse_employee_data(employees),
    }
else:
    # Handle the case where app_state_data is None
    log.error("Failed to extract app_state_data from the response")
    return {"organization": {}, "employees": []}

async def scrape_company(session: httpx.AsyncClient) -> CompanyData: """scrape crunchbase company page for organization and employee data"""

note: we use /people tab because it contains the most data:

url = f"https://www.crunchbase.com/organization/tesla-motors/people"
await asyncio.sleep(5)
response = await session.get(url)
print("response ",response)

return parse_company(response)

append this to the previous code snippet to run it:

async def run(): async with httpx.AsyncClient( limits=httpx.Limits(max_connections=5), timeout=httpx.Timeout(15.0), headers=BASE_HEADERS, http2=True ) as session: data = await scrape_company(session=session) print(json.dumps(data, indent=2, ensure_ascii=False))

if name == "main": asyncio.run(run())`

And we are getting this response 

**response  <Response [403 Forbidden]>

Select app_state_data None 2023-11-15 18:34:00.016 | ERROR | main:parse_company:100 - Failed to extract app_state_data from the response { "organization": {}, "employees": [] }**

MHossain-dviz commented 7 months ago

Solved this They Actually their script id name