This is the code :
`import json
from typing import Dict, List, TypedDict
from datetime import datetime
from typing import Iterator, List, Literal, Tuple
import httpx
from loguru import logger as log
from parsel import Selector
import asyncio
BASE_HEADERS = {
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",
"accept-language": "en-US;en;q=0.9",
"accept-encoding": "gzip, deflate, br",
}
class CompanyData(TypedDict):
"""Type hint for data returned by Crunchbase company page parser"""
organization: Dict
employees: List[Dict]
def _parse_organization_data(data: Dict) -> Dict:
"""example that parses main company details from the whole company dataset"""
properties = data['properties']
cards = data['cards']
parsed = {
theres meta data in the properties field:
"name": properties['title'],
"id": properties['identifier']['permalink'],
"logo": "https://res.cloudinary.com/crunchbase-production/image/upload/" + properties['identifier']['image_id'],
"description": properties['short_description'],
# but most of the data is in the cards field:
"semrush_global_rank": cards['semrush_summary']['semrush_global_rank'],
"semrush_visits_latest_month": cards['semrush_summary']['semrush_visits_latest_month'],
# etc... There's much more data!
}
return parsed
def _parse_employee_data(data: Dict) -> List[Dict]:
"""example that parses employee details from the whole employee dataset"""
parsed = []
for person in data['entities']:
parsed.append({
"name": person['properties']['name'],
"linkedin": person['properties'].get('linkedin'),
"job_levels": person['properties'].get('job_levels'),
"job_departments": person['properties'].get('job_departments'),
etc...
})
return parsed
def _unescape_angular(text):
"""Helper function to unescape Angular quoted text"""
ANGULARESCAPE = {
"&a;": "&",
"&q;": '"',
"&s;": "'",
"&l;": "<",
"&g;": ">",
}
for from, to in ANGULARESCAPE.items():
text = text.replace(from, to) if text else ""
return text
def parse_company(response) -> CompanyData:
"""parse company page for company and employee data"""
sel = Selector(text=response.text)
print("Select ",sel)
# Extracting app state data
app_state_data = sel.css("script#client-app-state::text").get()
print("app_state_data ",app_state_data)
# Check if app_state_data is not None before parsing as JSON
if app_state_data:
app_state_data = _unescape_angular(app_state_data)
app_state_data = json.loads(app_state_data)
# Continue with the rest of the parsing logic
cache_keys = list(app_state_data["HttpState"])
data_cache_key = next(key for key in cache_keys if "entities/organizations/" in key)
people_cache_key = next(key for key in cache_keys if "/data/searches/contacts" in key)
organization = app_state_data["HttpState"][data_cache_key]["data"]
employees = app_state_data["HttpState"][people_cache_key]["data"]
return {
"organization": _parse_organization_data(organization),
"employees": _parse_employee_data(employees),
}
else:
# Handle the case where app_state_data is None
log.error("Failed to extract app_state_data from the response")
return {"organization": {}, "employees": []}
async def scrape_company(session: httpx.AsyncClient) -> CompanyData:
"""scrape crunchbase company page for organization and employee data"""
note: we use /people tab because it contains the most data:
This is the code : `import json from typing import Dict, List, TypedDict from datetime import datetime from typing import Iterator, List, Literal, Tuple
import httpx from loguru import logger as log from parsel import Selector import asyncio BASE_HEADERS = { "accept-language": "en-US,en;q=0.9", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8", "accept-language": "en-US;en;q=0.9", "accept-encoding": "gzip, deflate, br", }
class CompanyData(TypedDict): """Type hint for data returned by Crunchbase company page parser"""
def _parse_organization_data(data: Dict) -> Dict: """example that parses main company details from the whole company dataset""" properties = data['properties'] cards = data['cards'] parsed = {
theres meta data in the properties field:
def _parse_employee_data(data: Dict) -> List[Dict]: """example that parses employee details from the whole employee dataset""" parsed = [] for person in data['entities']: parsed.append({ "name": person['properties']['name'], "linkedin": person['properties'].get('linkedin'), "job_levels": person['properties'].get('job_levels'), "job_departments": person['properties'].get('job_departments'),
etc...
def _unescape_angular(text): """Helper function to unescape Angular quoted text""" ANGULARESCAPE = { "&a;": "&", "&q;": '"', "&s;": "'", "&l;": "<", "&g;": ">", } for from, to in ANGULARESCAPE.items(): text = text.replace(from, to) if text else "" return text
def parse_company(response) -> CompanyData: """parse company page for company and employee data"""
async def scrape_company(session: httpx.AsyncClient) -> CompanyData: """scrape crunchbase company page for organization and employee data"""
note: we use /people tab because it contains the most data:
append this to the previous code snippet to run it:
async def run(): async with httpx.AsyncClient( limits=httpx.Limits(max_connections=5), timeout=httpx.Timeout(15.0), headers=BASE_HEADERS, http2=True ) as session: data = await scrape_company(session=session) print(json.dumps(data, indent=2, ensure_ascii=False))
if name == "main": asyncio.run(run())`
Select
app_state_data None
2023-11-15 18:34:00.016 | ERROR | main:parse_company:100 - Failed to extract app_state_data from the response
{
"organization": {},
"employees": []
}**