KianaC23 / Capstone-test-scripts

0 stars 0 forks source link

Crawler fixes #1

Open 10LikeTheNumber opened 3 years ago

10LikeTheNumber commented 3 years ago
import os
import pdb
import requests
import csv
from bs4 import BeautifulSoup

GENRE_SITE_LIST = [
        "https://www.blackclassicmovies.com/movies-database/action/",
        "https://www.blackclassicmovies.com/movies-database/blaxploitation/",
        "https://www.blackclassicmovies.com/movies-database/comedy-movies/",
        "https://www.blackclassicmovies.com/movies-database/drama-movies-a-l/",
        "https://www.blackclassicmovies.com/movies-database/drama-movies-m-z/",
        "https://www.blackclassicmovies.com/movies-database/family-movies/",
        "https://www.blackclassicmovies.com/movies-database/romance-movies/",
        ]

def persist_html():
    # TODO: Write actual html to a file instead of recrawling web
    pass

def persist_results(url, results):
    name = url.split("/")[-2]
    path_to_script = os.path.dirname(os.path.abspath(__file__))
    filename = os.path.join(path_to_script, f"{name}.csv")
    print(f"Writing results to {filename}")
    with open(filename, 'w') as f:
        write = csv.writer(f)
        fields = ["Title", "Year", "Stars", "Director"]
        write.writerow(fields)
        write.writerows(results)

def crawl_site():
    for site in GENRE_SITE_LIST:
        results = crawl_movie_genre(site)
        persist_results(site, results)

def crawl_movie_genre(base_url):
    # Crawl the initial genre page
    result_list = []
    results, has_next = parse_page(base_url)
    if results:
        result_list.extend(results)

    index = 2 # we start iterating on the second page
    while has_next and index < 100: # quick backup exit ocndition
        url = f"{base_url}/page/{index}"
        results, has_next = parse_page(url) # parse page and see if there is another
        if results:
            result_list.extend(results)
        index = index + 1

    return result_list

def parse_page(url):
    print(f"Parsing url {url}")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
    }
    results = []
    has_next = False
    r = requests.get(url, headers=headers)
    if r.status_code == 200:
        print(f"Successfully called URL")
        html_text = r.text
        soup = BeautifulSoup(html_text, 'html.parser')
        mydivs = soup.find_all("div", {"class": "film-data-box"})
        print(f"Found {len(mydivs)} results")
        for div in mydivs:
            title = div.find_all("h3")[0].text.strip()
            year, stars, director = [res.text for res in div.find_all("td")[1::2]]
            print(f"Appending {title, year, stars, director} to results list")
            results.append((title, year, stars, director))

        has_next = len(soup.find_all("a", {"class": "next page-numbers"})) >= 1
    print(results)
    return (results, has_next)

crawl_site()
10LikeTheNumber commented 3 years ago

Didn't want to setup key management, so leaving. a comment but this should do the trick