0x776b7364 / toggle.sg-download

Python script to automate Toggle.sg video downloads
MIT License
26 stars 18 forks source link

Proposed fixed code #49

Closed freemanang1989 closed 6 years ago

freemanang1989 commented 6 years ago
#!/usr/bin/env python

import re
import json
import os
import time
import sys
import argparse
import threading
import random
import logging

try:
    import Queue
except ImportError:
    import queue as Queue

try:
    import urllib2 as urllib_request
except ImportError:
    import urllib.request as urllib_request

########## START USER CONFIGURATION ##########

# enabled by default
# disable by using the argument --no-autodl
# for videos, this would auto-select the best quality file
# for episodes, this would auto-select all episodes in the series
AUTO_DOWNLOAD = 1

# enabled by default
# disable by using the argument --no-subs
# if enabled, script will check and download video subtitles if present
CHECK_AND_DOWNLOAD_SUBTITLES = 1

# preferred order of file formats to download
# highest preference is first
# in some cases, mp4 is the only downloadable file even though
#   m3u8 is in the URL list
FILE_PREFERENCES =  [(1,'STB','m3u8'),  # generally 720p, Set-top Box, requires ffmpeg
            (2,'hlstv_hd','m3u8'),
            (3,'web_hd','m3u8'),
            (4,'ADD','mp4'),    # generally 540p, Android device
            (5,'IPAD','m3u8'),  # generally 540p, iPad, requires ffmpeg
            (6,'tablet_hd','m3u8'),
            (7,'IPH','m3u8'),   # generally 360p, iPhone, requires ffmpeg
            (8,'mobile_hd','m3u8'),
            (9,'hlstv_sd','m3u8'),
            (10,'web_sd','m3u8'),
            (11,'tablet_sd','m3u8'),
            (12,'mobile_sd','m3u8')]

# only download direct-accessible files i.e. ignore streaming files
#FILE_PREFERENCES = [(1,'ADD','mp4')]

########## END USER CONFIGURATION ##########

# sample (m3u8 and mp4) links
#url = "http://video.toggle.sg/en/series/sabo/ep12/327339"
#url = "http://video.toggle.sg/en/series/118-catch-up/ep126/328542"
#url = "http://video.toggle.sg/zh/series/118-catch-up/webisodes/document/330134"
#url = "http://video.toggle.sg/en/series/double-bonus/ep23/279063"

# sample wvm link
#url = "http://video.toggle.sg/en/series/marvel-s-agents-of-s-h-i-e-l-d-yr-2/ep6/327671"

# sample episode link
#url = "http://tv.toggle.sg/en/channel8/shows/the-dream-job-tif/episodes"

# regex
VALID_VIDEO_URL = r"https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies|tv-show)/.+?/(?P<id>[0-9]+)"
API_USER_PASS_EXPR = r'apiUser:\s*"(?P<user>[^"]+?)".+?apiPass:\s*"(?P<password>[^"]+?)"'
VALID_EPISODES_URL = r"http?://tv\.toggle\.sg/(?:en|zh)/.+?/episodes"
CONTENT_NAVIGATION_EXPR = r'10, 0,  (?P<content_id>[0-9]+), (?P<navigation_id>[0-9]+), isCatchup'
EPISODE_TITLE_EXPR = r'<title>([\s\S]*?)</title>'
URL_TITLE_EXPR = r'<h4.+?href="([\s\S]*?)">([\s\S]*?)</a>'
FORMAT_EXPR = r'(?:STB|IPH|IPAD|ADD|mobile_hd|web_hd|hlstv_hd|tablet_hd|mobile_sd|web_sd|hlstv_sd|tablet_sd)'

URL_CATEGORY = ['t_video','t_episodes']

MAIN_DOWNLOAD_QUEUE = Queue.Queue()

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'

# logging attributes
logger = logging.getLogger('download_toggle')
formatter = logging.Formatter('[%(levelname).1s] %(message)s')

## console logging
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)

# Page: http://www.blog.pythonlibrary.org/2012/08/01/python-concurrency-an-example-of-a-queue/
# Author: Mike Driscoll
class Downloader(threading.Thread):

    def __init__(self, queue):
        threading.Thread.__init__(self,name = os.urandom(4).encode('hex'))
        self.queue = queue

    def download_file(self, record):

        name = record[0]
        url = record[1]

        if (url.lower().endswith("m3u8")):
            logger.debug("Crafting ffmpeg command ...")
            #ffmpeg_download_cmd = "ffmpeg -user-agent \"" + USER_AGENT + "\" -headers 'origin: http://video.toggle.sg/\r\n' -hide_banner -loglevel info -i " + url + " -c copy -bsf:a aac_adtstoasc \"" + name + ".mp4\""
            ffmpeg_download_cmd = "ffmpeg -user-agent \"" + USER_AGENT + "\" -hide_banner -loglevel info -i " + url + " -c copy -bsf:a aac_adtstoasc \"" + name + ".mp4\""
            logger.debug(ffmpeg_download_cmd)
            logger.debug("Executing ffmpeg command ...")
            try:
                download_return_val = os.system(ffmpeg_download_cmd)
            except (KeyboardInterrupt):
                logger.error("Received KeyboardInterrupt. Quitting ...")
                sys.exit(0)

            if (download_return_val == 0):
                logger.info("" + name + ".mp4 file created!")
            else:
                logger.error("ffmpeg file not found, or existing file is for incorrect architecture, or download was interrupted prematurely.")
        elif (url.lower().endswith("mp4") or url.lower().endswith("wvm") or url.lower().endswith("srt")):
            # Page: http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python
            # Author: PabloG
            file_name = url.split('/')[-1]
            u = urllib_request.urlopen(url)
            f = open(file_name, 'wb')
            meta = u.info()
            file_size = int(meta.getheaders("Content-Length")[0])
            print("Downloading: %s Bytes: %s" % (file_name, file_size))

            file_size_dl = 0
            block_sz = 8192
            print("Thread\t\tDownloaded\tPercentage")
            print("-------------------------------")
            while True:
                buffer = u.read(block_sz)
                if not buffer:
                    break

                file_size_dl += len(buffer)
                f.write(buffer)
                if (random.randint(-100,100) == 0):
                    status = r"%s   %10d    [%3.2f%%]" % (self.name, file_size_dl, file_size_dl * 100. / file_size)
                    status = status + chr(8)*(len(status)+1)
                    print(status)
            f.close()
        else:
            logger.error("Unhandled file extension: " + url)

        logger.info("Thread %s completed" % (self.name))

    def run(self):
        while True:
            record = self.queue.get()
            logger.info("Thread %s: processing URL %s" % (self.name, record[1]))
            self.download_file(record)
            self.queue.task_done()

def print_script_header():
    print("\n+++++++++++++++++++++++++++++++++++++")
    print("Toggle video and episodes downloader")
    print("+++++++++++++++++++++++++++++++++++++")

def process_url(url):
    """
    Returns a list of translated URLs to be enqueued and downloaded
    """
    input_url_category = get_url_category(url)

    if input_url_category == 't_video':
        return process_video_url(url)
    elif input_url_category == 't_episodes':
        return process_episodes_url(url)
    else:
        logger.error("Error: %s is not a valid URL" % (url))
        return []

def get_url_category(url):
    """
    Returns the matching URL_CATEGORY of 'url'
    """
    if re.match(VALID_VIDEO_URL, url):
        return URL_CATEGORY[0]
    elif re.match(VALID_EPISODES_URL, url):
        return URL_CATEGORY[1]
    else:
        return None

def process_video_url(t_video_url):
    """
    Returns a list of translated URLs from a video URL, else returns
    None if errors are encountered
    """

    queued_urls = []

    logger.info("Toggle video %s detected" % (t_video_url))

    mediaID = re.match(VALID_VIDEO_URL, t_video_url).group('id')
    logger.debug("Obtained mediaID = %s" % (mediaID))

    logger.debug("Performing HTTP GET request on Toggle video URL ...")
    #t_video_url_resp = urllib_request.urlopen(t_video_url).read()

    t_video_url_req = urllib_request.Request(t_video_url)
    t_video_url_req.add_header('User-Agent', USER_AGENT)
    t_video_url_resp = urllib_request.urlopen(t_video_url_req).read()

    if (logger.isEnabledFor(logging.DEBUG)):
        text_file = open("v1.t_video_url_resp.txt", "w")
        text_file.write("{}".format(t_video_url_resp))
        text_file.close()

    apiUserPassRegex = re.search(API_USER_PASS_EXPR, t_video_url_resp, flags=re.DOTALL|re.MULTILINE)
    if apiUserPassRegex:
        apiUserValue = apiUserPassRegex.group("user").decode("utf-8")
        apiPassValue = apiUserPassRegex.group('password').decode("utf-8")
        logger.debug("Obtained apiUser = %s" % (apiUserValue))
        logger.debug("Obtained apiPass = %s" % (apiPassValue))
    else:
        logger.warning("Unable to obtain api user / password")
        return None

    download_url_params = {
        "initObj": {
            "Locale": {
                "LocaleLanguage": "", "LocaleCountry": "",
                "LocaleDevice": "", "LocaleUserState": 0
            },
            "Platform": 0, "SiteGuid": 0, "DomainID": "0", "UDID": "",
            "ApiUser": apiUserValue, "ApiPass": apiPassValue
        },
        "MediaID": mediaID,
        "mediaType": 0,
    }

    logger.debug("Performing HTTP GET request on download URL ...")
    download_url_req_url = "http://tvpapi.as.tvinci.com/v3_9/gateways/jsonpostgw.aspx?m=GetMediaInfo"
    download_url_req_params = json.dumps(download_url_params).encode("utf-8")
    download_url_resp = urllib_request.urlopen(download_url_req_url, download_url_req_params).read()

    if (logger.isEnabledFor(logging.DEBUG)):
        text_file = open("v2.download_url_resp.txt", "w")
        text_file.write("{}".format(download_url_resp))
        text_file.close()

    logger.debug("Performing JSON parsing ...")
    download_url_resp_json = json.loads(download_url_resp)

    if (logger.isEnabledFor(logging.DEBUG)):
        text_file = open("v3.download_url_resp_json.txt", "w")
        text_file.write("{}".format(json.dumps(download_url_resp_json,indent=4)))
        text_file.close()

    logger.debug("Obtaining media name ...")
    medianame = re.sub(r"\s+", "_", download_url_resp_json.get("MediaName", "UNKNOWN"))
    medianame = re.sub('[^a-zA-Z0-9-]', '_', medianame)
    try:
        logger.info("Obtained media name = %s" % (medianame.decode('unicode-escape')))
    except UnicodeEncodeError:
        medianame = mediaID
        logger.info("Unicode title encountered. New media name = %s" % (medianame))

    logger.debug("Obtaining URL records from download URL response ...")
    temp_urlList = []
    for fileInfo in download_url_resp_json.get('Files', []):
        urlRecord = fileInfo.get('URL')
        logger.debug("Examining urlRecord %s ...", urlRecord)
        for ext in ["m3u8", "wvm", "mp4"]:
            if urlRecord.startswith('http') and urlRecord.endswith(ext):
                fileformat = re.findall(FORMAT_EXPR, urlRecord, flags=re.DOTALL|re.MULTILINE)
                if fileformat:
                    logger.debug("Appending urlRecord %s to temp_urlList ...", urlRecord)
                    temp_urlList.append((medianame+"_"+fileformat[0],urlRecord))

    # the auto-download function chooses only one URL based on the ranking in FILE_PREFERENCES
    if (AUTO_DOWNLOAD):
        temp_queue1 = Queue.Queue()

        for priority,quality,format in FILE_PREFERENCES:
            for url in temp_urlList:
                if re.search(quality, url[1]) and re.search(format, url[1]):
                    temp_queue1.put(url)
                    logger.debug("Inserted into temporary queue: %s" % (url[1]))

        if temp_queue1.empty():
            logger.error("No files selected based on FILE_PREFERENCES")
            logger.error("Consider relaxing preference criteria, or setting '--no-autodl'")
        else:
            autoSelectedUrl = temp_queue1.get()         
            queued_urls.append(autoSelectedUrl)
            logger.info("Auto-selected URL: %s" % (autoSelectedUrl[1]))
    else:
        logger.debug("Entering video selection function ...")
        queued_urls = user_select_options(temp_urlList)

    logger.debug("Obtaining media duration ...")
    mediaduration = download_url_resp_json.get("Duration") or 0
    logger.debug("Obtained media duration = %s" % (time.strftime("%H hrs %M mins %S secs", time.gmtime(float(mediaduration)))))

    if (CHECK_AND_DOWNLOAD_SUBTITLES):
        logger.debug("Performing HTTP GET request to check for subtitles ...")
        subtitle_link = "https://sub.toggle.sg/toggle_api/v1.0/apiService/getSubtitleFilesForMedia?mediaId=" + mediaID 
        subtitle_link_resp = urllib_request.urlopen(subtitle_link).read()
        logger.debug("Performing JSON parsing ...")
        subtitle_link_resp_json = json.loads(subtitle_link_resp)
        if not subtitle_link_resp_json.get('subtitleFiles', []):
            logger.warning("No subtitles found!")
        for sfile in subtitle_link_resp_json.get('subtitleFiles', []):
            logger.info("Found " + sfile.get('subtitleFileLanguage') + " subtitles! Adding " + sfile.get('subtitleFileUrl') + " to queue list ...")
            queued_urls.append(("Subtitles for "+mediaID,sfile.get('subtitleFileUrl')))

    return queued_urls

def process_episodes_url(t_episodes_url):
    """
    Returns a list of translated URLs from an episodes URL, else returns
    None if errors are encountered
    """

    queued_urls = []

    logger.info("Toggle episodes %s detected" % (t_episodes_url))

    logger.debug("Performing HTTP GET request on Toggle episodes URL ...")
    #t_episodes_url_resp = urllib_request.urlopen(t_episodes_url).read()

    t_episodes_url_req = urllib_request.Request(t_episodes_url)
    t_episodes_url_req.add_header('User-Agent', USER_AGENT)
    t_episodes_url_resp = urllib_request.urlopen(t_episodes_url_req).read()

    contentNavigationRegex = re.search(CONTENT_NAVIGATION_EXPR, t_episodes_url_resp, flags=re.DOTALL|re.MULTILINE)
    contentid = contentNavigationRegex.group("content_id")
    navigationid = contentNavigationRegex.group("navigation_id")

    logger.debug("Obtained content_id = %s" % (contentid))
    logger.debug("Obtained navigation_id = %s" % (navigationid))

    if not (contentid or navigationid):
        return None

    # quick and dirty regex
    episodeTitleRegex = re.search(EPISODE_TITLE_EXPR, t_episodes_url_resp, flags=re.DOTALL|re.MULTILINE)
    seriesTitle = episodeTitleRegex.group(0).decode('unicode_escape').encode('ascii','ignore')
    seriesTitle = " ".join(seriesTitle.split())
    seriesTitle = re.sub(r"\s+", "_", seriesTitle[8:-8])
    logger.info("Series title = %s" % (seriesTitle))

    episodeListUrl = 'http://tv.toggle.sg/en/blueprint/servlet/toggle/paginate?pageSize=99&pageIndex=0&contentId=' + contentid + '&navigationId=' + navigationid + '&isCatchup=1'
    logger.debug("Performing HTTP GET request on Toggle blueprint URL:")
    logger.debug(episodeListUrl)
    #episodeListResp = urllib_request.urlopen(episodeListUrl).read()

    episodeList_req = urllib_request.Request(episodeListUrl)
    episodeList_req.add_header('User-Agent', USER_AGENT)
    episodeListResp = urllib_request.urlopen(episodeList_req).read()

    if (logger.isEnabledFor(logging.DEBUG)):
        text_file = open("e1.episodeListUrl.txt", "w")
        text_file.write("{}".format(episodeListResp))
        text_file.close()

    logger.debug("Parsing blueprint URL output ...")
    urlTitleRegex = re.findall(URL_TITLE_EXPR, episodeListResp, flags=re.DOTALL|re.MULTILINE)

    episodes_list = []
    for record in reversed(urlTitleRegex):
        episodes_list.append((" ".join(record[1].split()),record[0]))

    # the auto-download function chooses all episodes in the series
    if (AUTO_DOWNLOAD):
        logger.info("Auto-selecting all episodes ...")
        episodes_list_selected = episodes_list
    else:
        logger.debug("Entering episode selection function ...")
        episodes_list_selected = user_select_options(episodes_list)

    logger.debug("Processing selected episodes ...")
    for episode in episodes_list_selected:
        for record in process_video_url(episode[1]):
            queued_urls.append(record)

    logger.debug("Completed episodes processing!")
    return queued_urls

def user_select_options(recordsList):
    """
    Returns a list of user-selected names and URLs from 'recordsList'
    recordsList is a list of (title,url) tuples
    """
    user_selected_records = []

    print("")
    for cnt in range(1,len(recordsList)+1):
        print("[%s]: %s" % (cnt,recordsList[cnt-1][0]))

    is_invalid_selection = True
    while (is_invalid_selection):
        user_selection_input_list = list(set(raw_input('\nEnter selection (delimit multiple selections with space, 0 to select all): ').split()))

        for selection in user_selection_input_list:
            try:
                if int(selection) > len(recordsList) or int(selection) < 0:
                    raise ValueError
                if int(selection) == 0:
                    user_selected_records = []
                    user_selected_records = recordsList
                else:
                    user_selected_records.append(recordsList[int(selection)-1])
                is_invalid_selection = False
            except ValueError:
                logger.error("Invalid value: %s" % (selection))
                continue

    if user_selected_records:
        logger.info("Selected URL(s):")
        for record in user_selected_records:
            logger.info(record[1])

        if (logger.isEnabledFor(logging.DEBUG)):
            text_file = open("s1.selected_records.txt", "a")
            for selection in user_selection_input_list:
                try:
                    text_file.write("{}".format(recordsList[int(selection)-1]))
                    text_file.write("{}".format("\n"))
                except (ValueError, IndexError):
                    continue
            text_file.close()

    return user_selected_records

def main():
    currParam = 0
    parser = argparse.ArgumentParser(description='Download Toggle videos.',add_help=True)
    parser.add_argument('-d','--debug',help="Print debugging statements to stdout and files",
        action="store_const",dest="loglevel",const=logging.DEBUG,default=logging.INFO)
    parser.add_argument('-t','--threads',help="Number of download threads",
        dest="download_threads",default=2)
    parser.add_argument('--no-autodl',help="Disable auto-download",action='store_true')
    parser.add_argument('--no-subs',help="Disable subtitle downloads",action='store_true')
    parser.add_argument('URL',nargs='+',help="Toggle video or episodes URL")

    args = parser.parse_args()
    totalParams = len(args.URL)
    logger.setLevel(args.loglevel)

    if (logger.getEffectiveLevel() == logging.DEBUG):
        ## file logging
        fh = logging.FileHandler('download_toggle.log')
        fh_formatter = logging.Formatter('[%(asctime)s.%(msecs).03d] [%(levelname).1s] %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
        fh.setFormatter(fh_formatter)
        logger.addHandler(fh)

    print_script_header()

    global AUTO_DOWNLOAD
    global CHECK_AND_DOWNLOAD_SUBTITLES

    if args.no_autodl is True:
        logger.info("Auto-download is disabled")
        AUTO_DOWNLOAD = 0

    if args.no_subs is True:
        logger.info("Subtitle check is disabled")
        CHECK_AND_DOWNLOAD_SUBTITLES = 0

    try:
        for input_url in args.URL:
            currParam += 1
            print("\n+++++++++++++++++++++++++++++++++++++")
            print("[*] Processing input %i of %i ..." % (currParam, totalParams))
            print("+++++++++++++++++++++++++++++++++++++")

            records_to_enqueue = process_url(input_url)
            if records_to_enqueue:
                for record in records_to_enqueue:
                    MAIN_DOWNLOAD_QUEUE.put(record)
            else:
                logger.warning("Nothing to download for %s" % (input_url))

        if  MAIN_DOWNLOAD_QUEUE.empty():
            logger.error("No files in queue")
            sys.exit(0)     

        logger.info("Starting download of queued URLs ...")
        for i in range(int(args.download_threads)):
            t = Downloader(MAIN_DOWNLOAD_QUEUE)
            t.setDaemon(True)
            t.start()

        MAIN_DOWNLOAD_QUEUE.join()

    except (KeyboardInterrupt):
        logger.error("Received KeyboardInterrupt. Quitting ...")
        sys.exit(0)
    except (SystemExit):
        logger.info("Quitting ...")

    logger.info("+++ Script execution complete! +++\n\n")

if __name__ == '__main__':
    main()
0x776b7364 commented 6 years ago

You should have just submitted a PR. ;) I'll review it when once I'm back from my overseas trip..

0x776b7364 commented 6 years ago

Closed due to inactivity. Thanks.