Jayapraveen / INE-courses-downloader

Python Script to download coures from INE.com website for personal and educational use
GNU General Public License v3.0
37 stars 19 forks source link

some images does not completely downloaded #38

Open pwnedDesal opened 2 years ago

pwnedDesal commented 2 years ago

I tried to download PenetrationTestingEvasion. if you look on the slide, there is a square that should be an image image image image

ghost commented 2 years ago

i think the slides part is still under production.

what.s the course name? I can give it a look. probably need to update the function to save the slides.

ghost commented 2 years ago

give me a bit I can fix. the logic doesn't take in account that some slides might not have an image

pwnedDesal commented 2 years ago

i think the slides part is still under production.

what.s the course name? I can give it a look. probably need to update the function to save the slides.

Penetration Testing: Evasion -> Defense Evasion. I can manually download the images from the course. I think the problem is the CloudFront-* cookies. For every course their is a different CloudFront-* downloaded course image From INE website image

ghost commented 2 years ago

make a copy of script then replace this function ( download_slides ) with this updated one

` def download_slides(uuid, slide_index):

# content meta
host="content-api.ine.com"
header = {"Host": host,"Origin": referer,"Authorization": access_token,"User-Agent": user_agent,"Accept": accept,"X-Requested-With": x_requested_with,"Accept-Encoding": accept_encodings,"sec-fetch-mode": sec_fetch_mode,"sec-fetch-dest": sec_fetch_dest,"Referer": referer}
out = requests.get(content_url.format(uuid),headers = header)
if (out.status_code == 200):

    cookies=out.cookies.get_dict()
    data = json.loads(out.text)

    # prepare subfolders
    data_name = data['name']
    data_name = fix_string_filename(data_name)
    subfolder_name = str(slide_index)+'.'+data_name+'/'
    subfolder_name = subfolder_name.replace(":", " - ")
    if not os.path.exists(subfolder_name):
        os.makedirs(subfolder_name)
    if not os.path.exists(subfolder_name+"/data"):
        os.makedirs(subfolder_name+"/data")

    # files
    host = "file.rmotr.com"
    header = {"Host": host,"Origin": referer,"Authorization": access_token,"User-Agent": user_agent,"Accept": accept,"X-Requested-With": x_requested_with,"Accept-Encoding": accept_encodings,"sec-fetch-mode": sec_fetch_mode,"sec-fetch-dest": sec_fetch_dest,"Referer": referer}    
    for f in data["files"]:
        out = requests.get(file_url.format(f), headers=header)
        if (out.status_code == 200):
            file_data = json.loads(out.text)
            dl_url = file_data["download_url"]
            file_name = file_data["filename"]
            out = requests.get(dl_url, stream=True)
            if (out.status_code == 200):
                file_name=file_name.replace('/', '_')
                with open(subfolder_name+file_name, 'wb') as fp:
                    shutil.copyfileobj(out.raw, fp)

    # prepare header for slide content download
    host = "els-cdn.content-api.ine.com"
    header = {"Host": host,"Origin": referer,"Authorization": access_token,"User-Agent": user_agent,"Accept": accept,"X-Requested-With": x_requested_with,"Accept-Encoding": accept_encodings,"sec-fetch-mode": sec_fetch_mode,"sec-fetch-dest": sec_fetch_dest,"Referer": referer}    

    # index.html
    if not os.path.exists(subfolder_name+"index.html"):
        out = requests.get(data["url"], headers=header, cookies=cookies)
        if (out.status_code == 200):
            with open(subfolder_name+"index.html",'w') as fp:
                #remove ending after ".js"
                html_out=out.text
                pre = "\<script\ src\=\""
                suf = "\"\>\<\/script\>"
                js_files = re.findall(pre+".*"+suf, html_out)
                for js in js_files:
                    js_path = re.findall('"([^"]*)"',js)[0]
                    html_out=html_out.replace(js_path,js_path[:js_path.rfind('?')])
                fp.write(html_out)

            # browsersupport.js and player.js
            pre = "\<script\ src\=\""
            suf = "\"\>\<\/script\>"
            js_files = re.findall(pre+".*"+suf, out.text)
            for js in js_files:
                js_path = re.findall('"([^"]*)"',js)[0]
                out = requests.get(slide_url.format(uuid)+js_path, headers=header, cookies=cookies)
                if (out.status_code == 200):    
                    with open(subfolder_name+js_path[:js_path.rfind('?')],'w') as fp:
                        fp.write(out.text)

    #console
    print('Please wait: Downloading slide content! This takes a long time and will appear stuck...')

    # slideX.js
    nums = 1
    http = True
    while http:
        target=subfolder_name+"data/slide{}.js".format(str(nums))
        if os.path.exists(target):
            nums = nums + 1
        else:
            out = requests.get(slidejs_url.format(uuid,str(nums)), headers=header, cookies=cookies)
            if (out.status_code == 200):
                with open(target,'w',encoding='utf-8') as fp:
                    data = str(out.text)
                    fp.write(data)
                nums = nums + 1
            else:
                http = False            

    # slideX.css
    num = 1
    http = True
    while http:
        target=subfolder_name+"data/slide{}.css".format(str(num))
        if os.path.exists(target):
            num = num + 1
        else:
            out = requests.get(slidecss_url.format(uuid,str(num)), headers=header, cookies=cookies)
            if (out.status_code == 200):
                with open(target,'w') as fp:
                    fp.write(out.text)
                num = num + 1
            else:
                http = False

    # imgX.png
    num = 0
    http = True
    while http:
        target=subfolder_name+"data/img{}.png".format(str(num))
        if os.path.exists(target):
            num = num + 1
        else:
            out = requests.get(slideimg_url.format(uuid,str(num)), headers=header, cookies=cookies, stream=True)
            if (out.status_code == 200):
                with open(target, 'wb') as fp:
                    shutil.copyfileobj(out.raw, fp)
                num = num + 1
            else:
                num = num + 1
                if num > nums:
                    http = False

    # fntX.woff
    num = 0
    http = True
    while http:
        target=subfolder_name+"data/fnt{}.woff".format(str(num))
        if os.path.exists(target):
            num = num + 1
        else:
            out = requests.get(slidefnt_url.format(uuid,str(num)), headers=header, cookies=cookies, stream=True)
            if (out.status_code == 200):
                with open(target, 'wb') as fp:
                    shutil.copyfileobj(out.raw, fp)
                num = num + 1
            else:
                http = False

`

ghost commented 2 years ago

also delete previous download attempt (folder) so the count works.

this is not a perm fix but will work until can get slide count another way

Ex-Communicad0 commented 1 year ago

`

@dlmart1972 when i add this code i got this error

IndentationError: expected an indented block

ghost commented 1 year ago

its an indentation error meaning its inconsistent.

select the pasted function and hold shift then press tab

after, while all code still selected press tab. this will correct the inconsistent tab error

python is dumb like that