FelixFrog / pdfgrabber

Download PDF books from bSmart, Pearson, Oxford, and many more!
GNU Affero General Public License v3.0
112 stars 16 forks source link

Download da GiuntiTVP #30

Open stepersy opened 1 year ago

stepersy commented 1 year ago

Ciao! Innanzitutto grazie mille per lo stupendo lavoro che stai facendo! Ho visto che è stata aggiunta la possibilità di scaricare da GiuntiTVP mediante One Shot Link. Come si usa? Ho provato a incollare il link di un mio libro (https://mydbook.giuntitvp.it/app/books/GIAC67_G6760798D/pdfParts?prependCollection=BL35LIV07_MYDBOOK2019) ma continua a rispondere "Unable to view this publicly. Aborting...". Grazieee :)

ckhmer1 commented 11 months ago

Ciao, io sono riuscito a farlo usando il seguente script, la login non so perchè mi restituisce 401, quindi ho usato i cookie prelevati dal browser.

import requests
import fitz
import os

COOKIES={
        "connect.sid": "XXXXXXXXXXX",
        "elmo_vc": "YYYYYYYYY",
        "PHPSESSID": "ZZZZZ",
        "shbookInitialized": "true"
}

BOOK_ID='GT2023_G3452521A'

LAST_PAGE_NUMBER=384

BOOK_URL="https://mydbook.giuntitvp.it/books/" + BOOK_ID + "/pdf/pages/%s?type="

USERNAME='USERNAME@email.com'
PASSWORD='password'

TEMP_DIR=BOOK_ID

def getnamevalue(input):
    dummy=[ t for t in input.split() if t.startswith('name=') or t.startswith('value=') ]
    data={}
    for t in dummy:
        d = t.split("=")
        data[d[0]] = d[1].split('"')[1]
    if data.get("name", "") == "":
        return
    return data.get("name", ""), data.get("value", "")

def getlogindata(username, password):
    s = 'https://mydbook.giuntitvp.it/authentication/cas?iframe=false'
    r = 'https://mydbook.giuntitvp.it/app/home'
    data={ "username": username, "password": password, "submit": "Invia", "service" : [s, s], "return" : [r, r]}
    r = requests.get("https://centralauthentication.giunti.it/cas/login?service=https://mydbook.giuntitvp.it/authentication/cas?iframe=false&return=https://mydbook.giuntitvp.it/app/home")
    for cookie in iter(r.cookies):
        print(cookie.name, cookie.value)
    inputs = [ t for t in r.text.split("\n") if '<input' in t ]
    inputs = [ t1 for t in inputs for t1 in t.split("<") if t1.startswith('input') ]
    namevalues = [ getnamevalue(t) for t in inputs ]
    for namevalue in namevalues:
        if namevalue is not None:
            n = namevalue[0]
            v = namevalue[1]
            if n in data:
                v1 = data[n]
                if isinstance(v1, str):
                    data[n] = [v1, v]
                else:
                    data[n].append(v)
            else:
                data[n] = v
    print(data)
    r = requests.post("https://centralauthentication.giunti.it/cas/login?service=https://mydbook.giuntitvp.it/authentication/cas?iframe=false&return=https://mydbook.giuntitvp.it/app/home", data=data)
    print(r.status_code)
    print(r.encoding)
    print(r.text)
    if r.status_code != 200:
        return
    else:
        r.encoding = "utf-8-sig"
        return r.json()

def login(username, password):
    logindata = getlogindata(username, password)
    print(logindata)
    if "error" in logindata or not logindata:
        if logindata.get("error") == "1":
            print("Incorrect credentials!")
        else:
            print("Login failed!")
    else:
        userid = str(logindata["userId"])
        if userid == "0":
            print("Unauthorized!")
        else:
            return logindata["accessToken"] + "/" + userid

def downloadfile(url):
    r = requests.get(url, stream=True, headers={"Referer": "https://mydbook.giuntitvp.it"}, cookies=COOKIES)
    length = int(r.headers.get("content-length", 1))
    if r.status_code != 200:
        return
    file = b""
    for data in r.iter_content(chunk_size=102400):
        file += data
    return file

def get_page(p):
    url = BOOK_URL % str(p)
    dta = downloadfile(url)
    if dta: 
        f = open(os.path.join(TEMP_DIR, "page_%d.jpg"%p), "wb")
        f.write(dta)
        f.close()

def download_book():
    for p in range(1, LAST_PAGE_NUMBER+1):
        print(p)
        get_page(p)

def create_pdf():
    doc = fitz.open()  # PDF with the pictures

    for p in range(1, LAST_PAGE_NUMBER+1):
        f = os.path.join(TEMP_DIR, "page_%d.jpg"%p)
        img = fitz.open(f)  # open pic as document
        rect = img[0].rect  # pic dimension
        pdfbytes = img.convert_to_pdf()  # make a PDF stream
        img.close()  # no longer needed
        imgPDF = fitz.open("pdf", pdfbytes)  # open stream as PDF
        page = doc.new_page(width = rect.width,  # new page with ...
                        height = rect.height)  # pic dimension
        page.show_pdf_page(rect, imgPDF, 0)  # image fills the page
        print(p)

    doc.save("%s.pdf" % BOOK_ID)

if __name__ == '__main__':
    os.makedirs(TEMP_DIR, exist_ok=True)
    #login(USERNAME, PASSWORD)
    download_book()
    create_pdf()
P1zz454 commented 2 weeks ago

ho provato a creare un file .py con lo script di @ckhmer1 e tenendo aperto il libro digitale. non so se quello che sto facendo e' giusto, ha solo creato dei jpeg non visualizzabili