anilabhadatta / educative.io_scraper

Educative.io Course Downloader developed using Python and Selenium. Refer Readme.md for setup instructions.
MIT License
170 stars 57 forks source link

iframe elements not saved correct #121

Closed anilabhadatta closed 7 months ago

anilabhadatta commented 7 months ago
image image

This issue was fixed previously but due to some issue in SingleFile recently, the iframe elements are not saved correctly. Although using the extension it is saved properly. Need a fix or understanding of singlefile why extension is able to save it but not by executing the js library manually in the console.

anilabhadatta commented 7 months ago

function injectScriptToHTML(scriptTag, location) { if (location === "iframe") { var frames = document.querySelectorAll('frame, iframe'); frames.forEach(frame => { var frameDocument = frame.contentDocument || frame.contentWindow.document; var targetElement = frameDocument.body || frameDocument.documentElement; targetElement.appendChild(scriptTag.cloneNode(true));

                        subframe = targetElement.querySelector('frame, iframe');
                        var subframeDocument = subframe.contentDocument || subframe.contentWindow.document;
                        var subtargetElement = subframeDocument.body || subframeDocument.documentElement;
                        subtargetElement.appendChild(scriptTag.cloneNode(true));
                });
            }
            document.head.appendChild(scriptTag);
        }

        function createScriptTagFromURL(url) {
            return fetch(url)
                .then(response => response.text())
                .then(data => {
                    var scriptElement = document.createElement('script');
                    scriptElement.type = 'text/javascript';
                    scriptElement.textContent = data;
                    return scriptElement;
                })
                .catch(error => {
                    console.error('Error loading script:', error);
                    return null;
                });
        }
        window.__define = window.define;
        window.__require = window.require;
        window.define = undefined;
        window.require = undefined;
        var baseurl = 'https://anilabhadatta.github.io/SingleFile/';
        var urls = [
        'lib/single-file-bootstrap.js',
        'lib/single-file-hooks-frames.js',
        'lib/single-file-frames.js',
        'lib/single-file.js'
        ];
        var fullUrls = urls.map(url => baseurl + url);

        for(let i=0; i< fullUrls.length; i++){
            createScriptTagFromURL(fullUrls[i])
                .then(scriptTag => {
                    if (scriptTag) {
                        if(i === 1 || i === 2){
                            injectScriptToHTML(scriptTag, "iframe") 
                        }
                        else {
                            injectScriptToHTML(scriptTag, "")
                        }
                    }
                });
        }

should fix

gildas-lormeau commented 7 months ago

Something similar to the code below (not tested) should work in a more generic way.

function injectScriptToHTML(scriptTag, doc = document) {
    var targetElement = doc.body || doc.documentElement;
    targetElement.appendChild(scriptTag.cloneNode(true));
    var frames = doc.querySelectorAll("frame, iframe");
    frames.forEach(frame => {
        if (frame.contentDocument) {
            injectScriptToHTML(scriptTag, frame.contentDocument);
        }
    });
}

/// ...
injectScriptToHTML(scriptTag);
/// ...
anilabhadatta commented 7 months ago

@gildas-lormeau Thank you, the recursive solution fix it. I will although try to impl the CDP solution

anilabhadatta commented 7 months ago

@gildas-lormeau #122 fixed it for seleniumbase as well, seleniumbase for some reason expects a new tab to disable to CORS issue(strange but hard to detect) Will check the CDP implementation if that can be done to remove the existing approach Thanks a lot for your support.

anilabhadatta commented 7 months ago

CDP impl is done. closing this issue