machawk1 / warcreate

Chrome extension to "Create WARC files from any webpage"
https://warcreate.com
MIT License
205 stars 13 forks source link

WARCreate contribution from WAIL-WARCreate core contributor or making contributions upstream to the parent repo #93

Closed N0taN3rd closed 7 years ago

N0taN3rd commented 7 years ago

Awesome Project!

Apologies for supper large diffs for this PR and if you want I can break it up. Just let me know. Wanted to contribute back to WARCreate after using it as a basis for WAIL-WARCreate as without WARCreate my node.js version would not exist.

Addressing #84

bumped standard version 3.0.0 -> 10.0.2 added devDependency snazzy (https://github.com/feross/snazzy) to help in reporting linting errors

added to package.json scripts:

added standard configuration to package.json to keep it from wrongly reporting globals such as chrome as no-undef addresses

ran linting-errors-fix addresses

Output from standard before this PR

warcreatelintbeforeme

Output from standard at commit 56b4ade of this PR :godmode:

warcreatelintafterme

fixes for #88 #84 #78 #17

refactored warcGenerator.js:

fix for #82 and rework for #90

refactored content.js per #90 and to ensure all of the page's resources are in the warc file which facilitated the fix for #82 by re-adding the writing the of jsData from content.js in warcGenerator.js

I added the async attribute to

/*
   per #90 added the async modifier to the function
   NOTE: any error thrown in this function not caught explicitly
   will cause on unhandled rejection error. This may or may not
   be a node.js only thing (will cause node to halt in a future release)
   but the handling of should take place for good standards and practises
   */
  port.onMessage.addListener(async function (msg) {
   .....

and "promisfied" fetchImage, fetchScriptData, fetchCssData. Promises used are 100% built in :)

/**
 * Exact functionality of the fetchImage method except returns a promise that calls the resolve
 * function after the image data has been set in local storage.
 * calls the reject function if chrome.lasterror is defined or the XHR requests onerror method is called
 * per #90
 */
function fetchImagePromise (u, ret, imgObjs) {
  return new Promise(function (resolve, reject) {
    var xhr = new XMLHttpRequest()
    xhr.open('GET', u, true)
    xhr.responseType = 'arraybuffer'
    xhr.onload = function (e) {
      var uInt8Array = new Uint8Array(this.response)

      var stringUInt8Array = []
      for (var ii = 0; ii < uInt8Array.length; ii++) {
        stringUInt8Array[ii] = uInt8Array[ii] + 0
      }

      ret[u] = uInt8Array
      delete imgObjs[u]

      // console.log("Ok, now postback image data");
      // console.error(u);
      var ohemefgee = {}
      ohemefgee[u] = stringUInt8Array
      chrome.storage.local.set(ohemefgee, function () {
        if (chrome.runtime.lastError) {
          console.error('Error in set data')
          console.error(chrome.runtime.lastError)
          reject(chrome.runtime.lastError)
        } else {
          resolve()
        }
      })
      // console.log(("- Image data in local storage for "+u)
      // port.postMessage({imageData: JSON.stringify(ret),method: "getImageDataRet",uri: u},function(e){})
    }

    xhr.onerror = function (e) {
      console.log('Error fetchImagePromise content.js', e)
      reject(e)
    }

    xhr.send()
  })
}

Then when calling the function

        try {
          await fetchCssDataPromise(document.styleSheets[ss].href, styleSheetData)
        } catch (error) {
          console.error('there was an error fetching css data content.js', error)
        }

finally when generating the WARC

    arrayBuffers.push(str2ab(`${respHeader}${respContent}${WARCEntryCreator.warcRecordSeparator}`))
      delete responsesToConcatenate[requestHeader]
    } else if (responseHeaders[requestHeader] && helperREs.jsregexp.exec(responseHeaders[requestHeader]) !== null) {
      // for #82
      var jsRespHeader = `${responseHeaders[requestHeader]}${WARCEntryCreator.warcRecordSeparator}`
      var jsRespContent
      var jsIdx = 0
      var jsURIsLen = jsURIs.length
      for (; jsIdx < jsURIsLen; jsIdx++) {
        if (requestHeader === jsURIs[jsIdx]) {
          jsRespContent = `${jsData[jsURIs.indexOf(requestHeader)]}${WARCEntryCreator.warcRecordSeparator}`
          break
        }
      }
      var jsRHSTemp = WARCEntryCreator.makeWarcResponseHeaderWith(requestHeader, now, warcConcurrentTo, jsRespHeader + jsRespContent)
      var jsResponseHeaderString = `${jsRHSTemp}${WARCEntryCreator.CRLF}`
      arrayBuffers.push(str2ab(jsResponseHeaderString))

      arrayBuffers.push(str2ab(`${jsRespHeader}${jsRespContent}${WARCEntryCreator.warcRecordSeparator}`))
      delete responsesToConcatenate[requestHeader]
    }

fix for #79

by adding a listener to onSendHeaders. Avoid duplicates by keeping track per URL which headers have been added so far

/**
 * address #79 by keeping track per URL what headers we have already concatenated
 */
var requestHeadersTracking = []

/**
 * Stores HTTP request headers into an object array with URI as key.
 * issue #79, these headers are not available here:
 * Authorization,Cache-Control,Connection,Content-Length,Host,If-Modified-Since,If-None-Match,If-Range
 * Partial-Data,Pragma,Proxy-Authorization,Proxy-Connection,Transfer-Encoding
 * see https://developer.chrome.com/extensions/webRequest
 */
chrome.webRequest.onBeforeSendHeaders.addListener(function (req) {
  var path = req.url.substring(req.url.match(/[a-zA-Z0-9]\//).index + 1)

  // per #79 keep track of already concatenated headers for warc string
  if (requestHeadersTracking[req.url] === null || requestHeadersTracking[req.url] === undefined) {
    requestHeadersTracking[req.url] = new Set()
  } else {
    requestHeadersTracking[req.url].clear()
  }
  requestHeaders[req.url] = `${req.method} ${path} HTTP/1.1${CRLF}`
  // requestHeaders[req.url] += req.method + ' ' + path + ' ' + FABRICATED_httpVersion + CRLF
  // console.log(("- Request headers received for "+req.url)
  for (var key in req.requestHeaders) {
    requestHeaders[req.url] += `${req.requestHeaders[key].name}: ${req.requestHeaders[key].value}${CRLF}`
    requestHeadersTracking[req.url].add(req.requestHeaders[key].name)
  }
}, {urls: ['http://*/*', 'https://*/*'], tabId: currentTabId}, ['requestHeaders', 'blocking'])

/**
 * Stores HTTP request headers into an object array with URI as key.
 * fix for issue #79, see explanation in onBeforeSendHeaders and documentation for requestHeadersTracking
 */
chrome.webRequest.onSendHeaders.addListener(function (req) {
  for (var key in req.requestHeaders) {
    if (!requestHeadersTracking[req.url].has(req.requestHeaders[key].name)) {
      requestHeaders[req.url] += `${req.requestHeaders[key].name}: ${req.requestHeaders[key].value}${CRLF}`
      requestHeadersTracking[req.url].add(req.requestHeaders[key].name)
    }
  }
}, {urls: ['http://*/*', 'https://*/*'], tabId: currentTabId}, ['requestHeaders'])

fixed #80

the metadata information for CSS is no longer E =EMBED_MIC it is now E link/@href

     // outlinks as CSS
      $(document.styleSheets).each(function () {
        if (!outlinksAddedRegistry[$(this).attr('href')]) {
          outlinksAddedRegistry[$(this).attr('href')] = ''
          // fixes #80
          outlinks.push(`${$(this).attr('href')} E link/@href`)
        }
      })

fixed #81

the reason why the script tag links were missing from the warcmetadata was because the code checking for its existence was looking for the HREF attribute, not an src attribute. Not rhetorical this error was simply due to a copy pasta error. Was the exact code for outlines as CSS and have I ever made my share of these :feelsgood:

 // outlinks as JavaScripts
      $(document.scripts).each(function () {
        if ($(this).attr('src') && // Only include the externally embedded JS, not the inline
          !outlinksAddedRegistry[$(this).attr('src')]
        ) {
          outlinksAddedRegistry[$(this).attr('src')] = ''
          outlinks.push(`${$(this).attr('src')} E script/@src`)
        }
      })

commented out in html/background.html

Per authors own comments

<!-- To convert the really high character numbers that are exhibited in image data. 
Keep this until Chrome supports https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint in EC6 --> 
<script src="../js/es6-shim.js"></script> 

String.fromCodePoint is a built in string function now

N0taN3rd commented 7 years ago

@machawk1 hey after this PR the decrease in linting errors is 97.39413680781759% take that travis :stuck_out_tongue_closed_eyes:

machawk1 commented 7 years ago

Thanks for the updates, @N0taN3rd . You were definitely the guy for the job with your familiarity w/ WARCreate's flow in your adaptation for WAIL-WARCreate. Given the WARCreate Chrome extension is overdue for this overhaul and update to ES6, don't both breaking it up into separate pull requests. I will review the code and functionality and then merge if all looks right functionality-wise. Stand by.