andykais / scrape-pages

generalized scraper using a single instruction set for any site that can be statically scraped
https://scrape-pages.js.org
MIT License
6 stars 2 forks source link

support persistent variables (e.g. access tokens) #32

Open andykais opened 4 years ago

andykais commented 4 years ago

we have a need to get a variable once and reuse it throughout the config. There are a few options:

we just store the last value out of every scraper. This is reasonable since we have no way to distinguish individual requests outside of the parse step anyways.

const config: ConfigInit = {
  input: ['username', 'password']
  scrapers: {
    login: {
      download: 'http://example.com/login?username={{ username }}&password={{ password }}',
      parse: {
        format: 'json',
        selector: 'session.token'
      }
    },
    pageBehindLogin: {
      download: {
        urlTemplate: 'http://example.com/account',
        headerTemplates: {
          cookie: 'token: {{{ login.lastValue }}}' // the important line
        }
      }
    }
  },
  run: {
    scraper: 'login',
    forEach: {
      scraper: 'pageBehindLogin'
    }
  }
}

or we can explicitly set variables. This feels less declarative so I am less inclined to use it

const config: ConfigInit = {
  scrapers: {
    login: {
      download: 'http://example.com/login?username={{ username }}&password={{ password }}',
      parse: {
        format: 'json',
        selector: 'session.token'
      },
      setValueAsGlobal: 'token'
    },
    pageBehindLogin: {
      download: {
        urlTemplate: 'http://example.com/account',
        headerTemplates: {
          cookie: 'token: {{{ globals.token }}}'
        }
      }
    }
  },
  run: {
    scraper: 'login',
    forEach: {
      scraper: 'pageBehindLogin'
    }
  }
}