eeshi / node-scrapy

Simple, lightweight and expressive web scraping with Node.js
MIT License
154 stars 27 forks source link

social: { stars: null, forks: null } }, files: null } with the example code. #18

Closed calendarbase closed 8 years ago

calendarbase commented 8 years ago

Thought first that it was because the project was moved but I get files null on every GitHub projects.

{ author: 'expressjs',
  repo: null,
  stats: 
   { commits: '1,609',
     branches: '3',
     releases: '0',
     contributors: '216',
     social: { stars: null, forks: null } },
  files: null }

My node.js code:

var scrapy = require('node-scrapy')
  , url = 'https://github.com/expressjs/expressjs.com'
  , model =
    { author: '.author',
      repo: '.js-current-repository',
      stats:
       { commits: '.commits .num',
         branches: '.numbers-summary > li.commits + li .num',
         releases: '.numbers-summary > li.commits + li + li .num',
         contributors: '.numbers-summary > li.commits + li + li + li .num',
         social:
          { stars: '.star-button + .social-count',
            forks: '.fork-button + .social-count' } },
      files: '.js-directory-link' }

scrapy.scrape(url, model, function(err, data) {
    if (err) return console.error(err)
    console.log(data)
});
calendarbase commented 8 years ago

This works:

var scrapy = require('node-scrapy')
  , url = 'https://github.com/eeshi/node-scrapy'
  , model =
    { author: '.author',
      repo: '.js-current-repository',
      stats:
       { commits: '.commits .num',
         branches: '.numbers-summary > li.commits + li .num',
         releases: '.numbers-summary > li.commits + li + li .num',
         contributors: '.numbers-summary > li.commits + li + li + li .num',
         social:
          { stars: '.star-button + .social-count',
            forks: '.fork-button + .social-count' } },
      files: '.content' }

scrapy.scrape(url, model, function(err, data) {
    if (err) return console.error(err)
    console.log(data)
});

But this repo is tricky:

var scrapy = require('node-scrapy')
  , url = 'https://github.com/spf13/hugoThemes'
  , model =
    { author: '.author',
      repo: '.js-current-repository',
      stats:
       { commits: '.commits .num',
         branches: '.numbers-summary > li.commits + li .num',
         releases: '.numbers-summary > li.commits + li + li .num',
         contributors: '.numbers-summary > li.commits + li + li + li .num',
         social:
          { stars: '.star-button + .social-count',
            forks: '.fork-button + .social-count' } },
      files: '.content' }

scrapy.scrape(url, model, function(err, data) {
    if (err) return console.error(err)
    console.log(data)
});

The result is commit messages and I can't get the href:

files: 
   [ 'Failed to load latest commit information.',
     'academic @ 1a20f5e',
     'agency @ 6415e68',
     'aglaus @ f0328f8',
     'air @ 0e1c3b0',
     'allegiant @ 6c49d13',
     'angels-ladder @ bbe249f',
     'artists @ baf5506',
     'aurora @ d760f75',
     'base16 @ 058fd51',
     'beautifulhugo @ ecb7e94',
     'beg @ 80a6b59',
     'blackburn @ 8103131',
     'bleak @ 13bf460',
     'bootie-docs @ e14a0e4',
     'bootstrap @ d8219ca',
     'cactus @ 7921c05',
     'casper @ 0c6470f',
     'cocoa @ e32b495',
     'creative @ 5c1fdb2',
     'crisp @ e99ce5d',
     'detox @ b56f0e2',
     'freelancer @ e8dd137',
     'future-imperfect @ 2db677f',
     'ghostwriter @ f3662c3',
     'gindoro @ 6bfc86b',
     'github-project-landing-page @ 9d8a23c',
     'greyshade @ 28fb061',
     'grid-side @ d384625',
     'heather-hugo @ cb93a3b',
     'herring-cove @ fe7ce04',
     'hikari @ ecfaed7',
     'html5 @ 9a3e7a0',
     'hugo-base-theme @ f78b4d1',
     'hugo-bootstrap-premium @ d1f1b8c',
     'hugo-bootswatch @ 4367ac1',
     'hugo-darkdoc-theme @ 7b2069f',
     'hugo-geo @ b14540d',
     'hugo-h5bp @ 7741bbc',
     'hugo-icarus @ ff019b8',
     'hugo-identity-theme @ 79e03ea',
     'hugo-incorporated @ 3d21a63',
     'hugo-lithium-theme @ 08ba4a9',
     'hugo-mdl @ 79c6f0e',
     'hugo-minimalist @ 21036c6',
     'hugo-multi-bootswatch @ 0f12d1b',
     'hugo-octopress @ bcd9bcd',
     'hugo-pacman-theme @ 604c57f',
     'hugo-phlat-theme @ 168cd40',
     'hugo-plus @ bba9290',
     'hugo-theme-arch @ 31a5301',
     'hugo-theme-geppaku @ d56d47c',
     'hugo-theme-learn @ 75df02c',
     'hugo-uno @ a66c2db',
     'hugo-zen @ ff5a388',
     'hugoscroll @ 6f6ce41',
     'hurock @ 13081b8',
     'hyde @ a04b9e1',
     'hyde-x @ ee61d83',
     'hyde-y @ 493bf99',
     'internet-weblog @ 6a9c9a5',
     'journal @ e543e4b',
     'landing-page-hugo @ 9280715',
     'lanyon @ 0c3da68',
     'liquorice @ b2b6d57',
     'material-design @ 0d88058',
     'material-docs @ e133b47',
     'material-lite @ 24a23e3',
     'next @ b8c8076',
     'nofancy @ 151dd32',
     'persona @ afb2d45',
     'pixyll @ 50dad69',
     'polymer @ d8be002',
     'projecthub @ d0fb856',
     'purehugo @ 3c4ee78',
     'redlounge @ 7853d1e',
     'robust @ 69ce3f1',
     'rocktopus @ c47bcd7',
     'shiori @ e23d4a1',
     'simple-a @ 3338b25',
     'simple-hugo @ 20d9086',
     'slender @ 0874af0',
     'slim @ a491cb0',
     'startbootstrap-clean-blog @ c3ff77a',
     'steam @ d917cfa',
     'strata @ 253c8bb',
     'tachyons @ a98c649',
     'tinyce @ ec650df',
     'twentyfourteen @ 83f0a78',
     'type @ e8a2c26',
     'vienna @ 890e97c',
     '.gitmodules',
     'LICENSE',
     'README.md' ] }
stefanmaric commented 8 years ago

Hi @calendarbase, thanks for reaching out.

Yes, Github's site have been updated since the examples were written.

I think this is what you want:

var scrapy = require('node-scrapy')
  , url = 'https://github.com/spf13/hugoThemes'
  , model =
    { author: '.author',
      repo: {
        selector: 'meta[property="og:url"]',
        get: 'content'
      },
      stats:
       { commits: '.commits .num',
         branches: '.numbers-summary > li.commits + li .num',
         releases: '.numbers-summary > li.commits + li + li .num',
         contributors: '.numbers-summary > li.commits + li + li + li .num',
         social:
          { stars: '.social-count[href$=stargazers]',
            forks: '.social-count[href$=network]' } },
      files: {
        selector: '.content a[href]',
        get: 'href'
      }
    }

scrapy.scrape(url, model, function(err, data) {
    if (err) return console.error(err)
    console.log(data)
})

It outputs:

{ author: 'spf13',
  repo: 'https://github.com/spf13/hugoThemes',
  stats: 
   { commits: '271',
     branches: '1',
     releases: '0',
     contributors: '31',
     social: { stars: '414', forks: '95' } },
  files: 
   [ '/gcushen/hugo-academic/tree/1a20f5e6d70908a2f41f1c6d331361c68024b062',
     '/digitalcraftsman/hugo-agency-theme/tree/6415e68a97f7cab3e04497e087ce348e1ca7574e',
     '/dim0627/hugo_theme_aglaus/tree/f0328f8f825591b1efa0f677ce9e3c3691b37e60',
     '/syui/hugo-theme-air/tree/0e1c3b0bfb335dfa23f9ec1198e24628a3df707b',
     '/brycematheson/allegiant/tree/6c49d136e538d514fd3bbc4371104bfcb1d99814',
     '/tanksuzuki/angels-ladder/tree/bbe249fcc3483b20e4db2c59350bbf507bb857a6',
     '/digitalcraftsman/hugo-artists-theme/tree/baf55066fec9c97ab590bdbac71d8bb4e36cf87c',
     '/coryshaw/hugo-aurora-theme/tree/d760f752f18361025a8b397ca4fd081787757bcf',
     '/htdvisser/hugo-base16-theme/tree/058fd51e22c705f080a0fe09d998ad5af1394d12',
     '/halogenica/beautifulhugo/tree/ecb7e949e7c08d982d77b16be53e89e7202f5ae6',
     '/dim0627/hugo_theme_beg/tree/80a6b59904f76ab2b994f231374e0bb700361ff4',
     '/yoshiharuyamashita/blackburn/tree/8103131976363a8e6d53abc8ca01cee1fc6871c8',
     '/Zenithar/hugo-theme-bleak/tree/13bf4609193d972828256164c629beccb9613576',
     '/key-amb/hugo-theme-bootie-docs/tree/e14a0e4fec4720c7a30cf186f623d18259b1dd14',
     '/mmrath/hugo-bootstrap/tree/d8219ca9bb7decbdf50b516e44955c7d0a0d29bb',
     '/digitalcraftsman/hugo-cactus-theme/tree/7921c05d7e32df62f43b0df46b501f455182e290',
     '/vjeantet/hugo-theme-casper/tree/0c6470f66b81e9bf24aa29c5b9e1f1db289368d1',
     '/nishanths/cocoa-hugo-theme/tree/e32b495b664ea8cc156a9fa38a3157077837672b',
     '/digitalcraftsman/hugo-creative-theme/tree/5c1fdb2dd5f60dd6ce0d28ce940cdddcfe403fe6',
     '/Zenithar/hugo-theme-crisp/tree/e99ce5df55f8184af72c0d3e6878c99a47e3fdf2',
     '/allnightgrocery/hugo-theme-blueberry-detox/tree/b56f0e23827959e53f58f3819061a15da00df204',
     '/digitalcraftsman/hugo-freelancer-theme/tree/e8dd1373964d5b838d130b5b2b9b2a04ba3c88d8',
     '/jpescador/hugo-future-imperfect/tree/2db677fe337a2bab43b776318559f86db9316daf',
     '/jbub/ghostwriter/tree/f3662c30747ad192f8dceee43e094f70ad1e9963',
     '/cdipaolo/gindoro/tree/6bfc86bd85d02e524b69aca3bfd134abd5cecab7',
     '/oarrabi/github-project-landing-page/tree/9d8a23c233479f0411c3547ba2fb0fc14df6ca6f',
     '/cxfksword/greyshade/tree/28fb061bb674a2add89724dfbbf167f88f381d40',
     '/chipsenkbeil/grid-side/tree/d3846256bb43410ddcd0288a60884506abeb42ff',
     '/hbpasti/heather-hugo/tree/cb93a3bd5fffcb1c5f0c9bc4623c92819a0f0a9c',
     '/spf13/herring-cove/tree/fe7ce044ce05343dd38e3076d1d9d44215c99f5a',
     '/digitalcraftsman/hugo-hikari-theme/tree/ecfaed71d673a17add5bb94d07fbd42c89c4c4e9',
     '/simonmika/hugo-theme-html5/tree/9a3e7a0b479c6d06147c05759723d312c90cf0aa',
     '/crakjie/hugo-base-theme/tree/f78b4d181b8dc970abde3a704524ac259af74f68',
     '/appernetic/hugo-bootstrap-premium/tree/d1f1b8cbc7d67fb7b06957cf3eca8b6b62c83fdb',
     '/nilproductions/hugo-bootswatch/tree/4367ac101b911780b8ce8050cbca102bb0e6f93f',
     '/adejoux/hugo-darkdoc-theme/tree/7b2069f14038c3e1cb0792e040a1a91c1480ec22',
     '/alexurquhart/hugo-geo/tree/b14540d1d84ae3c129d2b76a25cab89860a5eda5',
     '/garvincasimir/hugo-h5bp-simple/tree/7741bbce77c9fd66843ca102ae4ed118b42dd06e',
     '/digitalcraftsman/hugo-icarus-theme/tree/ff019b8cac212562bcd39dc0c64c7d3e1bf8ae5c',
     '/aerohub/hugo-identity-theme/tree/79e03ea6a44842be039984b08659b37238bb53bb',
     '/nilproductions/hugo-incorporated/tree/3d21a638bbad6b3902873cfe8a13cb15d6d3ce0a',
     '/jrutheiser/hugo-lithium-theme/tree/08ba4a9261d876837416d1129714e50d79a1b5cb',
     '/jchatkinson/HugoMDL/tree/79c6f0ec4ceab910e169c759924320b8b007cae0',
     '/digitalcraftsman/hugo-minimalist-theme/tree/21036c6936a63789a2b7edc7b88b99dcb425a4a2',
     '/mpas/hugo-multi-bootswatch/tree/0f12d1be8d1d4ff84f79f228d6c0584ff631e8fc',
     '/parsiya/Hugo-Octopress/tree/bcd9bcd32cc2810858c223c3f316420eb1b49795',
     '/coderzh/hugo-pacman-theme/tree/604c57f4184ccbeef8340208c5900fe269e968f5',
     '/nraboy/hugo-phlat-theme/tree/168cd40e198b248e9fe6d620a4140861ef46b282',
     '/H4tch/hugo-plus/tree/bba92902cd393af3b538380ed032df32d0cfc54c',
     '/syui/hugo-theme-arch/tree/31a5301662204879642302e7c1fdfc00f8c033b3',
     '/masa0221/hugo-theme-geppaku/tree/d56d47c483c84c514de5b37727ee008ef9b95c62',
     '/matcornic/hugo-theme-learn/tree/75df02c3ab7dc5e8fe81c5e220e3e7c54bea3252',
     '/SenjinDarashiva/hugo-uno/tree/a66c2dbb189c9eb6ee8ca89ec5342a38edcb0625',
     '/rakuishi/hugo-zen/tree/ff5a38825a65229bca110739c261d7e782f751e7',
     '/SenjinDarashiva/hugoscroll/tree/6f6ce41f4791f507f257c09a7f45ca306894d6f9',
     '/TiTi/hurock/tree/13081b8ef3c29d3ab27e25ea21a7a510e53c3376',
     '/spf13/hyde/tree/a04b9e15746f679a3a6d8f325b82e2392b20d380',
     '/zyro/hyde-x/tree/ee61d837b64e6a15adf12ee3f231dc29ede871bd',
     '/enten/hyde-y/tree/493bf9942c9cd88f714f6616dead1bec71c5c8f2',
     '/jnjosh/internet-weblog/tree/6a9c9a54690ced05ecf72e17c693da17221e3d85',
     '/mpas/hugo-journal/tree/e543e4bfa9de79306e906f34da42a8ee7ac48bc8',
     '/crakjie/landing-page-hugo/tree/9280715492960afb82dc5a3add39c9f458d5a41e',
     '/tummychow/lanyon-hugo/tree/0c3da68b8e1cc9b7616c977aab34d7cd7e283da9',
     '/eliasson/liquorice/tree/b2b6d571e0fb2ab4470964320bebf1beb418c812',
     '/pdevty/material-design/tree/0d88058fead04a70d14f34552e07a1fd81911ca6',
     '/digitalcraftsman/hugo-material-docs/tree/e133b473c44fef8f13813bd660d8975a272c4005',
     '/SamuelDebruyn/hugo-material-lite/tree/24a23e3be3f7f9b2f9d825b5ab4aa51f9cf1818c',
     '/leopku/hugo-theme-next/tree/b8c8076d32755b3921ec3e461c77b6c67716150f',
     '/gizak/nofancy/tree/151dd320af1f43d7c783f80d2a98df6ccc59b26f',
     '/pcdummy/hugo-theme-persona/tree/afb2d458f97b6fe1c295afbcd336b3c6e71028d0',
     '/azmelanar/hugo-theme-pixyll/tree/50dad694aaf8edbadc2e100b30d8742dad5f0030',
     '/pdevty/polymer/tree/d8be0028b80dcf7026489598f4f8905ecc291611',
     '/vjeantet/hugo-theme-projecthub/tree/d0fb856de1d22a396659a9c3deeeddeb4eeb256e',
     '/dplesca/purehugo/tree/3c4ee78cb7905f00545498733c8f7f6c18bc360c',
     '/tmaiaroto/hugo-redlounge/tree/7853d1eb5de8976a3087db0bff3f865b8efbd72d',
     '/dim0627/hugo_theme_robust/tree/69ce3f191a948ddeb323de3e836d0152f8525a30',
     '/esell/rocktopus/tree/c47bcd755c4fd25a50bdff09fc1064da6d29bbe0',
     '/chibicode/hugo-theme-shiori/tree/e23d4a165a25e08d054ccce96f072989138347c5',
     '/AlexFinn/simple-a/tree/3338b258ebccf0ed027fe9cb9f988fd023364a9c',
     '/druzza/simple-hugo/tree/20d9086e64e8493b931a419e52866e007e8699e7',
     '/CrimsonRay/slender/tree/0874af0b51a2ce3e7603e5f0f2d76ef1eb05e15e',
     '/zhe/hugo-theme-slim/tree/a491cb0d5d01970c83d81cd58cb204091d085ee3',
     '/humboldtux/startbootstrap-clean-blog/tree/c3ff77a0e2b29f5be3e6812139f4d23945d8a3ff',
     '/digitalcraftsman/hugo-steam-theme/tree/d917cfa3ac9b89ca2adca2ed796d03a915a74d3c',
     '/digitalcraftsman/hugo-strata-theme/tree/253c8bba874a9bb60ef9bd1ba925fc52d1a795a7',
     '/marloncabrera/tachyons/tree/a98c649c0afc175d2332c9ba9a6e0cc92aa7346b',
     '/roperzh/tinyce-hugo-theme/tree/ec650dfeeda9ae94152e51f0b8b01d61dbbf9e0c',
     '/jaden/twentyfourteen/tree/83f0a782af8c678f0017d56c2ac8478c31cf8c52',
     '/digitalcraftsman/hugo-type-theme/tree/e8a2c263a3d1bbe1bfd53ab0892b0ad0620769a0',
     '/keichi/vienna/tree/890e97c5a052984d04294862ac60c7e0bfc6a934',
     '/spf13/hugoThemes/blob/master/.gitmodules',
     '/spf13/hugoThemes/blob/master/LICENSE',
     '/spf13/hugoThemes/blob/master/README.md' ] }

Regards!

calendarbase commented 7 years ago

Thanks! I saw this now. Looks like a useful solution. Don't remember what I finally used. A strange combination of Osmosis and RB/J I think.