Open appjitsu opened 8 years ago
so far i have this in content.js:
chrome.runtime.onMessage.addListener(
function(request, sender, sendResponse) {
if( request.message === "clicked_browser_action" ) {
var firstHref = $("a[href^='http']").eq(0).attr("href");
console.log(firstHref);
console.log("window: ", window);
var jobList = $('a.main-headline').scrape({
url: 'href',
title: 'text'
});
// var jobList = window.artoo.scrape('a.main-headline', {
// url: 'href',
// title: 'text'
// });
console.log("jobList: ", jobList);
}
}
);
and this in background.js
chrome.browserAction.onClicked.addListener(function() {
// Send a message to the active tab
chrome.tabs.query({active: true, currentWindow: true}, function(tabs) {
var activeTab = tabs[0];
chrome.tabs.sendMessage(activeTab.id, {"message": "clicked_browser_action"});
});
});
in the manifest i added jquery:
"content_scripts": [{
"matches": ["http://*/*", "https://*/*"],
"js": ["chrome/jquery-2.2.0.min.js", "chrome/content.js"],
"run_at": "document_end"
}],
jquery works fine inside content.js, but I am unsure how to use artoo from there.
var jobList = $('a.main-headline').scrape({
gives:
Error in event handler for runtime.onMessage: TypeError: $(...).scrape is not a function
I got it to work!
So I added "build/artoo.chrome.js" to the list of scripts:
"content_scripts": [{
"matches": ["http://*/*", "https://*/*"],
"js": ["chrome/jquery-2.2.0.min.js", "build/artoo.chrome.js",
"chrome/content.js"
],
"run_at": "document_end"
}],
, then i commented out the injection stuff in background.js. Works perfectly!
If you have any feedback on my approach, please let me know. Thanks!
This seems to be a good approach @appjitsu. Don't hesitate to come back if you have issues with the lib when running your content script.
I am attempting to scrape https://www.linkedin.com/vsearch/j?orig=JSHP&keywords=PHP+Developer&distance=50&locationType=I&countryCode=us&trk=two_box_geo_fill.
I want to grab the job title, company, etc. from the detail pages.
I can get the array of links on the page. Then when I ask the spider to go scrape each of the links, I never get anything back for the elements that I am asking for. I used the goodreads example as my baseline. Any help would be appreciated.
var urls = artoo.scrape('a.main-headline', 'href');
console.log("urls: ", urls);
// Deploying an ajax spider
artoo.ajaxSpider(
// Feeding the spider with our urls
urls,
// Registering some parameters
{
// We want the html to be parsed with jQuery
jquerify: true,
// Function executed at each step
process: function($page, i) {
artoo.log.debug('Fetching page #' + i);
// Returning the results of scraping the page
// TODO: adjust the scraping to fit your tastes
return $page.scrape({
title: {
sel: 'h1'
},
author: {
sel: '[itemprop=name]'
}
});
},
// A limit for dev purposes (remove it if you want everything)
limit: 2,
// Callback for when we are done
done: function(books) {
// Inform we are done with a sound
artoo.beep('original');
// Showing results
console.log(books);
// let's download our data
artoo.savePrettyJson(books, 'books.json');
}
}
);
This is the console output:
urls: ["https://www.linkedin.com/jobs2/view/105910449?trk=…0%2CVSRPtargetId%3A105910449%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/86741421?trk=v…90%2CVSRPtargetId%3A86741421%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/97607083?trk=v…90%2CVSRPtargetId%3A97607083%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/95582213?trk=v…90%2CVSRPtargetId%3A95582213%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/99614314?trk=v…90%2CVSRPtargetId%3A99614314%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/92597211?trk=v…90%2CVSRPtargetId%3A92597211%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/97663826?trk=v…90%2CVSRPtargetId%3A97663826%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/99641565?trk=v…90%2CVSRPtargetId%3A99641565%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/95578916?trk=v…90%2CVSRPtargetId%3A95578916%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/99667665?trk=v…90%2CVSRPtargetId%3A99667665%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/96519810?trk=v…90%2CVSRPtargetId%3A96519810%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/92595388?trk=v…90%2CVSRPtargetId%3A92595388%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/93866028?trk=v…90%2CVSRPtargetId%3A93866028%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/87874693?trk=v…90%2CVSRPtargetId%3A87874693%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/96524207?trk=v…90%2CVSRPtargetId%3A96524207%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/93814206?trk=v…90%2CVSRPtargetId%3A93814206%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/86793592?trk=v…90%2CVSRPtargetId%3A86793592%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/98108669?trk=v…90%2CVSRPtargetId%3A98108669%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/99692742?trk=v…90%2CVSRPtargetId%3A99692742%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/99618918?trk=v…90%2CVSRPtargetId%3A99618918%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/98183858?trk=v…90%2CVSRPtargetId%3A98183858%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/97644983?trk=v…90%2CVSRPtargetId%3A97644983%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/92580011?trk=v…90%2CVSRPtargetId%3A92580011%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/93804129?trk=v…90%2CVSRPtargetId%3A93804129%2CVSRPcmpt%3Aprimary", "https://www.linkedin.com/jobs2/view/92547832?trk=v…90%2CVSRPtargetId%3A92547832%2CVSRPcmpt%3Aprimary"]
artoo.chrome.js:2114 [artoo]: debug - Fetching page #0
artoo.chrome.js:2114 [artoo]: debug - Fetching page #1
content.js:86 [Array[1], Array[1]]
The elements of the array are empty strings.
Hello @appjitsu. When I run the above code in the page you linked, I get the following JSON:
[
[
{
"title": "PHP Developer",
"author": "XebiaLabsXebiaLabs"
}
],
[
{
"title": "PHP Developer",
"author": "ProgrexionProgrexion"
}
]
]
What browser are you using?
I am using Google Chrome v48.0.2564.82 m.
And you don't get the same result as I do?
No, I get the same thing as yesterday; an empty result.
Here is the project: https://www.dropbox.com/s/rw94ggly2j9jyq4/artoo.zip?dl=0
[
[
{
"title": "",
"author": ""
}
],
[
{
"title": "",
"author": ""
}
]
]
That's very weird indeed. If you visit the pages by hand and use the same scraper you are using with the spider, does it work?
I commented out everything else and just ran the following directly on the detail page and it works fine.
var data = artoo.scrape('body', {
title: {
sel: 'h1'
},
author: {
sel: '[itemprop=name]'
}
});
console.log("data: ", JSON.stringify(data));
I get the following data:
[{
"title": "PHP DeveloperMeet XebiaLabsCompanySimilar JobsPHP Developer - PHP, HTML5, CSS3Senior Web Developer - WordpressResponsive Web Developer / Demand GenerationWeb Developer Web Developer: Html, Javascript, CSS, and JQuery. Top Technology Company. Web DeveloperSoftware Application DeveloperFront End Developer - HTML, CSS, JavascriptWeb Developer *College Grads Welcome! People also viewedPHP DeveloperPHP DeveloperWeb DeveloperPHP DeveloperPHP DeveloperPHP DeveloperPHP DeveloperPHP DeveloperPHP DeveloperPHP Developer Current job:PHP Developer",
"author": "XebiaLabs"
}]
Then try disabling jquerify
and log what string does the spider gives you in the process callback. Note also that you probably should be using scrapeOne
rather than scrape
since you only need one result and not a list.
Are there any known problems with using Jquery 2.2.0?
I don't think it could affect our case if jQuery follows semver specs. But just in case, try with 2.1.3
.
I still cannot get the detail page scraper to work if I start it from the search result page. I can get urls on the search result page. I can go directly to the detail page and scrape the page and that works.
I tried jquery v2.1.3 and that had no positive affect.
Can you please download from the following link and try it on your end? https://www.dropbox.com/s/rw94ggly2j9jyq4/artoo.zip?dl=0
Search results page: https://www.linkedin.com/vsearch/j?orig=JSHP&keywords=PHP+Developer&distance=50&locationType=I&countryCode=us&trk=two_box_geo_fill
var scraper_link = {
iterator: 'li.job',
data: {
url: {
sel: 'a.result-image',
attr: 'href'
}
},
params: {
limit: 1
}
};
var scraper = {
iterator: 'div.top-row',
data: {
title: {
sel: 'div.content > div.first-line > div.titleAndTags > h1.title'
},
},
params: {
done: function(data){
console.log("scraper data: ", data);
artoo.s.pushTo('jobs_list', data);
}
}
};
var urls = artoo.scrape(scraper_link);
console.log("urls: ", urls);
artoo.ajaxSpider(urls, {
throttle: 3000,
scrape: scraper,
done: function(data) {
artoo.log.debug('Finished retrieving data. Downloading...');
artoo.saveJson(artoo.s.get('jobs_list'), {
filename: 'jobs.json'
});
artoo.s.remove('jobs_list');
}
});
@Yomguithereal have you had a chance to try the code above?
Not yet, sorry. Can you remind me next week, I won't have time before then.
Sure thing! We're all busy, so I totally understand. ;)
I think the reason it's not working is the page is not fully rendered yet when the crawler gets the HTML. The page loads a script when it's fully loaded to put all of the content in its proper place on the page. What do you do in this instance? Is there any way to tell the crawler to wait?
@appjitsu, if you are crawling client side with an ajax spider, no you cannot. Ajax requests will only send you raw HTML.
If you are crawling serverside, then yes you can wait. There is even artoo.waitFor
to help you do so.
Ok, I am back. Did you advance on your side?
No, not really. How would you use the waitFor method with a spider?
The spider uses ajax so the data it returns is static. There are not real way around it. You are reaching the limits of what you can achieve in a browser. If the other pages your are fetching are dynamic and you need them to execute JS to display the data you need, then this starts to be difficult. Retro-engineering the target's ajax calls might be easier then.
Another way to crawl is to load the page in tab, and grab the content on page complete. Just tried with the link @appjitsu posted and seems to work fine. repository here: https://github.com/nobodxbodon/ChromeCrawlerWildSpider
So I have the repo cloned it is running fine as is. Now, if I wanted to customize this for my own needs, where do I put the code? It's not readily apparent to us mere mortals.
Something like: