Closed lost-RD closed 8 years ago
cat test.lua
#!/usr/bin/env luajit
local cURL = require"cURL";
local html = require"htmlparser"
math.randomseed(tonumber(math.randomseed(os.time()) or os.time())+os.time()); -- randomize
local UAs = {
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20130101 Firefox/37.0"; -- Firefox 37, Windows 7, 64bit
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:37.0) Gecko/20130101 Firefox/37.0"; -- Firefox 37, Ubuntu 12, 32bit
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.22"; -- Chrome 34, Windows 7, 64bit
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2357.10 Safari/537.22"; -- Chrome 34, Windows XP
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/42.0.2311.82 Safari/537.22"; -- Chrome 25, Mac OS 10.7
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2537.10 Safari/537.22"; -- Chrome 25, Linux, 64bit
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2537.10 Safari/537.22"; -- Chrome 25, Linux, 32bit
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; -- Internet Explorer 9, Windows 7, 64bit
"Mozilla/5.0 (compatible; MSIE 11.0; Windows NT 8.1; WOW64; Trident/7.0)"; -- Internet Explorer 11, Windows 8.1, 64bit
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17"; -- Safari, Mac OS 10.8
}
local headers = {
"Accept: text/*",
"Accept-Language: ru,en",
"Accept-Charset: utf-8,cp1251,koi8-r,iso-8859-5,*",
"Cache-Control: no-cache"
}
local c = cURL.easy_init()
c:setopt_httpheader(headers)
c:setopt_cookiefile("")
c:setopt_followlocation(1)
local buf="";
local base_url="http://pastebin.com/raw/cahjwTjC";
c:setopt_useragent(UAs[math.random(1,#UAs)]);
c:setopt_url(base_url);
c:setopt_writefunction(function(result) buf=buf..result; return true; end);
c:perform();
local main_page=html.parse(buf)('div.col h4');
buf="";
for _,e in ipairs(main_page) do
print(e:getcontent())
end
c:close();
os.exit(0);
luajit test.lua
Prices and research
<span class="arrow-down "></span>
Products
<span class="arrow-down "></span>
Services
<span class="arrow-down "></span>
Listings
<span class="arrow-down "></span>
Education
<span class="arrow-down "></span>
About ASX
<span class="arrow-down "></span>
Regulation
<span class="arrow-down "></span>
Here's an example of a page parse fails at:
http://pastebin.com/cahjwTjC