msva / lua-htmlparser

An HTML parser for lua.
231 stars 44 forks source link

Parse fails on some pages #41

Closed lost-RD closed 8 years ago

lost-RD commented 8 years ago

Here's an example of a page parse fails at:

http://pastebin.com/cahjwTjC

msva commented 8 years ago

cat test.lua

#!/usr/bin/env luajit
local cURL = require"cURL";
local html = require"htmlparser"

math.randomseed(tonumber(math.randomseed(os.time()) or os.time())+os.time()); -- randomize

local UAs = {
  "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20130101 Firefox/37.0"; -- Firefox 37, Windows 7, 64bit
  "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:37.0) Gecko/20130101 Firefox/37.0"; -- Firefox 37, Ubuntu 12, 32bit
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.22"; -- Chrome 34, Windows 7, 64bit
  "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2357.10 Safari/537.22"; -- Chrome 34, Windows XP
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/42.0.2311.82 Safari/537.22"; -- Chrome 25, Mac OS 10.7
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2537.10 Safari/537.22"; -- Chrome 25, Linux, 64bit
  "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/43.0.2537.10 Safari/537.22"; -- Chrome 25, Linux, 32bit
  "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; -- Internet Explorer 9, Windows 7, 64bit
  "Mozilla/5.0 (compatible; MSIE 11.0; Windows NT 8.1; WOW64; Trident/7.0)"; -- Internet Explorer 11, Windows 8.1, 64bit
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17"; -- Safari, Mac OS 10.8
}

local headers = {
        "Accept: text/*",
        "Accept-Language: ru,en",
        "Accept-Charset: utf-8,cp1251,koi8-r,iso-8859-5,*",
        "Cache-Control: no-cache"
}

local c = cURL.easy_init()
c:setopt_httpheader(headers)
c:setopt_cookiefile("")
c:setopt_followlocation(1)

local buf="";
local base_url="http://pastebin.com/raw/cahjwTjC";

c:setopt_useragent(UAs[math.random(1,#UAs)]);
c:setopt_url(base_url);
c:setopt_writefunction(function(result) buf=buf..result; return true; end);
c:perform();

local main_page=html.parse(buf)('div.col h4');
buf="";

for _,e in ipairs(main_page) do
print(e:getcontent())
end

c:close();
os.exit(0);

luajit test.lua

Prices and research
              <span class="arrow-down "></span>

Products
              <span class="arrow-down "></span>

Services
              <span class="arrow-down "></span>

Listings
              <span class="arrow-down "></span>

Education
              <span class="arrow-down "></span>

About ASX
              <span class="arrow-down "></span>

Regulation
              <span class="arrow-down "></span>