Closed ghost closed 4 years ago
@Unumus Normally I am using 5-10 connections to get ~200 - 300 items per minute on a local machine. The system is not supposed to shut down like this in any case. So let's explore the case and maybe improve Crawly's stability.
I would ask for the following information:
After several tries, I got not identical behaviour. In other runs, it works well. I could scrape 1969items before the system crash. Here a new type of error.
12:47:14.629 [error] GenServer Crawly.RequestsStorage terminating
** (stop) exited in: GenServer.call(#PID<0.425.0>, {:store, %Crawly.Request{headers: [], middlewares: [Crawly.Middlewares.DomainFilter, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.UserAgent], options: [], prev_response: %HTTPoison.Response{body: "\r\n\r\n<!doctype html>\r\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7 ie\" lang=\"en\"> <![endif]-->\r\n<!--[if IE 7]> <html class=\"no-js lt-ie9 lt-ie8 ie\" lang=\"en\"> <![endif]-->\r\n<!--[if IE 8]> <html class=\"no-js lt-ie9 ie\" lang=\"en\"> <![endif]-->\r\n<!--[if gte IE 9]> <html class=\"ie no-js\" lang=\"en\"> <![endif]-->\r\n<!--[if !IE]><!-->\r\n<html class=\"no-js\" lang=\"en\">\r\n<!-- <![endif]-->\r\n<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /><meta charset=\"utf-8\" /><meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\" /><script type=\"text/javascript\">window.NREUM||(NREUM={});NREUM.info = {\"beacon\":\"bam.nr-data.net\",\"errorBeacon\":\"bam.nr-data.net\",\"licenseKey\":\"cb2fc5a887\",\"applicationID\":\"107236984\",\"transactionName\":\"NFEHZhYDDEpUB0FcXA0bN1cVFwdKQUtBUF4TWARGAU0yS1oAQFZHIFURVwMNEEA=\",\"queueTime\":0,\"applicationTime\":886,\"agent\":\"\",\"atts\":\"\"}</script><script type=\"text/javascript\">(window.NREUM||(NREUM={})).loader_config={licenseKey:\"cb2fc5a887\",applicationID:\"107236984\"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var i=n[t]={exports:{}};e[t][0].call(i.exports,function(n){var i=e[t][1][n];return r(i||n)},i,i.exports)}return n[t].exports}if(\"function\"==typeof __nr_require)return __nr_require;for(var i=0;i<t.length;i++)r(t[i]);return r}({1:[function(e,n,t){function r(){}function i(e,n,t){return function(){return o(e,[u.now()].concat(f(arguments)),n?null:this,t),n?void 0:this}}var o=e(\"handle\"),a=e(4),f=e(5),c=e(\"ee\").get(\"tracer\"),u=e(\"loader\"),s=NREUM;\"undefined\"==typeof window.newrelic&&(newrelic=s);var p=[\"setPageViewName\",\"setCustomAttribute\",\"setErrorHandler\",\"finished\",\"addToTrace\",\"inlineHit\",\"addRelease\"],l=\"api-\",d=l+\"ixn-\";a(p,function(e,n){s[n]=i(l+n,!0,\"api\")}),s.addPageAction=i(l+\"addPageAction\",!0),s.setCurrentRouteName=i(l+\"routeName\",!0),n.exports=newrelic,s.interaction=function(){return(new r).get()};var m=r.prototype={createTracer:function(e,n){var t={},r=this,i=\"function\"==typeof n;return o(d+\"tracer\",[u.now(),e,t],r),function(){if(c.emit((i?\"\":\"no-\")+\"fn-start\",[u.now(),r,i],t),i)try{return n.apply(this,arguments)}catch(e){throw c.emit(\"fn-err\",[arguments,this,e],t),e}finally{c.emit(\"fn-end\",[u.now()],t)}}}};a(\"actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get\".split(\",\"),function(e,n){m[n]=i(d+n)}),newrelic.noticeError=function(e,n){\"string\"==typeof e&&(e=new Error(e)),o(\"err\",[e,u.now(),!1,n])}},{}],2:[function(e,n,t){function r(e,n){var t=e.getEntries();t.forEach(function(e){\"first-paint\"===e.name?c(\"timing\",[\"fp\",Math.floor(e.startTime)]):\"first-contentful-paint\"===e.name&&c(\"timing\",[\"fcp\",Math.floor(e.startTime)])})}function i(e,n){var t=e.getEntries();t.length>0&&c(\"lcp\",[t[t.length-1]])}function o(e){if(e instanceof s&&!l){var n,t=Math.round(e.timeStamp);n=t>1e12?Date.now()-t:u.now()-t,l=!0,c(\"timing\",[\"fi\",t,{type:e.type,fid:n}])}}if(!(\"init\"in NREUM&&\"page_view_timing\"in NREUM.init&&\"enabled\"in NREUM.init.page_view_timing&&NREUM.init.page_view_timing.enabled===!1)){var a,f,c=e(\"handle\"),u=e(\"loader\"),s=NREUM.o.EV;if(\"PerformanceObserver\"in window&&\"function\"==typeof window.PerformanceObserver){a=new PerformanceObserver(r),f=new PerformanceObserver(i);try{a.observe({entryTypes:[\"paint\"]}),f.observe({entryTypes:[\"largest-contentful-paint\"]})}catch(p){}}if(\"addEventListener\"in document){var l=!1,d=[\"click\",\"keydown\",\"mousedown\",\"pointerdown\",\"touchstart\"];d.forEach(function(e){document.addEventListener(e,o,!1)})}}},{}],3:[function(e,n,t){function r(e,n){if(!i)return!1;if(e!==i)return!1;if(!n)return!0;if(!o)return!1;for(var t=o.split(\".\"),r=n.split(\".\"),a=0;a<r.length;a++)if(r[a]!==t[a])return!1;return!0}var i=null,o=null,a=/Version\\/(\\S+)\\s+Safari/;if(navigator.userAgent){var f=navigator.userAgent,c=f.match(a);c&&f.indexOf(\"Chrome\")===-1&&f.indexOf(\"Chromium\")===-1&&(i=\"Safari\",o=c[1])}n.exports={agent:i,version:o,match:r}},{}],4:[function(e,n,t){function r(e,n){var t=[],r=\"\",o=0;for(r in e)i.call(e,r)&&(t[o]=n(r,e[r]),o+=1);return t}var i=Object.prototype.hasOwnProperty;n.exports=r},{}],5:[function(e,n,t){func" <> ..., headers: [{"Date", "Thu, 19 Mar 2020 10:47:10 GMT"}, {"Content-Type", "text/html; charset=utf-8"}, {"Transfer-Encoding", "chunked"}, {"Connection", "keep-alive"}, {"Set-Cookie", "__cfduid=d53fbe3c8646bf32a8864ce2e207e0b101584614829; expires=Sat, 18-Apr-20 10:47:09 GMT; path=/; domain=.homebase.co.uk; HttpOnly; SameSite=Lax; Secure"}, {"Cache-Control", "no-cache, no-store"}, {"Pragma", "no-cache"}, {"Expires", "-1"}, {"Vary", "Accept-Encoding"}, {"Set-Cookie", "Bunnings.Device=default; path=/"}, {"Set-Cookie", "ASP.NET_SessionId=erdggz52redh5gzpjc4qsxeo; path=/; HttpOnly; SameSite=Lax"}, {"Set-Cookie", "Bunnings.Device=default; path=/"}, {"Set-Cookie", "ASP.NET_SessionId=erdggz52redh5gzpjc4qsxeo; path=/; HttpOnly; SameSite=Lax"}, {"Set-Cookie", "Bunnings.UserType=RetailUser; path=/"}, {"Set-Cookie", "Bunnings.NearestStoreID=335; expires=Fri, 19-Mar-2021 10:47:09 GMT; path=/"}, {"Set-Cookie", "Bunnings.NearestStoreName=Milton Keynes; expires=Fri, 19-Mar-2021 10:47:09 GMT; path=/"}, {"Set-Cookie", "Bunnings.NearestStoreNumber=0335; expires=Fri, 19-Mar-2021 10:47:09 GMT; path=/"}, {"Set-Cookie", "Bunnings.Region=Homebase; expires=Fri, 19-Mar-2021 10:47:09 GMT; path=/"}, {"Set-Cookie", "Bunnings.NearestStoreDisableOnlineOrderFulfilment=False; expires=Fri, 19-Mar-2021 10:47:10 GMT; path=/"}, {"Set-Cookie", "Bunnings.NearestStoreIsRemote=False; expires=Fri, 19-Mar-2021 10:47:09 GMT; path=/"}, {"Set-Cookie", "Bunnings.SelectedSuburbID=; expires=Mon, 09-Mar-2020 10:47:09 GMT; path=/"}, {"Set-Cookie", "Bunnings.SelectedPostcode=; expires=Mon, 09-Mar-2020 10:47:09 GMT; path=/"}, {"Set-Cookie", "__AntiXsrfToken=93447a2d6de14d08ba0333baa9bf33bd; path=/; HttpOnly"}, {"Set-Cookie", "__RequestVerificationToken=UaiwRUprXoSpPJl3guwJCOltQvtR8aOHTSnW4wrSw3AISghNpd5m4BloNGeXpez7SnwKHyPUXU_qxEmxKjHda0oKlyo1; path=/; HttpOnly"}, {"Set-Cookie", "CookieBannerDisplay=false; domain=www.homebase.co.uk; expires=Thu, 16-Apr-2020 09:47:09 GMT; path=/; HttpOnly"}, {"COMMERCE-SERVER-SOFTWARE", "Commerce Server, Enterprise Edition"}, {"X-Server-Region", "Homebase"}, {"X-Frame-Options", "SAMEORIGIN"}, {"Set-Cookie", "ApplicationGatewayAffinity=d5f42a2203cf101bdb79095de889e7ef00167d78417b6877d252c776b358cba7;Path=/;Domain=www.homebase.co.uk"}, {"Set-Cookie", "ApplicationGatewayAffinityCORS=d5f42a2203cf101bdb79095de889e7ef00167d78417b6877d252c776b358cba7;Path=/;Domain=www.homebase.co.uk;SameSite=None;Secure"}, {"CF-Cache-Status", "DYNAMIC"}, {"Expect-CT", "max-age=604800, report-uri=\"https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct\""}, {"Server", "cloudflare"}, {"CF-RAY", "57669d1d1f1ed240-SOF"}], request: %HTTPoison.Request{body: "", headers: [{"User-Agent", "Crawly Bot 1.0"}], method: :get, options: [], params: %{}, url: "https://www.homebase.co.uk/our-range/storage-and-home/furniture/kids-furniture/kids-chest-of-drawers"}, request_url: "https://www.homebase.co.uk/our-range/storage-and-home/furniture/kids-furniture/kids-chest-of-drawers", status_code: 200}, retries: 0, url: "https://www.homebase.co.uk/softline-2-4-chest-of-drawers_p529432"}}, 5000)
** (EXIT) no process: the process is not alive or there's no process currently associated with the given name, possibly because its application isn't started
(elixir 1.10.1) lib/gen_server.ex:1023: GenServer.call/3
(elixir 1.10.1) lib/enum.ex:783: Enum."-each/2-lists^foreach/1-0-"/2
(elixir 1.10.1) lib/enum.ex:783: Enum.each/2
(crawly 0.8.0) lib/crawly/requests_storage/requ (truncated)
My config file
use Mix.Config
# in config.exs
config :crawly,
closespider_timeout: 10,
concurrent_requests_per_domain: 100,
closespider_itemcount: 1000,
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.UserAgent
],
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :sku, :price]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"} # NEW IN 0.7.0
],
port: 4001
The code here I played around the Homebase crawler.
defmodule Tools do
@behaviour Crawly.Spider
@impl Crawly.Spider
def base_url(), do: "https://www.homebase.co.uk"
@impl Crawly.Spider
def init() do
[
start_urls: [
"https://www.homebase.co.uk/"
]
]
end
@impl Crawly.Spider
def parse_item(response) do
product_categories = extract_href(response, ".article-tiles a")
product_pages = extract_href(response, "a.product-tile ")
urls = product_pages ++ product_categories
requests =
urls
|> Enum.uniq()
|> Enum.map(&build_absolute_url/1)
|> Enum.map(&Crawly.Utils.request_from_url/1)
item = %{
title: extract_text(response, ".page-title h1"),
sku: extract_text(response, ".product-header-heading span"),
price: extract_text(response, ".price-value [itemprop=priceCurrency]")
}
%Crawly.ParsedItem{:items => [item], :requests => requests}
end
defp extract_text(response, selector) do
response.body
|> Floki.find(selector)
|> Floki.text()
end
defp extract_href(response, selector) do
response.body
|> Floki.find(selector)
|> Floki.attribute("href")
end
defp build_absolute_url(url), do: URI.merge(base_url(), url) |> to_string()
end
Yes, this is the death of request storage, where the first thing looked like an error in hackney (e.g. hackney did not handle the empty queue outs). In any case I will try to reproduce the error locally to come with some improvements for the case. Otherwise, can you downgrade the concurrency to 10 to see how stable it is?
Before I reduce concurrency to 10. I should notice that I re-run the same spider with concurrency 100. This time it was finished normally.
13:00:35.075 [info] Current crawl speed is: 629 items/min
13:00:35.075 [info] Stopping Tools, closespider_itemcount achieved
But in the Tools.jl file have only 1112 scraped items.
My current issue is that due to the COVID 19 I am at home with very basic internet connection. So it's hard to reproduce it. Could I ask you to commit it somewhere on github, so it's easy to just clone & run it.
But in the Tools.jl file has only 1112 scraped items.
You need to tweak the closespider_itemcount: 1000, E.g. try to have something like 10_000_000 there to see how it works..
Yeah, locally I can't get even a half of your speed, to see the error. I will be able to test once the COVID disaster ends... in the ideal scenario, luckily for us. However, I will try to invent a proper benchmarking system for Crawly.
I could scrape 7096 items, with concurrency of 100.
I will report new issue if they appear here.
But in the Tools.jl file has only 1112 scraped items.
You need to tweak the closespider_itemcount: 1000, E.g. try to have something like 10_000_000 there to see how it works..
Yeah, locally I can't get even a half of your speed, to see the error. I will be able to test once the COVID disaster ends... in the ideal scenario, luckily for us. However, I will try to invent a proper benchmarking system for Crawly.
In my own usage, this was also a mystery for me when I was setting up my crawler, as the close count is set by default. I think adding some logging to advise users to tweak this setting when the itemcount is reached would help
@Unumus @Ziinc I kind of agree that 1000 is kind of strange and not intuitive at all. Shall we make it disabled by default?
Sounds good to disable. Maybe can replace it with reporting the total items scraped count, which, if I'm not wrong, is already stored in the worker state.
@oltarasenko Sorry for long reply, was busy with an other project.
Disabled it by default, it seems like a good idea!
Ok, done!
It looks like it easy to overflow queue. Entire system down. This happen when I set concurrency more than ~20. What is the recommended setting of concurrency for Crawly? What is performance can you achieve with it?