zrashwani / arachnid

Crawl all unique internal links found on a given website, and extract SEO related information - supports javascript based sites
MIT License
253 stars 60 forks source link

404 error is hardcoded #26

Closed mkantautas closed 7 years ago

mkantautas commented 7 years ago

Hello, so the error_code is hardcoded to always return a 404, but in real life we are often dealing with 403, or a 500 etc. Would be nice to see a bit more info - I know this is not difficult to check. :)

For e.g. the method could look something like this:

function check_http_code($a)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $a);
    curl_setopt($ch, CURLOPT_HEADER, 1);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $data = curl_exec($ch);
    $headers = curl_getinfo($ch);
    curl_close($ch);
    return $headers['http_code'];
}
zrashwani commented 7 years ago

Hello, I cannot re-produce the issue you have, status_code value is filled correctly with corresponding http_response_code, I tried to crawl http://zrashwani.com/test-crawler/index.php for testing and got the following result of the broken links:

LinksCollection {#80 ▼
  #items: array:4 [▼
    "/test-crawler/test-403.php" => array:10 [▼
      "original_urls" => array:1 [▶]
      "links_text" => array:1 [▶]
      "absolute_url" => "http://zrashwani.com/test-crawler/index.phptest-403.php"
      "external_link" => false
      "visited" => false
      "frequency" => 1
      "source_link" => "http://zrashwani.com/test-crawler/index.php"
      "depth" => 1
      "status_code" => 403
      "error_message" => 403
    ]
    "/test-crawler/test-500.php" => array:10 [▼
      "original_urls" => array:1 [▶]
      "links_text" => array:1 [▶]
      "absolute_url" => "http://zrashwani.com/test-crawler/index.phptest-500.php"
      "external_link" => false
      "visited" => false
      "frequency" => 1
      "source_link" => "http://zrashwani.com/test-crawler/index.php"
      "depth" => 1
      "status_code" => 500
      "error_message" => 500
    ]
    "/test-crawler/test-400.php" => array:10 [▼
      "original_urls" => array:1 [▶]
      "links_text" => array:1 [▶]
      "absolute_url" => "http://zrashwani.com/test-crawler/index.phptest-400.php"
      "external_link" => false
      "visited" => false
      "frequency" => 1
      "source_link" => "http://zrashwani.com/test-crawler/index.php"
      "depth" => 1
      "status_code" => 400
      "error_message" => 400
    ]
    "/test-crawler/broken-link.php" => array:10 [▼
      "original_urls" => array:1 [▶]
      "links_text" => array:1 [▶]
      "absolute_url" => "http://zrashwani.com/test-crawler/index.phpbroken-link.php"
      "external_link" => false
      "visited" => false
      "frequency" => 1
      "source_link" => "http://zrashwani.com/test-crawler/index.php"
      "depth" => 1
      "status_code" => 404
      "error_message" => 404
    ]
  ]
}

please provide me with more information if the status_code is incorrectly set for specific domain you have so I can trace

mkantautas commented 7 years ago

Ok, looks like this was a false positive issue . Seems to work on further tests.