Open MMihut opened 9 years ago
[ http://www.netinstructions.com/how-to-make-a-web-crawler-in-under-50-lines-of-python-code/ ]
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
class LinkParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for (key, value) in attrs:
if key == 'href':
newUrl = parse.urljoin(self.baseUrl, value)
self.links = self.links + [newUrl]
def getLinks(self, url):
self.links = []
self.baseUrl = url
response = urlopen(url)
if response.getheader('Content-Type')=='text/html':
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "",[]
def spider(url, word, maxPages):
pagesToVisit = [url]
numberVisited = 0
foundWord = False
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
numberVisited = numberVisited +1
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()
data, links = parser.getLinks(url)
if data.find(word)>-1:
foundWord = True
pagesToVisit = pagesToVisit + links
print(" Success!")
except:
print(" Failed!")
if foundWord:
print("The word", word, "was found at", url)
else:
print("Word never found")
A simple Python web-crawler. The soup is beautiful: BeautifulSoup is a HTML/XML parser that we use for Python.This little Python library is very useful when we are using Python to parse HTML code of a website.In this project we have used it to build our tiny web-crawler in Python.If you do not have BeautifulSoup I suggest you download it form here(DOWNLOAD). The serious work:
The entire code is as given below.
from BeautifulSoup import from urlparse import import urllib2 linksarray=[] page='http://www.mathsisfun.com/' #this value will be use to complete the link. c=urllib2.urlopen('http://www.mathsisfun.com') data=c.read() soup = BeautifulSoup(data)
links=soup.findAll('a')#finds all the links in the page. for link in links: str_links=link.get('href') linksarray.append(page+str(str_links)) linkstr=str(linksarray) file_links=open('links2.html','w') for linking in range (len(linksarray)): hyperlink=""+linksarray[linking]+""
linkstr=str(hyperlink)
file_links.write(linkstr)
file_links.close() for i in range (len(linksarray)): try: nextdata=urllib2.urlopen(linksarray[i]) namestr=str(i) name=namestr+".html" data2=nextdata.read() file1=open(name,'w') file1.write(data2) file1.close() print i except: i=i+1 print "could not open link:",linksarray[i]
What this code does is to get the data(full page HTML) from a given website and search for all the links in it and saves it in a html file links2.html and then those links get opened and the data on those pages get saved by their index names.
Chercher sur Google si il y en a un example de code en python sur le code de Web Crawler