Open MMihut opened 9 years ago
http://williamjturkel.net/2013/09/29/writing-a-simple-web-spider-using-command-line-tools-in-linux/
import sys import re import urllib2 import urlparse tocrawl = set(["http://www.facebook.com/"]) crawled = set([]) keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent="\'["\']\s/>') linkregex = re.compile('<a\shref=[\'|"](.?)[\'"].*?>') while 1: try: crawling = tocrawl.pop() print crawling except KeyError: raise StopIteration url = urlparse.urlparse(crawling) try: response = urllib2.urlopen(crawling) except: continue msg = response.read() startPos = msg.find('
class MySpider(BaseSpider):
name = 'my_spider'
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
Ecrire ou recuperer le code pour une application de type ligne de commande pour le codage/encodage de Web Crawler.