[root@localhost woaidu_crawler]# scrapy crawl woaidu
Unhandled error in Deferred:
Unhandled Error
Traceback (most recent call last):
File "/usr/lib64/python2.7/site-packages/scrapy/commands/crawl.py", line 57, in run
self.crawler_process.crawl(spname, _opts.spargs)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 163, in crawl
return self._crawl(crawler, args, _kwargs)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 167, in _crawl
d = crawler.crawl(_args, _kwargs)
File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1181, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- ---
File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1039, in _inlineCallbacks
result = g.send(result)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 90, in crawl
six.reraise(*exc_info)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 72, in crawl
self.engine = self._create_engine()
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 97, in _createengine
return ExecutionEngine(self, lambda : self.stop())
File "/usr/lib64/python2.7/site-packages/scrapy/core/engine.py", line 69, in init
self.scraper = Scraper(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/core/scraper.py", line 71, in init
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/pipelines/media.py", line 51, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "/usr/lib64/python2.7/site-packages/scrapy/pipelines/images.py", line 95, in from_settings
return cls(store_uri, settings=settings)
exceptions.TypeError: init() got an unexpected keyword argument 'settings'
[root@localhost woaidu_crawler]# scrapy crawl woaidu Unhandled error in Deferred: Unhandled Error Traceback (most recent call last): File "/usr/lib64/python2.7/site-packages/scrapy/commands/crawl.py", line 57, in run self.crawler_process.crawl(spname, _opts.spargs) File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 163, in crawl return self._crawl(crawler, args, _kwargs) File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 167, in _crawl d = crawler.crawl(_args, _kwargs) File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1181, in unwindGenerator return _inlineCallbacks(None, gen, Deferred()) --- ---
File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1039, in _inlineCallbacks
result = g.send(result)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 90, in crawl
six.reraise(*exc_info)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 72, in crawl
self.engine = self._create_engine()
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 97, in _createengine
return ExecutionEngine(self, lambda : self.stop())
File "/usr/lib64/python2.7/site-packages/scrapy/core/engine.py", line 69, in init
self.scraper = Scraper(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/core/scraper.py", line 71, in init
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/pipelines/media.py", line 51, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "/usr/lib64/python2.7/site-packages/scrapy/pipelines/images.py", line 95, in from_settings
return cls(store_uri, settings=settings)
exceptions.TypeError: init() got an unexpected keyword argument 'settings'
settting.py内容如下:
!/usr/bin/python
--coding:utf-8--
Scrapy settings for woaidu_crawler project
import os
PROJECT_DIR = os.path.abspath(os.path.dirname(file))
BOT_NAME = 'woaidu_crawler'
SPIDER_MODULES = ['woaidu_crawler.spiders'] NEWSPIDER_MODULE = 'woaidu_crawler.spiders'
DOWNLOAD_DELAY = 1 CONCURRENT_ITEMS = 100 CONCURRENT_REQUESTS = 16
The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain.
CONCURRENT_REQUESTS_PER_DOMAIN = 8 CONCURRENT_REQUESTS_PER_IP = 0 DEPTH_LIMIT = 0 DEPTH_PRIORITY = 0 DNSCACHE_ENABLED = True
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
AutoThrottle extension
AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_START_DELAY = 3.0 AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD = 10#How many responses should pass to perform concurrency adjustments.
XXX:scrapy's item pipelines have orders!!!!!,it will go through all the pipelines by the order of the list;
So if you change the item and return it,the new item will transfer to the next pipeline.
XXX:notice:
if you want to use shard mongodb,you need MongodbWoaiduBookFile and ShardMongodbPipeline
if you want to use single mongodb,you need WoaiduBookFile and SingleMongodbPipeline
ITEM_PIPELINES = ['woaidu_crawler.pipelines.cover_image.WoaiduCoverImage',
'woaidu_crawler.pipelines.bookfile.WoaiduBookFile',
'woaidu_crawler.pipelines.mongodb_book_file.MongodbWoaiduBookFile',
'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile',
'woaidu_crawler.pipelines.mongodb.SingleMongodbPipeline',
'woaidu_crawler.pipelines.mongodb.ShardMongodbPipeline',
'woaidu_crawler.pipelines.final_test.FinalTestPipeline',]
ITEM_PIPELINES = ['woaidu_crawler.pipelines.WoaiduBookFile',]
ITEM_PIPELINES = {'woaidu_crawler.pipelines.cover_image.WoaiduCoverImage':300,'woaidu_crawler.pipelines.mongodb_book_file.MongodbWoaiduBookFile':400, 'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile':500,'woaidu_crawler.pipelines.mongodb.ShardMongodbPipeline':600, 'woaidu_crawler.pipelines.final_test.FinalTestPipeline':700,}
ITEM_PIPELINES = {'woaidu_crawler.pipelines.cover_image.WoaiduCoverImage':300,
'woaidu_crawler.pipelines.bookfile.WoaiduBookFile':400,
'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile':500,
'woaidu_crawler.pipelines.mongodb.SingleMongodbPipeline':600,
'woaidu_crawler.pipelines.final_test.FinalTestPipeline':700,}
IMAGES_STORE = os.path.join(PROJECT_DIR,'media/book_covor_image') IMAGES_EXPIRES = 30 IMAGES_THUMBS = { 'small': (50, 50), 'big': (270, 270), }
IMAGES_MIN_HEIGHT = 0 IMAGES_MIN_WIDTH = 0
COOKIES_ENABLED = False
USER_AGENT = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31'
DOWNLOADER_MIDDLEWARES = {
'woaidu_crawler.contrib.downloadmiddleware.google_cache.GoogleCacheMiddleware':50,
}
GOOGLE_CACHE_DOMAINS = ['www.woaidu.org',]
To make RotateUserAgentMiddleware enable.
USER_AGENT = ''
FILE_EXPIRES = 30 BOOK_FILE_EXPIRES = 30 FILE_STORE = os.path.join(PROJECT_DIR,'media/files') BOOK_FILE_STORE = os.path.join(PROJECT_DIR,'media/book_files')
For more mime types about file,you can visit:
http://mimeapplication.net/
BOOK_FILE_CONTENT_TYPE = ['application/file', 'application/zip', 'application/octet-stream', 'application/x-zip-compressed', 'application/x-octet-stream', 'application/gzip', 'application/pdf', 'application/ogg', 'application/vnd.oasis.opendocument.text', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/x-dvi', 'application/x-rar-compressed', 'application/x-tar', 'multipart/x-zip', 'application/x-zip', 'application/x-winzip', 'application/x-compress', 'application/x-compressed', 'application/x-gzip', 'zz-application/zz-winassoc-arj', 'application/x-stuffit', 'application/arj', 'application/x-arj', 'multipart/x-tar', 'text/plain',]
URL_GBK_DOMAIN = ['www.paofuu.com', 'down.wmtxt.com', 'www.txt163.com', 'down.txt163.com', 'down.sjtxt.com:8199', 'file.txtbook.com.cn', 'www.yyytxt.com', 'www.27xs.org', 'down.dusuu.com:8199', 'down.txtqb.cn'] ATTACHMENT_FILENAME_UTF8_DOMAIN = []
FILE_EXTENTION = ['.doc','.txt','.docx','.rar','.zip','.pdf']
Drop_NoneBookFile = True
LOG_FILE = "logs/scrapy.log"
STATS_CLASS = 'woaidu_crawler.statscol.graphite.RedisGraphiteStatsCollector'
GRAPHITE_HOST = '127.0.0.1' GRAPHITE_PORT = 2003 GRAPHITE_IGNOREKEYS = []
SingleMONGODB_SERVER = "localhost" SingleMONGODB_PORT = 27017 SingleMONGODB_DB = "books_fs"
ShardMONGODB_SERVER = "localhost" ShardMONGODB_PORT = 27017 ShardMONGODB_DB = "books_mongo" GridFs_Collection = "book_file"
SCHEDULER = "woaidu_crawler.scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = False SCHEDULER_QUEUE_CLASS = 'woaidu_crawler.scrapy_redis.queue.SpiderPriorityQueue'
是否配置不对