elliotgao2 / gain

Web crawling framework based on asyncio.
GNU General Public License v3.0
2.03k stars 207 forks source link

add encoding #36

Closed allphfa closed 6 years ago

allphfa commented 6 years ago

request.py

import asyncio

from .log import logger

try:
    import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

async def fetch(url, spider, session, semaphore):
    with (await semaphore):
        try:
            if callable(spider.headers):
                headers = spider.headers()
            else:
                headers = spider.headers
            # hare   hare   hare
            if hasattr(spider,'encoding'):
                codec = spider.encoding
            else:
                codec = 'utf-8'
            # hare   hare   hare

            async with session.get(url, headers=headers) as response:
                if response.status in [200, 201]:
                    data = await response.text(encoding=codec)   # hare   hare   hare
                    return data
                logger.error('Error: {} {}'.format(url, response.status))
                return None
        except:
            return None

test.py

class MySpider(Spider):
    concurrency = 5
    encoding = 'gbk'
    start_url = r'http://blog.sciencenet.cn/home.php?mod=space&uid=40109&do=blog&view=me&from=space&page=1'
    parsers = [Parser('http://blog.sciencenet.cn/home.php.*?page=\d+',Post)]
allphfa commented 6 years ago

I forgot there was an important parameter in front of you

data = await response.text(encoding=codec,errors='ignore')