共被编辑 3 次

版本 更新时间 贡献者 编辑原因 操作
#r3 2017年08月04日 疯狂王子10 更新问题 查看

pyspider超时 HTTP 599: Operation timed out after .....

图片描述

Retry的几次之后,系统感觉就不在动了,好像任务停止了一样
图片描述
图片描述

源码如下:
图片描述

class Handler(BaseHandler):
crawl_config = {
}

def __init__(self):
    self.deal = Deal()

@every(minutes=24 * 60)
def on_start(self):
    self.crawl('http://hangkong.citytt.com/hk-1/', callback=self.index_page)

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
    for each in response.doc('.list_centaa a[href^="http"]').items():
        self.crawl(each.attr.href, callback=self.detail_page,connect_timeout = 50,timeout = 200,save={'name': each.text()})

@config(priority=2)
def detail_page(self, response):
    html =response.doc('div.center_rt').html()
    #print html
    pattern = re.compile(u'航空公司:(.*?)<br.*?IATA代码:(.*?)<br.*?官方网站:(.*?)<br',re.S)
    items = re.findall(pattern, html)
    result = ''
    for item in items:
        print item
        result = "[%s,%s,%s],\n" % (item[0],item[1],item[2])
        
    print result
    #self.deal.saveBrief(result, DIR_PATH, "aaa")
    return {
        "url":result,
        "name": response.save['name'],
    }
#r2 2017年08月04日 疯狂王子10 更新问题 查看

pyspider超时 HTTP 599: Operation timed out after .....

图片描述

Retry的几次之后,系统感觉就不在动了,好像任务停止了一样
图片描述
图片描述

源码如下:
图片描述

#r1 2017年08月04日 疯狂王子10 创建问题 查看

pyspider超时 HTTP 599: Operation timed out after .....

taskid
c54f7587c4e70eaa421a6d6a43eb66a6
lastcrawltime
1501813934.95 (25 minutes ago)
updatetime
1501813934.95 (25 minutes ago)
exetime
1501817534.95 (Aug 4, 2017 at 3:32)
track.fetch 200010.0ms
{
"content": "",
"encoding": null,
"error": "HTTP 599: Operation timed out after 199984 milliseconds with 712274 out of 1387088 bytes received",
"headers": {},
"ok": false,
"redirect_url": null,
"status_code": 599,
"time": 200.00999999046326
}
track.process 0.0ms
HTTP 599: Operation timed out after 199984 milliseconds with 712274 out of 1387088 bytes received
_handler.py", line 196, in run_task

    result = self._run_task(task, response)
  File "c:\python27\lib\site-packages\pyspider\libs\base_handler.py", line 175, in _run_task
    response.raise_for_status()
  File "c:\python27\lib\site-packages\pyspider\libs\response.py", line 172, in raise_for_status
    six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback())
  File "c:\python27\lib\site-packages\pyspider\fetcher\tornado_fetcher.py", line 378, in http_fetch
    response = yield gen.maybe_future(self.http_client.fetch(request))
  File "c:\python27\lib\site-packages\tornado\gen.py", line 1055, in run
    value = future.result()
  File "c:\python27\lib\site-packages\tornado\concurrent.py", line 238, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 3, in raise_exc_info
Exception: HTTP 599: Operation timed out after 199984 milliseconds with 712274 out of 1387088 bytes received

{
"exception": "HTTP 599: Operation timed out after 199984 milliseconds with 712274 out of 1387088 bytes received",
"follows": 0,
"logs": "_handler.py", line 196, in run_taskn result = self._run_task(task, response)n File "c:\python27\lib\site-packages\pyspider\libs\base_handler.py", line 175, in _run_taskn response.raise_for_status()n File "c:\python27\lib\site-packages\pyspider\libs\response.py", line 172, in raise_for_statusn six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback())n File "c:\python27\lib\site-packages\pyspider\fetcher\tornado_fetcher.py", line 378, in http_fetchn response = yield gen.maybe_future(self.http_client.fetch(request))n File "c:\python27\lib\site-packages\tornado\gen.py", line 1055, in runn value = future.result()n File "c:\python27\lib\site-packages\tornado\concurrent.py", line 238, in resultn raise_exc_info(self._exc_info)n File "<string>", line 3, in raise_exc_infon Exception: HTTP 599: Operation timed out after 199984 milliseconds with 712274 out of 1387088 bytes receivedn",
"ok": false,
"result": null,
"time": 0.0
}
schedule
{
"exetime": 1501817534.954,
"priority": 2,
"retried": 2
}
process
{
"callback": "detail_page"
}
fetch
{
"connect_timeout": 50,
"save": {

"name": "春秋航空公司"

},
"timeout": 200
}

图片描述

后面感觉就不再动了图片描述

代码如下

!/usr/bin/env python

-- encoding: utf-8 --

Created on 2017-08-03 15:12:21

Project: aircompany

from pyspider.libs.base_handler import *
import re

DIR_PATH = 'd:/aircompany'
class Handler(BaseHandler):


crawl_config = {
}

def __init__(self):
    self.deal = Deal()

@every(minutes=24 * 60)
def on_start(self):
    self.crawl('http://hangkong.citytt.com/hk-1/', callback=self.index_page)

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
    for each in response.doc('.list_centaa a[href^="http"]').items():
        self.crawl(each.attr.href, callback=self.detail_page,connect_timeout = 50,timeout = 200,save={'name': each.text()})

@config(priority=2)
def detail_page(self, response):
    html =response.doc('div.center_rt').html()
    #print html
    pattern = re.compile(u'航空公司:(.*?)<br.*?IATA代码:(.*?)<br.*?官方网站:(.*?)<br',re.S)
    items = re.findall(pattern, html)
    result = ''
    for item in items:
        print item
        result = "[%s,%s,%s],\n" % (item[0],item[1],item[2])
        
    print result
    self.deal.saveBrief(result, DIR_PATH, "aaa")
    return {
        "url":result,
        "name": response.save['name'],
    }

import os

class Deal:

def __init__(self):
    self.path = DIR_PATH
    if not self.path.endswith('/'):
        self.path = self.path + '/'
    if not os.path.exists(self.path):
        os.makedirs(self.path)

def saveBrief(self, content, dir_path, name):
    file_name = dir_path + "/" + name + ".txt"
    f = open(file_name, "a+")
    f.write(content.encode('utf-8'))
    f.close()