Spider
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com"]
start_urls = ['http://blog.jobbole.com/all-posts/',]
def parse(self, response):
try:
# 获取最大页
max_page = int(response.css("a.page-numbers::text").extract()[2])
except IndexError:
print("max_page_error")
@@@@@@@问题一: start_urls[0] = 下面构筑的gape_1 然后报重定向,我该怎么优化
for pg in range(1,max_page+1):
#构筑每页链接
page_url = "http://blog.jobbole.com/all-posts/page/" + str(pg) + "/"
yield Request(url=page_url,callback=self.href_parsen)
def href_parse(self, response):
#获取详情页url
sub_node = response.css("div.post.floated-thumb > div.post-thumb")
for sub in sub_node:
image_url = sub.css("img::attr(src)").extract_first()
post_href = sub.css("a::attr(href)").extract_first()
return Request(url=post_href,
meta={"image_url":parse.urljoin(response.url,image_url)},
callback=self.detail_parse)
def detail_parse(self, response):
#解析详情页
item = JobBboleArticleItem()
item["images_url"] = [response.meta.get("image_url","")]
Pipelines
class JobboleJsonPipeline(object):
#自定义json
def __init__(self):
self.file = codecs.open("jobbole1.json", "w", encoding="utf-8")
def process_item(self, item, spider):
self.row = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(self.row)
return item
def spider_closed(self, spdier):
self.file.close()
class JobboleJsonExporterPinpeline(object):
#scrapy.jsonpipeline
def __init__(self):
self.file = open("jobbole2.json", "wb")
self.expoter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.expoter.start_exporting() # JsonItemExporter 导出文件
def process_item(self, item, spider):
self.expoter.export_item(item)
return item
def close_spider(self, spider):
self.expoter.finish_exporting()
self.file.close()
@@@@@@@@@问题2:pycharm中删除刚才导出的JSON,文件会弹出窗口,如下
望求解!