# -*- coding: utf-8 -*-
import scrapy
from pm25.items import Pm25Item
import re
class InfospSpider(scrapy.Spider):
name = "infosp"
allowed_domains = ["pm25.com"]
start_urls = ['http://www.pm25.com/rank/1day.html', ]
def parse(self, response):
item = Pm25Item()
re_time = re.compile("\d+-\d+-\d+")
date = response.xpath("/html/body/div[4]/div/div/div[2]/span").extract()[0] #单独解析出DATE
# items = []
selector = response.selector.xpath("/html/body/div[5]/div/div[3]/ul[2]/li") #从response里确立解析范围
for subselector in selector: #通过范围逐条解析
try: #防止[0]报错
rank = subselector.xpath("span[1]/text()").extract()[0]
quality = subselector.xpath("span/em/text()")[0].extract()
city = subselector.xpath("a/text()").extract()[0]
province = subselector.xpath("span[3]/text()").extract()[0]
aqi = subselector.xpath("span[4]/text()").extract()[0]
pm25 = subselector.xpath("span[5]/text()").extract()[0]
except IndexError:
item['date'] = re_time.findall(date)[0]
item['rank'] = rank
item['quality'] = quality
item['province'] = city
item['city'] = province
item['aqi'] = aqi
item['pm25'] = pm25
# items.append(item)
yield item #这里不懂该怎么用,出来的是什么格式,
#有的教程会return items,所以希望能得到指点
import time
class Pm25Pipeline(object):
def process_item(self, item, spider):
today = time.strftime("%y%m%d",time.localtime())
fname = str(today) + ".txt"
with open(fname,"a") as f:
for tmp in item: #不知道这里是否写的对,
#个人理解是spider return出来的item是yiled dict
f.write(tmp["date"] + '\t' +
tmp["rank"] + '\t' +
tmp["quality"] + '\t' +
tmp["province"] + '\t' +
tmp["city"] + '\t' +
tmp["aqi"] + '\t' +
tmp["pm25"] + '\n'
return item
import scrapy
class Pm25Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
rank = scrapy.Field()
quality = scrapy.Field()
province = scrapy.Field()
city = scrapy.Field()
aqi = scrapy.Field()
pm25 = scrapy.Field()
