import scrapy
class SunspiderItem(scrapy.Item):
url=scrapy.Field()
title=scrapy.Field()
con=scrapy.Field()
import scrapy
from sunspider.items import SunspiderItem
class SunSpider(scrapy.Spider):
name = 'sun'
allowed_domains = ['wzzdg.sun0769.com']
url="http://wzzdg.sun0769.com/political/index/politicsNewest?id=1&page="
offset=1
start_urls = [url+str(offset)]
def parse(self, response):
links=response.xpath('//span[@class="state3"]/a/@href').extract()
for link in links:
dataurl="http://wzzdg.sun0769.com"+link
yield scrapy.Request(dataurl,callback=self.parse_item)
if self.offset<=2:
self.offset+=1
yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
def parse_item(self,response):
item=SunspiderItem()
item['url']=response.url
item['title']=response.xpath('//p[@class="focus-details"]/text()').extract()[0]
item['con']="".join(response.xpath('//div[@class="details-box"]/pre/text()').extract())
class SunspiderPipeline(object):
def __init__(self):
self.file=open('sun.text','a')
def process_item(self, item, spider):
print("------------------------------------------------------")
content=str(item)+"\n"
self.file.write(content)
return item
def close_spider(self,spider):
self.file.close()
运行后显示结果:
2022-05-22 16:07:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://wzzdg.sun0769.com/political/politics/index?id=566958> (referer: http://wzzdg.sun0769.com/poli...)
2022-05-22 16:07:16 [scrapy.core.engine] INFO: Closing spider (finished)
2022-05-22 16:07:16 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 19649,
'downloader/request_count': 49,
'downloader/request_method_count/GET': 49,
'downloader/response_bytes': 631990,
'downloader/response_count': 49,
'downloader/response_status_count/200': 48,
'downloader/response_status_count/502': 1,
'elapsed_time_seconds': 1.366505,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 5, 22, 8, 7, 16, 989949),
'httpcompression/response_bytes': 2623212,
'httpcompression/response_count': 48,
'log_count/DEBUG': 49,
'log_count/INFO': 10,
'request_depth_max': 3,
'response_received_count': 48,
'retry/count': 1,
'retry/reason_count/502 Bad Gateway': 1,
'scheduler/dequeued': 49,
'scheduler/dequeued/memory': 49,
'scheduler/enqueued': 49,
'scheduler/enqueued/memory': 49,
'start_time': datetime.datetime(2022, 5, 22, 8, 7, 15, 623444)}
这种情况一般是被限制访问。比如设置
cookie
,限制访问频率等。你这种完全抓取不到信息的,基本可以认定为:
user-agent
限制。已参与了 SegmentFault 思否社区 10 周年「问答」打卡 ,欢迎正在阅读的你也加入。