Python scrapy爬出来的text文档始终是空的,反复检查未找到问题?

import scrapy
class SunspiderItem(scrapy.Item):

url=scrapy.Field()
title=scrapy.Field()
con=scrapy.Field()

import scrapy
from sunspider.items import SunspiderItem
class SunSpider(scrapy.Spider):

name = 'sun'
allowed_domains = ['wzzdg.sun0769.com']
url="http://wzzdg.sun0769.com/political/index/politicsNewest?id=1&page="
offset=1
start_urls = [url+str(offset)]


def parse(self, response):
    links=response.xpath('//span[@class="state3"]/a/@href').extract()
    for link in links:
        dataurl="http://wzzdg.sun0769.com"+link
        yield scrapy.Request(dataurl,callback=self.parse_item)
    if self.offset<=2:
        self.offset+=1
        yield scrapy.Request(self.url+str(self.offset),callback=self.parse)

def parse_item(self,response):
    item=SunspiderItem()
    item['url']=response.url
    item['title']=response.xpath('//p[@class="focus-details"]/text()').extract()[0]
    item['con']="".join(response.xpath('//div[@class="details-box"]/pre/text()').extract())

class SunspiderPipeline(object):

def __init__(self):                  
    self.file=open('sun.text','a')
def process_item(self, item, spider):   
    print("------------------------------------------------------")
    content=str(item)+"\n"
    self.file.write(content)
    return item
def close_spider(self,spider):        
    self.file.close()

运行后显示结果:
2022-05-22 16:07:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://wzzdg.sun0769.com/political/politics/index?id=566958> (referer: http://wzzdg.sun0769.com/poli...)
2022-05-22 16:07:16 [scrapy.core.engine] INFO: Closing spider (finished)
2022-05-22 16:07:16 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 19649,
'downloader/request_count': 49,
'downloader/request_method_count/GET': 49,
'downloader/response_bytes': 631990,
'downloader/response_count': 49,
'downloader/response_status_count/200': 48,
'downloader/response_status_count/502': 1,
'elapsed_time_seconds': 1.366505,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 5, 22, 8, 7, 16, 989949),
'httpcompression/response_bytes': 2623212,
'httpcompression/response_count': 48,
'log_count/DEBUG': 49,
'log_count/INFO': 10,
'request_depth_max': 3,
'response_received_count': 48,
'retry/count': 1,
'retry/reason_count/502 Bad Gateway': 1,
'scheduler/dequeued': 49,
'scheduler/dequeued/memory': 49,
'scheduler/enqueued': 49,
'scheduler/enqueued/memory': 49,
'start_time': datetime.datetime(2022, 5, 22, 8, 7, 15, 623444)}

阅读 2.7k
2 个回答

这种情况一般是被限制访问。比如设置cookie,限制访问频率等。
你这种完全抓取不到信息的,基本可以认定为:user-agent限制。

# settings.py 文件

# Chrome版
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"

已参与了 SegmentFault 思否社区 10 周年「问答」打卡 ,欢迎正在阅读的你也加入。

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进