附上爬虫文件的源码。
import scrapy
from openhub.items import OpenhubItem
from lxml import etree
import json
class ProjectSpider(scrapy.Spider):
name = 'project'
# allowed_domains = []
start_urls = ['https://www.openhub.net/p?page=1&query=']
url = 'https://www.openhub.net/p?page={}&query='
page = 1
def parse(self, response):
div_list = response.xpath("//div[@class='well searchable']")
pid = 0
for odiv in div_list:
#创建一个item
item = OpenhubItem()
# 主页信息提取
post = odiv.xpath(".//div/a[@class='pull-left logo']/img/@src").extract_first()
pid = (pid+1)+(self.page*10)-10
item['pid'] = pid
if post:
item['post'] = post
else:
item['post'] = ''
licenses = odiv.xpath(".//div[@class='licenses pull-right']").xpath('string(.)').extract()
lic = ','.join(licenses).replace('\n','').strip('Licenses:')
if lic:
item['licenses'] = lic
else:
item['licenses'] = 'No declared licenses'
href = odiv.xpath(".//div/a[@class='pull-left logo']/@href").extract_first()
item['openhub_url'] = 'https://www.openhub.net' + href
time = odiv.xpath(".//div[@id='inner_content']/div/p/i/abbr/@title").extract_first()
if time:
item['time'] = time
else:
item['time'] = ''
detail_url = item['openhub_url']
lt = []
# 比对去重
with open('time222.txt', 'r', encoding='utf-8') as f:
is_Existed = False
need_update = False
for lines in f.readlines():
line = lines.strip('\n')
line = json.loads(line)
lt.append(line)
# if line.get('openhub_url') == item['url']:
if item['openhub_url'] == line.get('openhub_url'):
is_Existed = True
if item['time'] != line.get('time'):
need_update = True
if (is_Existed == False) or (need_update == True):
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
if self.page <= 47517:
self.page += 1
url = self.url.format(self.page)
yield scrapy.Request(url=url, callback=self.parse)
#爬取详情页
def parse_detail(self, response):
#通过response的meta属性,获取到参数item
item = response.meta['item']
item['title'] = response.xpath('//div[@id="project_header"]/div[1]/h1/a/text()').extract_first()
summary = response.xpath("//section[@id='project_summary']").xpath('string(.)').extract_first().strip(' \n').replace('\n','')
if summary:
item['summary'] = summary
else:
item['summary'] = ''
score = response.xpath('//*[@id="i_use_this_container"]/div/a/text()').extract_first()
if score:
item['score'] = score
else:
item['score'] = ''
tags = response.xpath("//p[@class='tags']").xpath('string(.)').extract_first().strip("\n")
if tags:
item['tags'] = tags
else:
item['tags'] = ''
info = response.xpath("//div[@class='well']/ul").xpath('string(.)').extract_first().strip('\n').replace('\n','')
if info:
item['info'] = info
else:
item['info'] = ''
organiz_text = response.xpath("//div[@class='col-xs-12']/div[@class='col-xs-7'][1]/a/text()").extract_first()
if organiz_text:
item['organiz_text'] = organiz_text
else:
item['organiz_text'] = ''
organiz_url = response.xpath("//div[@class='col-xs-12']/div[@class='col-xs-7'][1]/a/@href").extract_first()
if organiz_url:
item['organiz_url'] = 'https://www.openhub.net' + organiz_url
else:
item['organiz_url'] = ''
language = response.xpath("//table[@class='table table-striped unstyled pull-left language_table']//tr").xpath('string(.)').extract()
if language:
item['language'] = ' '.join(language).replace('\n','')
else:
item['language'] = ''
user_rate = response.xpath('//div[@id="community_rating"]/div[1]/div[1]/text()').extract_first()
if user_rate:
item['user_rate'] = ''.join(user_rate).strip('\n').strip(' users rate this project:')
else:
item['user_rate'] = ''
rating = response.xpath('//div[@id="community_rating"]/div[1]/div[2]/div/div/div/text()').extract_first()
if rating:
item['rating'] = ''.join(rating).strip('\n')
else:
item['rating'] = ''
link = response.xpath("//div[@class='col-xs-12']/div[@class='col-xs-5 text-right text-right']/a/@href").extract_first()
link_url = 'https://www.openhub.net' + link
a_list = response.xpath("//div[@class='well']/div[@class='col-xs-12']/div[@class='col-xs-5 text-right']/a/@href").extract_first()
# print(a_list)
if a_list:
code_link = 'https://www.openhub.net' + a_list
# print(code_link)
yield scrapy.Request(url=code_link,callback=self.parse_detail_two2,meta={'item': item})
yield scrapy.Request(url=link_url,callback=self.parse_detail_two,meta={'item':item})
def parse_detail_two2(self, response):
item = response.meta['item']
print('对着呢***' * 10)
item['a'] = 0
tr_list = response.xpath("//table[@class='table table-striped']/tbody/tr")
for otr in tr_list:
item['code_url'] = otr.xpath(".//td[@class='col-md-4']/text()").extract_first().strip('\n')
item['scmtype'] = otr.xpath(".//td[@class='col-md-2']/text()").extract_first().strip('\n')
item['update_status'] = otr.xpath(".//td[@class='col-md-3']/text()").extract_first().strip('\n')
yield item
# 爬取第二层详情页
def parse_detail_two(self, response):
item = response.meta['item']
well_list = response.xpath("//div[@class='well']")
for owell in well_list:
html = etree.HTML(owell.extract())
if html.xpath(r"//div[@class='col-md-3 margin_bottom_25']"):
type = html.xpath("//h3[@class='strong pull-left margin_bottom_15 margin_top_0']/text()")
name = html.xpath("//div[@class='col-md-3 margin_bottom_25']/text()")
url = html.xpath("//div[@class='col-md-6']/h5/a/@href")
item['type'] = ''.join(type).strip('\n')
item['name'] = ''.join(name).strip('\n')
item['url'] = ''.join(url).strip('\n')
item['a'] = 1
yield item
item['a'] = 2
yield item
把最后运行完的统计信息发下,是不是网站屏蔽了