我用的是splash 爬取,返回的数据10%-20%和页面不一样. 80%是正常数据
试过爬取第一、二页,重复爬取。不一样的数据位置会变化,一条数据有时会正常爬到,有时不正常。
是网站随机对这返回数据 运算了吗?怎么能拿到原始数据。
或者怎么才能根据获取的不正常数据,通过运算转回对的数据。
找了一天,没找到答案。。也找不出它转码的规律。
求大神帮忙~~~~~
代码如下:
# -*- coding: utf-8 -*-
import json
import datetime
import scrapy
from scrapy import Spider
from nbaOdds.items import NbaOddsItem
from scrapy_splash import SplashRequest
from scrapy.http.headers import Headers
RENDER_HTML_URL = "http://127.0.0.1:8050/render.html"
class NbaOddsAHSpider(Spider):
name = "odds"
allowed_domains = ["oddsportal.com"]
start_urls = []
start_url ="http://www.oddsportal.com/basketball/usa/nba-2014-2015/results/#/page/"
number_pages = 3
# start_urls.append(start_url)
for x in range(1, number_pages):
start_urls.append(start_url + str(x))
#转换队名
def convert_name(self, name):
if name == "Atlanta Hawks":
return "ATL"
elif name == "Boston Celtics":
return "BOS"
elif name == "Brooklyn Nets":
return "BRK"
elif name == "Cleveland Cavaliers":
return "CLE"
elif name == "Chicago Bulls":
return "CHI"
elif name == "Charlotte Hornets":
return "CHO"
elif name == "Dallas Mavericks":
return "DAL"
elif name == "Denver Nuggets":
return "DEN"
elif name == "Detroit Pistons":
return "DET"
elif name == "Golden State Warriors":
return "GSW"
elif name == "Houston Rockets":
return "HOU"
elif name == "Indiana Pacers":
return "IND"
elif name == "Los Angeles Clippers":
return "LAC"
elif name == "Los Angeles Lakers":
return "LAL"
elif name == "New York Knicks":
return "NYK"
elif name == "New Orleans Pelicans":
return "NOP"
elif name == "Memphis Grizzlies":
return "MEM"
elif name == "Miami Heat":
return "MIA"
elif name == "Minnesota Timberwolves":
return "MIN"
elif name == "Milwaukee Bucks":
return "MIL"
elif name == "Oklahoma City Thunder":
return "OKC"
elif name == "Orlando Magic":
return "ORL"
elif name == "Philadelphia 76ers":
return "PHI"
elif name == "Portland Trail Blazers":
return "POR"
elif name == "Phoenix Suns":
return "PHO"
elif name == "Sacramento Kings":
return "SAC"
elif name == "San Antonio Spurs":
return "SAS"
elif name == "Toronto Raptors":
return "TOR"
elif name == "Utah Jazz":
return "UTA"
elif name == "Washington Wizards":
return "WAS"
def start_requests(self):
for url in self.start_urls:
body = json.dumps({"url":url, "wait":5})
headers = Headers({'Content-Type': 'application/json',
#'Content-Type':'text/javascript;charset=UTF-8',
#'Content-Type': 'text/javascript; charset=UTF-8',
#"Host":"www.oddsportal.com",
#"Referer":"http://www.oddsportal.com/basketball/usa/nba-2015-2016/results/",
"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13"
})
yield SplashRequest(url, self.parse, args={
'wait': 0.5, 'html': 1, 'timeout': 3600,
}, headers=headers,
)
def parse(self, response):
base_url = "http://www.oddsportal.com"
headers = Headers({'Content-Type': 'application/json',
#'Content-Type': 'text/javascript;charset=UTF-8',
"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13"
})
games = response.xpath("//tr[contains(@class, 'deactivate')]")
for game in games:
child_url = base_url + game.xpath("td[contains(@class,'table-participant')]/a/@href").extract()[0] + "#over-under;8"
yield SplashRequest(child_url, self.parse_ah_page, args={
'wait': 0.5, 'html': 1, 'timeout': 3600,
}, headers=headers,
)
def parse_ah_page(self, response):
item = NbaOddsItem()
matchup = response.xpath("//*[@id='col-content']/h1/text()").extract()[0]
matchup_arr = matchup.strip().split('-')
item["home_team_name"] = self.convert_name(matchup_arr[0].strip())
item["away_team_name"] = self.convert_name(matchup_arr[1].strip())
date_n_time = response.xpath("//*[@id='col-content']/p[1]/text()").extract()[0]
tmp = date_n_time.split(',')
date_time = datetime.datetime.strptime(tmp[1].strip() + " " + tmp[2].strip(), "%d %b %Y %H:%M")
date_time += datetime.timedelta(0,0,0,0,0,-5)
date_time = date_time.strftime('%I:%M %p, %b %d, %Y')
item["date"] = date_time
##########################
#获取 整个表
# Asian handicap -7.5 95.2%1.273.80(1)Compare odds
# Asian handicap -6.5 (0)Compare odds
ah_odds = response.xpath("//*[contains(@class, 'table-header-light')]")
ah_odds_list = []
for ah_odd in ah_odds:
#每次只获取一个 左边 分数
#如 Asian handicap -7.5
odds_number_temp = ah_odd.xpath("strong/a/text()").extract()[0]
#分割 就只剩 -7.5
odds_number = float(odds_number_temp.split()[-1])
#每次只获取一个 右边的开这个odds的有几个 Compare odds
# 如 (1)
odds_cnt_temp = ah_odd.xpath("span[contains(@class,'odds-cnt')]/text()").extract()[0]
#print odds_cnt_temp
# 剩下1,去除() 去除第一个和最后一个字符
odds_cnt = int(odds_cnt_temp[1:-1])
odds_over = 0
odds_under = 0
if odds_cnt > 0:
#获取第一个odds,
#如 1.27
# 3.80
odds = ah_odd.xpath("span[contains(@class,'chunk-odd')]/a/text()").extract()
#如果 odds 有数据,0为客odds 1为主odds
if len(odds) > 1:
# 判断是否有错误数据 XX/XX
#---------------下面这段是有时返回和页面不同的数据---------
if "/" in odds[1]:
# 分子
molecule = float(odds[0].split("/")[0])
# 分母
Denominator = float(odds[0].split("/")[1])
odds_under = molecule/Denominator+1
molecule = float(odds[1].split("/")[0])
# 分母
Denominator = float(odds[1].split("/")[1])
odds_over = molecule/Denominator+1
#---------------上面这段是有时返回和页面不同的数据---------
else:
odds_over = float(odds[1])
odds_under = float(odds[0])
ah_odds_list.append((odds_number, odds_cnt, odds_over, odds_under))
print (odds_number, odds_cnt, odds_over, odds_under)
ah_odds_list = sorted(ah_odds_list, reverse=True, key=lambda a: a[1])
ah_odds_list = ah_odds_list[0]
item['over_1'] = ah_odds_list[2]
item['part_1_ou'] = ah_odds_list[0]
item['under_1'] = ah_odds_list[3]
yield item
第一段'over_1'和'under_1'错误的数据,第二段为正常
2017-10-25 21:53:07 [scrapy] DEBUG: Crawled (200) <GET http://www.oddsportal.com/basketball/usa/nba-2014-2015/milwaukee-bucks-chicago-bulls-rPsOwGX8/#over-under;8 via http://127.0.0.1:8050/render.html> (referer: None)
2017-10-25 21:53:07 [stdout] INFO: (47.0, 1, -125.0, 104.0)
2017-10-25 21:53:07 [stdout] INFO: (47.5, 4, -109.0, -110.0)
2017-10-25 21:53:07 [stdout] INFO: (48.0, 5, -105.0, -117.0)
2017-10-25 21:53:07 [stdout] INFO: (48.5, 0, 0, 0)
2017-10-25 21:53:07 [stdout] INFO: ('insert:', 12)
2017-10-25 21:53:07 [scrapy] DEBUG: Scraped from <200 http://www.oddsportal.com/basketball/usa/nba-2014-2015/milwaukee-bucks-chicago-bulls-rPsOwGX8/#over-under;8>
{'away_team_name': 'CHI',
'date': '05:30 PM, Apr 25, 2015',
'home_team_name': 'MIL',
'over_1': -105.0,
'part_1_ou': 48.0,
'under_1': -117.0}
2017-10-25 21:53:11 [scrapy] DEBUG: Crawled (200) <GET http://www.oddsportal.com/basketball/usa/nba-2014-2015/new-orleans-pelicans-golden-state-warriors-Ei2haTbg/#over-under;8 via http://127.0.0.1:8050/render.html> (referer: None)
2017-10-25 21:53:11 [stdout] INFO: (51.5, 1, 1.77, 2.1)
2017-10-25 21:53:11 [stdout] INFO: (52.0, 5, 1.87, 1.97)
2017-10-25 21:53:11 [stdout] INFO: (52.5, 4, 1.9, 1.91)
2017-10-25 21:53:11 [stdout] INFO: (53.0, 0, 0, 0)
2017-10-25 21:53:11 [stdout] INFO: ('insert:', 13)
2017-10-25 21:53:11 [scrapy] DEBUG: Scraped from <200 http://www.oddsportal.com/basketball/usa/nba-2014-2015/new-orleans-pelicans-golden-state-warriors-Ei2haTbg/#over-under;8>
{'away_team_name': 'GSW',
'date': '08:00 PM, Apr 25, 2015',
'home_team_name': 'NOP',
'over_1': 1.87,
'part_1_ou': 52.0,
'under_1': 1.97}
爬虫获取的错误数据,和对应网页的数据.大概有10%+会出现错误数据,其他正常
另一种错误数据格式如 "20/25" ,对应正常数据 "1.8" ,
我在代码中转换用, 20/25+1 转换了
错误的数据我也找不到规律转回正确的。。。