我有一个爬成都链家网的虫子,但是链家网有流量的限制,我使用了和浏览器相同的cache和headers来绕过,每次遇到限制,就手动的识别验证码,然后虫子就继续跑起来了,但是却出现了很多重复的已经爬过的url,而且数据也没有更新,大多都是重复的数据。
核心代码如下:
class LianjiaSpider(scrapy.Spider):
name = 'lianjiaspider'
start_urls = 'http://cd.lianjia.com/ershoufang/'
cookie = trans.stringToDict()
headers = {
'Host': "cd.lianjia.com",
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'en-US,en;q=0.8,zh;q=0.6',
}
def start_requests(self):
yield scrapy.Request(url=self.start_urls, headers=self.headers, method='GET', cookies=self.cookie, callback=self.parse)
def parse(self, response):
body = response.body.decode('utf-8')
soup = BeautifulSoup(body)
area_div = soup.select('div[data-role="ershoufang"]')
area_list = area_div[0].find_all('a')
for area in area_list:
try:
area_han = area.string # 地点
area_pin = area['href'].split('/')[2] # 拼音
area_url = 'http://cd.lianjia.com/ershoufang/{}/'.format(area_pin)
print(area_url)
yield scrapy.Request(url=area_url, headers=self.headers, cookies=self.cookie, callback=self.detail_url, meta={"id1": area_han, "id2": area_pin} )
except Exception:
pass
def get_detail_info(self, item, url): # 进入每个房源链接抓经纬度
contents = requests.get(url, headers=self.headers, cookies=self.cookie)
body = contents.content.decode('utf-8')
soup = BeautifulSoup(body)
transaction_div = soup.find('div', 'transaction')
transaction_lis = transaction_div.find_all('li')
item['last_buy_time'] = transaction_lis[2].text[4:]
item['publish_time'] = transaction_lis[0].text[4:]
regex = '''resblockPosition(.+)'''
items = re.search(regex, body)
content = items.group()[:-1] # 经纬度
longitude_latitude = content.split(':')[1]
item['location'] = longitude_latitude[1:-1]
id_regex = '''houseId(.+)'''
ids = re.search(id_regex, body)
house_id_str = ids.group()[:-1] # house id
house_id = house_id_str.split(':')[1]
item['house_id'] = house_id[1:-1]
def detail_url(self, response):
for i in range(1, 101):
url = 'http://cd.lianjia.com/ershoufang/{}/pg{}/'.format(response.meta["id2"], str(1))
time.sleep(2)
try:
print('当前正在爬取:{}'.format(url))
contents = requests.get(url, headers=self.headers, cookies=self.cookie)
body = contents.content.decode('utf-8')
soup = BeautifulSoup(body)
house_ul = soup.find('ul', 'sellListContent')
houselist = house_ul.find_all('li')
for house in houselist:
try:
item = LianjiaItem()
item['title'] = house.find('div', 'title').a.string
item['community'] = house.find('div', 'houseInfo').text.split('|')[0]
item['model'] = house.find('div', 'houseInfo').text.split('|')[1]
area_str = house.find('div', 'houseInfo').text.split('|')[2]
area_match = re.findall(r'\d+', area_str)
if len(area_match) == 2:
item['area'] = float(area_match[0] + '.' + area_match[1])
else:
item['area'] = float(area_match[0])
focus_num_str = house.find('div', 'followInfo').text.split('/')[0]
focus_num_match = re.findall(r'\d+', focus_num_str)
item['focus_num'] = focus_num_match[0]
watch_num_str = house.find('div', 'followInfo').text.split('/')[1]
watch_num_match = re.findall(r'\d+', watch_num_str)
item['watch_num'] = watch_num_match[0]
item['price'] = float(house.find('div', 'totalPrice').span.string) * 10000
average_price_str = house.find('div', 'unitPrice').span.string
average_price_match = re.findall(r'\d+', average_price_str)
item['average_price'] = average_price_match[0]
item['link'] = house.find('div', 'title').a['href']
item['city'] = response.meta["id1"]
self.get_detail_info(item, item['link'])
except Exception as e:
print(str(e))
pass
yield item
except Exception:
pass
完整的代码地址:https://github.com/BlackKnigh...
不知道是不是该使用redies_spider, 或者不知道大家有没有更好的方式,来做数据或者url去重?
先把所有要爬取的链接找出来,放到待爬取列表,爬完一个清掉一个