问题描述
我用scrapy爬取链家新房数据,由于有多个城市,所有start_urls中会有90多个城市的url,然后爬取每个城市的新房数据,结果发现数据不对,在深圳的数据库中出现了上海的房产数据,想问下可能是什么原因造成的
项目地址如下
https://github.com/yawuplus/G...
问题出现的环境背景及自己尝试过哪些方法
我觉得是由于scrapy同时并发爬取多个城市数据造成的数据问题
相关代码
首先取每个城市url
def get_all_city():
response = requests.get('https://www.lianjia.com/city/', headers=header)
data = re.findall(re.compile('<li><a href="https://(\w+).lianjia.com/">(.+?)</a></li>'),
response.text)
all_city_url_list = ['https://{}.lianjia.com/'.format(tuple[0]) for tuple in data]
return all_city_url_list
import scrapy
from ..common import *
from scrapy import Request
import time
from scrapy.log import logger
from bs4 import BeautifulSoup
from ..items import NewhouseItem
class NewhouseSpider(scrapy.Spider):
name = 'newhouse'
allowed_domains = ['lianjia.com']
start_urls = []
house = []
city_url = ''
city = ''
def __int__(self):
self.start_urls = get_all_city()
self.city_map = all_city_map()
def start_requests(self):
for city, url in all_city_map().items():
page = get_new_house_page(url)
self.city_url = url
self.city = city
logger.info('{} 一共有 {} 页'.format(city, page))
for i in range(1, page + 1):
crawl_url = '{}/loupan/pg{}'.format(url, str(i))
yield Request(crawl_url, self.parse, dont_filter=True)
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
ul = soup.find_all(attrs={'class': 'resblock-list post_ulog_exposure_scroll has-results'})
for li in ul:
item = NewhouseItem()
temp = li.find_all(attrs={'class': 'name'})
if temp:
item['loupan'] = temp[0].text
else:
item['loupan'] = NOT_EXIST_STR
item['city'] = self.city
temp = li.find_all(attrs={'class': 'name'})
if temp:
item['loupan_url'] = self.city_url[:-1] + temp[0].attrs['href']
else:
item['loupan_url'] = NOT_EXIST_STR
temp = li.find_all(attrs={'class': 'resblock-type'})
if temp:
item['wuye_type'] = temp[0].text
else:
item['wuye_type'] = NOT_EXIST_STR
temp = li.find_all(attrs={'class': 'sale-status'})
if temp:
item['sale_status'] = temp[0].text
else:
item['sale_status'] = NOT_EXIST_STR
temp = li.find_all(attrs={'class': 'lj-lazy'})
if temp:
item['img_url'] = temp[0].attrs['data-original']
else:
item['img_url'] = NOT_EXIST_STR
temp = li.find_all(attrs={'class': 'resblock-location'})
if temp:
item['location'] = temp[0].text.replace('\n', '').split('/')
else:
item['location'] = NOT_EXIST_STR
temp = li.find_all(attrs={'class': 'resblock-room'})
if temp:
item['huxing'] = temp[0].text.replace('\n', '')
else:
item['huxing'] = NOT_EXIST_STR
temp = li.find_all(attrs={'class': 'resblock-area'})
if temp:
tmp_area = temp[0].text.replace(' ', '').replace('\n', '').replace(' 建面', '').replace('㎡', '')
if tmp_area == '':
item['area'] = []
else:
list = tmp_area.split('-')
item['area'] = []
for i in list:
if i.isdigit():
item['area'].append(int(i))
else:
item['area'].append(NOT_EXIST_LIST)
else:
item['area'] = NOT_EXIST_LIST
main_price = li.find_all(attrs={'class': 'number'})
if main_price != []:
temp = main_price[0].text.replace(' ', '')
if temp.isdigit():
item['main_price'] = int(temp)
else:
item['main_price'] = NOT_EXIST_NUM
else:
item['main_price'] = NOT_EXIST_NUM
main_price_desc = li.find_all(attrs={'class': 'desc'})
if main_price_desc != []:
item['main_price_desc'] = main_price_desc[0].text.replace('\xa0', '')
else:
item['main_price_desc'] = NOT_EXIST_STR
second_price = li.find_all(attrs={'class': 'second'})
if (second_price != []):
temp = second_price[0].text.replace(' 总价', ''). \
replace(' ', '').replace(' 万 / 套起', '')
if temp.isdigit():
item['second_price'] = int(temp)
else:
item['second_price'] = NOT_EXIST_NUM
else:
item['second_price'] = NOT_EXIST_NUM
temp = li.find_all(attrs={'class': 'resblock-tag'})
if temp:
tag = temp[0].text.replace('', '').split('\n')
item['tag'] = tag[1:-1]
else:
item['tag'] = []
item['crawl_time'] = int(time.time())
yield item
### 你期待的结果是什么?实际看到的错误信息又是什么?
兄弟你怎么解决的 我也有这个问题