汽车之家车型的简单爬取
spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):
#spider名字
name = 'car_home'
allowed_domains = ['autohome.com.cn']
start_urls = [
]
# 自定义配置
custom_settings = {
'ITEM_PIPELINES': {
'mininova.pipelines.CarPipeline': 300,
}
}
def start_requests(self): #重新定义起始爬取点
#所有首字母
words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
#按照首字母,组合对应的页面,压入start_urls
for word in words:
self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html')
#根据start_urls,抓取页面
for url in self.start_urls:
yield Request(url,meta={'word':word})
#定义默认的抓取函数
def parse(self, response):
print('url')
print(response.url)
word = response.meta['word']
car_cates = response.xpath('//dl').extract()
brand_id = 0
total_cars = []
for brand_index in range(len(car_cates)):
#品牌编号
brand_num = brand_index + 1
brand_num = str(brand_num)
#品牌名
brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]
print('brand:'+brand)
#品牌logo
brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]
#品牌小类别
brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()
#品牌小类别对应的页面
brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()
for brand_item_index in range(len(brand_items)):
#品牌小类别的编号
brand_item_num = brand_item_index + 1
brand_item_num = str(brand_item_num)
#品牌小类别名
brand_item = brand_items[brand_item_index]
#品牌小类别对应的页面的url
brand_item_url = brand_item_urls[brand_item_index]
print('brand_item:'+brand_item)
print('brand_item_url:'+brand_item_url)
#品牌小类别中的所有车
cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()
print('cars_count:'+str(len(cars)))
for car_index in range(len(cars)):
car_num = car_index + 1
car_num = str(car_num)
#具体车的名称
name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]
#车对应的页面
url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]
#报价(最低价-最高价)
price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]
prices = price.split('-')
price_base = '万'
if len(prices) != 2:
max_price = '暂无'
min_price = '暂无'
else:
max_price = str(prices[1].replace(price_base,''))
min_price = str(prices[0])
print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)
car_item = carItem()
car_item['name'] = name
car_item['url'] = url
car_item['brand_item'] = brand_item
car_item['first_word'] = word
car_item['brand'] = brand
car_item['brand_logo_url'] = brand_logo_url
car_item['max_price'] = max_price
car_item['min_price'] = min_price
total_cars.append(car_item)
return total_cars
item
# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):
#具体车名
name = scrapy.Field()
#对应的介绍页面url
url = scrapy.Field()
#最高报价,单位(万)
max_price = scrapy.Field()
#最低报价,单位(万)
min_price = scrapy.Field()
#品牌名
brand = scrapy.Field()
#品牌logo
brand_logo_url = scrapy.Field()
#品牌小类别名
brand_item = scrapy.Field()
#品牌首字母
first_word = scrapy.Field()
mongo_car
from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():
db_name = 'car'
brand_set_name = 'brand'
brand_item_set_name = 'brand_item'
car_set_name = 'car'
def __init__(self):
self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])
def insert(self,item):
brand_where = {'name':item['brand']}
brand = self.brand_exist(self.db,brand_where)
if brand == False:
brand = {'name':item['brand'],'first_word':item['first_word']}
brand = self.insert_brand(self.db,brand)
print('brand insert ok!')
else:
brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}
brand = self.update_brand(self.db,brand_where,brand)
print('brand_exist!')
brand_item_where = {'name':item['brand_item']}
brand_item = self.brand_item_exist(self.db,brand_item_where)
if brand_item == False:
brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}
brand_item = self.insert_brand_item(self.db,brand_item)
print('brand_item insert ok!')
else:
print('brand_item_exist!')
car_where = {'name':item['brand_item'],'name':item['name']}
car = self.car_exist(self.db,car_where)
if car == False:
car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}
car = self.insert_car(self.db,car)
print('car insert ok!')
else:
print('car_exist!')
if car != False:
return True;
else:
return False;
def update_brand(self,db,brand_where,brand):
my_set = db.set(self.db_name,self.brand_set_name)
my_set.update_one(brand_where,{'$set':brand})
exist = my_set.find_one(brand_where)
if(exist is None):
return False
else:
return exist
def brand_exist(self,db,brand):
my_set = db.set(self.db_name,self.brand_set_name)
exist = my_set.find_one(brand)
if(exist is None):
return False
else:
return exist
def insert_brand(self,db,brand):
my_set = db.set(self.db_name,self.brand_set_name)
my_set.insert_one(brand)
brand = my_set.find_one(brand)
return brand
def brand_item_exist(self,db,brand_item):
my_set = db.set(self.db_name,self.brand_item_set_name)
exist = my_set.find_one(brand_item)
if(exist is None):
return False
else:
return exist
def insert_brand_item(self,db,brand_item):
my_set = db.set(self.db_name,self.brand_item_set_name)
my_set.insert_one(brand_item)
brand = my_set.find_one(brand_item)
return brand
def car_exist(self,db,car):
my_set = db.set(self.db_name,self.car_set_name)
exist = my_set.find_one(car)
if(exist is None):
return False
else:
return exist
def insert_car(self,db,car):
my_set = db.set(self.db_name,self.car_set_name)
my_set.insert_one(car)
brand = my_set.find_one(car)
return brand
pipeline
from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):
def process_item(self,item,spider):
mongo_car = MongoCar()
mongo_car.insert(item)
print(item['name'])
print('item insert ok!')
setting
mongo_setting = {
'mongo_host' : 'xxx.xxx.xxx.xxx',
'mongo_port' : 27017,
'mongo_user' : 'username',
'mongo_password' : 'password'
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。