1

汽车之家车型的简单爬取
spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):
    #spider名字
    name = 'car_home'
    allowed_domains = ['autohome.com.cn']
    start_urls = [
    ]
     # 自定义配置
    custom_settings = {
         'ITEM_PIPELINES': {
         'mininova.pipelines.CarPipeline': 300,
         }
    }
    def start_requests(self): #重新定义起始爬取点
        #所有首字母
        words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
        #按照首字母,组合对应的页面,压入start_urls
        for word in words:
            self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') 
        #根据start_urls,抓取页面
        for url in self.start_urls:
            yield Request(url,meta={'word':word})
    #定义默认的抓取函数
    def parse(self, response): 
        print('url')
        print(response.url)
        word = response.meta['word']
        car_cates = response.xpath('//dl').extract()
        brand_id = 0
        total_cars = []
        for brand_index in range(len(car_cates)):
            #品牌编号
            brand_num = brand_index + 1
            brand_num = str(brand_num)
            #品牌名
            brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]
            print('brand:'+brand)
            #品牌logo
            brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]
            #品牌小类别
            brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()
            #品牌小类别对应的页面
            brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()
            for brand_item_index in range(len(brand_items)):
                #品牌小类别的编号
                brand_item_num = brand_item_index + 1
                brand_item_num = str(brand_item_num)
                #品牌小类别名
                brand_item = brand_items[brand_item_index]
                #品牌小类别对应的页面的url
                brand_item_url = brand_item_urls[brand_item_index]
                print('brand_item:'+brand_item)
                print('brand_item_url:'+brand_item_url)
                #品牌小类别中的所有车
                cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()
                print('cars_count:'+str(len(cars)))
                for car_index in range(len(cars)):
                    car_num = car_index + 1
                    car_num = str(car_num)
                    #具体车的名称
                    name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]
                    #车对应的页面
                    url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]
                    #报价(最低价-最高价)
                    price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]
                    prices = price.split('-')
                    price_base = '万'
                    if len(prices) != 2:
                        max_price = '暂无'
                        min_price = '暂无'
                    else:
                        max_price = str(prices[1].replace(price_base,''))
                        min_price = str(prices[0])
                    print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)
                    car_item = carItem()
                    car_item['name'] = name
                    car_item['url'] = url
                    car_item['brand_item'] = brand_item
                    car_item['first_word'] = word
                    car_item['brand'] = brand
                    car_item['brand_logo_url'] = brand_logo_url
                    car_item['max_price'] = max_price
                    car_item['min_price'] = min_price
                    total_cars.append(car_item)
        return total_cars

item

# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):
    #具体车名
    name = scrapy.Field()
    #对应的介绍页面url
    url = scrapy.Field()
    #最高报价,单位(万)
    max_price = scrapy.Field()
    #最低报价,单位(万)
    min_price = scrapy.Field()
    #品牌名
    brand = scrapy.Field()
    #品牌logo
    brand_logo_url = scrapy.Field()
    #品牌小类别名
    brand_item = scrapy.Field()
    #品牌首字母
    first_word = scrapy.Field() 

mongo_car

from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():
    db_name = 'car'
    brand_set_name = 'brand'
    brand_item_set_name = 'brand_item'
    car_set_name = 'car'
    def __init__(self):
        self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])

    def insert(self,item):
        brand_where = {'name':item['brand']}
        brand = self.brand_exist(self.db,brand_where)
        if brand == False:
            brand = {'name':item['brand'],'first_word':item['first_word']}
            brand = self.insert_brand(self.db,brand)
            print('brand insert ok!')
        else:
            brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}
            brand = self.update_brand(self.db,brand_where,brand)
            print('brand_exist!')

        brand_item_where = {'name':item['brand_item']}
        brand_item = self.brand_item_exist(self.db,brand_item_where)
        if brand_item == False:
            brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}
            brand_item = self.insert_brand_item(self.db,brand_item)
            print('brand_item insert ok!')
        else:
            print('brand_item_exist!')

        car_where = {'name':item['brand_item'],'name':item['name']}
        car = self.car_exist(self.db,car_where)
        if car == False:
            car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}
            car = self.insert_car(self.db,car)
            print('car insert ok!')
        else:
            print('car_exist!')
            


        if car != False:
            return True;
        else:
            return False;
    def update_brand(self,db,brand_where,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        my_set.update_one(brand_where,{'$set':brand})
        exist = my_set.find_one(brand_where)
        if(exist is None):
            return False
        else:
            return exist

    def brand_exist(self,db,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        exist = my_set.find_one(brand)
        if(exist is None):
            return False
        else:
            return exist

    def insert_brand(self,db,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        my_set.insert_one(brand)
        brand = my_set.find_one(brand)
        return brand

    def brand_item_exist(self,db,brand_item):
        my_set = db.set(self.db_name,self.brand_item_set_name)
        exist = my_set.find_one(brand_item)
        if(exist is None):
            return False
        else:
            return exist

    def insert_brand_item(self,db,brand_item):
        my_set = db.set(self.db_name,self.brand_item_set_name)
        my_set.insert_one(brand_item)
        brand = my_set.find_one(brand_item)
        return brand

    def car_exist(self,db,car):
        my_set = db.set(self.db_name,self.car_set_name)
        exist = my_set.find_one(car)
        if(exist is None):
            return False
        else:
            return exist

    def insert_car(self,db,car):
        my_set = db.set(self.db_name,self.car_set_name)
        my_set.insert_one(car)
        brand = my_set.find_one(car)
        return brand

pipeline

from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):   
    def process_item(self,item,spider):
        mongo_car = MongoCar()
        mongo_car.insert(item)
        print(item['name'])
        print('item insert ok!')

setting

mongo_setting = {
    'mongo_host' : 'xxx.xxx.xxx.xxx',
    'mongo_port' : 27017,
    'mongo_user' : 'username',
    'mongo_password' : 'password'
}

冰茶么么哒
31 声望2 粉丝