image.png

image.png

import re
import os
import csv
import json
import time
import requests
from urllib.parse import urlencode


class Spider:
    def __init__(self):
        self.path = '.'
        self.img_path = 'imgs'
        self.csvfilename = 'datas.csv'
        self.logfilename = 'run.log'
        
        self.flag=0

    def run(self):
        strat = time.time()
        if os.path.exists('{}/{}'.format(self.path, self.csvfilename)):
            os.remove('{}/{}'.format(self.path, self.csvfilename))
        self.save_data(['名字', '别名', '粉丝数', '专辑数', 'mv数', '音乐发行数量', '头像链接'])

        for url in self.mkurl():
            for artistFans, albumNum, mvNum, musicNum, name, aartist, pic in self.parse_page(self.get_page(url)):
                self.save_data([name, aartist, artistFans,
                                albumNum, mvNum, musicNum, pic])
                self.img_downloader(pic, name)

        end = time.time()

        self.runtime = end - strat

    def mkurl(self):
        params = {
            'category': '0',
            'prefix': '',
            'pn': '3',
            'rn': '102',
            'httpsStatus': '1',
            'reqId': '01a9c5f0-bb51-11ea-a4ae-6f62032e0cbe',
        }
        for i in range(1, 4):
            params['pn'] = i
            yield 'http://www.kuwo.cn/api/www/artist/artistInfo?'+urlencode(params)

    def get_page(self, url):

        headers = {
            'Cookie': 'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1593486940; _ga=GA1.2.2081093336.1593486940;'
            ' _gid=GA1.2.1984756662.1593486940; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1593505734;'
            ' kw_token=TBONPSLXNY',
            'csrf': 'TBONPSLXNY',
            'Host': 'www.kuwo.cn',
            'Proxy-Connection': 'keep-alive',
            'Referer': 'http://www.kuwo.cn/singers',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
        }
        response = requests.get(url, headers=headers)
        return response

    def parse_page(self, response):
        for item in response.json().get('data').get('artistList'):
            text = json.dumps(item,ensure_ascii=False)
            def pop(attr): return attr[0] if attr else ''
            
            artistFans = pop(re.findall('"artistFans":(.*?),', text)).strip()
            albumNum = pop(re.findall('"albumNum":(.*?),', text)).strip()
            mvNum = pop(re.findall('"mvNum":(.*?),', text)).strip()
            musicNum = pop(re.findall('"musicNum":(.*?),', text)).strip()
            name = pop(re.findall('"name": "(.*?)",', text)).strip()
            aartist = pop(re.findall('"aartist": "(.*?)",', text)).strip()
            pic = pop(re.findall('"pic": "(.*?)",', text)).strip()
            
            yield artistFans, albumNum, mvNum, musicNum, name, aartist, pic

    def save_data(self, item):
        '''
        保存文件
        '''
        print('-', end='')
        with open('{}/{}'.format(self.path, self.csvfilename), 'a', encoding='utf_8_sig', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(item)

    def img_downloader(self, url, name):
        if not os.path.exists(self.img_path):
            os.mkdir(self.img_path)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
        }
        response = requests.get(url, headers=headers)
        with open('{}/{}'.format(self.img_path, str(self.flag)+'-'+name+'.jpg'), 'wb') as f:
            f.write(response.content)
        self.flag += 1

    @property
    def time(self):
        return '总共用时:{}秒'.format(self.runtime)


if __name__ == '__main__':
    spider = Spider()
    spider.run()

universe_king
3.4k 声望678 粉丝