本文从https://www.rankedftw.com/lad... 爬取了星际2天梯数据,并存储到本地的MySQL数据库中,共爬得32万条数据。

import requests  
from bs4 import BeautifulSoup  
import pymysql.cursors  
from config import *  
from multiprocessing import Pool  

# 连接到数据库  

connection = pymysql.connect(host, user, password, db, port)  
cursor = connection.cursor()  


def get_html(url):  

    # 获取html  
    
    r = requests.get(url)  
    return r.text  
   

def save_to_db(values):  

    # 插入数据到表格中  

    insert_ = 'INSERT INTO {} VALUES{}'.format(tabel, tuple(values))  
    try:  
        if cursor.execute(insert_):  
            connection.commit()  

    except:  
        connection.rollback()  
        print('插入失败', values)  


def get_info(html):  

    # 解析html获取数据并存储  
    
    soup = BeautifulSoup(html,'lxml')  
    infos = soup.find('table',class_='team-size-1').find_all('tr')  

    for i in infos[1:]:  

        if i.find_all('td', class_='img')[1].find('img', class_='league') is not None:  
            League = i.find_all('td', class_='img')[1].find('img', class_='league').get('src').split('/')\[-1][:-10]  
        else:  
            League = '0'  

    infomation = {'Rank': int(i.find_all('td', class_='number')[0].text),  
                  'Name': i.find('span', class_='name').text,  
                  'MMR': int(i.find_all('td', class_='number')[1\].text),  
                  'Points': int(i.find_all('td', class_='number')[2].text),  
                  'Wins': int(i.find_all('td', class_='number')[3].text),  
                  'Losses': int(i.find_all('td', class_='number')[4].text),  
                  'Played': 0,  
                  'WinRate': i.find_all('td', class_='number')[6].text,  
                  'Age': i.find_all('td', class_='number')[7].text,  
                  'Region': i.find_all('td', class_='img')[0].find('img').get('src')[-12:-10],  
                  'League': League,  
                  'Tier': int(i.find_all('td', class_='img')[2].text),  
                  'Race': i.find('img', class_='race').get('src').split('/')[-1]\[:-10],  
  }  

    #keys = [i for i in infomation.keys()]  
    values = [i for i in infomation.values()]  

    save_to_db(values)  
    #list_ = [int(Rank),Name,int(MMR),int(Points),int(Wins),int(Losses),Played,WinRate,Age,Region,League,int(Tier),Race]  
 #save_to_db(list_)  


def main(offset):  

    try:  
        url = 'https://www.rankedftw.com/ladder/lotv/1v1/mmr/?offset={}'.format(offset*100)  
        html = get_html(url)  
        get_info(html)  

    except Exception as e:  
        print(e)  


if __name__ == '__main__':  

    pool_ = Pool()  
    pool_.map(main, [i for i in range(3254)])  
    connection.close()

HH丶丶
29 声望8 粉丝