python网络爬虫 myip.ms

目标网站:https://myip.ms
难度10颗星
具有极强的反爬虫能力：封ip
'''

------------------------------
https://myip.ms/browse/web_hosting/1/countryID/ALA%5EASM

------------------------------
'''

import os
import csv
import time
import random
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


class LoopOver(Exception):
    def __init__(self, *args, **kwargs):
        pass


class Spider:
    def __init__(self):
        # csv储存
        self.path = '.'
        self.inputfilename = 'country.csv'
        self.csvfilename = 'datas.csv'
        self.logfilename = 'run.log'

        options = webdriver.ChromeOptions()

        self.browser = webdriver.Chrome(options=options)
        self.wait = WebDriverWait(self.browser, 20)

        # 链接
        self.listurl = 'https://myip.ms/browse/web_hosting/1/countryID/{}'

        self.host = 'https://myip.ms'

        self.tempalte = '''
<p>
    {}
</p>
<table border="5">
    <thead class="tableFloatingHeaderOriginal">
        <tr valign="middle">
            <th class="nobackgroundimage" align="center" style="width: 32px;">
                No
            </th>
            <th colfirst="ip_owners" align="center" title-orig="Hosting Company" class="header" style="width: 163px;">
                Hosting Company</th>

            <th align="center" title-orig="Website/s" class="header" style="width: 114px;">
                Website/s</th>
            <th align="center" title-orig="Total Websites use this company IPs" class="header headerSortUp"
                style="width: 92px;">
                Total Websites use this company IPs</th>
            <th align="center" title-orig="TOP Websites use this company IPs" class="header" style="width: 77px;">
                TOP Websites use this company IPs</th>
            <th align="center" title-orig="Diagram" class="header" style="width: 38px;">
                Record Update Time</th>
        </tr>
    </thead>
    <tbody>
        {}
    </tbody>
</table>
        
        '''

        self.tempalte_page = '''
<table border="5">
    <thead class="tableFloatingHeaderOriginal">
        <tr valign="middle">
            <th class="nobackgroundimage" align="center" style="width: 44px;"><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div><div class="edit-icon-tmp normal ui-button ui-widget ui-state-default ui-corner-all ui-button-text-icon-primary" role="button" title-orig="View Table in Full-screen Mode" style="position: absolute; z-index: 1001; left: 5px; top: 568.6px; display: none;"><span class="ui-button-icon-primary ui-icon ui-icon-arrow-4-diag"></span><span class="ui-button-text">Full-screen Mode</span></div>No</th>
            <th colfirst="sites" align="center" title-orig="Web Site" class="header" style="width: 153px;">
            Web Site</th>
            <th align="center" title-orig="Website IP Address" class="header" style="width: 144px;">
            Website IP Address</th>
            <th align="center" title-orig="Web Hosting Company / IP Owner" class="header" style="width: 178px;">
                Website IPV6 Address</th>
            <th align="center" title-orig="Web Hosting / Server IP Location" class="header" style="width: 134px;">
                World Site Popular</th>
            <th align="center" title-orig="Web Hosting City" class="header" style="width: 105px;">
                World Site Popular Rating</th>
            <th align="center" title-orig="World Site Popular Rating" class="header headerSortDown" style="width: 86px;">
            DNS Records</th>
            <th align="center" title-orig="Diagram" class="header" style="width: 38px;">
            Record Update Time</th>
            </tr>
    </thead>
    <tbody>
        {}
    </tbody>
</table>
                '''

    def turn2filename(self, dst):
        d = dst.replace("\\", "").replace("/", "").replace(":", "").replace("*", "").replace(
            "?", "").replace("\"", "").replace("<", "").replace(">", "").replace(
            "|", "")
        return d

    def run(self):
        strat = time.time()

        self.get_input()
        # 71
        for c, cid in self.datas[115:116]:
            print('>>> ', c, self.listurl.format(cid))

            for item_index, item in enumerate(self.parse_list(self.get_list(self.listurl.format(cid)))):
                if item[1] == '- No Records Found -':
                    item[0] = c
                if c in ['British Indian Ocean Territory', 'Brunei', 'Bulgaria']:
                    self.save_data(item=item, filename=self.turn2filename(c) + '.csv')
                else:
                    self.save_data(item=item, filename='data.csv')
            time.sleep(0)

            end = time.time()

            self.runtime = end - strat
            print('用时{}'.format(self.runtime))

        end = time.time()

        self.runtime = end - strat

    def get_input(self):
        with open(self.inputfilename, 'r', encoding='utf_8') as f:
            reader = csv.reader(f)
            self.datas = [i for i in list(reader) if i]

    def mkurl(self, kw):
        for i in range(0, 1):
            yield self.listurl.format(kw, i * 10)

    def get_list(self, url):
        while True:
            try:
                self.browser.get(url)
                try:
                    self.wait.until(EC.presence_of_element_located(
                        (By.XPATH, '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]')))
                except Exception:
                    if 'a Robot' in self.browser.find_element_by_xpath('/html/body/div[2]/div/div/div/center').text:
                        self.browser.find_element_by_xpath(
                            '//*[@id="captcha_submit"]').click()
                        time.sleep(1)
                        raise Exception
                return self.browser
            except Exception as error:
                print('error >>> ', error)
                if self.browser.current_url != url:
                    self.browser.quit()
                    self.browser = webdriver.Chrome()
                    self.wait = WebDriverWait(self.browser, 20)
                    time.sleep(1)
                pass

    def parse_list(self, response):
        html = etree.HTML(response.page_source)

        def pop(attr): return attr[0].strip().replace(
            '\n', '').replace('  ', '') if attr else ''

        for tr in html.xpath('//*[@id="web_hosting_tbl"]/tbody/tr[not(contains(@class,"expand"))]'):
            No = tr.xpath('./td[1]/text()')[0].strip()

            Hosting_Company = pop(tr.xpath('./td[2]/a/text()'))

            page_url = pop(tr.xpath('./td[2]/a/@href'))

            country_name = pop(tr.xpath('./td[3]/a/text()'))

            Website = pop(tr.xpath('./td[4]/a/text()'))

            Total_Websites_use_this_company_IPs = pop(
                tr.xpath('./td[5]/a/text()'))

            TOP_Websites_use_this_company_IPs = pop(
                tr.xpath('./td[6]/a/text()'))

            record_update_time = pop(
                tr.xpath('./td[7]/text()'))

            yield [country_name, No, Hosting_Company, Website, Total_Websites_use_this_company_IPs,
                   TOP_Websites_use_this_company_IPs, record_update_time, self.host + page_url]

    def get_page(self, url):
        while True:
            try:
                self.browser.get(url)
                try:
                    self.wait.until(EC.presence_of_element_located(
                        (By.XPATH, '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]')))
                except Exception as error:
                    print('//*[@id="sites_tbl" or @id ="web_hosting_tbl"] error', error)

                    if 'a Robot' in self.browser.find_element_by_xpath('/html/body/div[2]/div/div/div/center').text:
                        self.browser.find_element_by_xpath(
                            '//*[@id="captcha_submit"]').click()
                        time.sleep(5)
                        raise Exception
                return self.browser
            except Exception as error:
                print('error >>> ', error)
                if self.browser.current_url != url:
                    self.browser.quit()
                    self.browser = webdriver.Chrome()
                    self.wait = WebDriverWait(self.browser, 20)
                    time.sleep(100)
                pass

    def parse_page(self, response):
        text = response.page_source

        html = etree.HTML(text)

        def pop(attr):
            return attr[0].strip().replace(
                '\n', '').replace('  ', '') if attr else ''

        l = len(html.xpath(
            '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]/tbody/tr[not(contains(@class,"expand"))]'))
        print('len is ', l)
        try:
            for i in range(1, l + 1):
                tr = html.xpath(
                    '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]/tbody/tr[not(contains(@class,"expand"))][{}]'.format(
                        i))[0]
                tre = html.xpath(
                    '//*[@id="sites_tbl" or @id ="web_hosting_tbl"]/tbody/tr[contains(@class,"expand")][{}]'.format(i))[
                    0]

                No = pop(tr.xpath('./td[1]/text()'))
                web_site = pop(tr.xpath('./td[2]/a/text()'))
                web_site_ip_address = pop(tr.xpath('./td[3]/a/text()'))

                # tre
                web_site_ipv6_address = pop(
                    tre.xpath(
                        './td[1]/div[@class="stitle"]/b[contains(text(),"IPv6")]/../following-sibling::*[1]//a/text()'))

                # tre
                website_popularity = pop(
                    tre.xpath('./td[1]/div/span[@class="bold arial grey"]/text()'))

                website_popularity_rating = pop(
                    tr.xpath('./td[7]/span/text()'))

                # tre
                dns_records = '\n'.join(
                    [i for i in tre.xpath(
                        './td[1]/div[@class="stitle"]/b[contains(text(),"DNS")]/../following-sibling::*[1]//a/text()')])
                # tre
                record_update_time = pop(
                    tre.xpath(
                        './td[1]/div[@class="stitle"]/b[contains(text(),"Record Update Time")]/../following-sibling::div/text()'))

                yield [No, web_site, web_site_ip_address, web_site_ipv6_address, website_popularity,
                       website_popularity_rating, dns_records, record_update_time]
        except IndexError:
            raise LoopOver
        if l < 50:
            with open('error.html', 'w', encoding='utf-8') as f:
                f.write(text)
            raise LoopOver

    def save_data(self, filename=None, path=None, item=None):
        if not filename:
            filename = self.csvfilename
        if not path:
            path = self.path

        '''
        保存文件
        '''
        with open('{}/{}'.format(path, filename), 'a', encoding='utf_8', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(item)

    def save_log(self, info):
        with open(self.logfilename, 'a', encoding='utf-8') as f:
            f.write(info + '   ' + time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime()) + '\n')

    def save_html_list(self, country, items, filename=None, path=None):
        tr = ''
        for item in items:
            t = ''
            for index, it in enumerate(item):
                if index == 1:
                    td = '<td><a href="./data/{}-{}.html">{}</a></td>'.format(
                        country, it.replace("\\", "").replace("/", "").replace(":", "").replace("*", "").replace("?",
                                                                                                                 "").replace(
                            "\"", "").replace("<", "").replace(">", "").replace("|", ""), it)
                else:
                    td = '<td>{}</td>'.format(it)
                t += td
            tr += '<tr>' + t + '</tr>'
        with open('main.html', 'a', encoding='utf-8') as f:
            f.write(self.tempalte.format(country, tr))

    def save_html_page(self, country, items, filename=None, path=None, it=None):
        if not os.path.exists(path):
            os.mkdir(path)
        tr = ''
        for index, item in enumerate(items):
            t = ''
            for it in item:
                td = '<td>{}</td>'.format(it)
                t += td
            tr += '<tr>' + t + '</tr>'
        with open('./{}/{}'.format(path, filename), 'w', encoding='utf-8') as f:
            f.write(self.tempalte_page.format(tr))

    @property
    def time(self):
        return '总共用时：{}秒'.format(self.runtime)


if __name__ == '__main__':
    spider = Spider()
    spider.run()
    print(spider.time)  # 运行总时间
python网络爬虫 myip.ms

universe_king

引用和评论

apple silicon 的 mac 上有哪些好用的安卓 app 模拟器？

python与nodejs哪个性能高

Anaconda安装教程以及Anaconda和pip配置国内镜像

如何减少跨团队交付摩擦？——基于 DevOps 与敏捷的最佳实践

Python 描述符

科学计算编程涉及到的技术栈简介

使用 chardet 判断文件编码需要注意的坑——过大的文件会导致高耗时