4

国家企业信用公示系统的爬取

1. 网站分析

1.1 获取首页

  • 通过 requests.get 直接请求网站首页,返回 521 错误提示码,返回结果是js代码。这是采用乐加速乐反爬技术,在访问前先判断客户端的cookie是否正确,如果不正确,返回521状态码和一段js代码,并且进行set-cookie操作,返回的js代码经过浏览器执行又会生成新的cookie,这两个cookie一起发送给服务器,才会返回正确的网页内容。
  • 解决方法:
    通过python的 execjs 执行返回的js,拿到新的cookie,和第一次请求的cookie一起发送给服务器。
  • 具体操作:

    response = self.session.get(self.cookie_url)
    js_code1 = response.text
    print(js_code1)
    print(response.cookies)

    输出

    <script>var x="toLowerCase@@@@267@@@@window@36@@@@firstChild@div@@catch@@var@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@@dpUWKvmM@@substr@@f@53@30@@split@@String@e@location@cookie@g@if@3D@@1@@for@@RegExp@1500@0@@try@pathname@@@@@length@@@@@reverse@1553939630@DOMContentLoaded@@Path@@19@eval@@https@parseInt@chars@innerHTML@charAt@return@2@while@@match@attachEvent@@challenge@@@Sat@@search@10@charCodeAt@createElement@hgCxEv@JgSe0upZ@__jsl_clearance@Array@addEventListener@50@@2FP@setTimeout@a@false@fromCharCode@8@Mar@@0xEDB88320@href@@captcha@replace@function@0xFF@@new@@@d@onreadystatechange@document@@join@@GMT@@toString@Expires@W7@else@@BWEUk".replace(/@*$/,"").split("@"),y="j 3=34(){2r('y.30=y.1e+y.2f.33(/[\\?|&]32-2a/,\\'\\')',1a);3c.10='2l=1p.5|1b|'+(34(){j 3=2m(+[[[-~{}+(+!-[])+(+!-[])]*(-~{}+(+!-[])+(+!-[]))]]),1g=['%',[[(+!-[])+(+!-[])]/~~''+[[]][1b]][1b].22(24),'3n%2q',[{}+[]+[]][1b].22(-~~~''-~[-~~~''+(-~~~''<<-~~~'')+((+!-[])+(+!-[])^-~[])]),'3k',(!{}+[]+[[]][1b]).22(-~!{})+[[(+!-[])+(+!-[])]/~~''+[[]][1b]][1b].22(24),'2j',((-~~~''<<-~~~'')+[[]][1b]),'m%13'];17(j 1n=1b;1n<1g.1j;1n++){3.1o()[1n]=1g[1n]};23 3.3e('')})()+';3j=2d, s-2w-1u 2g:r:2o 3g;1s=/;'};12((34(){1d{23 !!9.2n;}h(x){23 2t;}})()){3c.2n('1q',3,2t)}3l{3c.28('3b',3)}",f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},z=f(y.match(/\w/g).sort(function(x,y){return f(x)-f(y)}).pop());while(z++)try{eval(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]||("_"+y)}));break}catch(_){}</script>
    <RequestsCookieJar[<Cookie __jsluid=30d33a1b087a857d4fd13e63223f63c9 for www.gsxt.gov.cn/>]>
    

    对这段代码进行js反混淆,得到

    <script>
     var x = "toLowerCase@@@@267@@@@window@36@@@@firstChild@div@@catch@@var@rOm9XFMtA3QKV7nYsPGT4lifyWwkq5vcjH2IdxUoCbhERLaz81DNB6@@dpUWKvmM@@substr@@f@53@30@@split@@String@e@location@cookie@g@if@3D@@1@@for@@RegExp@1500@0@@try@pathname@@@@@length@@@@@reverse@1553939630@DOMContentLoaded@@Path@@19@eval@@https@parseInt@chars@innerHTML@charAt@return@2@while@@match@attachEvent@@challenge@@@Sat@@search@10@charCodeAt@createElement@hgCxEv@JgSe0upZ@__jsl_clearance@Array@addEventListener@50@@2FP@setTimeout@a@false@fromCharCode@8@Mar@@0xEDB88320@href@@captcha@replace@function@0xFF@@new@@@d@onreadystatechange@document@@join@@GMT@@toString@Expires@W7@else@@BWEUk".replace(/@*$/, "").split("@"),
     y = "j 3=34(){2r('y.30=y.1e+y.2f.33(/[\\?|&]32-2a/,\\'\\')',1a);3c.10='2l=1p.5|1b|'+(34(){j 3=2m(+[[[-~{}+(+!-[])+(+!-[])]*(-~{}+(+!-[])+(+!-[]))]]),1g=['%',[[(+!-[])+(+!-[])]/~~''+[[]][1b]][1b].22(24),'3n%2q',[{}+[]+[]][1b].22(-~~~''-~[-~~~''+(-~~~''<<-~~~'')+((+!-[])+(+!-[])^-~[])]),'3k',(!{}+[]+[[]][1b]).22(-~!{})+[[(+!-[])+(+!-[])]/~~''+[[]][1b]][1b].22(24),'2j',((-~~~''<<-~~~'')+[[]][1b]),'m%13'];17(j 1n=1b;1n<1g.1j;1n++){3.1o()[1n]=1g[1n]};23 3.3e('')})()+';3j=2d, s-2w-1u 2g:r:2o 3g;1s=/;'};12((34(){1d{23 !!9.2n;}h(x){23 2t;}})()){3c.2n('1q',3,2t)}3l{3c.28('3b',3)}",
     f = function(x, y) {
     var a = 0,
         b = 0,
         c = 0;
     x = x.split("");
     y = y || 99;
     while ((a = x.shift()) && (b = a.charCodeAt(0) - 77.5)) c = (Math.abs(b) < 13 ? (b + 48.5) : parseInt(a, 36)) + y * c;
     return c
     },
     z = f(y.match(/\w/g).sort(function(x, y) {
     return f(x) - f(y)
     }).pop());
     while (z++) try {
         eval(y.replace(/\b\w+\b/g, function(y) {
         return x[f(y, z) - 1] || ("_" + y)
     }));
     break
     } catch (_) {}
    </script>
    

    对js代码进行格式化处理,再调用python的execjs执行这段代码,如下所示:

    js_code1 = js_code1.rstrip('\n')
    js_code1 = js_code1.replace('</script>', '')
    js_code1 = js_code1.replace('<script>', '')
    index = js_code1.rfind('}')
    js_code1 = js_code1[0:index + 1]
    js_code1 = 'function getCookie() {' + js_code1 + '}'
    js_code1 = js_code1.replace('eval', 'return')
    js_code2 = execjs.compile(js_code1)   
    code = js_code2.call('getCookie')
    print(code)
    

    得到结果为:

    var _1l=function(){setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);document.cookie='__jsl_clearance=1553940235.414|0|'+(function(){var _1l=Array(+[[-~!{}]+[-~!{}]]),_O=['URpBd',(((+!-[])+[(+!-[])+(+!-[])]>>(+!-[])+(+!-[]))+[[]][0]),'BM',(((+!-[])+[(+!-[])+(+!-[])]>>(+!-[])+(+!-[]))+[[]][0])+[{}+[]+[[]][0]][0].charAt(-~{}),'GQ7RqBB',(!+[]+[]+[[]][0]).charAt(~~'')+[[(+!-[])+(+!-[])]/~~''+[]][0].charAt((-~~~''<<-~~~'')+([(+!-[])+(+!-[])]+~~!{}>>(+!-[])+(+!-[]))),'LMV',({}+[]).charAt([-~!{}]+(~~''+[[]][0])),'BUc%',(!{}+[]+[[]][0]).charAt(-~!{})+[{}+[]+[[]][0]][0].charAt(-~{}),'D'];for(var _7=0;_7<_O.length;_7++){_1l.reverse()[_7]=_O[_7]};return _1l.join('')})()+';Expires=Sat, 30-Mar-19 11:03:55 GMT;Path=/;'};if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',_1l,false)}else{document.attachEvent('onreadystatechange',_1l)}

    同样进行js反混淆,得到

    var _1l = function () {
     setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')', 1500);
     document.cookie = '__jsl_clearance=1553940235.414|0|' + (function () {
         var _1l = Array(+[
             [-~!{}] + [-~!{}]
         ]),
             _O = ['URpBd', (((+!-[]) + [(+!-[]) + (+!-[])] >> (+!-[]) + (+!-[])) + [
                 []
             ][0]), 'BM', (((+!-[]) + [(+!-[]) + (+!-[])] >> (+!-[]) + (+!-[])) + [
                 []
             ][0]) + [{} + [] + [
                 []
             ][0]][0].charAt(-~ {}), 'GQ7RqBB', (!+[] + [] + [
                 []
             ][0]).charAt(~~'') + [
                 [(+!-[]) + (+!-[])] / ~~'' + []
             ][0].charAt((-~~~'' << -~~~'') + ([(+!-[]) + (+!-[])] + ~~!{} >> (+!-[]) + (+!-[]))), 'LMV', ({} + []).charAt([-~!{}] + (~~'' + [
                 []
             ][0])), 'BUc%', (!{} + [] + [
                 []
             ][0]).charAt(-~!{}) + [{} + [] + [
                 []
             ][0]][0].charAt(-~ {}), 'D'];
             for (var _7 = 0; _7 < _O.length; _7++) {
                 _1l.reverse()[_7] = _O[_7]
             };
         return _1l.join('')
     })() + ';Expires=Sat, 30-Mar-19 11:03:55 GMT;Path=/;'
    };
    if ((function () {
    try {
       return !!window.addEventListener;
    } catch (e) {
        return false;
    }
    })()) {
        document.addEventListener('DOMContentLoaded', _1l, false)
    } else {
        document.attachEvent('onreadystatechange', _1l)
    }
    

    同样对这段js代码格式化,再用python的execjs进行调用,得到结果

    code = 'var a' + code.split('document.cookie')[1].split("Path=/;'")[0] + "Path=/;';return a;"
    code = 'window = {}; \n' + code
    js_final = "function getClearance(){" + code + "};"
    ctx = execjs.compile(js_final)
    jsl_clearance = ctx.call('getClearance')
    jsl_cle = jsl_clearance.split(';')[0].split('=')[1]
    print('__jsl_clearance=' + jsl_cle)

    得到最终生成的cookie

    __jsl_clearance=1553940235.414|0|URpBdaoBMjGQ7RqBBtyLMV3oBUc%3D

    与第一次请求得到的cookie 一起发送给服务器,就能返回正常结果。

1.2 验证码破解

  • 此网站采用的是极验验证码,调用打码平台进行验证码破解。
    验证码识别
    开发者文档
  • 代码

    num = int(time.time() * 1000)
    url = 'http://www.gsxt.gov.cn/SearchItemCaptcha?t={}'.format(num)
    response = request.get(url)
    print(response.text)
    json_data = response.text
    dict_data = json.loads(json_data)
    url = 'http://jiyanapi.c2567.com/shibie?gt={}&challenge={}&referer=http://www.gsxt.gov.cn&user={}&pass={}&return=json'.format(dict_data['gt'], dict_data['challenge'], username, password)
    response = self.session.get(url)
    json_data = response.text
    print(json_data)

    结果:

    {"status":"ok","challenge":"4cab66342cb0def73b5b78e5337d51b2","validate":"2686d56f9c350c4dd4c90b762c3afa7a"}

1.3 查询企业

post_params = {
    'geetest_challenge': data.get('challenge'),
    'geetest_validate': data.get('validate'),
    'geetest_seccode': data.get('validate') + '|jordan',
    'tab': 'ent_tab',
    'province': '',
    # token: 在首页的源代码中,有一句注释:#TODO 伪造极验变量
    'token': '2016',
    'searchword': '百度'
}
response = self.session.post(self.post_url, data=post_params)
print(response.text)
return response.text

成功获取到结果!

1.4 github链接

国家企业信用公示系统爬取: https://github.com/einherjarC...


einherjar
32 声望9 粉丝

练好基本功,方能打开任督二脉。