requests库如何爬取含有验证码,需要cookies的ajax动态网页?

需要爬取如下网站https://isisn.nsfc.gov.cn/egr...
目的是进行搜索,但是这个网页是通过ajax动态加载的,并且需要cookies,post的内容含有验证码,验证码每秒更新,请问如何爬取这样的网页?

在这里提供一个查询示例:
项目代码:F030203
资助类别:面上项目
批准年度:2017

post的网页
clipboard.png

post的数据
clipboard.png

源代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/4/15 18:12

import requests,json,zlib,gzip,re

with open('curl.txt') as f:
    para = f.read()

s = requests.session()

url = 'https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list?flag=grid&checkcode='
headers = {
'Origin': 'https://isisn.nsfc.gov.cn',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/xml, text/xml, */*; q=0.01',
'Referer': 'https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
# 'Cookie': 'THFqhTnW0hPXnGjMZxctP5lYgKqRyDyDspJ20mjQJ8T12MG5JpxY!330819558!-2052098913; test=69345741; isisn=98184645; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; JSESSIONID=Zd1uNLn4tg6QFEWhXZ6Hc8e0ldqtAwWS0NN5mmerlfSyLVoYJe5T!1578882446'

}

cookies = {'sessionidindex':'Nhd1hT2D2bLsDX0fbYPH6gGbpNvFGhG177Dr3BksGFj1MB11czXc!-877234612!180665615',###
'test':'69345741',
'isisn':'98184645',
'org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE':'zh_CN',
'JSESSIONID':'Zd1uNLn4tg6QFEWhXZ6Hc8e0ldqtAwWS0NN5mmerlfSyLVoYJe5T!1578882446'}

cookies['sessionidindex'] = re.findall('sessionidindex=(.*?);',para)[0]

# data = {'_search':'false',
# 'nd':'1523792584670',#######
# 'rows':10,
# 'page':'1',
# 'sidx':'',
# 'sord':'desc',
# 'searchString':'resultDate%5E%3AprjNo%253A%252Cctitle%253A%252CpsnName%253A%252CorgName%253A%252CsubjectCode%253AF030203.%25E5%25A4%258D%25E6%259D%2582%25E7%25B3%25BB%25E7%25BB%259F%25E5%258F%258A%25E5%25A4%258D%25E6%259D%2582%25E7%25BD%2591%25E7%25BB%259C%25E7%2590%2586%25E8%25AE%25BA%25E4%25B8%258E%25E6%2596%25B9%25E6%25B3%2595%252Cf_subjectCode_hideId%253AF030203%252CsubjectCode_hideName%253AF030203.%25E5%25A4%258D%25E6%259D%2582%25E7%25B3%25BB%25E7%25BB%259F%25E5%258F%258A%25E5%25A4%258D%25E6%259D%2582%25E7%25BD%2591%25E7%25BB%259C%25E7%2590%2586%25E8%25AE%25BA%25E4%25B8%258E%25E6%2596%25B9%25E6%25B3%2595%252CkeyWords%253A%252Ccheckcode%253A837c%252CgrantCode%253A218%252CsubGrantCode%253A%252ChelpGrantCode%253A%252Cyear%253A2005%252Csqdm%253AF030203%5Btear%5Dsort_name1%5E%3ApsnName%5Btear%5Dsort_name2%5E%3AprjNo%5Btear%5Dsort_order%5E%3Adesc'
# }
#
# data['nd'] = re.findall('nd=(.*?)&',para)[0]
# data['searchString'] = re.findall('searchString=(.*?)\'',para)[0]
data = re.findall('--data \'(.*?)\'',para)[0]
print(cookies['sessionidindex'])
print(data)
# print(data['nd'])
# print(data['searchString'])


data = re.sub('year%253A2005','year%253A{}',data)
for year in range(2005,2017):
    r = requests.post(url,data=data.format(year),headers=headers)
    print(r.text)

阅读 3.4k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题