我准备根据作者姓名爬取Web of science上的论文数据,用post方法提交了作者的姓名和sid,结果r.text返回正确,并且也能够匹配到需要的信息。但是!!我准备获取url和页面数,然后可以进一步爬取更多的页面,但输出r.url返回的并不是正确的网址!但明明r.text返回正确的啊!!!太奇怪了!
被这个问题困扰好几天了!求助各路大神解救!!!!
import re
# from threading import Thread
from multiprocessing import Process
from multiprocessing import Manager
import requests
import time
import xlrd
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
from pandas import DataFrame
authors = ['HUANG J X']
root = 'http://apps.webofknowledge.com'
s = requests.get(root)
sid = re.findall(r'SID=\w+&', s.url)[0].replace('SID=', '').replace('&', '')
root_url = 'http://apps.webofknowledge.com/UA_GeneralSearch.do'
count = 0
basic_urls = []
pages = []
for author in authors:
print author
time.sleep(1)
count += 1
if count % 100 ==0:
s = requests.get(root)
sid = re.findall(r'SID=\w+&', s.url)[0].replace('SID=', '').replace('&', '')
"""headers = {
'Origin': 'http://apps.webofknowledge.com',
'Referer': 'http://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch&SID=5B3frlTpw4JHeQwYMgR&preferencesSaved=',
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
'Content-Type': 'application/x-www-form-urlencoded'
}"""
headers = {
'Origin': 'http://apps.webofknowledge.com',
'Referer': 'http://apps.webofknowledge.com/Search.do?product=UA&SID=6E6M5icjM8KCcO9BZx4&search_mode=GeneralSearch&preferencesSaved=',
'Content-Type': 'application/x-www-form-urlencoded',
'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
form_data = {
'fieldCount': 1,
'action': 'search',
'product': 'UA',
'search_mode': 'GeneralSearch',
'SID': sid,
'max_field_count': 25,
'formUpdated': 'true',
'value(input1)': author,
'value(select1)': 'AU',
'value(hidInput1)': '',
'limitStatus': 'collapsed',
'ss_lemmatization': 'On',
'ss_spellchecking': 'Suggest',
'SinceLastVisit_UTC': '',
'SinceLastVisit_DATE': '',
'period': 'Range Selection',
'range': 'ALL',
'startYear': '1900',
'endYear': '2018',
'update_back2search_link_param': 'yes',
'ssStatus': 'display:none',
'ss_showsuggestions': 'ON',
'ss_query_language': 'auto',
'ss_numDefaultGeneralSearchFields': 1,
'rs_sort_by': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A'
}
s = requests.Session()
r = s.post(root_url, data=form_data, headers=headers,timeout=30)
#返回正确的网页内容!
print r.text
r.encoding = r.apparent_encoding
tree = etree.HTML(r.text)
basic_url = r.url
if basic_url.strip()=='':
basic_url = u'无记录'
else:
basic_url = r.url
page = tree.xpath('//span[@id="pageCount.top"]/text()')
if len(page)==0:
page = u'无记录'
else:
page = tree.xpath('//span[@id="pageCount.top"]/text()')[0]
#能够获取到page页面数,但r.url返回的不是真正的网址!!
print page,basic_url
basic_urls.append(basic_url)
pages.append(page)