用requests.post就可以有返回值,用formrequest就是400错误。
import scrapy
import requests
import json
class testform(scrapy.Spider):
name = 'testform'
def start_requests(self):
url='https://www.jiqizhixin.com/graphql'
headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '320',
# 'Content-Type': "application/x-www-form-urlencoded",
'Content-Type': "application/json",
'Host': 'www.jiqizhixin.com',
'Origin': 'https://www.jiqizhixin.com',
'Referer': 'https://www.jiqizhixin.com/categories/zi-xun',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'X-CSRF-Token': 'DfeIz+CdfLWv86lTgKmBu5xDl+hpbIxoE8wmxWIuhzECfwvWfO62+uO2dtqXaKh4nzSM4xRh3/TsTKrEbw73/Q=='
}
payload={
'operationName': 'Search',
'query': "query Search($cursor: String, $count: Int, $keywords: String!, $filter_tags: [String]) {\n elastic_search(first: $count, after: $cursor, keywords: $keywords, filter_tags: $filter_tags) {\n edges {\n node {\n id\n title\n tag_list\n content\n author_name\n categories\n cover_image_url\n published_at\n friendly_path\n __typename\n }\n __typename\n }\n total_count\n pageInfo {\n endCursor\n hasNextPage\n __typename\n }\n __typename\n }\n}\n",
'variables': {'count': 50, 'keywords':'NLP', 'filter_tags': []},
}
# req=requests.post(url,json=payload,headers=headers)
req = requests.post(url, data=json.dumps(payload), headers=headers)
yield scrapy.FormRequest(url,formdata=payload,headers=headers)
def parse(self, response):
print(response)
原网页是https://www.jiqizhixin.com/,右上角搜索按钮,在搜索框输入NLP,经过chrome分析发现向https://www.jiqizhixin.com/gr... 发送post请求
'Content-Type': "application/json"的,发送的参数在playloads里面,用json=postdata
'Content-Type': "application/x-www-form-urlencoded" 的,发送的参数在formdata里面,用data=postdata,formrequest和data=postdata规则似乎是一样的