coding=utf-8
import requests
from selenium import webdriver
import time
class JzSpider:
def __init__(self,):
self.start_url = "http://radar.itjuzi.com//company"
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
"Accept":"Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Connection": "keep - alive",
"Accept-Encoding":"gzip, deflate, br"}
def parse_url(self,url):
proxies = {"http": "http://117.127.0.204:8080"}
response = requests.get(url, headers=self.headers)
content = response.content.decode("utf-8")
return content
def save_content_list(self,content):
with open("Jz.txt", "w", encoding="utf-8") as f:
f.write(content)
print("保存成功")
def run(self):
driver = webdriver.Chrome()
# 用driver.get()请求这个网址,返回403,是ip被封了?要怎么设置代理ip或者其它解决方法
driver.get("https://www.itjuzi.com/user/login?flag=radar&redirect=/company")
driver.find_element_by_id("create_account_email").send_keys("13333331328")
driver.find_element_by_id("create_account_password").send_keys("lz133333333334")
time.sleep(8)
driver.find_element_by_id("login_btn").click()
html_str = self.parse_url(self.start_url)
self.save_content_list(html_str)
if name == '__main__':
Jz_spider = JzSpider()
Jz_spider.run()
403 Forbidden 错误,大多是被服务器屏蔽了,拒绝提供返回内容
一般可以通过更换服务器ip、设置代理服务器,去爬取
最好的办法,是通过模拟浏览器人工采集爬取
selenium + xvfb + firefox + proxy ip
下面是我的解决方案,仅供参考,相互学习