python爬虫

爬取爬虫框架的通用代码

import requests  
def getHtmlText(url):  
    try:  
        Headers = {  
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}  
        r = requests.get(url,headers=Headers)  
        r.raise_for_status()  # 如果状态不是200，则为httpError异常  
        r.encoding = r.apparent_encoding  
        return r.text  
    except:  
        return "产生异常"  
  
if __name__ == "__main__":  
    url ="http://news.fznews.com.cn/shehui/list.shtml"  
  HtmlText = getHtmlText(url)  
    print(HtmlText)

2、爬取照片代码

import requests  
def getPicture(url):  
    try:  
        Headers = {'user-agent':  
                       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}  
        r = requests.get(url,headers\=Headers)  
        r.raise\_for\_status()  \# 如果状态不是200，则为httpError异常  
  return r.content  
    except:  
        return "产生异常"  
  
if \_\_name\_\_ == "\_\_main\_\_":  
    picurl ="http://img0.dili360.com/pic/2019/10/23/5db027e9441a73i93221149.jpg"  
  path="C://Users//fuxingyu//Desktop//abc.jpg"  
  Pic=getPicture(picurl)  
    with open(path,'wb')as f:  
        f.write(Pic)  
        f.close()

或者

import requests  
import os  
url="https://pic.rmb.bdstatic.com/1cf349c922d2e0faa054de841535a0788853.gif"  
root="C://Users//fuxingyu//Desktop//"  
path=root+url.split('/')[-1]  
try:  
    if not os.path.split(root):  
        os.mkdir(root)  
    if not os.path.exists(path):  
        r=requests.get(url)  
        with open(path,'wb')as f:  
            f.write(r.content)  
            f.close()  
            print("文件保存成功")  
    else:  
        print("文件已经存在")  
except:  
    print("爬取失败")

python爬虫

爬取爬虫框架的通用代码

2、爬取照片代码

叫我瞄大人

引用和评论

获取某网页基金净值

如何减少跨团队交付摩擦？——基于 DevOps 与敏捷的最佳实践

科学计算编程涉及到的技术栈简介

使用 chardet 判断文件编码需要注意的坑——过大的文件会导致高耗时

Python3 格式化时间（qbit）

本地使用PaddleOCR进行图片识别获得文字（返回JSON）

manus 的替代品有哪些？使用LLM大模型技术做手机/网页/浏览器自动化操作技术汇总