具体代码如下,请帮忙解决一些,谢谢了
# -*- coding:utf-8 -*-
'''
MBA院校新闻收集器
'''
import re # 网络连接模块
import bs4 # DOM解析模块
import pymysql # 数据库连接模块
import urllib # 网络访问模块
import urllib.request # 网络访问模块
import urllib.error # 网络访问模块
import time # 时间模块
import random # 随机数模块
import lxml # lxml模块
from lxml.html.clean import Cleaner, clean_html # 清除模块
import selenium
# from datetime import *
# 数据库连接参数
db_config = {
'host': '127.0.0.1',
'port': '3306',
'username': 'root',
'password': 'error.error',
'database': 'hxedu',
'charset': 'utf8'
}
# 连接数据库
connect = pymysql.Connect(
host=db_config['host'], # 数据库地址 localhost/127.0.0.1
port=int(db_config['port']), # 数据库端口
user=db_config['username'], # 连接数据库的用户名
passwd=db_config['password'], # 链接数据库的密码
db=db_config['database'], # 数据库名称
charset=db_config['charset'] # 数据库编码
)
cursor = connect.cursor()
cursor.execute("select * from bs_college_rules WHERE isdelete=0") # 查询获取url规则表中的所有数据
results = cursor.fetchall()
for row in results :
collegeid = row[1]
index_url = row[2]
html_code = row[3]
home_url = row[4]
columns_url = row[5]
column_img = row[6]
column_parent = row[7]
column_title = row[8]
column_keywords = row[9]
column_description = row[10]
column_pub_date = row[11]
column_content = row[12]
# 配置参数
maxcount = 1000 # 数据数量
home = index_url # 起始位置
url_set = set() # url集合
url_old = set() # 过期url
# 获取首页链接
try :
html = urllib.request.urlopen(home, None, timeout=10).read().decode(html_code)
soup = bs4.BeautifulSoup(html, 'html.parser')
pattern = columns_url # 匹配正则
links = soup.find_all('a', href=re.compile(pattern)) # 利用第三方模块的beautifulSoup4 查找与正则的相匹配的a标签的
for link in links:
link_href = re.search( pattern, link['href']).group(0) # 清除url中的相对地址的..
url_set.add(link_href) # 将获取到的url放入url集合中
# 文章类定义
class Article(object):
def __init__(self): # 初始化定义
self.url = None # 新闻url
self.collegeid = None # 所属院校ID
self.title = None # 新闻标题
self.keywords = None # 新闻页面的关键词,SEO
self.description = None # 新闻页面的描述
self.date = None # 新闻发表时间
self.images = None # 新闻缩略图
self.content = None # 新闻主要内容
self.read = None # 新闻阅读量
self.like = None # 新闻点赞量
self.createtime = None # 抓取新闻的时间
# 处理URL信息
count = 0
while len(url_set) != 0:
try:
# 获取链接
if re.search( r'http:\/\/(.*)', url_set.pop()):
url = url_set.pop()
else :
url = home_url + url_set.pop() # 拼接url 原地址+从url集合中获取的url
url_old.add(url) # 获取到url之后放入到过期url的集合
# 获取代码
html = urllib.request.urlopen(url).read().decode(html_code) # 通过url获取该页面的源代码
# DOM解析
soup = bs4.BeautifulSoup(html, 'html.parser') # 通过第三方的beautifulSoup解析源代码
pattern = columns_url # 链接匹配规则
links = soup.find_all('a', href=re.compile(pattern))
# 获取URL
for link in links:
if link['href'] not in url_old:
link_href = re.search( pattern, link['href']).group(0) # 清除url中的相对地址的..
url_set.add(link_href)
# 数据防重
sql = "SELECT newsid FROM bs_news WHERE isdelete=0 and scoure_url = '%s' "
data = (url,)
cursor.execute(sql % data)
if cursor.rowcount != 0:
sql = "INSERT INTO python_error_info( url, title, errordate ) "
sql = sql + " VALUES ('%s', '%s', '%s') "
print( url + "数据已存在")
errorurl = url
errortitle = "数据已存在"
errordate = time.time()
data = (errorurl, errortitle, errordate )
cursor.execute(sql % data)
# 获取信息
article = Article()
article.url = url # URL信息
page = eval( column_parent ) # 新闻内容最外层标签
article.title =eval( column_title ) # 抓取新闻标题
if(eval( column_keywords ) ):
article.keywords = eval( column_keywords ) # 抓取新闻网页关键词,SEO
else:
article.keywords = article.title + '--商学院头条' # 抓取新闻网页关键词,SEO
if(eval( column_description ) ):
article.description = eval( column_description ) # 抓取新闻网页描述
else:
article.description = article.title + '--商学院头条' # 抓取新闻网页描述
article.date = re.search( r'\d{4}(-|\S+)\d{1,2}(-|\S+)\d{1,2}', eval( column_pub_date ) ).group(0) # 抓取新闻发表时间
# 将获取到的发表时间转换为时间戳
if re.search(r'\d+年\d+月\d+', article.date) :
timeArray = time.strptime(article.date, "%Y年%m月%d")
else :
timeArray = time.strptime(article.date, "%Y-%m-%d")
article.date = int(time.mktime(timeArray))
clean_style = Cleaner(style=True, page_structure=False, safe_attrs_only=False) # 清除style
article.content = clean_style.clean_html( str ( eval( column_content ) ) ) # 抓取新闻的主要内容
# 替换掉抓取的文章中的img标签src是否含有 绝对地址
data_img = page.find_all('img', attrs={'src': True})
data_img_len = len(data_img) # 获取到一共有多少个图片
img_count = 0
while img_count < data_img_len :
img_src = data_img[img_count].attrs['src']
if re.search( r'(http|https):\/\/(.*)', img_src):
break
else :
article.content = article.content.replace(img_src, home_url + img_src)
img_count = img_count + 1
# 替换掉抓取的文章中的video标签src是否含有 绝对地址
data_video = page.find_all('video', attrs={'src': True})
data_video_len = len(data_video) # 获取到一共有多少个图片
video_count = 0
while video_count < data_video_len :
video_src = data_video[video_count].attrs['src']
if re.search( r'(http|https):\/\/(.*)', video_src):
break
else :
article.content = article.content.replace(video_src, home_url + video_src)
video_count = video_count + 1
article.images = 1 # 新闻缩略图
article.read = random.randint(300,800) # 新闻阅读量
article.collegeid = collegeid # 新闻所属院校ID
article.like = random.randint(30,80) # 新闻点赞量
article.createtime = time.time() # 新闻抓取时间
# 存储数据
sql = "INSERT INTO bs_news( scoure_url, title, keywords, description, content, images, pub_date, capture_date, collegeid, readcount, likecount ) "
sql = sql + " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') "
data = (article.url, article.title, article.keywords, article.description, article.content, article.images, article.date, article.createtime, article.collegeid, article.read, article.like )
cursor.execute(sql % data)
connect.commit()
except Exception as e:
sql = "INSERT INTO python_error_info( url, title, errordate ) "
sql = sql + " VALUES ('%s', '%s', '%s') "
errorurl = home
errortitle = e
errordate = time.time()
data = (errorurl, errortitle, errordate )
cursor.execute(sql % data)
connect.commit()
continue
else:
print(article.title)
count += 1
finally:
# 判断数据是否收集完成
if count == maxcount:
break
except Exception as e:
# print(home + e)
sql = "INSERT INTO python_error_info( url, title, errordate ) "
sql = sql + " VALUES ('%s', '%s', '%s') "
url = home
title = "请求失败"
errordate = time.time()
data = (url, title, errordate )
cursor.execute(sql % data)
connect.commit()
# 关闭数据库连接
cursor.close()
connect.close()
异常是啥?爬取的地址是啥?