#####coding=utf-8
import urllib
import urllib.request
import pymysql
import time
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import pymongo
from pymongo import MongoClient
import gridfs
#####获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
#####获取使用的database对象
db = client.news
#####开始爬取数据
def start_crawler():
page_num = 1
while page_num<=1:
url = "http://www.sxcoal.com/news/seniorsearch?GeneralNewsSearch%5Bcategory_id%5D%5B0%5D=1&GeneralNewsSearch%5Bnews_industry_ids%5D=&GeneralNewsSearch%5Bnews_tag_ids%5D=&GeneralNewsSearch%5Bport_ids%5D=&GeneralNewsSearch%5Bprov_area_id%5D=&page={}&per-page=10".format(page_num)
print (url)
page_num += 1
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib.request.Request(url,headers=headers)
response=requests.get(url,headers=headers)
content=response.text
one_page = get_page_news(content)
time.sleep(1)
if one_page:
to_mysql(one_page)
time.sleep(1)
else:
break
print ('新闻抓取完毕')
#####爬取新闻标题、日期、地址到数据库
def to_mysql(one_page):
print (one_page)
def get_page_news(content):
soup = BeautifulSoup(content,'lxml')
one_page_list = []
for i in soup.find_all("div",class_="artnr"):
title = i.select('h4')[0].text
url = i.a['href']
date = i.p.find('span',class_='spandate').string.split(" ")[1]
one_page={'title':title,'url':url,'date':date,'type':'news','label':'www.sxcoal.com'}
db.newstitle.insert_one(one_page)
one_page_list.append((title,url,date))
return one_page_list
#####抓取具体内容
def get_new_body():
link_list = get_news_linksfrom_database()
for url in link_list:
news_body = get_news_text(url)
print('_id')
#写入数据库
one_page={'newsbody':get_news_text(url)}
db.newstitle.insert_one(one_page)
print("新闻主体完毕!")
def get_news_linksfrom_database():
result = db.newstitle.find({'label':'www.sxcoal.com'},{'_id':1,'url':1})
return result if result else []
def get_news_text(url):
html = requests.get(url)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
try:
return str(soup.find('div',{'id':'Zoom'}))
except:
return None
if __name__ == '__main__':
'''爬取新闻简要'''
##### start_crawler() #开始爬虫
'''爬取具体新闻'''
get_new_body()
start_crawler()执行没有问题
get_new_body()执行报错
requests.exceptions.InvalidSchema: No connection adapters were found for '{'url': 'http://www.sxcoal.com/news/4564136/info', '_id': ObjectId('5a0ab7b7d1e44d23281339e6')}'
求大神解答