想要实现的功能:爬新闻首页,拿到新闻链接,然后去爬每篇新闻,把爬到的新闻内容插入到monggodb中
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from pyspider.libs.base_handler import *
from pymongo import *
client = MongoClient()
db = client['result']
col = db['kr']
#col.remove()
class Handler(BaseHandler):
def on_start(self):
print '-'*10
self.crawl('http://36kr.com/news',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'},fetch_type='js', callback=self.index_page,connect_timeout = 50,timeout = 200)
def index_page(self, response):
print '+'*10
for item in response.doc('div.intro').items():
print '='*10
self.crawl(item('h3>a').attr.href, fetch_type='js', callback=self.detail_page)
def detail_page(self, response):
print '#'*10
obj = {
"url": response.url,
"title": response.doc('title').text(),
"content": response.doc('section.textblock:first').text()
}
col.insert(obj)
return obj
点击run之后
然后点向右的白色箭头,
必须再点白色箭头,detail_page才会执行
不然怎么叫「单步调试」呢?
要执行到 dashboard 上改状态,点 run