Python爬虫:抓取百度百科python词条页面的1000个网页

代码如下:
spider_main.py:

# coding:utf8

import html_outputer, html_downloader, url_manager,\
    html_parser

class SpiderMain(object):

    def __init__(self):

        self.urls=url_manager.UrlManager()

        self.downloader=html_downloader.HtmlDownloader()

        self.parser=html_parser.HtmlParse()

        # self.outputer=html_outputer.HtmlOutputer()
        self.html_outputer = html_outputer.Html0utputer()
 

    def craw(self, root_url):

        count=1

        self.urls.add_new_url(root_url)

        while self.urls.has_new_url(): 

            try:

                new_url=self.urls.get_new_url()

                print("craw %d :%s"%(count,new_url))

                html_cont=self.downloader.download(new_url)

                new_urls,new_data=self.parser.parse(new_url,html_cont)

                self.urls.add_new_urls(new_urls)

                # self.outputer.collect_data(new_data)
                self.html_outputer.collect_data(new_data)
                if count==1000:

                    break

                count=count+1

            except:
                print("craw failed")          

        # self.outputer.output_html()
        self.html_outputer.output_html()
        
if __name__=="__main__":

    root_url="http://baike.baidu.com/view/21087.htm"

    obj_spider=SpiderMain()

    obj_spider.craw(root_url)

 

html_downloader.py:

Created on 2017-7-3

@author: Administrator
'''
import urllib.request

class HtmlDownloader(object):
    
    
    def download(self, url):
        if url is None:
            return None
        response = urllib.request.urlopen(url)
        
        if response.getcode() != 200:
            return None
        
        return response.read()
    
    
    

html_parser.py

# coding:utf-8
from bs4 import BeautifulSoup
import re
from urllib import parse 


class HtmlParse(object):
    
    
    def _get_new_urls(self, page_url, soup):
        new_urls = set()
    
        links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
        for link in links:
            new_url = link['href']
            new_full_url = parse.urljoin(page_url, new_url)
            new_urls.add(new_full_url)
        return new_urls
    def _get_new_data(self, page_url, soup):
# <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
        res_data = {}
        # url\
        res_data['url'] = page_url
        title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1")
        res_data['title'] = title_node.get_text()
        
        # <div class="lemma-summary" label-module="lemmaSummary">
        summary_node = soup.find('div', class_="lemma-summary")
        res_data['summary'] = summary_node.get_text()
        return res_data
        
    def parse(self, page_url, html_cont):
        if page_url is  None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont, 'html.parser',from_encoding='utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        new_data = self._get_new_data(page_url, soup)
        return new_urls, new_data
    
    



html_outputer.py:

# coding:utf8
class Html0utputer(object):
    def __init__(self):
        self.datas = []
    def collect_data(self,data):
        if data is None:
            return
        self.datas.append(data)
 
    def output_html(self):
        fout = open('output.html', 'w')
 
        fout.write('<html>')
        fout.write("<head><meta http-equiv='content-type' content='text/html;charset=utf-8'></head>")
        fout.write('<body>')
        fout.write('<table>')
 
        # ascci
        for data in self.datas:
            fout.write('<tr>')
            fout.write('<td>%s</td>' % data['url'])
            fout.write('<td>%s</td>' % data['title'].encode(encoding='UTF-8'))
            fout.write('<td>%s</td>' % data['summary'].encode(encoding='UTF-8'))
            fout.write('</tr>')
 
        fout.write('</table>')
        fout.write('</body>')
        fout.write('</html>')



url_manager.py:
'''
Created on 2017-7-3

@author: Administrator
'''


class UrlManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()
    def add_new_url(self, url):# 第一个方法:向管理器中添加一个URL
        # 添加判断
        if url is None:
            return  
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)
    def add_new_urls(self, urls):# 添加批量URL
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)
    
    def has_new_url(self):# 看是否有待爬取的URL
        return len(self.new_urls) != 0

    
    def get_new_url(self):# 
        # pop()方法从列表中获取URL,同时移除URL
        # 首先添加进URL
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

    
  

但是我运行spider_main.py结果如下:

clipboard.png

clipboard.png
正确结果应该是这样的:

clipboard.png

请问这是什么原因?

阅读 2.5k
1 个回答

HtmlOutputer::output_html:

        fout = open('output.html', 'wb')
        # ...
        for data in self.datas:
            fout.write(b'<tr>')
            fout.write(b'<td>%s</td>' % data['url'])
            fout.write(b'<td>%s</td>' % data['title'].encode(encoding='UTF-8'))
            fout.write(b'<td>%s</td>' % data['summary'].encode(encoding='UTF-8'))
            fout.write('</tr>')
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题