爬取京东页面的文本为乱码

我使用beautiful soup解析京东的界面,把里面的文本全都提取出来,但是打印的时候发现全是乱码。jd的界面使用utf-8编码的,我在解码成gbk时却遇到错误。

下面是代码,请指教。

#encoding=gbk
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Comment
from bs4 import Doctype
import urllib2

def walker(soup, indent):
    text=""
    if soup.name is not None:
        for child in soup.children:
            if isinstance(child, NavigableString):
                if len(child) != 1: #如何判断是否为空
                    text = indent + unicode(child).encode('utf-8').strip() #.decode('utf-8').encode('gbk')
            text += walker(child, indent+"\t")
    return text

if __name__ == "__main__":
    soup = BeautifulSoup( urllib2.urlopen("http://item.jd.com/1592573020.html").read()) 
    doctypes=soup.findAll(text=lambda text: isinstance(text, Doctype))
    [doctype.extract() for doctype in doctypes]
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]

    for script in soup("script"):
        script.extract()
    for noscript in soup("noscript"):
        noscript.extract()
    for style in soup("style"):
        style.extract()
    text=walker(soup, "")
    print "text", text.decode('utf-8').encode('gbk') #这里会出错
阅读 5.3k
2 个回答
text.decode('utf-8').encode('gbk')
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题