try:
req = urllib.request.Request(url, headers=hds[page_num%len(hds)])
source_code = urllib.request.urlopen(req).read()
plain_text=str(source_code)
except:
print ("Error.")
continue
soup = BeautifulSoup(plain_text, from_encoding='utf-8')
list_soup = soup.find('div', {'class': 'mod book-list'})
try_times+=1;
if list_soup==None and try_times<200:
continue
elif list_soup==None or len(list_soup)<=1:
break # Break when no informatoin got after 200 times requesting
for book_info in list_soup.findAll('dd'):
title = book_info.find('a', {'class':'title'}).string.strip()
desc = book_info.find('div', {'class':'desc'}).string.strip()
desc_list = desc.split('/')
book_url = book_info.find('a', {'class':'title'}).get('href')
#输出爬取到的书籍的标题
print(title)
上面的代码是爬取豆瓣网的爬虫,为什么我爬取的书籍的标题(title),是一串utf-8编码,而不是正常的字符串?如下图:
由于获取到的标题(title)其实仍然是str类型,所以无法使用decode函数解码,请问还有什么办法可以解决吗?
试一试