我使用python爬虫抓取了一段网页
如果直接输出网页,中文会是乱码。
我使用如下方法解码
html.decode('gbk')
现在出现了这样的一个问题。
我抓取的网页类似于博客,我希望将博客的标题作为我保存文件的文件名
但有如下报错
开篇
Traceback (most recent call last):
File "D:/Users/rongweiwei799/PycharmProjects/untitled1/Fregment Post.py", line 97, in <module>
f = open('%s' %result[0],'w')
IOError: [Errno 22] invalid mode ('w') or filename: u'\r\n\u5f00\u7bc7\r\n'
“开篇”为标题名
代码如下
# coding:utf-8
import sys
import urllib2
import urllib
import cookielib
import re
reload(sys)
sys.setdefaultencoding('utf-8')
#在此处设置用户名/密码
username = "rongweiwei799"
password = "XXX"
def make_cookie(name, value, domain, path='/'):
return cookielib.Cookie(version=0,
name=name,
value=value,
port=None,
port_specified=False,
domain=domain,
domain_specified=True,
domain_initial_dot=False,
path=path,
path_specified=True,
secure=False,
expires=None,
discard=False,
comment=None,
comment_url=None,
rest=None,
rfc2109=False)
#设置Cookie容器 和 User-Agent
__cookie = cookielib.CookieJar()
__req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie))
__req.addheaders = [
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('User-Agent', 'Mozilla/5.0,(Windows NT 6.1; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0')
]
urllib2.install_opener(__req)
#先请求首页, 得到Cookie(产生Session会话)
request = urllib2.Request('http://profile.paic.com.cn/profile/')
urllib2.urlopen(request).read()
#用户名转换为大写
username = username.upper()
#构造要提交的数据
postdata = {'j_username': username, 'j_password': password,}
req = urllib2.Request('http://profile.paic.com.cn/profile/j_security_check', urllib.urlencode(postdata))
#增加 Cookie(该Cookie由网页中的JS写入的)
#__cookie.set_cookie(make_cookie('E2E.loginUserName', username, 'emp.paic.com.cn'))
#增加 Referer
req.add_header('Referer', 'http://profile.paic.com.cn/profile/',)
#发出请求
result = urllib2.urlopen(req)
#输出结果
#print result.read().decode('gbk').encode('utf-8')
#Logfile = []
w1 = r'font-weight:bold">'
w2 = r'</div>'
pat = re.compile(w1+'(.*?)'+w2,re.S)
w3 = r'class="divLogContent"[>]'
w4 = r'</div>'
bat = re.compile(w3+'(.*?)'+w4,re.S)
for i in range(1,20):
Log = urllib2.urlopen('http://profile.paic.com.cn/profile/log/viewlog.shtml?id=%d' %i)
buff = Log.read().decode('gbk')
result = pat.findall(buff)
if result:
print result[0]
f = open('%s' %result[0],'w')
Result = re.sub(r'[</]+?p>','',buff)
#print Result
content = bat.findall(Result)
for i in content:
print i
f.close()
Html源代码如下
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<title>PROFILE系统</title>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=GBK"/>
<script type="text/javascript" src="/profile/js/jquery-1.7.1.min.js"></script>
<link rel="stylesheet" href="/profile/style/style.css" type="text/css" media="screen, projection" />
<link rel="stylesheet" type="text/css" href="/profile/ke/plugins/code/prettify.css" />
<script type="text/javascript" charset="utf-8" src="/profile/ke/plugins/code/prettify.js"></script>
<script type="text/javascript">
$(function(){
prettyPrint();
});
</script>
<link rel="stylesheet" href="/profile/style/jqModal.css" type="text/css" media="screen, projection"/>
<script type="text/javascript" src="/profile/js/jqModal.js"></script>
<script type="text/javascript" src="/profile/js/util.js"></script>
<script type="text/javascript" src="/profile/js/jquery.form.js"></script>
<script type="text/javascript" src="/profile/js/input_position.js"></script>
<script type="text/javascript" src="/profile/js/textarea_edit.js"></script>
<script type="text/javascript" src="/profile/js/jquery.lightbox-0.5.js"></script>
<link rel="stylesheet" type="text/css" href="/profile/style/jquery.lightbox-0.5.css" media="screen" />
<script type="text/javascript">
var weibo_maxlen = 200;
$(function(){
$('#txtReplyContent').keydown(function(){
var word_len = $(this).val().length;
$('#spnWeiboLength').text(weibo_maxlen - word_len >= 0 ? weibo_maxlen - word_len : 0);
if(word_len > weibo_maxlen){
$(this).val($(this).val().substring(0, weibo_maxlen));
return false;
}
});
$('#favorite').click(function(){
if(confirm('确定收藏吗?')){
var targetId = $(this).attr('targetId');
var targetType = $(this).attr('targetType');
$.get('/profile/user/addFavorite.shtml',{targetId:targetId,targetType:targetType},function(data){
if($.trim(data) == 'ok'){
showPopMsg('收藏成功!');
$('#favoriteCount').text(parseInt($('#favoriteCount').text(),10) + 1);
}else{
showPopMsg($.trim(data));
}
});
}
});
$('body').click(function(){
$('#faceBox').hide();
});
loadReply();
$('.divLogContent img').lightBox();
});
function loadReply(){
$('#divReply').load('/profile/reply/showReply.shtml?rt=LOG&rtid=5', function(){
$('#spnReplyCount').text($("#hidReplySize").val());
});
}
var replySaveOption = {
index:0,
btn:null,
txt:null,
spn:null,
url: '/profile/reply/saveReply.shtml',
beforeSubmit:function(a,b,c){
if($.trim(this.txt.val()) != ''){
this.spn.text('').hide();
$('#spnMsg0').fadeIn('slow');
setTimeout("$('#spnMsg0').fadeOut('slow')",3000);
this.btn.attr('disabled', 'true');
}else{
this.spn.text('说点什么吧!').show();
return false;
}
},
success: function(data, status) {
if($.trim(data) == '1'){
this.btn.removeAttr('disabled');
this.txt.val('');
loadReply();
}
}
};
function saveReply(index){
if($('#hidReplyType' + index).val() != ''){
replySaveOption.index = index;
replySaveOption.btn = $('#btnSaveReply' + index);
replySaveOption.txt = $('#txtReplyContent' + index);
replySaveOption.spn = $('#spnWarn' + index);
$('#formReply' + index).ajaxSubmit(replySaveOption);
}
}
</script>
</head>
<body>
<script type="text/javascript">
$(function(){
checkNotify();
getUserNeedDoTaskCount();
setInterval('getUserNeedDoTaskCount()', 60000);
window.setInterval('checkNotify();', 15 * 1000);
$('#btnSearchAll').click(function(){
var keyword = $.trim($('#txtKeyword').val());
if(keyword != ''){
$('#k').val(escape(keyword));
$('#formSearch').attr('action','/profile/search/searchindex.shtml').submit();
}else{
alert('请先输入搜索的内容');
return;
}
});
$('#searchCategory').click(function(){
var offset = $(this).offset();
$('#searchCategorysBox').toggle();
$('#searchCategorysBox').offset({top : offset.top + 24, left : offset.left});
});
$('#searchCategorysBox a').click(function(){
var t = $(this).attr('t');
$('#t').val(t);
$('#searchCategory').text($(this).text());
$('#searchCategorysBox').hide();
});
$('.taskCountTip').click(function(){
var taskCount = $(this).text();
if(taskCount > 0){
var taskType = $(this).attr('target');
var color = {'CASE':'#66CCFF','CHANGE':'#8F2C11','REQUEST':'green','STEP':'#FFAA00'};
var taskListBox = $('#taskListBox');
var coverBox = $('#coverBox');
taskListBox.load('/profile/user/getUserNeedToDoList.shtml?type='+taskType,function(){
var offset = $('.headLinkBox').offset();
taskListBox.css({width:'700px',top : offset.top + 40,left : offset.left - 3,border : '1px solid #333'});
taskListBox.slideDown("slow");
//$('.tbTaskList tr:first-child').css({'background':'#EEE','border-top':'2px solid ' + eval('color.' + taskType)});
//$('.tbTaskList tr').css({'border-bottom':'1px solid #CCC'});
coverBox.show();
coverBox.height($('body').height());
$('#slideup').click(function(){
$('#taskListBox').slideUp("slow").delay(800);
coverBox.hide();
});
});
}
});
});
function ajaxCallback(){
if($.trim($('#divNotify').text()) != ''){
$('#divNotify').slideDown();
}else{
$('#divNotify').slideUp();
}
if($("#hidHonorIDs").length > 0 && $("#hidHonorIDs").val() != ''){
if($('#divHonor').is(':hidden')){
showHonor(1, $("#hidHonorIDs").val());
}
}
}
function checkNotify(){
asyLoadDataHtml('/profile/user/getUserNotify.shtml','','divNotify', ajaxCallback);
}
function showHonor(page, hids){
$('#divHonor #ifrmHonor').attr('src','/profile/honor/getUserNotify.shtml?p='+ page +'&hids=' + hids);
$("#divHonor").jqm({modal : true,width : 500}).jqmShow();
}
function closeHonorDialog(){
$("#divHonor").jqm().jqmHide();
}
function getUserNeedDoTaskCount(uid){
var url = "/profile/user/getUserNeedDoCount.shtml";
if(uid){
url += "?uid=" + uid;
}
asyLoadDataJson(url, '', function(data){
for(var key in data){
if($('#spnCount' + key).hasClass('divHeadTitleUp')){
if(data[key] != 0){
$('#spnCount' + key).text(data[key]).addClass('divHeadTitleInfo');
}else{
$('#spnCount' + key).text('').removeClass('divHeadTitleInfo');
}
}else{
$('#spnCount' + key).show();
//$('#spnCount' + key).add($('#spnCount' + key).parent()).css({'background-color': '#B74939'});
$('#spnCount' + key).text(data[key]);
if(data[key] == 0){
$('#spnCount' + key).hide();
//$('#spnCount' + key).add($('#spnCount' + key).parent()).css({'background-color': ''});
}
}
}
});
}
</script>
<div id="divNotify" style="display: none;"></div>
<div class="jqmWindow" id="divHonor" style="padding:0px;background:transparent;border:0px">
<iframe style="height:320px;width:100%;background:transparent" scrolling="no" frameborder="0" id="ifrmHonor" allowtransparency="true"></iframe>
</div>
<div id="head">
<div style="width: 1004px;margin-left: auto;margin-right: auto;height:40px;padding:0px">
<div class="d_f" style="margin:0px;padding:0px;height:40px">
<a href="/profile/welcome.shtml" style="padding:0px"><img src="/profile/images/logo.jpg" border="0" /></a>
</div>
<div style="padding:0px;margin:0px" class="d_f">
<div>
<div style="margin:0px" class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f"></div>
<div class="divHeadTitleUp d_f" id="spnCountPROJECT"></div>
<div class="clear"></div>
</div>
<div class="headLinkBox">
<a href="/profile/home/myhome.shtml">首页</a>
|
<a href="/profile/log/mylog.shtml">日志</a>
|
<a href="/profile/weibo/getWeiboView.shtml?uid=RONGWEIWEI799">动态</a>
|
<a href="/profile/user/myfriend.shtml">关注</a>
|
<a href="/profile/reply/getReplyMeList.shtml">评论</a>
|
<a href="/profile/honor/getMyHonorView.shtml?cmd=in">评价</a>
|
<a href="/profile/vote/queryVoteList.shtml">投票</a>
|
<a href="/profile/project/myprojectlist.shtml?type=minedo">项目</a>
</div>
</div>
<div class="headTaskTips">
<a href="javascript:void(0);" class="caseTip"><span id="spnCountCASE" target="CASE" class="taskCountTip" title="待处理CASE数" style="display: none;"></span></a>
<a href="javascript:void(0);" class="requestTip"><span id="spnCountREQUEST" target="REQUEST" class="taskCountTip" title="待处理REQUEST数" style="display: none;"></span></a>
<a href="javascript:void(0);" class="changeTip"><span id="spnCountCHANGE" target="CHANGE" class="taskCountTip" title="待处理CHANGE数" style="display: none;"></span></a>
<a href="javascript:void(0);" class="stepTip"><span id="spnCountSTEP" target="STEP" class="taskCountTip" title="待处理STEP数" style="display: none;"></span></a>
</div>
<div class="searchItem">
<img src="/profile/images/search.png" style="cursor:hand;vertical-align:bottom" id="btnSearchAll"/>
</div>
<div class="searchItem">
<input type="text" id="txtKeyword" name="txtKeyword" style="vertical-align:bottom;padding-bottom:0;padding-bottom:1px\9;width: 110px;"/>
<form method="get" id="formSearch" target="_blank">
<input type="hidden" id="k" name="k"/>
<input type="hidden" id="t" name="t" value="LOG"/>
</form>
</div>
<div class="searchItem">
<a href="javascript:void(0);" id="searchCategory">日志</a>
</div>
<div id="searchCategorysBox">
<a href="javascript:void(0);" t="LOG">日志</a>
<a href="javascript:void(0);" t="user">用户</a>
<a href="javascript:void(0);" t="WEIBO">动态</a>
<a href="javascript:void(0);" t="HONOR">评价</a>
<a href="javascript:void(0);" t="PROJECT">项目</a>
</div>
<div class="clear"></div>
</div>
</div>
<div id="taskListBox" style="position:absolute;display: none;background: #FFF;overflow:hidden;z-index:99;">
</div>
<div id="coverBox" style="position:absolute;display:none;left:0;top:0;width:100%;height:100%;background:#000;opacity: 0.8;filter:alpha(opacity=80);z-index:90;"></div>
<div id="main">
<script type="text/javascript">
$(function(){
$('#btnCancelFriend').click(function(){
var userID = $('#hidUserID').val();
if(confirm('确认取消关注' + userID + "?")){
$.post(
'/profile/user/cancelFriend.shtml?fid=' + userID,
function(data){
if($.trim(data) == '1'){
window.location.reload();
}else{
msgBox('取消失败,请联系管理员!');
}
}
);
}
});
$('#btnMakeFriend').click(function(){
var userID = $('#hidUserID').val();
$.post(
'/profile/user/insertFriend.shtml?fid=' + userID,
function(data){
if($.trim(data) == '1'){
window.location.reload();
}else{
msgBox('关注失败,请联系管理员!');
}
}
);
});
$('#btnGood').click(function(){
var userID= $("#hidUserID").val();
location.href="/profile/honor/addHonorView.shtml?uid="+userID+"&i=1";
});
$('#btnBad').click(function(){
var userID= $("#hidUserID").val();
location.href="/profile/honor/addHonorView.shtml?uid="+userID+"&i=-1";
});
});
</script>
<div id="left">
<input type="hidden" id="hidUserID" value="LINCHENG530">
<div style="background:#9B0200;"></div>
<div id="leftmenu" class="main_menu" style="padding:5px;">
<div class="" style="width:100%;padding-left:13px;padding-bottom:5px">
<div>
<span class="face_bg_mid">
<img class="faceMid" src="/profile/user/getUserFace.shtml?uid=LINCHENG530&type=big"/>
</span>
</div>
<div>
<input type="button" id="btnMakeFriend" value="关注" class="btn3"/>
</div>
</div>
</div>
</div>
<div id="middle">
<div id="title">
<img src="/profile/images/page_text.gif"/> 林城的日志
</div>
<div class="d_l_1" style="width:650px;">
<div class="divLogTitle" style="padding:5px;line-height:20px;border-bottom:1px solid #ddd;border-top:1px solid #ccc;background:#eee;height:20px;font-weight:bold">
开篇
</div>
<div style="text-align:right;height:20px;line-height:20px;color:#666">
2012-09-12 14:57:15
分类:未分类
权限:完全公开
</div>
<div class="divLogContent"><p>
开篇第一章:
</p>
<p>
profile第一版上线测试!
</p>
<br />
</div>
<div style="color:#666">
转发[0]
评论[<span id="spnReplyCount"></span>]
<a href="javascript:void(0);" id="favorite" targetId="5" targetType="LOG">收藏[<span id="favoriteCount">0</span>]</a>
</div>
<div style="border-bottom:1px solid #eee">
<div class="d_f">
<span class="face_bg_small">
<img src="/profile/user/getUserFace.shtml?uid=RONGWEIWEI799&type=small"/>
</span>
</div>
<div class="d_f">
<form method="post" id="formReply0">
<div>
<textarea name="txtReplyContent" id="txtReplyContent0"
onkeydown="return editorKeyDown();"
onkeyup="editorKeyUp(this);"
onclick="editorClick(this);" style="width:500px;height:150"></textarea>
</div>
<div style="text-align:right;padding:2px">
<span class="d_f" style="margin:0px;color:#666">剩余字符<span id="spnWeiboLength">200</span></span>
<span class="d_f" style="margin:0;"><img title="表情" width="20" target="txtReplyContent0" class="imgFace" onclick="addFace($(this))" style="position:relative;top:-2px;" src="/profile/images/face/14.gif" /></span>
<span id="spnWarn0" class="spnWarn" style="display:none;"></span>
<span id="spnMsg0" class="spnMsg" style="display:none;">发布成功!</span>
<input type="checkbox" class="chkbox" name="chkWeibo" />同时转发
<input type="button" class="btn1" id="btnSaveReply0" value="评论" onclick="saveReply('0');"></input></div>
<input type="hidden" value="5" name="hidReplyToID" id="hidReplyToID"></input>
<input type="hidden" value="LINCHENG530" name="hidReplyToOwner"></input>
<input type="hidden" value="LOG" name="hidReplyType" id="hidReplyType0"></input>
<input type="hidden" value="0" name="hidReplyOrginalID"></input>
</form>
</div>
<div class="clear"></div>
</div>
<div id="divReply">
</div>
</div>
<!-- 分类 -->
<div class="d_r_1">
<div id="divLogType">
<div style="font-weight:bold">日志分类:</div>
<div>
<ul style="padding:0px">
<li><a href="/profile/log/otherlog.shtml?uid=LINCHENG530&t=104">工作点滴</a></li>
<li><a href="/profile/log/otherlog.shtml?uid=LINCHENG530&t=0">未分类</a></li>
<li><a href="/profile/log/otherlog.shtml?uid=LINCHENG530">全部日志</a></li>
</ul>
</div>
</div>
</div>
</div>
<div class="clear"></div>
</div>
<div id="foot">
©2012 基础架构应用平台领域 TK.Unit Some rights reserved.
</div>
</body>
</html>
u'\r\n\u5f00\u7bc7\r\n'
中\r\n
是回车换行,因此是无效文件名,和编码无关。