自编脚本,目的是从自己的服务器中连续下载两个压缩的CSV文件,解压缩并合并CSV,为数据分析前道工序。
问题出在连续下载部分,怪异的是Windows下工作正常,Linux下第二次下载总是失败。以下是部分代码。
import getopt, os, sys
from datetime import datetime, timedelta, date
import time
import zipfile
import shutil
import logging
import urllib, urllib2
import socket
local = "http://127.0.0.1:8888/static/archive"
def deflat_rename(date, snr):
zip_fn = "%s.csv.zip"%(snr)
zfile = zipfile.ZipFile(zip_fn, 'r')
de_fn = "%s.csv"%(snr)
fn_date = datetime.strftime(date, "%Y%m%d")
fn = "%s_%s.csv"%(snr, fn_date)
if de_fn in zfile.namelist():
data = zfile.read(de_fn)
with open(de_fn, 'w+b') as f:
f.write(data)
zfile.close()
if os.path.exists(fn):
print(fn)
os.remove(fn)
else:
print("not exist")
print(de_fn)
os.rename(de_fn,fn)
if os.path.exists(zip_fn):
os.remove(zip_fn)
return fn
def download_file(res, snr):
f = None
fn = "%s.csv.zip"%(snr)
try:
f = urllib2.urlopen(res)
data = f.read()
with open(fn, "wb") as code:
code.write(data)
print "download %s done"%(fn)
except urllib2.URLError, e:
print e
return None
except urllib2.HTTPError, e:
print e
return None
finally:
if None != f:
f.close()
return fn
def download(start, stop, snr):
if 'iZ2573cw0yvZ' == socket.gethostname():
host = local
else:
host = remote
url_date = datetime.strftime(start, "%Y/%m/%d")
url = "%s/%s/%s.csv.zip"%(host, url_date, snr)
log("info","url: %s"%(url))
fn_date = datetime.strftime(start, "%Y%m%d")
fn = "%s_%s.csv"%(snr, fn_date)
f = download_file(url, snr)
if f is None:
log("info","No such file on %s."%(url))
f1 = deflat_rename(start, snr)
url_date = datetime.strftime(stop, "%Y/%m/%d")
url = "%s/%s/%s.csv.zip"%(remote, url_date, snr)
fn_date = datetime.strftime(stop, "%Y%m%d")
fn = "%s_%s.csv"%(snr, fn_date)
f = download_file(url, snr)
if f is None:
log("info","No such file on %s."%(url))
f2 = deflat_rename(stop, snr)
return f1, f2
Linux下错误:
./merge2report.py --date=20161201 --snr=A2H470049
download A2H470049.csv.zip done
A2H470049_20161130.csv
A2H470049.csv
<urlopen error [Errno 110] Connection timed out>
Traceback (most recent call last):
File "./merge2report.py", line 245, in <module>
main()
File "./merge2report.py", line 236, in main
f1, f2 = download(b, e, snr)
File "./merge2report.py", line 119, in download
f2 = deflat_rename(stop, snr)
File "./merge2report.py", line 66, in deflat_rename
zfile = zipfile.ZipFile(zip_fn, 'r')
File "/usr/lib/python2.7/zipfile.py", line 701, in __init__
self.fp = open(file, modeDict[mode])
IOError: [Errno 2] No such file or directory: 'A2H470049.csv.zip'
Windows下结果
> merge2report.py --date=20161201 --snr=A2H470049
download A2H470049.csv.zip done
A2H470049_20161130.csv
A2H470049.csv
download A2H470049.csv.zip done
A2H470049_20161201.csv
A2H470049.csv
关键在<urlopen error [Errno 110] Connection timed out>这句上。什么导致第二次下载超时?
Python的官方文档说明,如果你没有指定一个timeout,那么它就会使用socket的timeout值.而socket实际上是由操作系统提供的,所以timeout值依赖于操作系统.
有可能是你的这个任务恰好处在windows的socket超时之内而在linux的socket超时之外.