` logfile = open(filepath,'r')
# source_ip_dict={}
res_url_dict={}
from_url_dict={}
category_dict={}
print('start.....')
for line in logfile:
line=line.strip()
if line!="":
reg='"[GETPUOSHADINS]{5,12} /'
url_start = re.compile(reg)
re_result = url_start.findall(line)
if len(re_result)>=1:
res_url = '"'+line.split(re_result[0])[1].split(' ')[0]
category = strip_detail(res_url.split('/'))
if len(category)>=1:
if category[0] in ['360Spider','bingbot','Baiduspider','Googlebot','MediavBot','DotBot','YisouSpider','YandexBot']:
if category_dict.get(category[0],'-')=='-':
category_dict[category[0]]=1
else:
category_dict[category[0]]=category_dict[category[0]]+1
for cate in category:
if cate.find('category')!=-1:
if category_dict.get('category','-')=='-':
category_dict['category']=1
else:
category_dict['category']=category_dict['category']+1
if res_url.endswith('.jpg') or res_url.endswith('.css') or res_url.endswith('.js') or res_url.endswith('.png') or res_url.endswith('.gif'):
pass
else:
if res_url.find(r'.css?')!=-1 or res_url.find(r'.js?')!=-1:
pass
else:
if res_url_dict.get(res_url,'-')=='-':
res_url_dict[res_url]=1
else:
res_url_dict[res_url]=res_url_dict[res_url]+1
logfile.close()`
以上是我的代码
这个是日志的格式
`61.182.137.6 - - [21/Apr/2017:00:00:37 +0800] 0 "HEAD / HTTP/1.1" 200 - "-" "Baidu-YunGuanCe-SLABot(ce.baidu.com)"
123.125.71.89 - - [21/Apr/2017:00:00:38 +0800] 0 "GET /article/515140 HTTP/1.1" 200 10315 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/s...)"
216.244.66.229 - - [21/Apr/2017:00:00:39 +0800] 0 "GET /article/330012 HTTP/1.1" 200 29593 "-" "Mozilla/5.0 (compatible; DotBot/1.1; http://www.opensiteexplorer.o... help@moz.com)"`
我想要把蜘蛛的类型都提取出来,进行统计数量
就是'360Spider','bingbot','Baiduspider','Googlebot','MediavBot','DotBot','YisouSpider'
这些爬虫,一直试都匹配不上,恳请大神帮忙