首页 > 建站知识

如何用Python脚本分析日志

2018-11-20 14:19:00 海静

  如何用Python脚本分析日志?很简单,重点统计URL的抓取和流量数据,略细分。光年那宏观的数据觉得没多大用,就没往里加。因为是临时想到什么就往里加的什么,搞得很多变量自身都忘了是做什么的,所以整体看上去对照繁琐。效率通常,i3处理器1G多日志3、4分钟。

百度SEO

  实行命令:python log_file seo_file

  log_file:要输入日志的文件名,seo_file:随便起一个。。。

  脚本输送的内容包括:

  蜘蛛抓取量(总抓取量)

  每类页面的抓取量(分差异蜘蛛的;包括仅有抓取量和总抓取量,对比看下反复抓取对不多。。。)

  SEO流量(统计的搜寻引擎在‘seo_traffic_req’变量里,觉得不能够在自身加)

百度SEO

  每类页面的SEO流量

  每类页面的百度SEO流量和360 SEO流量

  蜘蛛形态码汇总

  百度出处重点词

  #coding:utf-8

  #weblog analytics

  import re

  import sys

  import urllib

  import os

  input_file,seo_file = sys.argv[1:3] #要输入的日志文件名,和输送的seo流量文件名

  seo_url = open(seo_file,'a')

  #fenci = open(fenci_file,'a') ps:需求独自输送百度出处重点词文件,则撤销说明注解

  baidu_seo = open('baiduseo.txt','a')

  #要统计蜘蛛抓取及流量数据的页面url对应的正则,想统计1个就写1个,想统计100个就写100个,根据自身需求更换下~~

  mulu_re = [

  '/abc/[0-9]+.html',

  '/abc/g[0-9]+/[a-z]+.html'

  ]

  #要统计的蜘蛛,根据自身需求更换下

  kz_spider = [

  'Baiduspider.*search/spider.html' #因为只匹配‘Baiduspider’可能把假蜘蛛也算进入,所以这么写。。

  #'360Spider'

  #'Googlebot',

  #'Sogou'

  ]

  weblog = open(input_file).read()

  word_re = re.compile('\.baidu\.com/.*?(?:wd|word)=(.*?)[&"]')

  seo_traffic_req = re.compile(r'(so.com/.*?q=|360.cn/.*?q=|baidu.com/.*wd=|baidu.com/.*word=|so.com/.*q=|sogou.com/.*query=|youdao.com/.*q=|yahoo.com/.*p=|bing.com/.*q=|google.com/.*q=)')

  baidu_seo_re = re.compile(r'(baidu.com/.*wd=|baidu.com/.*word=)')

  seo_traffic = 0

  seo_baidu = 0

  pagecode = {}

  baidupagecode = {}

  def spider_zq(spider):

  req = re.compile(spider)

  data = len(re.findall(req,weblog))

  return data

  def url_spider_zq(zz,spider):

  url_re = zz + '.*' + spider

  req = re.compile(url_re)

  data_one = len(list(set(re.findall(req,weblog)))) #仅有抓取量

  data_two = len(re.findall(req,weblog)) #总抓取量

  #e = '%.2f%%'% (float('%.1f'%(data_two-data_one))/data_two)

  return data_one,data_two

  print "\n"

  print "<-------------------------------每个蜘蛛的总抓取量---------------------------------->"

  for spider in kz_spider:

  print spider + "总抓取量:",spider_zq(spider)

  print "\n"

  print "<-------------------------------蜘蛛目次抓取量---------------------------------->"

  for spider in kz_spider:

  print spider+"目次抓取量:","\n"

  for zz in mulu_re:

  print zz,":",url_spider_zq(zz,spider)

  print "\n"

  print "<-------------------------------SEO总流量---------------------------------->"

  for line in open(input_file):

  data = re.search(seo_traffic_req,line)

  baidu = re.search(baidu_seo_re,line)

  if data:

  seo_traffic += 1

  seo_url.write(line+'\n')

  else:

  continue

  if baidu:

  seo_baidu += 1

  baidu_seo.write(line+'\n')

  else:

  continue

  code = line.split(' ')[9]

  if code.isdigit():

  if code in pagecode:

  pagecode[code] += 1

  else:

  pagecode[code] = 1

  print 'SEO流量:',seo_traffic,"\n"

  baidu_seo.close()

  seo_url.close()

  seo_mulu = open(seo_file).read()

  baiduseo = open('baiduseo.txt').read()

  print "<-------------------------------SEO目次流量---------------------------------->"

  print "网站目次SEO流量统计:","\n"

  for line in mulu_re:

  req = re.compile(line)

  seo_data = len(re.findall(req,seo_mulu))

  print line,seo_data

  print "\n"

  print "<-------------------------------百度 SEO目次流量---------------------------------->"

  print "网站目次SEO流量统计:","\n"

  for line in mulu_re:

  req = re.compile(line)

  seo_data = len(re.findall(req,baiduseo))

  print line,seo_data

  print "\n"

  print "<-------------------------------360 SEO目次流量---------------------------------->"

  print "360 SEO流量统计:","\n"

  for line in mulu_re:

  line_360 = line + ".*(so.com|360.cn)/.*?q="

  req = re.compile(line_360)

  seo_data_360 = len(re.findall(req,seo_mulu))

  print line,seo_data_360

  print "\n"

  print "<-------------------------------蜘蛛形态码---------------------------------->"

  pagecode_sort = sorted(pagecode.iteritems(), key=lambda d:d[1], reverse = True)

  print pagecode_sort

  print "\n"

  os.remove('baiduseo.txt')

  os.remove(seo_file) #假如需求日志中SEO流量的部分不妨删掉这行

  #假如需求日志中的百度出处重点词不妨撤销说明注解

  #for line in open(seo_file):

  # word = re.search(word_re,line)

  # if not word:

  # continue

  # kw = urllib.unquote_plus(word.group(1))

  # if 'ie=utf-8' not in line:

  # kw = kw.decode('gb2312','ignore').encode('utf-8')

  # fenci.write(kw+"\n")

转载请注明出处。