我在服务器上有每天切割nginx日志的习惯,所以针对每天各大搜索引擎来访,总能记录一些404页面信息,传统上我只是偶尔分析下日志,但是对于很多日志信息的朋友,人工来筛选可能不是一件容易的事情,这不我个人自己慢慢研究了一点点,针对谷歌、百度、搜搜、360搜索、宜搜、搜狗、必应等搜索引擎的404访问生成为一个txt文本文件,直接上代码test.php。
<?php //访问test.php?s=google header('Content-type: text/plain'); $time=$_GET['t']; $time_unix=strtotime($time); if($time_unix!==false){ $yesterday= $time_unix; }else{ $yesterday= strtotime(date('Y/m/d'))-86400; } $domain='http://www.markdream.com'; $spiders=array('baidu'=>'Baiduspider','360'=>'360Spider','google'=>'Googlebot','soso'=>'Sosospider','sogou'=>'Sogou web spider','easou'=>'EasouSpider','bing'=>'bingbot'); $path='/opt/nginx/logs/markdream/'.date('Y/m/d',$yesterday).'/access_www.txt'; $s=$_GET['s']; if(!array_key_exists($s,$spiders)) exit(); $spider=$spiders[$s]; $file=$s.'_'.date('ymd',$yesterday).'.txt'; if(!file_exists($file)){ $in=file_get_contents($path); $pattern='/GET (.*) HTTP\/1.1" 404.*'.$spider.'/'; preg_match_all ( $pattern , $in , $matches ); $out=''; foreach($matches[1] as $k=>$v){ $out.=$domain.$v."\r\n"; } file_put_contents($file,$out); } echo file_get_contents($file);
好就这样了。没有什么高深的技术,只有动手写的过程。
附件nginx日志切割脚本
#!/bin/bash #!/bin/bash # This script run at 00:00 # The Nginx logs path logs_path="/opt/nginx/logs/" #sed by view on http display_path="/home/website/pony/" day_path=${display_path}$(date -d "yesterday" +"%Y")/$(date -d "yesterday" +"%m")/$(date -d "yesterday" +"%d") #create mkdir -p ${day_path} #mine mv ${logs_path}access.log ${day_path}/access.txt mv ${logs_path}error.log ${day_path}/error.txt kill -USR1 `cat /opt/nginx/logs/nginx.pid` #添加到计划任务 #0 0 * * * root /opt/nginx/logs/cutlog.sh