linux三剑客实战-作业贴

课后作业 url_summary

找出访问量最高的页面地址 借助于sed的统计分析

首先对相似的url进行聚类,聚类的规则如下(sed或者awk),

  • /topics/16689/replies/124751/edit 把数字替换为 /topics/id/replies/id/edit
  • /_img/uploads/photo/2018/c54755ee-6bfd-489a-8a39-81a1d7551cbd.png!large 变成 /_img/uploads/photo/2018/id.png!large
  • /topics/9497 改成 /topics/id
  • url中?后面的参数全部去掉
  • 其他规则参考如上

输出

  • 取出请求量top 10的聚类url

类似
100 urlxxx
98 urlxxx

url_summary(){

}

答案

url_summary() {
    less nginx.log |
        awk '{print $7}' |
        sed \
            -e 's#?.*##' \
            -e '/topics.*replies.*/s#\/[0-9]*\/#\/id\/#g' \
            -e '/\/photo\//s#\/[a-z0-9\-]*\.#\/id\.#' \
            -e '/uploads\/user\/avatar/s#\/[0-9].*#\/id#' \
            -e '/\/topics/s#\/[0-9][0-9]*[\/]*#\/id\/#' |
        sort | uniq -c | sort -nr
}


awk '{print $7}' nginx.log | sed -n '/^\/topics/p' | sed -e 's/[0-9]\{1,9\}/id/g' | sort | uniq -c | sort -rn | head -10
awk '{print $7}' nginx.log | sed -n '/^\/_img/p' | sed -e 's/[0-9a-zA-Z\-]*\.\(png\|jpg\|jpeg\|gif\)/id/g' | sort | uniq -c | sort -rn | head -10
awk '{print $11}' nginx.log | sed -n '/url/p' | sed -e 's/\(\?[0-9a-zA-Z\=\&\_\-]*\)//g' | uniq -c

url_summary(){
awk '{print $7}' nginx.log | sed -e 's/\/[0-9]\+/\/id/g' -e 's/id\/.*\./id\/id./g' -e 's/?.*//g' | sort | uniq -c | sort -rn | head -10
}
url_summary2(){
awk '{print $7}' nginx.log | sed -e 's/\/[0-9]\+/\/id/g' -e 's/id\/.*\.png!large/id\/id.png\!large/g' -e 's/?.*//g' | sort | uniq -c | sort -rn | head -10
}
url_summary () 
{ 
    awk '{print $7}' nginx.log | sed -e 's/[0-9]\+/id/g' -e 's/?.*//g' -e 's/!.*//g' -e 's/\/id\/.*\./\id\/id./g' | sort | uniq -c | sort -rn | head -10
}

url_summary(){
awk '{print $7}' nginx.log | sed -e 's/[0-9]\+/id/g'|sed -e 's/?.*//g'|sed -e 's/\/id\/.*\./\/id\/id\./g'|sed -e 's/!.*//g'|sed -e 's/%.*//g'|sort|uniq -c|sort -rn|head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -e "s/[0-9]*/id/g" | sed -e "s/\?.*//g" | sed -e "s/\/[0-9a-zA-Z\-]*\./\/id\./g" | sort | uniq -c | sort -rn | head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -e "s/\/[0-9]*\//\/id\//g" -e "s/\?.*//g" -e "s/\/[0-9a-zA-Z\-]*\./\/id\./g" | sort | uniq -c | sort -rn | head -10
}
url_summary(){
awk '{print $7}' nginx.log|sed -e 's/\/[0-9]\+/\/id/g' -e 's/\/[0-9a-zA-Z\-]\+\./\/id./' -e 's/\?.*//' |sort|uniq -c|sort -rn|head -10
}

url_summary(){
awk ‘{print $7}’ nginx.log|sed -e ‘s/\d+/id/g’ -e ‘s/?*//g’ | sort | uniq -c |sort -rn | head -10
}

url_summary(){
awk '{print $7}' nginx.log | sed -e 's/?.*/?/g'  -e 's/[^avatar,phot]\/[0-9]*\//\/id\//g' -e 's/\/[0-9]*[^\s]$/\/id/g'  -e "s/\/[0-9a-zA-Z\-]*\./\/id\./g" |sort | uniq -c | sort -nr | head -10
}
1 个赞

更新了下:

url_summary () 
{ 
    awk '{print $7}' nginx.log | sed -e 's/\/topics\/[0-9]\+/\/topics\/id/g' -e 's/\/replies\/[0-9]\+/\/replies\/id/g' -e 's/?.*//g' -e 's/!.*//g' -e 's/\/[0-9a-zA-Z]*\./\/id\./g' | sort | uniq -c | sort -rn | head -10
}

url_summary(){
awk '{ print $7 } ' nginx.log |sed 's/\/replies\/[0-9]*\//\/replies\/id2\//g' |sed 's/\/topics\/[0-9]*/\/topics\/id/g'  |sed 's/[0-9a-z_-]*.png/id.png/g' |sed -e 's/[0-9a-z_-]*.jpg/id.jpg/g' -e  's/[0-9a-z_-]*.jpeg/id.jpeg/g' | sort| uniq -c |sort -nr|head -10
}

url_summary(){
	awk '{print $7}' nginx.log | sed -r 's#/[0-9]{4,6}#/id#g'| sed -r 's#/(\S+-){4}\S+#/id#g' | sed -r 's#\?.*##g' | sort | uniq -c | sort -nr | head -10
}
url_summary(){
awk '{print $7}' nginx.log |sed -n '/^\/topics/p' | sed 's/\/[0-9]\+/\/id/g'| sort |uniq -c |sort -rn |head -10
awk '{print $7}' nginx.log | sed -n '/^\/_img/p'|sed -e 's/[0-9a-zA-Z\-]\+\./id\./g'|sort|uniq -c|sort -rn|head -10
awk '{print $11}' nginx.log | sed -e 's/\?[0-9a-zA-Z\=\&\_\%\.]\+//g'|sort|uniq -c|sort -rn|head -10
}
url_summary(){

awk '{if(match($7,"/[0-9]+")){gsub("/[0-9]+","/id",$7)};gsub("/[0-9a-z-]+.png","/id.png",$7);sub("?.*","",$7);print $7 }' nginx.log | sort | uniq -c |sort -rn | head -10

}
url_summary(){
awk '{print $7}' nginx.log | sed -n '/^\/topics/p' | sed -e 's/[0-9]\+/id/g' | sort | uniq -c | sort -n | tail -10
awk '{print $7}' nginx.log | sed -n '/^\/_img/p'|sed -e 's/[0-9a-zA-Z-]\+\./id\./g' | sort | uniq -c | sort -n | tail -10
awk '{print $11} ' nginx.log |sed -e 's/\?[0-9a-zA-Z=\&\_\%\.]\+//g'| sort | uniq -c | sort -rn | head -10

{
url_summary () 
{ 
    awk '{print $7}' nginx.log | sed -e 's/\b[0-9]\+\b/id/g' | sed -e 's/\b[0-9a-zA-Z\-]*\.png\b/id\.png/g' | sed -e 's/\b[0-9a-zA-Z\-]*\.jpg\b/id\.jpg/g' | sed -e 's/?.*//g' | sort | uniq -c | sort -nr | head -10
}
url_summary () 
{
      awk '{print $7}' nginx.log|sed -e 's/[0-9]\+/id/g' -e 's/\/.*.png/id.png/g' -e 's/\/.*.jpg/id.jpg/g' -e 's/\?.*//' |sort|uniq -c|sort -rn|head -10
}
url_summary() {awk '{print $7}' nginx.log | sed -e 's/topics\/[0-9]*/topics\/id/g' -e 's/[0-9]*\/edit/id\/edit/g' -e 's/?.*$//g' -e 's/\/[0-9a-zA-Z\-]*\./\/id\./g' | sort | uniq -c | sort -rn | head -10}
url_summary () 
{ 
    awk '{print $7}' nginx.log | sed -e 's/[0-9]\+/id/g' -e 's/?.*//g' -e 's/!.*//g' -e 's/\/id\/.*\./\id\/id./g' | sort | uniq -c | sort -rn | head -10
}