课后作业 url_summary
找出访问量最高的页面地址 借助于sed的统计分析
首先对相似的url进行聚类,聚类的规则如下(sed或者awk),
- /topics/16689/replies/124751/edit 把数字替换为 /topics/id/replies/id/edit
- /_img/uploads/photo/2018/c54755ee-6bfd-489a-8a39-81a1d7551cbd.png!large 变成 /_img/uploads/photo/2018/id.png!large
- /topics/9497 改成 /topics/id
- url中?后面的参数全部去掉
- 其他规则参考如上
输出
类似
100 urlxxx
98 urlxxx
url_summary(){
}
答案
url_summary() {
less nginx.log |
awk '{print $7}' |
sed \
-e 's#?.*##' \
-e '/topics.*replies.*/s#\/[0-9]*\/#\/id\/#g' \
-e '/\/photo\//s#\/[a-z0-9\-]*\.#\/id\.#' \
-e '/uploads\/user\/avatar/s#\/[0-9].*#\/id#' \
-e '/\/topics/s#\/[0-9][0-9]*[\/]*#\/id\/#' |
sort | uniq -c | sort -nr
}
cream
(Hei蛋炒饭)
2
awk '{print $7}' nginx.log | sed -n '/^\/topics/p' | sed -e 's/[0-9]\{1,9\}/id/g' | sort | uniq -c | sort -rn | head -10
awk '{print $7}' nginx.log | sed -n '/^\/_img/p' | sed -e 's/[0-9a-zA-Z\-]*\.\(png\|jpg\|jpeg\|gif\)/id/g' | sort | uniq -c | sort -rn | head -10
awk '{print $11}' nginx.log | sed -n '/url/p' | sed -e 's/\(\?[0-9a-zA-Z\=\&\_\-]*\)//g' | uniq -c
url_summary(){
awk '{print $7}' nginx.log | sed -e 's/\/[0-9]\+/\/id/g' -e 's/id\/.*\./id\/id./g' -e 's/?.*//g' | sort | uniq -c | sort -rn | head -10
}
url_summary2(){
awk '{print $7}' nginx.log | sed -e 's/\/[0-9]\+/\/id/g' -e 's/id\/.*\.png!large/id\/id.png\!large/g' -e 's/?.*//g' | sort | uniq -c | sort -rn | head -10
}
url_summary ()
{
awk '{print $7}' nginx.log | sed -e 's/[0-9]\+/id/g' -e 's/?.*//g' -e 's/!.*//g' -e 's/\/id\/.*\./\id\/id./g' | sort | uniq -c | sort -rn | head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -e 's/[0-9]\+/id/g'|sed -e 's/?.*//g'|sed -e 's/\/id\/.*\./\/id\/id\./g'|sed -e 's/!.*//g'|sed -e 's/%.*//g'|sort|uniq -c|sort -rn|head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -e "s/[0-9]*/id/g" | sed -e "s/\?.*//g" | sed -e "s/\/[0-9a-zA-Z\-]*\./\/id\./g" | sort | uniq -c | sort -rn | head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -e "s/\/[0-9]*\//\/id\//g" -e "s/\?.*//g" -e "s/\/[0-9a-zA-Z\-]*\./\/id\./g" | sort | uniq -c | sort -rn | head -10
}
kers
(坨坨君)
8
url_summary(){
awk '{print $7}' nginx.log|sed -e 's/\/[0-9]\+/\/id/g' -e 's/\/[0-9a-zA-Z\-]\+\./\/id./' -e 's/\?.*//' |sort|uniq -c|sort -rn|head -10
}
url_summary(){
awk ‘{print $7}’ nginx.log|sed -e ‘s/\d+/id/g’ -e ‘s/?*//g’ | sort | uniq -c |sort -rn | head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -e 's/?.*/?/g' -e 's/[^avatar,phot]\/[0-9]*\//\/id\//g' -e 's/\/[0-9]*[^\s]$/\/id/g' -e "s/\/[0-9a-zA-Z\-]*\./\/id\./g" |sort | uniq -c | sort -nr | head -10
}
1 个赞
更新了下:
url_summary ()
{
awk '{print $7}' nginx.log | sed -e 's/\/topics\/[0-9]\+/\/topics\/id/g' -e 's/\/replies\/[0-9]\+/\/replies\/id/g' -e 's/?.*//g' -e 's/!.*//g' -e 's/\/[0-9a-zA-Z]*\./\/id\./g' | sort | uniq -c | sort -rn | head -10
}
Hurt
(hurt)
12
url_summary(){
awk '{ print $7 } ' nginx.log |sed 's/\/replies\/[0-9]*\//\/replies\/id2\//g' |sed 's/\/topics\/[0-9]*/\/topics\/id/g' |sed 's/[0-9a-z_-]*.png/id.png/g' |sed -e 's/[0-9a-z_-]*.jpg/id.jpg/g' -e 's/[0-9a-z_-]*.jpeg/id.jpeg/g' | sort| uniq -c |sort -nr|head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -r 's#/[0-9]{4,6}#/id#g'| sed -r 's#/(\S+-){4}\S+#/id#g' | sed -r 's#\?.*##g' | sort | uniq -c | sort -nr | head -10
}
xml
(xiaomaolv)
14
url_summary(){
awk '{print $7}' nginx.log |sed -n '/^\/topics/p' | sed 's/\/[0-9]\+/\/id/g'| sort |uniq -c |sort -rn |head -10
awk '{print $7}' nginx.log | sed -n '/^\/_img/p'|sed -e 's/[0-9a-zA-Z\-]\+\./id\./g'|sort|uniq -c|sort -rn|head -10
awk '{print $11}' nginx.log | sed -e 's/\?[0-9a-zA-Z\=\&\_\%\.]\+//g'|sort|uniq -c|sort -rn|head -10
}
xym
(徐燕民)
15
url_summary(){
awk '{if(match($7,"/[0-9]+")){gsub("/[0-9]+","/id",$7)};gsub("/[0-9a-z-]+.png","/id.png",$7);sub("?.*","",$7);print $7 }' nginx.log | sort | uniq -c |sort -rn | head -10
}
url_summary(){
awk '{print $7}' nginx.log | sed -n '/^\/topics/p' | sed -e 's/[0-9]\+/id/g' | sort | uniq -c | sort -n | tail -10
awk '{print $7}' nginx.log | sed -n '/^\/_img/p'|sed -e 's/[0-9a-zA-Z-]\+\./id\./g' | sort | uniq -c | sort -n | tail -10
awk '{print $11} ' nginx.log |sed -e 's/\?[0-9a-zA-Z=\&\_\%\.]\+//g'| sort | uniq -c | sort -rn | head -10
{
Yuki
(Momo)
17
url_summary ()
{
awk '{print $7}' nginx.log | sed -e 's/\b[0-9]\+\b/id/g' | sed -e 's/\b[0-9a-zA-Z\-]*\.png\b/id\.png/g' | sed -e 's/\b[0-9a-zA-Z\-]*\.jpg\b/id\.jpg/g' | sed -e 's/?.*//g' | sort | uniq -c | sort -nr | head -10
}
url_summary ()
{
awk '{print $7}' nginx.log|sed -e 's/[0-9]\+/id/g' -e 's/\/.*.png/id.png/g' -e 's/\/.*.jpg/id.jpg/g' -e 's/\?.*//' |sort|uniq -c|sort -rn|head -10
}
Akien
(Akien)
19
url_summary() {awk '{print $7}' nginx.log | sed -e 's/topics\/[0-9]*/topics\/id/g' -e 's/[0-9]*\/edit/id\/edit/g' -e 's/?.*$//g' -e 's/\/[0-9a-zA-Z\-]*\./\/id\./g' | sort | uniq -c | sort -rn | head -10}
to_yang
(to_yang)
20
url_summary ()
{
awk '{print $7}' nginx.log | sed -e 's/[0-9]\+/id/g' -e 's/?.*//g' -e 's/!.*//g' -e 's/\/id\/.*\./\id\/id./g' | sort | uniq -c | sort -rn | head -10
}