Scripts

替换class文件的方式发布爬虫

ip_array=()
spider='CommonSpiderService'
port=9471
for ip in ${ip_array[@]}; do
    ssh -t -p 22 root@$ip rm -f /data/server/test_apps/CommonSpiderService/WEB-INF/classes/com/prime/crawler/commonspiderservice/base/BaseSpiderService.class
    sleep 1
    scp -r BaseSpiderService.class root@$ip:/data/server/test_apps/CommonSpiderService/WEB-INF/classes/com/prime/crawler/commonspiderservice/base/
    sleep 1
    ssh -t -p 22 root@$ip /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
    sleep 10
    ssh -t -p 22 root@$ip nohup /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
done

网站屏蔽测试

url="http://weixin.sogou.com/weixin?query=%E5%85%B3%E9%94%AE&sourceid=inttime_day&type=2&tsn=1&interation="
cookie="SUV=0086783674E4D0BE570CAF47B2933328; IPLOC=CN3100; SUID=BED0E4742624930A00000000570CAF47; SUID=BED0E4741810990A00000000570CAF47; weixinIndexVisited=1; CXID=95C63F56F467B23D9F7222E34D5C5D4E; pgv_pvi=1084406784; ld=$Zllllllll2g4he7qeD7bCteYJCg4hesNnFyskllll1llllljs@@@@@@@@@@@@@@; ABTEST=5|1463469726|v1; ad=3W9jFZllll2g93mElllllVNwJ4GlllllNnFyskllll9lllllpZlll5@@@@@@@@@@; SNUID=56380C9CE7E2D3F41A680619E802F1C5; sct=174; JSESSIONID=aaaX5ObW0L-VCgrUG3euv; LSTMV=142%2C190; LCLKINT=40614"
delay=60
feature="class=\"np\""
source=""
function spiderHtml(){
    source=$( curl -s \
    -H "Referer:$url" \
    -H "User-agent:Mozilla/5.0 (Windows NT 6.1\; WOW64\; rv:28.0) Gecko/20100101 Firefox/28.0" \
    -H "Connection:keep-alive" \
    -H "Accept-Language:zh-CN,zh\;q=0.8,en-us\;q=0.5,en\;q=0.3" \
    -H "Accept:text/html,application/xhtml xml,application/xml\;q0.9,*/*\;q0.8" \
    $url )
}
# -H "Cookie:$cookie" \
flag=True
errorCount=0
{
    for i in {1..500000}; do
        spiderHtml
        sourceLen=${#source}
        currentTime=$(date +%Y-%m-%d:%H:%M:%S)
        if [[ $source == *"$feature"* ]]; then
            echo $currentTime" "$i" isMatch->True courceLen->"$sourceLen" delay->"$delay" errorCount->"$errorCount
            errorCount=0
        else
            ((errorCount++))
            echo $currentTime" "$i" isMatch->False courceLen->"$sourceLen" delay->"$delay" errorCount->"$errorCount
            if [ "$errorCount" -gt 3 ]; then
                break
            fi
        fi
        sleep $delay
    done
} || {
    echo "something unknown happens"
    echo "something unknown happens">catch
}

查看爬虫是否正常运行

for ip in ;do
  printf $ip" "
  #ssh -t -p 22 root@$ip 'if grep -q BaseSpiderService /data/server/tomcats_test/tomcat7_9471_CommonSpiderService/logs/catalina.out; then echo yes; else echo no; fi'
  ssh -t -p 22 root@$ip 'tail -555 /data/server/tomcats_test/tomcat7_9471_CommonSpiderService/logs/catalina.out'
  sleep 1
done

发布爬虫

p_array=('120.24.94.155' '120.24.171.220' '120.24.166.167')
port_array=(9471 9472 9473)
spider_array=('CommonSpiderService' 'WechatSpiderService' 'WebClientSpiderService')

for ip in ${ip_array[@]}; do
    ssh -t -p 22 root@$ip rm -rf  /data/server/test_apps/*
    ssh -t -p 22 root@$ip rm -rf  /data/server/apps/*

    scp -r /home/paul.ge/crawler/* root@$ip:/data/server/test_apps/
    scp -r /home/paul.ge/prepare/* root@$ip:/data/server/apps/
    for i in 0 1 2; do
        port=${port_array[$i]}
        spider=${spider_array[$i]}
        ssh -t -p 22 root@$ip /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
        sleep 10
        ssh -t -p 22 root@$ip nohup /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
    done

    ssh -t -p 22 root@$ip /data/server/tomcats/tomcat7_8081_prepareservice/bin/catalina.sh stop -force
    sleep 10
    ssh -t -p 22 root@$ip nohup /data/server/tomcats/tomcat7_8081_prepareservice/bin/catalina.sh start
done

修改tomcat内存

yes|cp /data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh /data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh.bak
sed -i "s/^JAVA_OPTS='-Xms.*$/JAVA_OPTS='-Xms128m -Xmx256m'/g" /data/server/tomcats_test/tomcat7_9473_*/bin/catalina.sh
#sed -i "s/^JAVA_OPTS='-Xms.*$/JAVA_OPTS='-Xms64m -Xmx128m'/g" /data/server/tomcats_test/tomcat7_947[123]_*/bin/catalina.sh
sed -i 's/^JAVA_OPTS="\$.*$/JAVA_OPTS="\$JAVA_OPTS -server -XX:PermSize=128M -XX:MaxPermSize=256m"/g' /data/server/tomcats_test/tomcat7_9473_*/bin/catalina.sh
#sed -i 's/^JAVA_OPTS="\$.*$/JAVA_OPTS="\$JAVA_OPTS -server -XX:PermSize=64M -XX:MaxPermSize=128m"/g' /data/server/tomcats_test/tomcat7_947[123]_*/bin/catalina.sh

/data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh stop -force
sleep 10
nohup /data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh start

重启阿里云

if [[ $1 =~ ([0-9]+.[0-9]+.[0-9]+.[0-9]+).([0-9]+) ]] ; then
    ip=${BASH_REMATCH[1]}
    port=${BASH_REMATCH[2]}
    port_array=(9471 9472 9473)
    spider_array=('CommonSpiderService' 'WechatSpiderService' 'WebClientSpiderService')
    for ((i=0; i<${#port_array[@]}; i ++)) {
        spider=${spider_array[$i]}
        if [ $port -eq ${port_array[$i]} ]; then
            ssh -t -p 22 root@$ip /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
            sleep 10
            ssh -t -p 22 root@$ip nohup /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
        fi
    }
fi

检查爬虫状态,如需要,重启爬虫

HOME_DR="/home/cron/check_service"
if [ ! -d "${HOME_DR}" ];then
        mkdir -p $HOME_DR
fi

cd $HOME_DR

ip=$(ifconfig eth1|grep -Po '(?<=inet addr:).*?(?=  )')
port_array=(9471 9472 9473)
spider_array=('CommonSpiderService' 'WechatSpiderService' 'WebClientSpiderService')

for i in 0 1 2
do
port=${port_array[$i]}
spider=${spider_array[$i]}
addr=http://$ip:$port/$spider/service
http_code=`curl -I -o /dev/null -s -w %{http_code} $addr`
if [ $http_code != "200" ]; then
echo "restart"
/data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
sleep 20
/data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
fi
done

列出文件

for dir in `ls .`
 do
   if [ -d $dir ]
   then
     printf "$dir "
     cd $dir
        ls .
     cd ..
   fi
done

微信阅读数

curl -d "" -H "User-Agent:Mozilla/5.0 (Linux; U; Android 4.1.1; zh-cn; M032 Build/JRO03H) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 MicroMessenger/6.0.0.54_r849063.501 NetType/WIFI" -H "Connection:keep-alive" -H "Accept-Language:zh-CN, en-US" -H "X-Requested-With:com.tencent.mm" -H "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "http://mp.weixin.qq.com/mp/getappmsgext?__biz=MzA5OTA0NDU1NQ==&mid=2656035351&idx=4&sn=873958b95cbd2ede12d31f47cb4e698d&is_only_read=1&uin=MjYxNjgzNjMzNg==&key=77421cf58af4a65366dd4f8ad2bdd71dbc0db80fa7c75b58135d6a6095fd5d90d960b7c7920621f6f1217c8334b52b1a"

删除日志


LOG_FILE_DIR='/root/willow/logs/'
LOG_FILE_PATTERN='access_log.\([0-9]\+-[0-9]\+-[0-9]\+\)'
CURRENT_DATE=`date +%Y-%m-%d`
SECONDS_PER_MONTH=$[ 30 * 24 * 60 * 60 ]

for log_file in `ls $LOG_FILE_DIR`; do
    log_date=`expr $log_file : $LOG_FILE_PATTERN`;
    if [ $log_date ]; then
        diff_seconds=$(($(date +%s -d $CURRENT_DATE) - $(date +%s -d $log_date)));
        if [ $diff_seconds -gt $SECONDS_PER_MONTH ]; then
                        echo "rm -f $LOG_FILE_DIR$log_file";
            rm -f $LOG_FILE_DIR$log_file;
        fi
    fi
don

results matching ""

    No results matching ""