Scripts
替换class文件的方式发布爬虫
ip_array=()
spider='CommonSpiderService'
port=9471
for ip in ${ip_array[@]}; do
ssh -t -p 22 root@$ip rm -f /data/server/test_apps/CommonSpiderService/WEB-INF/classes/com/prime/crawler/commonspiderservice/base/BaseSpiderService.class
sleep 1
scp -r BaseSpiderService.class root@$ip:/data/server/test_apps/CommonSpiderService/WEB-INF/classes/com/prime/crawler/commonspiderservice/base/
sleep 1
ssh -t -p 22 root@$ip /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
sleep 10
ssh -t -p 22 root@$ip nohup /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
done
网站屏蔽测试
url="http://weixin.sogou.com/weixin?query=%E5%85%B3%E9%94%AE&sourceid=inttime_day&type=2&tsn=1&interation="
cookie="SUV=0086783674E4D0BE570CAF47B2933328; IPLOC=CN3100; SUID=BED0E4742624930A00000000570CAF47; SUID=BED0E4741810990A00000000570CAF47; weixinIndexVisited=1; CXID=95C63F56F467B23D9F7222E34D5C5D4E; pgv_pvi=1084406784; ld=$Zllllllll2g4he7qeD7bCteYJCg4hesNnFyskllll1llllljs@@@@@@@@@@@@@@; ABTEST=5|1463469726|v1; ad=3W9jFZllll2g93mElllllVNwJ4GlllllNnFyskllll9lllllpZlll5@@@@@@@@@@; SNUID=56380C9CE7E2D3F41A680619E802F1C5; sct=174; JSESSIONID=aaaX5ObW0L-VCgrUG3euv; LSTMV=142%2C190; LCLKINT=40614"
delay=60
feature="class=\"np\""
source=""
function spiderHtml(){
source=$( curl -s \
-H "Referer:$url" \
-H "User-agent:Mozilla/5.0 (Windows NT 6.1\; WOW64\; rv:28.0) Gecko/20100101 Firefox/28.0" \
-H "Connection:keep-alive" \
-H "Accept-Language:zh-CN,zh\;q=0.8,en-us\;q=0.5,en\;q=0.3" \
-H "Accept:text/html,application/xhtml xml,application/xml\;q0.9,*/*\;q0.8" \
$url )
}
# -H "Cookie:$cookie" \
flag=True
errorCount=0
{
for i in {1..500000}; do
spiderHtml
sourceLen=${#source}
currentTime=$(date +%Y-%m-%d:%H:%M:%S)
if [[ $source == *"$feature"* ]]; then
echo $currentTime" "$i" isMatch->True courceLen->"$sourceLen" delay->"$delay" errorCount->"$errorCount
errorCount=0
else
((errorCount++))
echo $currentTime" "$i" isMatch->False courceLen->"$sourceLen" delay->"$delay" errorCount->"$errorCount
if [ "$errorCount" -gt 3 ]; then
break
fi
fi
sleep $delay
done
} || {
echo "something unknown happens"
echo "something unknown happens">catch
}
查看爬虫是否正常运行
for ip in ;do
printf $ip" "
#ssh -t -p 22 root@$ip 'if grep -q BaseSpiderService /data/server/tomcats_test/tomcat7_9471_CommonSpiderService/logs/catalina.out; then echo yes; else echo no; fi'
ssh -t -p 22 root@$ip 'tail -555 /data/server/tomcats_test/tomcat7_9471_CommonSpiderService/logs/catalina.out'
sleep 1
done
发布爬虫
p_array=('120.24.94.155' '120.24.171.220' '120.24.166.167')
port_array=(9471 9472 9473)
spider_array=('CommonSpiderService' 'WechatSpiderService' 'WebClientSpiderService')
for ip in ${ip_array[@]}; do
ssh -t -p 22 root@$ip rm -rf /data/server/test_apps/*
ssh -t -p 22 root@$ip rm -rf /data/server/apps/*
scp -r /home/paul.ge/crawler/* root@$ip:/data/server/test_apps/
scp -r /home/paul.ge/prepare/* root@$ip:/data/server/apps/
for i in 0 1 2; do
port=${port_array[$i]}
spider=${spider_array[$i]}
ssh -t -p 22 root@$ip /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
sleep 10
ssh -t -p 22 root@$ip nohup /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
done
ssh -t -p 22 root@$ip /data/server/tomcats/tomcat7_8081_prepareservice/bin/catalina.sh stop -force
sleep 10
ssh -t -p 22 root@$ip nohup /data/server/tomcats/tomcat7_8081_prepareservice/bin/catalina.sh start
done
修改tomcat内存
yes|cp /data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh /data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh.bak
sed -i "s/^JAVA_OPTS='-Xms.*$/JAVA_OPTS='-Xms128m -Xmx256m'/g" /data/server/tomcats_test/tomcat7_9473_*/bin/catalina.sh
#sed -i "s/^JAVA_OPTS='-Xms.*$/JAVA_OPTS='-Xms64m -Xmx128m'/g" /data/server/tomcats_test/tomcat7_947[123]_*/bin/catalina.sh
sed -i 's/^JAVA_OPTS="\$.*$/JAVA_OPTS="\$JAVA_OPTS -server -XX:PermSize=128M -XX:MaxPermSize=256m"/g' /data/server/tomcats_test/tomcat7_9473_*/bin/catalina.sh
#sed -i 's/^JAVA_OPTS="\$.*$/JAVA_OPTS="\$JAVA_OPTS -server -XX:PermSize=64M -XX:MaxPermSize=128m"/g' /data/server/tomcats_test/tomcat7_947[123]_*/bin/catalina.sh
/data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh stop -force
sleep 10
nohup /data/server/tomcats_test/tomcat7_9473_WebClientSpiderService/bin/catalina.sh start
重启阿里云
if [[ $1 =~ ([0-9]+.[0-9]+.[0-9]+.[0-9]+).([0-9]+) ]] ; then
ip=${BASH_REMATCH[1]}
port=${BASH_REMATCH[2]}
port_array=(9471 9472 9473)
spider_array=('CommonSpiderService' 'WechatSpiderService' 'WebClientSpiderService')
for ((i=0; i<${#port_array[@]}; i ++)) {
spider=${spider_array[$i]}
if [ $port -eq ${port_array[$i]} ]; then
ssh -t -p 22 root@$ip /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
sleep 10
ssh -t -p 22 root@$ip nohup /data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
fi
}
fi
检查爬虫状态,如需要,重启爬虫
HOME_DR="/home/cron/check_service"
if [ ! -d "${HOME_DR}" ];then
mkdir -p $HOME_DR
fi
cd $HOME_DR
ip=$(ifconfig eth1|grep -Po '(?<=inet addr:).*?(?= )')
port_array=(9471 9472 9473)
spider_array=('CommonSpiderService' 'WechatSpiderService' 'WebClientSpiderService')
for i in 0 1 2
do
port=${port_array[$i]}
spider=${spider_array[$i]}
addr=http://$ip:$port/$spider/service
http_code=`curl -I -o /dev/null -s -w %{http_code} $addr`
if [ $http_code != "200" ]; then
echo "restart"
/data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh stop -force
sleep 20
/data/server/tomcats_test/tomcat7_${port}_${spider}/bin/catalina.sh start
fi
done
列出文件
for dir in `ls .`
do
if [ -d $dir ]
then
printf "$dir "
cd $dir
ls .
cd ..
fi
done
微信阅读数
curl -d "" -H "User-Agent:Mozilla/5.0 (Linux; U; Android 4.1.1; zh-cn; M032 Build/JRO03H) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 MicroMessenger/6.0.0.54_r849063.501 NetType/WIFI" -H "Connection:keep-alive" -H "Accept-Language:zh-CN, en-US" -H "X-Requested-With:com.tencent.mm" -H "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "http://mp.weixin.qq.com/mp/getappmsgext?__biz=MzA5OTA0NDU1NQ==&mid=2656035351&idx=4&sn=873958b95cbd2ede12d31f47cb4e698d&is_only_read=1&uin=MjYxNjgzNjMzNg==&key=77421cf58af4a65366dd4f8ad2bdd71dbc0db80fa7c75b58135d6a6095fd5d90d960b7c7920621f6f1217c8334b52b1a"
删除日志
LOG_FILE_DIR='/root/willow/logs/'
LOG_FILE_PATTERN='access_log.\([0-9]\+-[0-9]\+-[0-9]\+\)'
CURRENT_DATE=`date +%Y-%m-%d`
SECONDS_PER_MONTH=$[ 30 * 24 * 60 * 60 ]
for log_file in `ls $LOG_FILE_DIR`; do
log_date=`expr $log_file : $LOG_FILE_PATTERN`;
if [ $log_date ]; then
diff_seconds=$(($(date +%s -d $CURRENT_DATE) - $(date +%s -d $log_date)));
if [ $diff_seconds -gt $SECONDS_PER_MONTH ]; then
echo "rm -f $LOG_FILE_DIR$log_file";
rm -f $LOG_FILE_DIR$log_file;
fi
fi
don