#!/usr/bin/
bash
###############################################################################
# name:Robot
# date:
2012-
11-
09
# desc:download porn pictures from baixingsex
###############################################################################
FIRSTUSE=
64
USER_CHOOSE_QUIT=
65
UNKNOWN_CHOICE=
66
BAD_ARGUMENTS=
67
###############################################################################
# show usage FIXME:use literal instead of numeric
###############################################################################
function Usage {
echo -e
"Usage: ./Robot.sh [-o outdir] [-t type] [-s startpage] [-e endpage]"
echo -e
" -t 42 zipai"
echo -e
" 43 siwa"
echo -e
" 44 toupai"
echo -e
" 45 yazhou"
echo -e
" 46 oumei"
echo -e
" 47 linglei"
echo -e
" 48 mingxing"
}
###############################################################################
# loop for each img url
in each page and download it
###############################################################################
function RobotImpl {
local dir=$
1;
shift
local type=$
1;
shift
local s=$
1;
shift
local e=$(($
1+
1))
# cache wget files
local cache=
"cache"
# change working-
directory
[ -d $
dir ] ||
mkdir -p $
dir
cd $dir
[ -d $cache ] ||
mkdir $cache
urlPrefix=
"http://news.baisex.me/forum-"${type}
"-"
urlSuffix=
".html"
curPage=
$s
while [ $curPage -
lt $e ]
do
cd $cache
abslUrl=
${urlPrefix}${curPage}${urlSuffix}
pageHtml=${type}
"-"${curPage}
".html"
echo -e
"$abslUrl\t$pageHtml"
# check first if file in cache
# retry 5 times after each 30s
# continue not done yet
file
[ -e $pageHtml ] ||
wget -t
5 -
w 30 -c $abslUrl -O $pageHtml
1>/dev/
null 2>&
1
# extract each thread's title and url
tmp=${type}
"-"${curPage}.tmp
title=${type}
"-"${curPage}.title
awk '/<tbody id="normalthread/{\
getline line;\
while(line!~/<\/tbody>/
){\
getline line;\
if(line~/atarget/
){\
print line\
}\
}\
next\
}' $pageHtml > $tmp
awk '{\
split($
0, a,
"\"");\
from=index($
0,
">");\
rst=substr($
0,from+
1);\
to=index(rst,
"<");\
printf("%s\t%s\n",a[
2],substr(rst,
0,to-
1))
}' $tmp > $title
cd ../
# now we are in baixingsex/
while read each_title_thread
do
echo -e
"$each_title_thread"
titlename=$(
echo $each_title_thread |
awk 'BEGIN{IFS='\t
'}{print $2}')
[ -d $titlename ] ||
mkdir $titlename
threadUrl=$(
echo $each_title_thread |
awk 'BEGIN{IFS='\t
'}{print $1}')
abslThreadUrl=
"http://news.baisex.me/"$threadUrl
# download each thread content without img
# check if in cache
[ -e $cache
"/"$threadUrl ] ||
wget -t
5 -
w 30 -c $abslThreadUrl -O $cache
"/"$threadUrl
1>/dev/
null 2>&
1
# extract each pic's url from threadUrl
# fortunately, it is not hard to extract them
cat /dev/
null >
pic.tmp
awk '{if($0~/onload/){split($0,a,"\"");print a[2];}}' $cache
"/"$threadUrl >
pic.tmp
picIdx=
1
while read each_pic_url
do
ext=$(
echo $each_pic_url |
awk -F
'.' '{print $NF}')
[ -e $titlename
"/"$picIdx
"."$ext ] ||
wget -t
5 -
w 30 -c $each_pic_url -O $titlename
"/"$picIdx
"."$ext
1>/dev/
null 2>&
1
let "picIdx+=1"
done <
pic.tmp
done < $cache
"/"$title
# remove tmp file here
rm -f $cache
"/"$title $cache
"/"$tmp
let "curPage+=1"
done
}
###############################################################################
# check argument and call RobotImpl
###############################################################################
function Robot {
# save arg to Robot.arg
# in case of crash we could restart it
cat /dev/
null >
Robot.arg
echo $* >
Robot.arg
# set arguments default value
local dir=/mnt/windows/
baixingsex
local type=
42
local start=
1
local end=
10
[ $# -eq
0 ] && Usage &&
exit $FIRSTUSE
# parse command-
line arguments
while [ $# -gt
0 ]
do
case $
1 in
-
o) # outdir
shift
dir=$
1
;;
-
t) # type
shift
type=$
1
;;
-
s) # start page
shift
start=$
1
;;
-
e) # end page
shift
end=$
1
;;
*
) # default
shift
;;
esac
done
# check arguments
[ $type -lt
42 ] || [ $type -gt
48 ] || [ $start -gt $end ] && Usage &&
exit $BAD_ARGUMENTS
table=
( zipai siwa toupai yazhou oumei linglei mingxing )
echo -e
"dir=$dir type=${table[$(($type-42))]} start=$start end=$end"
echo -n -e
"Are you sure?[y/n]"
read y_or_n
case $y_or_n
in
y|
yes)
RobotImpl $dir $type $start $end
;;
n|
no)
exit $USER_CHOOSE_QUIT
;;
*
)
exit $UNKNOWN_CHOICE
;;
esac
}
###############################################################################
# main entry
###############################################################################
Robot $*
#!/usr/bin/
bash
###############################################################################
# name:Daemon.sh
# date:
2012-
11-
09
# desc:run in background and monitor Robot process restart it when dead
###############################################################################
ROBOT=Robot.
sh
LOG=
Robot.log
ARG=
Robot.arg
function Daemon {
while [
1 ]
do
PID=$(
ps aux |
grep $ROBOT |
grep -v
"grep" |
awk '{print $2}')
if [ -
z $PID ]
then
# Robot is dead log time first
currenttime=$(
date "+%Y-%m-%d %H:%M:%S")
echo -e $currenttime
" [dead]" >>
$LOG
# reload arg from Robot.arg restart it
arg=$(
cat $ARG)
nohup ./Robot.
sh $arg &
currenttime=$(
date "+%Y-%m-%d %H:%M:%S")
echo -e $currenttime
" [restart]" >>
$LOG
else
# too verbose ?
currenttime=$(
date "+%Y-%m-%d %H:%M:%S")
echo -e $currenttime
" [alive]" >>
$LOG
fi
# relax CPU
sleep 30
done
}
###############################################################################
# main entry
###############################################################################
Daemon
#!/usr/bin/
bash
###############################################################################
# name:Shutdown.sh
# date:
2012-
11-
10
# desc:shutdown Robot.sh and Daemon.
sh
###############################################################################
ROBOT=Robot.
sh
DAEMON=Daemon.
sh
function shutdown {
local waittime=
2
echo -e
"shutdown..."
ROBOT_PID=$(
ps aux |
grep $ROBOT |
grep -v
"grep" |
awk '{print $2}')
DAEMON_PID=$(
ps aux |
grep $DAEMON |
grep -v
"grep" |
awk '{print $2}')
if [ ! -z $DAEMON_PID ];
then
kill $DAEMON_PID
fi
if [ ! -z $ROBOT_PID ];
then
kill $ROBOT_PID
fi
sleep $waittime
ROBOT_PID=$(
ps aux |
grep $ROBOT |
grep -v
"grep" |
awk '{print $2}')
DAEMON_PID=$(
ps aux |
grep $DAEMON |
grep -v
"grep" |
awk '{print $2}')
if [[ -z $DAEMON_PID ]] && [[ -z $ROBOT_PID ]];
then
titlenum=$(
awk -F
'\t' '{print $2}' /mnt/windows/baixingsex/Robot.download |
sort -u |
wc -
l)
echo -e
"Robot has been shutdown"
echo -e
"total $titlenum threads downloaded"
else
echo -e
"Failed to shutdown Robot"
fi
exit 0
}
###############################################################################
# main entry
###############################################################################
shutdown
转载于:https://www.cnblogs.com/Anney/archive/2012/11/10/2763550.html
相关资源:google-group-crawler:从google group归档中获取(几乎)原始消息。 您的数据是您的-源码