简单介绍:
说明: Shinken是一个网络监控平台,可以通过一系列直观的方式监控网络内的各种健康状况.Shinken脱胎于Nagios,其实Shinken这个项目本身就是一帮Nagios项目的人无法忍受Nagios,自己跳出来重新用纯Python重构了一下,甚至完全兼容Nagios的配置文件.
相关地址:
官网地址: http://www.shinken-monitoring.org/
官网文档: http://shinken.readthedocs.io/en/latest/
论坛地址: http://shinken.readthedocs.io/en/latest/
架构一览:
组件介绍:
shinken-arbiter
说明: shinken-arbiter节点读取配置,然后将配置切分后分发到多个shinken-scheduler节点
shinken-broker
说明: shinken-broker节点负责导出和管理shinken-scheduler节点中的数据
shinken-scheduler
说明: shinken-scheduler节点负责管理shinken-poller节点和shinken-reactionner节点任务调度
shinken-poller
说明: shinken-poller节点通过各类插件执行shinken-scheduler节点的任务,获取各种监控指标
shinken-reactionner
说明: shinken-reactionner节点负责一旦满足要求则触发预警通知事件
shinken-receiver
说明: shinken-receiver可选节点,特殊环境下的数据汇总统一转发
源码分析:
#!/bin/sh
# Copyright (C) 2009-2014:
# Gabes Jean, naparuba@gmail.com
# Gerhard Lausser, Gerhard.Lausser@consol.de
# Gregory Starck, g.starck@gmail.com
# Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken. If not, see <http://www.gnu.org/licenses/>.
### BEGIN INIT INFO
# Provides: shinken
# Required-Start: $network $remote_fs
# Required-Stop: $network $remote_fs
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Shinken monitoring daemon
# Description: Shinken is a monitoring tool composed of many separated modules:
# - arbiter : the main one : control everything else.
# - scheduler : receives checks/actions from arbiter. Schedules & forwards them to pollers.
# - poller : receives the checks from a scheduler. Launch them and returns results
# - broker : manage results by looking at scheduler. Like export to flat file or db.
# - reactionner : manage the failed checks by looking at scheduler.
# - receiver : manage all passive data
### END INIT INFO
### Chkconfig Header
# Shinken Starts Shinken daemons
#
# chkconfig: 345 99 01
# description: Start Shinken daemons
# Reference:
# http://refspecs.linuxfoundation.org/LSB_4.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
NAME="shinken"
# 分析: 所有组件
AVAIL_MODULES="scheduler poller reactionner broker receiver arbiter"
## SHINKEN_MODULE_FILE is set by shinken-* if it‘s one of these that‘s calling us.
if [ -z "$SHINKEN_MODULE_FILE" ]; then
SCRIPTNAME=$0
_usage_mods_="[ <$AVAIL_MODULES> ]"
else
SCRIPTNAME=$SHINKEN_MODULE_FILE
fi
# 分析: /etc/init.d/shinken restart启动时curpath为/etc/init.d/
curpath=$(cd $(dirname "$0") && pwd)
#echo curpath is $curpath filename is $(basename "$0")
# 分析:
# bin - /etc 是否有问题 ???
# var - /var
# etc - /etc
## Default paths:
test "$BIN" || BIN=$(cd $curpath/.. && pwd)
test "$VAR" || VAR=$(cd $curpath/../../var && pwd)
test "$ETC" || ETC=$(cd $curpath/../../etc && pwd)
# 分析: 设置相关环境变量
export PATH="${PATH:+$PATH:}/usr/sbin:/bin:/sbin"
export LANG=en_US.UTF8
export LC_ALL=en_US.UTF8
export PYTHONIOENCODING=utf8
export PYTHONUNBUFFERED="0"
export TZ=:/etc/localtime
# also unset http proxy, because pycurl is using it and this is bad, very bad :)
unset http_proxy
unset https_proxy
# 分析: 判断当前系统PY版本
# We try to find the LAST possible Python VERSION
pythonver() {
versions="2.4 2.5 2.6 2.7"
LASTFOUND=""
# Is there any python here?
for v in $versions
do
which python$v > /dev/null 2>&1
if [ $? -eq 0 ]
then
LASTFOUND="python$v"
fi
done
if [ -z "$LASTFOUND" ]
then
# Finaly try to find a default python
which python > /dev/null 2>&1
if [ $? -ne 0 ]
then
echo "No python interpreter found!"
exit 2
else
echo "python found"
LASTFOUND=$(which python)
fi
fi
# 分析: PYTHON为当前PY版本
PYTHON=$LASTFOUND
}
# Ok, go search this Python version
pythonver
# Uncomment the line below if you got the **lib** shinken installed
# on a non standard place (not in /usr/lib/python*)
#export PYTHONPATH="${PATH:+$PATH:}/opt/shinken"
# Or uncommentif you want to force the Python version
#export PYTHON=python2.7
# default
DEBUG=false
CMD=""
SUBMODULES=""
# 分析: 如果$SHINKEN_DEFAULT_FILE未配置则设置为/etc/default/shinken文件,就是所有组件的默认配置
## This permits to overhidde the default "default shinken cfg file":
[ -z "$SHINKEN_DEFAULT_FILE" ] && SHINKEN_DEFAULT_FILE="/etc/default/$NAME"
## so you can now do:
## bash -c "SHINKEN_DEFAULT_FILE=$your_own_default_file $init_path/shinken $action $args"
## to easily use your own config
#echo "Using $SHINKEN_DEFAULT_FILE .."
usage() {
cat << END
Usage: $SCRIPTNAME [ -d ] {start|stop|status|restart|reload|force-reload|check} $_usage_mods_
-d start requested module(s) in debug mode, only useful with start|restart
END
}
if [ "$1" = "-d" ]; then
DEBUG="1"
shift
fi
if [ $# -eq 0 ]; then
usage >&2
exit 2
fi
# 分析: CMD就是start stop status restart reload force-reload check这些命令
CMD=$1
shift
# 分析: SUBMODULES就是scheduler poller reactionner broker receiver arbiter这些组件
SUBMODULES=$*
# 分析: 加载/etc/default/shinke中的所有组件的默认配置
# Reads configuration variable file if it is present
[ -r "$SHINKEN_DEFAULT_FILE" ] && . "$SHINKEN_DEFAULT_FILE"
# 分析: 通过参数形式来设置CMD是作用于一个组件还是多个组件
if [ -z "$SUBMODULES" ]; then
SUBMODULES=$AVAIL_MODULES
else
# check given modules
for mod1 in $SUBMODULES; do
found=0
for mod2 in $AVAIL_MODULES; do
[ $mod1 = $mod2 ] && found=1;
done
[ $found = 0 ] && { usage >&2 ; exit 2 ; }
done
fi
# 分析: 如果正确加载/etc/default/shinke上面的$ETC被重置,设置通用对象配置文件SHINKENCFG
# Now look if some required variables are pre defined:
if ! test "$SHINKENCFG"
then
SHINKENCFG="$ETC/shinken.cfg"
fi
# If var or run dir is missing, create them and chown them
#[ ! -d $VAR ] && mkdir -p $VAR && chown $SHINKENUSER:$SHINKENGROUP $VAR
#[ ! -d $RUN ] && mkdir -p $RUN && chown $SHINKENUSER:$SHINKENGROUP $RUN
# Now place us in our var directory so even our arbiter will be
# happy for opening its pid and cmd files
# 分析: 进入运行目录/var下
cd $VAR
#echo BIN=$BIN
#echo VAR=$VAR
#echo ETC=$ETC
#set -xv
echo_success() {
log_end_msg 0 $*
}
echo_failure() {
log_end_msg 1 $*
}
#log_end_msg
# 分析: 加载默认一些配置以及SHELL函数库
# Load the VERBOSE setting and other rcS variables
[ -f /etc/default/rcS ] && . /etc/default/rcS
# Source function library.
[ -f /etc/rc.d/init.d/functions ] && . /etc/rc.d/init.d/functions
[ -f /lib/lsb/init-functions ] && . /lib/lsb/init-functions
################################################
#
# returns the pid for a submodule
#
getpidfile() {
mod="$1"
modPIDVAR=$(echo $mod | tr ‘a-z‘ ‘A-Z‘)"PID"
pidfile=$(echo $(eval echo \${$modPIDVAR}))
if test "$pidfile"
then
echo "$pidfile"
else
echo "$RUN/${mod}d.pid"
fi
}
getmodpid() {
mod=$1
pidfile=$(getpidfile "$mod")
if [ -s $pidfile ]; then
cat $pidfile
fi
}
# 分析: 生成调试模式下文件/tmp/bad_start_for_${mod}
getdebugfile() {
mod="$1"
modDEBUG=$(echo $mod | tr ‘a-z‘ ‘A-Z‘)"DEBUGFILE"
debugfile=$(echo $(eval echo \${$modDEBUG}))
if test "$debugfile"
then
echo "$debugfile"
else
echo "${VAR}/${mod}-debug.log"
fi
}
#
# Display status
#
do_status() {
mod=$1
pidfile=$(getpidfile "$mod")
[ -e "$pidfile" ] || {
echo "$mod NOT RUNNING (pidfile ($pidfile) not exist)"
return 3
}
[ -r "$pidfile" ] || {
echo "$mod NOT RUNNING (pidfile ($pidfile) unreadable)"
return 3
}
pid=$(cat "$pidfile")
if [ -z "$pid" ]; then
echo "$mod NOT RUNNING (pid file empty)"
return 4
fi
ps -p "$pid" >/dev/null 2>&1
rc=$?
if [ $rc != 0 ]; then
log_failure_msg "$mod NOT RUNNING (process $pid doesn‘t exist?)"
return 1
fi
echo "$mod RUNNING (pid $pid)"
return 0
}
# 分析: 启动/bin/shinken-${mod}单个组件
#
# starts our modules
#
do_start() {
mod=$1
modfilepath="$BIN/shinken-${mod}"
[ -e "$modfilepath" ] || {
log_failure_msg "FAILED: did not find $mod file ($modfilepath) ; are you sure shinken-$mod is installed?"
return 5
}
# 分析: 启动调试后生成调试文件/tmp/bad_start_for_${mod}
[ "$DEBUG" = 1 ] && DEBUGCMD="--debug "$(getdebugfile "$mod")
# Arbiter shinken.cfg, and the other OTHERd.ini
if [ "$mod" != "arbiter" ]; then
modINI=$(echo "$"${mod}CFG | tr ‘[:lower:]‘ ‘[:upper:]‘)
modinifile=$(eval echo ${modINI})
output=$($PYTHON "$modfilepath" -d -c "${modinifile}" $DEBUGCMD 2>&1)
rc=$?
else
# 分析: $SHINKENSPECIFICCFG的存在主要是为了类Centreon监控的可能需要独立配置
if ! test "$SHINKENSPECIFICCFG"
then
output=$($PYTHON "$modfilepath" -d -c "$SHINKENCFG" $DEBUGCMD 2>&1)
else
output=$($PYTHON "$modfilepath" -d -c "$SHINKENCFG" -c "$SHINKENSPECIFICCFG" $DEBUGCMD 2>&1)
fi
rc=$?
fi
# debug:
#resfile="/tmp/bad_start_for_$mod"
#echo "$output" > "$resfile" || true
if [ $rc != 0 ]; then
resfile="/tmp/bad_start_for_$mod"
echo "$output" > "$resfile" || true
output=$(echo "$output" | tail -1)
echo "FAILED: $output (full output is in $resfile)"
return 1
fi
echo "OK"
return 0
}
# 分析: 关闭/bin/shinken-${mod}单个组件
#
# stops modules
#
do_stop() {
mod=$1
pid=$(getmodpid "$mod")
statusoutput=$(do_status "$mod")
[ $? -ne 0 ] && {
echo "$statusoutput"
return 0
}
if [ ! -z "$pid" ]; then
kill "$pid"
sleep 1
## TODO: instead of ‘sleep 1‘: wait up to when pid file is removed (with timeout)?
for i in 1 2 3
do
# TODO: use a better way to get the children pids..
allpids="$(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘)"
if [ -z "$allpids" ]; then
echo "OK"
return 0
fi
sleep 1
done
echo "there are still remaining processes to $mod running.. ; trying to kill them (SIGTERM).."
allpids="$(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘)"
for cpid in $(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘); do
kill $cpid > /dev/null 2>&1
done
for i in 1 2 3
do
# TODO: eventually use a better way to get the children pids..
allpids="$(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘)"
if [ -z "$allpids" ]; then
echo "OK"
return 0
fi
sleep 1
done
echo "there are still remaining processes to $mod running.. ; trying to kill -9 them.."
allpids="$(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘)"
for cpid in $(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘); do
kill -9 $cpid > /dev/null 2>&1
done
sleep 1
allpids="$(ps -aef | grep "$pid" | grep "shinken-$mod" | awk ‘{print $2}‘)"
if [ ! -z "$allpids" ]; then
echo "FAILED: one or more process for $mod are still running after kill -9!"
echo "Remaining processes are (pids="$allpids"):"
ps -lf $(for p in $allpids ; do echo -n "-p$p " ; done)
echo "You should check this."
return 1
fi
echo "OK"
else
echo "NOT RUNNING"
fi
return 0
}
# 分析: 调用python shinken-arbiter -v -c /etc/shinken/shinken.cfg检测配置是否有问题
#
# does the config check
#
do_check() {
[ "$DEBUG" = 1 ] && DEBUGCMD="--debug $VAR/${mod}-debug.log"
if ! test "$SHINKENSPECIFICCFG"
then
$PYTHON "$BIN/shinken-arbiter" -v -c "$SHINKENCFG" $DEBUGCMD 2>&1
else
$PYTHON "$BIN/shinken-arbiter" -v -c "$SHINKENCFG" -c "$SHINKENSPECIFICCFG" $DEBUGCMD 2>&1
fi
return $?
}
############################
# 分析: 默认在/var/lib/shinken下启动组件但是arbiter比较特殊没有声明所以就到/var/下面去启动
do_start_() {
echo "Starting $1: "
status=$(do_status "$1")
rc=$?
if [ $rc -eq 0 ]; then
log_warning_msg "Already running"
return
fi
if test "$1" = "arbiter"
then
# arbiter is special:
# it doesn‘t actually declare a "workdir" properties in its config
# so we have explicitely to cd to the "VAR" directory.
# so that the default pidfile ( == nagios lock_file) which is now "arbiterd.pid"
# will be created at the correct place.
cd "$VAR"
# TODO: check if other possibility wouldn‘t be better:
# declare a "workdir" properties for the arbiter module definition.. in shinken-specific.cfg.
# but if the lock_file path is absolute then this ‘cd‘ isn‘t required.
fi
# 分析: 调用do_start启动对应组件
startoutput=$(do_start "$1")
rc=$?
if [ $rc -eq 0 ]; then
echo_success
else
echo "$startoutput"
echo_failure
fi
return $rc
}
# 分析: 关闭所有组件
do_stop_() {
echo "Stopping $1"
statusoutput=$(do_status "$1")
rc=$?
if [ $rc -ne 0 ]; then
failuremsg="Couldn‘t get status of $1: $statusoutput"
else
stopoutput=$(do_stop "$1" 2>&1)
rc=$?
[ $rc -ne 0 ] && failuremsg="Couldn‘t stop $1: $stopoutput"
fi
if [ $rc -ne 0 ]; then
log_failure_msg "$failuremsg"
echo_failure
else
echo_success
fi
return $rc
}
# 分析: 重启所有组件
do_restart_() {
mod="$1"
echo "Restarting $mod"
if [ "$mod" = "arbiter" ]; then
do_check_ "$mod"
checkrc=$?
if [ $checkrc -ne 0 ]; then
return 1
fi
fi
stopoutput=$(do_stop "$mod")
startoutput=$(do_start "$mod")
rc=$?
if [ $rc -eq 0 ]; then
echo_success
else
log_failure_msg "$startoutput"
echo_failure
fi
return $rc
}
# 分析: 强制重新加载组件
do_force_reload_() {
do_restart_ $1
}
# 分析: 重新加载所有组件
do_reload_() {
mod="$1"
echo "Reloading $mod"
if [ "$mod" = "arbiter" ]; then
do_check_ "$mod"
checkrc=$?
if [ $checkrc -ne 0 ]; then
return 1
fi
fi
stopoutput=$(do_stop "$mod")
startoutput=$(do_start "$mod")
rc=$?
if [ $rc -eq 0 ]; then
echo_success
else
log_failure_msg "$startoutput"
echo_failure
fi
return $rc
}
# 分析: 获取所有组件状态
do_status_() {
mod=$1
echo "Checking status of $mod"
do_status "$1"
rc=$?
if [ $rc -eq 0 ]; then
echo_success
else
echo_failure
fi
}
# 分析: 检查/etc/shinken/shinken.cfg以及其包含配置文件配置,并将检测结果写在/tmp/shinken_checkconfig_result文件
do_check_() {
echo "Doing config check"
output=$(do_check "$1" 2>&1)
rc=$?
check_res_file=$(mktemp /tmp/shinken_checkconfig_resultXXXXXXXX)
echo "$output" > "$check_res_file"
mv $check_res_file /tmp/shinken_checkconfig_result
check_res_file="/tmp/shinken_checkconfig_result"
if [ $rc -eq 0 ]; then
echo_success
else
output=$(echo "$output" | tail -1)
log_warning_msg "full result is in ${check_res_file}"
log_failure_msg "ConfigCheck failed: $output"
echo_failure
fi
return $rc
}
do_checkconfig_() { do_check_ "$1" ; }
############################
# 分析: 遍历组件每个组件调用对应处理函数do_${action}_ "${mod}"
do_cmd_on() {
# 分析: action就是start stop status restart reload force-reload check
action=$1
# 分析: mods其实就是scheduler poller reactionner broker receiver arbiter单个组件或是组件组合
mods=$2
local return_value
return_value=0
# 分析: 遍历组件组合
for mod in $mods
do
# If at least one action fails, the return value is 1.
# 调用do_${action}_ "${mode}"来处理
do_${action}_ "$mod" || return_value=1
done
return $return_value
}
# 分析: 代码入口,根据$CMD来调用do_cmd_on函数处理
############################
## Main:
case "$CMD" in
start|stop|restart|status|force-reload)
do_cmd_on "$CMD" "$SUBMODULES"
;;
force-reload)
do_cmd_on "force_reload" "$SUBMODULES"
;;
check|checkconfig|reload)
do_cmd_on "$CMD" "arbiter"
;;
*)
usage >&2
exit 2
;;
esac最终目标::
说明: 由于目前市面上监控系统Zabbix/Nagios/Ganglia/OneAPM/Cacti/监控宝/Open-falcon/OWL/Zenoss/Hyperic HQ/OpenNMS/360网站服务监控/阿里云监控/百度云观测/小蜜蜂网站监测,几乎很少用PY作为后端监控,之前自己也尝试编写过一套基于REDIS的全自动插件式监控系统xmzoomeye,但是自从看到Shinken纯PY实现的完整的监控系统解决方案,我觉得这正是我需要的,接下来我会和大家一起来分析Shinken整个监控框架的源码,最后我们自己实现一套监控框架~
本文出自 “@湖北@白头发” 博客,请务必保留此出处http://xmdevops.blog.51cto.com/11144840/1867125
源码分析_Shinken-2.4.0001.启动脚本/etc/init.d/shinken源码分析?
原文:http://xmdevops.blog.51cto.com/11144840/1867125