#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
export MONITORING_DIR="$SCRIPT_DIR"
PROMETHEUS_DIR="$MONITORING_DIR/prometheus"
GRAFANA_DIR="$MONITORING_DIR/grafana"
LOKI_DIR="$MONITORING_DIR/loki"
TEMPO_DIR="$MONITORING_DIR/tempo"
OTEL_DIR="$MONITORING_DIR/otelcol"
BIN_DIR="$MONITORING_DIR/bin"
PROMETHEUS_VERSION="2.51.2"
LOKI_VERSION="3.3.2"
TEMPO_VERSION="2.7.1"
OTELCOL_VERSION="0.120.0"
GRAFANA_VERSION="11.4.0"
GRAFANA_HOME="$BIN_DIR/grafana-v${GRAFANA_VERSION}"
mkdir -p "$BIN_DIR"
mkdir -p "$PROMETHEUS_DIR/data" "$PROMETHEUS_DIR/logs"
mkdir -p "$GRAFANA_DIR/data" "$GRAFANA_DIR/logs"
mkdir -p "$GRAFANA_DIR/provisioning/datasources"
mkdir -p "$GRAFANA_DIR/provisioning/dashboards"
mkdir -p "$LOKI_DIR/data/chunks" "$LOKI_DIR/data/rules"
mkdir -p "$LOKI_DIR/data/storage" "$LOKI_DIR/data/compactor" "$LOKI_DIR/logs"
mkdir -p "$TEMPO_DIR/data/traces" "$TEMPO_DIR/data/wal"
mkdir -p "$TEMPO_DIR/data/generator/wal" "$TEMPO_DIR/logs"
mkdir -p "$OTEL_DIR/logs"
export PATH="$BIN_DIR:$PATH"
_ensure_bin() {
local name="$1"
local url="$2"
local arc_name="${3:-$name}"
command -v "$name" &>/dev/null && return 0
echo " 下载 $name ..."
local tmp
tmp=$(mktemp -d)
trap "rm -rf '$tmp'" RETURN
if ! curl -fsSL --retry 3 -o "$tmp/arc" "$url"; then
echo "错误: 下载 $name 失败,URL: $url"
exit 1
fi
case "$url" in
*.tar.gz|*.tgz) tar -xzf "$tmp/arc" -C "$tmp" ;;
*.zip) unzip -q "$tmp/arc" -d "$tmp" ;;
esac
local bin
bin=$(find "$tmp" -name "$arc_name" -type f | head -1)
if [[ -z "$bin" ]]; then
echo "错误: 在下载包内找不到 $arc_name"
exit 1
fi
cp "$bin" "$BIN_DIR/$name"
chmod +x "$BIN_DIR/$name"
echo " $name 已安装: $BIN_DIR/$name"
}
_ensure_grafana() {
command -v grafana &>/dev/null && return 0
command -v grafana-server &>/dev/null && return 0
[[ -x "$GRAFANA_HOME/bin/grafana" ]] && return 0
echo " 下载 Grafana ${GRAFANA_VERSION} ..."
local url="https://dl.grafana.com/oss/release/grafana-${GRAFANA_VERSION}.linux-amd64.tar.gz"
local tmp
tmp=$(mktemp -d)
trap "rm -rf '$tmp'" RETURN
if ! curl -fsSL --retry 3 -o "$tmp/grafana.tar.gz" "$url"; then
echo "错误: 下载 Grafana 失败,URL: $url"
exit 1
fi
tar -xzf "$tmp/grafana.tar.gz" -C "$BIN_DIR"
echo " Grafana 已安装: $GRAFANA_HOME"
}
echo "检查依赖..."
_ensure_bin prometheus \
"https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
_ensure_bin loki \
"https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/loki-linux-amd64.zip" \
"loki-linux-amd64"
_ensure_bin tempo \
"https://github.com/grafana/tempo/releases/download/v${TEMPO_VERSION}/tempo_${TEMPO_VERSION}_linux_amd64.tar.gz"
_ensure_bin otelcol-contrib \
"https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v${OTELCOL_VERSION}/otelcol-contrib_${OTELCOL_VERSION}_linux_amd64.tar.gz"
_ensure_grafana
echo ""
cat > "$GRAFANA_DIR/provisioning/datasources/datasources.yml" <<EOF
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://localhost:9090
isDefault: true
editable: true
jsonData:
timeInterval: "10s"
httpMethod: "POST"
- name: Loki
type: loki
access: proxy
url: http://localhost:3100
editable: true
jsonData:
derivedFields:
- name: TraceID
matcherRegex: 'trace_id=(\w+)'
url: "\${__value.raw}"
datasourceUid: tempo
- name: Tempo
type: tempo
access: proxy
url: http://localhost:3200
editable: true
uid: tempo
jsonData:
httpMethod: GET
serviceMap:
datasourceUid: prometheus
EOF
cat > "$GRAFANA_DIR/provisioning/dashboards/default.yml" <<EOF
apiVersion: 1
providers:
- name: Default
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: $GRAFANA_DIR/provisioning/dashboards
EOF
cp "$MONITORING_DIR/grafana-dashboard.json" \
"$GRAFANA_DIR/provisioning/dashboards/yuanrong-monitoring.json"
echo "启动 Loki..."
if pgrep -f "loki.*loki-config.yaml" &>/dev/null; then
echo " Loki 已在运行"
else
nohup loki \
-config.file="$MONITORING_DIR/loki-config.yaml" \
-config.expand-env=true \
> "$LOKI_DIR/logs/loki.log" 2>&1 &
echo " Loki 已启动 (PID: $!)"
fi
echo "启动 Tempo..."
if pgrep -f "tempo.*tempo-config.yaml" &>/dev/null; then
echo " Tempo 已在运行"
else
nohup tempo \
-config.file="$MONITORING_DIR/tempo-config.yaml" \
> "$TEMPO_DIR/logs/tempo.log" 2>&1 &
echo " Tempo 已启动 (PID: $!)"
fi
echo "启动 OTel Collector..."
if pgrep -f "otelcol.*otel-collector-config.yaml" &>/dev/null; then
echo " OTel Collector 已在运行"
else
nohup otelcol-contrib \
--config="$MONITORING_DIR/otel-collector-config.yaml" \
> "$OTEL_DIR/logs/otelcol.log" 2>&1 &
echo " OTel Collector 已启动 (PID: $!)"
fi
echo "启动 Prometheus..."
if pgrep -f "prometheus.*prometheus.yml" &>/dev/null; then
echo " Prometheus 已在运行"
else
nohup prometheus \
--config.file="$MONITORING_DIR/prometheus.yml" \
--storage.tsdb.path="$PROMETHEUS_DIR/data" \
--web.listen-address=:9090 \
--web.enable-lifecycle \
--web.enable-remote-write-receiver \
> "$PROMETHEUS_DIR/logs/prometheus.log" 2>&1 &
echo " Prometheus 已启动 (PID: $!)"
fi
sleep 2
echo "启动 Grafana..."
if pgrep -f "grafana.* server\|grafana-server" &>/dev/null; then
echo " Grafana 已在运行"
else
export GF_PATHS_DATA="$GRAFANA_DIR/data"
export GF_PATHS_LOGS="$GRAFANA_DIR/logs"
export GF_PATHS_PROVISIONING="$GRAFANA_DIR/provisioning"
export GF_SERVER_HTTP_PORT=3000
if command -v grafana-server &>/dev/null; then
nohup grafana-server \
--homepath=/usr/share/grafana \
--config=/etc/grafana/grafana.ini \
> "$GRAFANA_DIR/logs/grafana.log" 2>&1 &
else
local_bin="$GRAFANA_HOME/bin/grafana"
[[ ! -x "$local_bin" ]] && local_bin=$(command -v grafana)
nohup "$local_bin" server \
--homepath="$GRAFANA_HOME" \
cfg:paths.data="$GRAFANA_DIR/data" \
cfg:paths.logs="$GRAFANA_DIR/logs" \
cfg:paths.provisioning="$GRAFANA_DIR/provisioning" \
cfg:server.http_port=3000 \
> "$GRAFANA_DIR/logs/grafana.log" 2>&1 &
fi
echo " Grafana 已启动 (PID: $!)"
fi
echo ""
echo "=========================================="
echo "监控栈已启动"
echo "=========================================="
echo "Prometheus: http://localhost:9090"
echo "Grafana: http://localhost:3000 (admin/admin)"
echo "Loki: http://localhost:3100"
echo "Tempo: http://localhost:3200"
echo "OTel (OTLP): grpc://localhost:4317 http://localhost:4318"
echo "OTel self-mon: http://localhost:8888"
echo ""
echo "日志目录:"
echo " Loki: $LOKI_DIR/logs/loki.log"
echo " Tempo: $TEMPO_DIR/logs/tempo.log"
echo " OTel: $OTEL_DIR/logs/otelcol.log"
echo " Prometheus: $PROMETHEUS_DIR/logs/prometheus.log"
echo " Grafana: $GRAFANA_DIR/logs/grafana.log"
echo ""
echo "停止服务: ./stop-monitoring.sh"
echo "=========================================="