Get the FREE Ultimate OpenClaw Setup Guide →
npx machina-cli add skill chaterm/terminal-skills/maintenance --openclaw
Files (1)
SKILL.md
12.8 KB

OpenClaw 运维与故障修复

概述

OpenClaw 日常运维操作、备份恢复、故障修复和高可用管理指南。

日常运维

服务管理

# Systemd 方式
systemctl start openclaw-server
systemctl stop openclaw-server
systemctl restart openclaw-server
systemctl status openclaw-server

# 脚本方式
/opt/openclaw/bin/openclaw-server.sh start
/opt/openclaw/bin/openclaw-server.sh stop
/opt/openclaw/bin/openclaw-server.sh restart
/opt/openclaw/bin/openclaw-server.sh status

# Docker 方式
docker-compose start
docker-compose stop
docker-compose restart
docker-compose ps

# Kubernetes 方式
kubectl rollout restart deployment/openclaw-server -n openclaw
kubectl scale deployment/openclaw-worker --replicas=5 -n openclaw

健康检查

# 服务健康检查
curl -s http://localhost:8080/api/health | jq .

# 集群健康检查
curl -s http://localhost:8080/api/cluster/health | jq .

# 自动化健康检查脚本
#!/bin/bash
HEALTH=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/api/health)
if [ "$HEALTH" != "200" ]; then
    echo "OpenClaw 服务异常,状态码: $HEALTH"
    # 发送告警
    curl -X POST https://webhook.example.com/alert \
      -H "Content-Type: application/json" \
      -d '{"message": "OpenClaw 服务异常"}'
fi

日志管理

# 日志轮转配置
cat > /etc/logrotate.d/openclaw << 'EOF'
/opt/openclaw/logs/*.log {
    daily
    rotate 30
    compress
    delaycompress
    missingok
    notifempty
    create 644 openclaw openclaw
    postrotate
        systemctl reload openclaw-server > /dev/null 2>&1 || true
    endscript
}
EOF

# 手动清理日志
find /opt/openclaw/logs -name "*.log.*" -mtime +30 -delete
find /opt/openclaw/logs/tasks -name "*.log" -mtime +7 -delete

# 日志归档
tar -czf /backup/openclaw-logs-$(date +%Y%m%d).tar.gz /opt/openclaw/logs/

备份与恢复

数据库备份

# 全量备份
mysqldump -h localhost -u openclaw -p \
  --single-transaction \
  --routines \
  --triggers \
  openclaw > /backup/openclaw_$(date +%Y%m%d_%H%M%S).sql

# 压缩备份
mysqldump -h localhost -u openclaw -p openclaw | gzip > /backup/openclaw_$(date +%Y%m%d).sql.gz

# 定时备份脚本
#!/bin/bash
BACKUP_DIR="/backup/mysql"
DATE=$(date +%Y%m%d_%H%M%S)
mkdir -p $BACKUP_DIR

mysqldump -h localhost -u openclaw -p$MYSQL_PASSWORD \
  --single-transaction \
  openclaw | gzip > $BACKUP_DIR/openclaw_$DATE.sql.gz

# 清理 7 天前的备份
find $BACKUP_DIR -name "openclaw_*.sql.gz" -mtime +7 -delete

echo "备份完成: $BACKUP_DIR/openclaw_$DATE.sql.gz"

数据库恢复

# 停止服务
systemctl stop openclaw-server

# 恢复数据库
gunzip < /backup/openclaw_20240115.sql.gz | mysql -h localhost -u openclaw -p openclaw

# 或直接恢复
mysql -h localhost -u openclaw -p openclaw < /backup/openclaw_20240115.sql

# 启动服务
systemctl start openclaw-server

配置备份

# 备份配置文件
tar -czf /backup/openclaw-config-$(date +%Y%m%d).tar.gz \
  /opt/openclaw/conf/ \
  /etc/systemd/system/openclaw*.service

# Docker 配置备份
tar -czf /backup/openclaw-docker-config-$(date +%Y%m%d).tar.gz \
  /opt/openclaw/docker-compose.yml \
  /opt/openclaw/.env \
  /opt/openclaw/config/

完整备份

#!/bin/bash
# full_backup.sh - 完整备份脚本

BACKUP_DIR="/backup/openclaw/$(date +%Y%m%d)"
mkdir -p $BACKUP_DIR

echo "开始备份..."

# 1. 备份数据库
echo "备份数据库..."
mysqldump -h localhost -u openclaw -p$DB_PASSWORD \
  --single-transaction openclaw | gzip > $BACKUP_DIR/database.sql.gz

# 2. 备份配置
echo "备份配置文件..."
tar -czf $BACKUP_DIR/config.tar.gz /opt/openclaw/conf/

# 3. 备份 Redis (如需要)
echo "备份 Redis..."
redis-cli -h localhost BGSAVE
sleep 5
cp /var/lib/redis/dump.rdb $BACKUP_DIR/redis.rdb

# 4. 生成备份清单
echo "生成备份清单..."
cat > $BACKUP_DIR/manifest.txt << EOF
备份时间: $(date)
数据库: database.sql.gz
配置: config.tar.gz
Redis: redis.rdb
OpenClaw 版本: $(curl -s http://localhost:8080/api/version | jq -r .version)
EOF

echo "备份完成: $BACKUP_DIR"

故障修复

服务无法启动

端口占用

# 检查端口占用
netstat -tlnp | grep -E "8080|9090"
lsof -i :8080

# 杀死占用进程
kill -9 $(lsof -t -i :8080)

# 或修改配置使用其他端口
vim /opt/openclaw/conf/application.yml
# 修改 server.port

配置错误

# 检查配置语法
/opt/openclaw/bin/openclaw-server.sh validate

# 查看启动日志
tail -100 /opt/openclaw/logs/openclaw-server.log

# 常见配置问题
# 1. YAML 格式错误 - 检查缩进
# 2. 数据库连接字符串错误
# 3. 环境变量未设置

权限问题

# 修复目录权限
chown -R openclaw:openclaw /opt/openclaw
chmod 755 /opt/openclaw/bin/*.sh
chmod 644 /opt/openclaw/conf/*.yml

# 修复日志目录权限
chown -R openclaw:openclaw /opt/openclaw/logs
chmod 755 /opt/openclaw/logs

数据库问题修复

连接失败

# 测试连接
mysql -h localhost -u openclaw -p -e "SELECT 1"

# 重置密码
mysql -u root -p << 'EOF'
ALTER USER 'openclaw'@'%' IDENTIFIED BY 'new_password';
FLUSH PRIVILEGES;
EOF

# 检查用户权限
mysql -u root -p -e "SHOW GRANTS FOR 'openclaw'@'%'"

# 修复权限
mysql -u root -p << 'EOF'
GRANT ALL PRIVILEGES ON openclaw.* TO 'openclaw'@'%';
FLUSH PRIVILEGES;
EOF

表损坏修复

# 检查表状态
mysqlcheck -u root -p --check openclaw

# 修复表
mysqlcheck -u root -p --repair openclaw

# 修复特定表
mysqlcheck -u root -p --repair openclaw task
mysqlcheck -u root -p --repair openclaw execution

数据不一致修复

# 清理僵尸任务(长时间 RUNNING 但 Worker 已下线)
mysql -u root -p openclaw << 'EOF'
UPDATE task SET status = 'FAILED',
  error_message = 'Worker offline - auto recovered'
WHERE status = 'RUNNING'
  AND worker_id NOT IN (SELECT id FROM worker WHERE status = 'ONLINE')
  AND update_time < DATE_SUB(NOW(), INTERVAL 1 HOUR);
EOF

# 重置卡住的调度任务
mysql -u root -p openclaw << 'EOF'
UPDATE task SET status = 'PENDING', worker_id = NULL
WHERE status = 'ASSIGNED'
  AND update_time < DATE_SUB(NOW(), INTERVAL 10 MINUTE);
EOF

Redis 问题修复

连接问题

# 测试连接
redis-cli -h localhost ping

# 重启 Redis
systemctl restart redis

# 检查 Redis 日志
tail -100 /var/log/redis/redis-server.log

内存问题

# 检查内存使用
redis-cli info memory

# 清理过期 key
redis-cli --scan --pattern "openclaw:task:log:*" | xargs redis-cli del

# 设置内存策略
redis-cli config set maxmemory 2gb
redis-cli config set maxmemory-policy allkeys-lru

数据恢复

# 从 RDB 恢复
systemctl stop redis
cp /backup/redis.rdb /var/lib/redis/dump.rdb
chown redis:redis /var/lib/redis/dump.rdb
systemctl start redis

# 验证恢复
redis-cli info keyspace

Worker 故障修复

Worker 无法注册

# 检查网络连接
telnet openclaw-server 9090
nc -zv openclaw-server 9090

# 检查防火墙
iptables -L -n | grep 9090
firewall-cmd --list-ports

# 开放端口
firewall-cmd --add-port=9090/tcp --permanent
firewall-cmd --reload

# 检查 Worker 配置
grep -E "server|host|port" /opt/openclaw/conf/worker.yml

Worker 频繁离线

# 调整心跳配置
vim /opt/openclaw/conf/worker.yml
# 增加心跳间隔和超时时间
# heartbeat:
#   interval: 10000
#   timeout: 60000

# 检查系统资源
top -p $(pgrep -f openclaw-worker)
free -h

# 检查网络稳定性
ping -c 100 openclaw-server | tail -5

Worker 任务堆积

# 查看 Worker 负载
curl -s http://localhost:8080/api/workers | jq '.[] | {name, runningTasks, maxTasks}'

# 增加 Worker 线程数
vim /opt/openclaw/conf/worker.yml
# threads: 16

# 或扩容 Worker
docker-compose up -d --scale openclaw-worker=5

任务故障修复

批量重试失败任务

# API 方式重试
curl -X POST http://localhost:8080/api/tasks/batch-retry \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer ${TOKEN}" \
  -d '{"status": "FAILED", "startTime": "2024-01-15T00:00:00", "endTime": "2024-01-15T23:59:59"}'

# 数据库方式重置
mysql -u root -p openclaw << 'EOF'
UPDATE task SET status = 'PENDING', retry_count = 0, worker_id = NULL
WHERE status = 'FAILED'
  AND create_time BETWEEN '2024-01-15 00:00:00' AND '2024-01-15 23:59:59';
EOF

清理过期任务

# 清理历史执行记录
mysql -u root -p openclaw << 'EOF'
DELETE FROM execution
WHERE create_time < DATE_SUB(NOW(), INTERVAL 30 DAY);
EOF

# 清理已完成任务日志
find /opt/openclaw/logs/tasks -name "*.log" -mtime +7 -delete

终止卡住的任务

# API 方式终止
curl -X POST http://localhost:8080/api/tasks/12345/kill \
  -H "Authorization: Bearer ${TOKEN}"

# 批量终止
curl -X POST http://localhost:8080/api/tasks/batch-kill \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer ${TOKEN}" \
  -d '{"taskIds": ["12345", "12346", "12347"]}'

高可用运维

主备切换

# 查看当前主节点
curl -s http://localhost:8080/api/cluster/leader | jq .

# 手动切换主节点(计划维护)
curl -X POST http://localhost:8080/api/admin/cluster/transfer-leader \
  -H "Authorization: Bearer ${TOKEN}" \
  -d '{"targetNode": "node-2"}'

# 检查切换结果
curl -s http://localhost:8080/api/cluster/leader | jq .

节点上下线

# 优雅下线节点(不接收新任务,等待当前任务完成)
curl -X POST http://localhost:8080/api/admin/nodes/node-1/drain \
  -H "Authorization: Bearer ${TOKEN}"

# 检查节点状态
curl -s http://localhost:8080/api/admin/nodes/node-1/status | jq .

# 节点上线
curl -X POST http://localhost:8080/api/admin/nodes/node-1/resume \
  -H "Authorization: Bearer ${TOKEN}"

滚动升级

# Kubernetes 滚动升级
kubectl set image deployment/openclaw-server \
  openclaw-server=openclaw/openclaw-server:v2.0.0 -n openclaw

kubectl rollout status deployment/openclaw-server -n openclaw

# Docker Compose 滚动升级
docker-compose pull
docker-compose up -d --no-deps openclaw-worker
docker-compose up -d --no-deps openclaw-server

# 验证升级
curl -s http://localhost:8080/api/version | jq .

监控告警

Prometheus 监控配置

# prometheus.yml
scrape_configs:
  - job_name: 'openclaw'
    static_configs:
      - targets: ['openclaw-server:8080']
    metrics_path: '/actuator/prometheus'

告警规则

# alerting_rules.yml
groups:
  - name: openclaw
    rules:
      - alert: OpenClawServerDown
        expr: up{job="openclaw"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "OpenClaw Server 宕机"

      - alert: OpenClawWorkerOffline
        expr: openclaw_worker_online_total < 1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "所有 Worker 离线"

      - alert: OpenClawTaskQueueHigh
        expr: openclaw_task_queue_size > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "任务队列堆积"

      - alert: OpenClawTaskFailRateHigh
        expr: rate(openclaw_task_failed_total[5m]) / rate(openclaw_task_completed_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "任务失败率过高"

运维脚本

一键健康检查

#!/bin/bash
# health_check.sh

echo "=== OpenClaw 健康检查 ==="
echo "时间: $(date)"
echo ""

# 服务状态
echo "1. 服务状态"
systemctl is-active openclaw-server && echo "Server: 运行中" || echo "Server: 已停止"
systemctl is-active openclaw-worker && echo "Worker: 运行中" || echo "Worker: 已停止"
echo ""

# API 健康
echo "2. API 健康"
curl -s http://localhost:8080/api/health | jq .
echo ""

# 数据库连接
echo "3. 数据库连接"
mysql -u openclaw -p$DB_PASSWORD -e "SELECT 1" > /dev/null 2>&1 && echo "MySQL: 正常" || echo "MySQL: 异常"
echo ""

# Redis 连接
echo "4. Redis 连接"
redis-cli ping > /dev/null 2>&1 && echo "Redis: 正常" || echo "Redis: 异常"
echo ""

# 磁盘空间
echo "5. 磁盘空间"
df -h /opt/openclaw
echo ""

# 内存使用
echo "6. 内存使用"
free -h
echo ""

echo "=== 检查完成 ==="

运维检查清单

检查项频率命令/操作
服务状态每分钟健康检查 API
日志错误每小时检查 error.log
磁盘空间每天df -h
数据库备份每天备份脚本
日志清理每周日志轮转
配置备份每周配置备份脚本
安全更新每月版本检查和升级
性能分析每月监控指标分析

Source

git clone https://github.com/chaterm/terminal-skills/blob/main/openclaw/maintenance/SKILL.mdView on GitHub

Overview

这是 OpenClaw 的日常运维、备份与恢复、故障修复以及高可用管理的完整指南。你将学习通过 Systemd、脚本、Docker Compose 或 Kubernetes 来管理服务,执行健康检查、日志轮转、数据库与配置备份,以及在故障情况下的快速诊断与修复流程。

How This Skill Works

该技能通过一套可执行的命令、脚本和配置模板,覆盖服务管理、健康监控、日志管理、备份与恢复,以及故障修复步骤。你可以使用 systemctl、/opt/openclaw/bin/openclaw-server.sh、docker-compose、kubectl 等工具来控制服务,以及 mysqldump、redis-cli 等进行数据备份与修复操作。日志轮转、备份脚本与完整备份流程均在技能中给出具体实现,便于你在真实环境中落地执行。

When to Use It

  • 需要对 OpenClaw 实例进行日常运维与状态监控时
  • 服务不可用或健康端点返回异常时进行故障排查与修复时
  • 需要定期备份数据库、配置及关键数据并具备恢复能力时
  • 需要针对日志进行轮转、清理与归档以控制磁盘使用时
  • 需要扩展或修复 Worker 集群、任务队列及相关组件时

Quick Start

  1. 选择部署方式并验证环境:确定你要使用 Systemd、脚本方式、Docker Compose 还是 Kubernetes 部署;确保 OpenClaw 实例可访问并具备必要的权限。
  2. 执行健康检查以确认当前状态:curl -s http://localhost:8080/api/health 等端点,必要时查看集群健康:curl -s http://localhost:8080/api/cluster/health。
  3. 配置并执行日志与备份:设置日志轮转(如 /etc/logrotate.d/openclaw)、运行数据库备份脚本或 full_backup.sh,确保备份定期执行并可恢复。
  4. 在出现故障时参考故障修复章节逐步定位并修复:查看日志、检查端口、验证权限、必要时进行数据库/Redis/Worker 的修复与恢复。

Best Practices

  • 始终在受控环境中测试变更(端口、配置、扩缩容等)后再在生产环境落地
  • 对关键组件实施滚动重启与最小化停机时间的策略(如 Kubernetes 的滚动更新或 Docker Swarm/Compose 的分阶段重启)
  • 对备份设定明确的保留策略(至少 7–30 天)并定期执行恢复演练
  • 将健康检查端点与告警系统紧密整合,确保服务健康异常时能触发告警并进入自动化修复流程
  • 定期清理日志、过期备份与历史记录,避免磁盘占用过高并影响性能

Example Use Cases

  • 使用 Systemd 管理 OpenClaw 服务:systemctl start/stop/restart/status,与 openclaw-server.sh 脚本组合使用实现不同场景的启动方式
  • 实现定期数据库备份与冷备:mysqldump + gzip,结合全量 + 增量策略及 7 天以上的备份清理
  • 日志轮转与归档:配置 /etc/logrotate.d/openclaw,定期归档日志至 /backup 目录并实现日志压缩与重载服务
  • 故障修复流程演练:端口被占用时自动切换端口、修复配置错误、修复权限问题并重新启动服务
  • Redis 与任务队列的恢复演练:在 Redis 出现内存瓶颈或连接问题时进行重启、内存调优和数据恢复,并对 Worker 的注册和心跳进行排错与优化

Frequently Asked Questions

Add this skill to your agents

Related Skills

Sponsor this space

Reach thousands of developers