Python自动化运维脚本实战|10个开箱即用的脚本
运维工作中有大量重复操作,用Python写几个脚本能省很多时间。
这篇整理了10个我实际在用的运维脚本,可以直接复制使用。
脚本1:批量SSH执行命令
#!/usr/bin/env python3
"""批量在多台服务器上执行命令"""
import paramiko
from concurrent.futures import ThreadPoolExecutor
SERVERS = [
{"host": "192.168.1.10", "user": "root", "password": "xxx"},
{"host": "192.168.1.11", "user": "root", "password": "xxx"},
{"host": "192.168.1.12", "user": "root", "password": "xxx"},
]
def run_command(server, command):
"""在单台服务器执行命令"""
try:
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(
hostname=server["host"],
username=server["user"],
password=server["password"],
timeout=10
)
stdin, stdout, stderr = ssh.exec_command(command)
result = stdout.read().decode().strip()
error = stderr.read().decode().strip()
ssh.close()
return {
"host": server["host"],
"success": True,
"output": result,
"error": error
}
except Exception as e:
return {
"host": server["host"],
"success": False,
"error": str(e)
}
def batch_run(command):
"""并发执行"""
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(run_command, s, command) for s in SERVERS]
results = [f.result() for f in futures]
for r in results:
print(f"\n{'='*50}")
print(f"Host: {r['host']}")
if r['success']:
print(f"Output:\n{r['output']}")
else:
print(f"Error: {r['error']}")
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python batch_ssh.py 'command'")
sys.exit(1)
batch_run(sys.argv[1])
使用:
python batch_ssh.py "df -h"
python batch_ssh.py "systemctl status nginx"
脚本2:服务器资源监控
#!/usr/bin/env python3
"""监控服务器CPU、内存、磁盘"""
import psutil
import time
from datetime import datetime
def get_system_info():
"""获取系统资源信息"""
# CPU
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
# 内存
memory = psutil.virtual_memory()
mem_total = memory.total / (1024**3) # GB
mem_used = memory.used / (1024**3)
mem_percent = memory.percent
# 磁盘
disk = psutil.disk_usage('/')
disk_total = disk.total / (1024**3)
disk_used = disk.used / (1024**3)
disk_percent = disk.percent
# 网络
net = psutil.net_io_counters()
bytes_sent = net.bytes_sent / (1024**2) # MB
bytes_recv = net.bytes_recv / (1024**2)
return {
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"cpu_percent": cpu_percent,
"cpu_count": cpu_count,
"mem_total_gb": round(mem_total, 2),
"mem_used_gb": round(mem_used, 2),
"mem_percent": mem_percent,
"disk_total_gb": round(disk_total, 2),
"disk_used_gb": round(disk_used, 2),
"disk_percent": disk_percent,
"net_sent_mb": round(bytes_sent, 2),
"net_recv_mb": round(bytes_recv, 2),
}
def monitor(interval=5, alert_cpu=80, alert_mem=80, alert_disk=90):
"""持续监控并告警"""
print("开始监控... (Ctrl+C退出)")
while True:
info = get_system_info()
# 打印信息
print(f"\n[{info['time']}]")
print(f"CPU: {info['cpu_percent']}% ({info['cpu_count']}核)")
print(f"内存: {info['mem_used_gb']}/{info['mem_total_gb']}GB ({info['mem_percent']}%)")
print(f"磁盘: {info['disk_used_gb']}/{info['disk_total_gb']}GB ({info['disk_percent']}%)")
# 告警检查
alerts = []
if info['cpu_percent'] > alert_cpu:
alerts.append(f"⚠️ CPU使用率过高: {info['cpu_percent']}%")
if info['mem_percent'] > alert_mem:
alerts.append(f"⚠️ 内存使用率过高: {info['mem_percent']}%")
if info['disk_percent'] > alert_disk:
alerts.append(f"⚠️ 磁盘使用率过高: {info['disk_percent']}%")
for alert in alerts:
print(alert)
# 这里可以加发送通知的逻辑
time.sleep(interval)
if __name__ == "__main__":
monitor()
脚本3:日志分析统计
#!/usr/bin/env python3
"""分析Nginx访问日志"""
import re
from collections import Counter
from datetime import datetime
def analyze_nginx_log(log_file, top_n=10):
"""分析Nginx访问日志"""
ip_counter = Counter()
url_counter = Counter()
status_counter = Counter()
ua_counter = Counter()
hour_counter = Counter()
# Nginx日志正则
pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
with open(log_file, 'r') as f:
for line in f:
match = re.match(pattern, line)
if match:
ip = match.group(1)
time_str = match.group(2)
request = match.group(3)
status = match.group(4)
ua = match.group(7)
ip_counter[ip] += 1
status_counter[status] += 1
# 提取URL
if request:
parts = request.split()
if len(parts) >= 2:
url_counter[parts[1]] += 1
# 提取小时
try:
dt = datetime.strptime(time_str.split()[0], "%d/%b/%Y:%H:%M:%S")
hour_counter[dt.hour] += 1
except:
pass
# 输出统计
print("=" * 60)
print(f"日志文件: {log_file}")
print(f"总请求数: {sum(ip_counter.values())}")
print("=" * 60)
print(f"\n📊 Top {top_n} IP:")
for ip, count in ip_counter.most_common(top_n):
print(f" {ip}: {count}")
print(f"\n📊 Top {top_n} URL:")
for url, count in url_counter.most_common(top_n):
print(f" {url}: {count}")
print(f"\n📊 状态码分布:")
for status, count in sorted(status_counter.items()):
print(f" {status}: {count}")
print(f"\n📊 每小时请求分布:")
for hour in range(24):
count = hour_counter.get(hour, 0)
bar = '█' * (count // 100)
print(f" {hour:02d}:00 | {bar} {count}")
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python log_analyzer.py /var/log/nginx/access.log")
sys.exit(1)
analyze_nginx_log(sys.argv[1])
脚本4:文件备份脚本
#!/usr/bin/env python3
"""自动备份指定目录"""
import os
import shutil
import tarfile
from datetime import datetime
def backup(source_dir, backup_dir, keep_days=7):
"""备份目录"""
# 创建备份目录
os.makedirs(backup_dir, exist_ok=True)
# 生成备份文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"backup_{timestamp}.tar.gz"
backup_path = os.path.join(backup_dir, backup_name)
print(f"开始备份: {source_dir}")
print(f"备份到: {backup_path}")
# 创建压缩包
with tarfile.open(backup_path, "w:gz") as tar:
tar.add(source_dir, arcname=os.path.basename(source_dir))
# 获取文件大小
size_mb = os.path.getsize(backup_path) / (1024 * 1024)
print(f"备份完成! 大小: {size_mb:.2f} MB")
# 清理旧备份
clean_old_backups(backup_dir, keep_days)
return backup_path
def clean_old_backups(backup_dir, keep_days):
"""清理超过指定天数的旧备份"""
import time
now = time.time()
cutoff = now - (keep_days * 86400)
for filename in os.listdir(backup_dir):
filepath = os.path.join(backup_dir, filename)
if os.path.isfile(filepath) and filename.startswith("backup_"):
if os.path.getmtime(filepath) < cutoff:
os.remove(filepath)
print(f"删除旧备份: {filename}")
if __name__ == "__main__":
# 配置
SOURCE = "/var/www/html"
BACKUP_DIR = "/data/backups"
KEEP_DAYS = 7
backup(SOURCE, BACKUP_DIR, KEEP_DAYS)
脚本5:端口扫描检测
#!/usr/bin/env python3
"""扫描服务器开放端口"""
import socket
from concurrent.futures import ThreadPoolExecutor
def scan_port(host, port, timeout=1):
"""扫描单个端口"""
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
result = sock.connect_ex((host, port))
sock.close()
return port if result == 0 else None
except:
return None
def scan_host(host, port_range=(1, 1024), workers=100):
"""扫描主机端口"""
print(f"扫描 {host} 端口 {port_range[0]}-{port_range[1]}...")
open_ports = []
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {
executor.submit(scan_port, host, port): port
for port in range(port_range[0], port_range[1] + 1)
}
for future in futures:
result = future.result()
if result:
open_ports.append(result)
open_ports.sort()
print(f"\n开放端口 ({len(open_ports)}个):")
for port in open_ports:
service = get_service_name(port)
print(f" {port}/tcp {service}")
return open_ports
def get_service_name(port):
"""获取常见端口服务名"""
services = {
21: "ftp", 22: "ssh", 23: "telnet", 25: "smtp",
53: "dns", 80: "http", 110: "pop3", 143: "imap",
443: "https", 3306: "mysql", 5432: "postgresql",
6379: "redis", 8080: "http-proxy", 27017: "mongodb"
}
return services.get(port, "unknown")
if __name__ == "__main__":
import sys
host = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
scan_host(host)
脚本6:MySQL数据库备份
#!/usr/bin/env python3
"""MySQL数据库备份"""
import os
import subprocess
from datetime import datetime
def backup_mysql(host, user, password, database, backup_dir):
"""备份MySQL数据库"""
os.makedirs(backup_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{database}_{timestamp}.sql.gz"
filepath = os.path.join(backup_dir, filename)
# mysqldump命令
cmd = f"mysqldump -h {host} -u {user} -p'{password}' {database} | gzip > {filepath}"
print(f"备份数据库: {database}")
try:
subprocess.run(cmd, shell=True, check=True)
size_mb = os.path.getsize(filepath) / (1024 * 1024)
print(f"备份成功: {filepath} ({size_mb:.2f} MB)")
return filepath
except subprocess.CalledProcessError as e:
print(f"备份失败: {e}")
return None
def backup_all_databases(host, user, password, backup_dir):
"""备份所有数据库"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"all_databases_{timestamp}.sql.gz"
filepath = os.path.join(backup_dir, filename)
cmd = f"mysqldump -h {host} -u {user} -p'{password}' --all-databases | gzip > {filepath}"
try:
subprocess.run(cmd, shell=True, check=True)
print(f"全量备份成功: {filepath}")
return filepath
except subprocess.CalledProcessError as e:
print(f"备份失败: {e}")
return None
if __name__ == "__main__":
# 配置
HOST = "localhost"
USER = "root"
PASSWORD = "your_password"
DATABASE = "mydb"
BACKUP_DIR = "/data/mysql_backups"
backup_mysql(HOST, USER, PASSWORD, DATABASE, BACKUP_DIR)
脚本7:进程监控重启
#!/usr/bin/env python3
"""监控进程,挂了自动重启"""
import subprocess
import time
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def check_process(name):
"""检查进程是否运行"""
try:
result = subprocess.run(
["pgrep", "-f", name],
capture_output=True,
text=True
)
return result.returncode == 0
except:
return False
def start_process(command):
"""启动进程"""
try:
subprocess.Popen(command, shell=True)
return True
except Exception as e:
logging.error(f"启动失败: {e}")
return False
def monitor(process_name, start_command, interval=30):
"""监控进程"""
logging.info(f"开始监控进程: {process_name}")
while True:
if not check_process(process_name):
logging.warning(f"进程 {process_name} 未运行,尝试启动...")
if start_process(start_command):
logging.info("进程已启动")
# 这里可以加通知逻辑
else:
logging.error("进程启动失败")
else:
logging.debug(f"进程 {process_name} 运行正常")
time.sleep(interval)
if __name__ == "__main__":
# 配置
PROCESS_NAME = "my-service"
START_COMMAND = "systemctl start my-service"
CHECK_INTERVAL = 30
monitor(PROCESS_NAME, START_COMMAND, CHECK_INTERVAL)
脚本8:批量修改文件
#!/usr/bin/env python3
"""批量替换文件内容"""
import os
import re
def replace_in_file(filepath, old_text, new_text, use_regex=False):
"""替换文件内容"""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if use_regex:
new_content = re.sub(old_text, new_text, content)
else:
new_content = content.replace(old_text, new_text)
if content != new_content:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(new_content)
return True
return False
def batch_replace(directory, old_text, new_text, extensions=None, use_regex=False):
"""批量替换目录下的文件"""
if extensions is None:
extensions = ['.py', '.txt', '.conf', '.yml', '.yaml', '.json']
modified_files = []
for root, dirs, files in os.walk(directory):
# 跳过隐藏目录
dirs[:] = [d for d in dirs if not d.startswith('.')]
for filename in files:
if any(filename.endswith(ext) for ext in extensions):
filepath = os.path.join(root, filename)
try:
if replace_in_file(filepath, old_text, new_text, use_regex):
modified_files.append(filepath)
print(f"已修改: {filepath}")
except Exception as e:
print(f"跳过 {filepath}: {e}")
print(f"\n共修改 {len(modified_files)} 个文件")
return modified_files
if __name__ == "__main__":
import sys
if len(sys.argv) < 4:
print("Usage: python batch_replace.py <directory> <old_text> <new_text>")
sys.exit(1)
batch_replace(sys.argv[1], sys.argv[2], sys.argv[3])
脚本9:SSL证书过期检查
#!/usr/bin/env python3
"""检查SSL证书过期时间"""
import ssl
import socket
from datetime import datetime
def check_ssl_cert(domain, port=443):
"""检查SSL证书信息"""
try:
context = ssl.create_default_context()
with socket.create_connection((domain, port), timeout=10) as sock:
with context.wrap_socket(sock, server_hostname=domain) as ssock:
cert = ssock.getpeercert()
# 解析过期时间
expire_date = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z')
days_left = (expire_date - datetime.now()).days
return {
"domain": domain,
"issuer": dict(x[0] for x in cert['issuer']).get('organizationName', 'Unknown'),
"expire_date": expire_date.strftime("%Y-%m-%d"),
"days_left": days_left,
"status": "OK" if days_left > 30 else ("WARNING" if days_left > 7 else "CRITICAL")
}
except Exception as e:
return {
"domain": domain,
"error": str(e),
"status": "ERROR"
}
def check_domains(domains):
"""批量检查域名"""
print(f"{'域名':<30} {'过期时间':<15} {'剩余天数':<10} {'状态'}")
print("=" * 70)
for domain in domains:
result = check_ssl_cert(domain)
if "error" in result:
print(f"{domain:<30} {'ERROR':<15} {'-':<10} {result['error']}")
else:
status_icon = "✅" if result['status'] == 'OK' else ("⚠️" if result['status'] == 'WARNING' else "❌")
print(f"{domain:<30} {result['expire_date']:<15} {result['days_left']:<10} {status_icon}")
if __name__ == "__main__":
domains = [
"google.com",
"github.com",
"baidu.com",
# 添加你要监控的域名
]
check_domains(domains)
脚本10:服务健康检查
#!/usr/bin/env python3
"""检查服务健康状态"""
import requests
import time
from concurrent.futures import ThreadPoolExecutor
SERVICES = [
{"name": "网站首页", "url": "https://example.com", "timeout": 5},
{"name": "API服务", "url": "https://api.example.com/health", "timeout": 5},
{"name": "管理后台", "url": "https://admin.example.com", "timeout": 5},
]
def check_service(service):
"""检查单个服务"""
try:
start = time.time()
resp = requests.get(
service["url"],
timeout=service.get("timeout", 10),
verify=True
)
elapsed = (time.time() - start) * 1000 # ms
return {
"name": service["name"],
"url": service["url"],
"status": resp.status_code,
"time_ms": round(elapsed, 2),
"ok": 200 <= resp.status_code < 400
}
except requests.exceptions.Timeout:
return {"name": service["name"], "url": service["url"], "error": "超时", "ok": False}
except requests.exceptions.SSLError:
return {"name": service["name"], "url": service["url"], "error": "SSL错误", "ok": False}
except Exception as e:
return {"name": service["name"], "url": service["url"], "error": str(e), "ok": False}
def check_all():
"""并发检查所有服务"""
print(f"{'服务名':<20} {'状态':<10} {'响应时间':<15} {'结果'}")
print("=" * 60)
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(check_service, SERVICES))
for r in results:
if r["ok"]:
print(f"{r['name']:<20} {r['status']:<10} {r['time_ms']}ms{' ':<10} ✅")
else:
error = r.get("error", f"HTTP {r.get('status', 'Unknown')}")
print(f"{r['name']:<20} {error:<10} {'-':<15} ❌")
# 统计
ok_count = sum(1 for r in results if r["ok"])
print(f"\n总计: {ok_count}/{len(results)} 服务正常")
return results
if __name__ == "__main__":
check_all()
远程服务器执行脚本
这些脚本大多需要在服务器上运行,如果服务器在内网怎么办?
我的做法是用星空组网把本地和服务器连起来,然后直接SSH执行:
# 组网后的虚拟IP
ssh root@192.168.188.10 "python3 /opt/scripts/monitor.py"
或者用脚本1的批量执行功能,同时在多台服务器上跑。
总结
这10个脚本覆盖了日常运维的主要场景:
| 脚本 | 用途 |
|---|---|
| batch_ssh | 批量执行命令 |
| monitor | 资源监控 |
| log_analyzer | 日志分析 |
| backup | 文件备份 |
| port_scan | 端口扫描 |
| mysql_backup | 数据库备份 |
| process_monitor | 进程守护 |
| batch_replace | 批量修改 |
| ssl_check | 证书检查 |
| health_check | 服务健康检查 |
这些脚本可以根据实际需求修改,也可以组合起来用。
有问题评论区交流~

浙公网安备 33010602011771号