kubecni

#!/usr/bin/env python3

"""
Kubernetes CNI Hotfix Tool
-------------------------

Dependencies:
- docker-py: Docker API client
- psutil: Process and system utilities
- dbus-python: D-Bus interface for Python
- systemd-python: systemd integration
- python-daemon: Daemon process support
- pyinstaller: Binary packaging
- tqdm: For displaying progress bars, useful for indicating the progress of time-consuming operations like waiting for a container to restart.

To install dependencies, run:
    sudo yum install systemd-devel
    sudo yum install dbus-devel
    pip install docker tqdm psutil dbus-python systemd-python python-daemon pyinstaller

Build instructions:
    pip install pyinstaller
    pyinstaller --name=kubecni --onefile --hidden-import=systemd --hidden-import=dbus --hidden-import=shutil --hidden-import=daemon --hidden-import=psutil --hidden-import=tqdm kubecni.py

Service Type: forking (creates daemon process)

# Start a service
systemd_manager.StartUnit('my_service.service', 'replace')

# Stop a service
systemd_manager.StopUnit('my_service.service', 'replace')

# Enable a service
systemd_manager.EnableUnitFiles(['my_service.service'], False, True)

# Disable a service
systemd_manager.DisableUnitFiles(['my_service.service'], False)

# Reload the systemd manager
systemd_manager.Reload()

# List all units
units = systemd_manager.ListUnits()
for unit in units:
    print(unit)

# Get a specific unit
unit = systemd_manager.GetUnit('my_service.service')

"""

import docker
import psutil
import time
import os
import sys
import shutil
import logging
import dbus
import daemon
from daemon import pidfile
from systemd import journal, daemon as systemd_daemon
import subprocess
import socket

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class FixedSizeStack:
    def __init__(self, size=3):  # Set size to 2
        self.size = size
        self.stack = []

    def push(self, value):
        if len(self.stack) >= self.size:
            self.stack.pop(0)  # Remove the oldest value
        self.stack.append(value)

    def get_stack(self):
        return self.stack

    def __len__(self):
        return len(self.stack)

class FlannelMonitor:
    def __init__(self):
        """
        Initialize the FlannelMonitor class.
        Set up the Docker client and systemd manager to interact with containers and services.
        """
        self.docker_client = docker.from_env()
        self.logger = logging.getLogger('FlannelMonitor')
        self.system_bus = dbus.SystemBus()
        self.systemd_manager = dbus.Interface(
            self.system_bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1'),
            'org.freedesktop.systemd1.Manager'
        )

    def check_service(self, service_name):
        """
        Use D-Bus to check if the specified systemd service is running.
        
        Parameters:
            service_name (str): The name of the service to check.
        
        Returns:
            bool: True if the service is active, False otherwise.
        """
        try:
            unit = self.systemd_manager.GetUnit(f"{service_name}.service")
            unit_proxy = self.system_bus.get_object('org.freedesktop.systemd1', str(unit))
            unit_interface = dbus.Interface(unit_proxy, 'org.freedesktop.DBus.Properties')
            active_state = unit_interface.Get('org.freedesktop.systemd1.Unit', 'ActiveState')
            self.logger.info(f"{service_name} active state: {active_state}")
            return active_state == 'active'
        except dbus.DBusException as e:
            self.logger.error(f"Error checking {service_name}: {e}")
            return False

    def check_kubelet_service(self):
        """
        Check if the kubelet, docker, and containerd services are running.
        
        Returns:
            bool: True if all services are running, False otherwise.
        """
        result = (
            self.check_service('kubelet') and
            self.check_service('docker') and
            self.check_service('containerd')
        )
        self.logger.info(f"Kubelet, Docker, and Containerd services running: {result}")
        return result

    def get_flannel_container_id(self):
        """
        Get the container ID and name of the flannel container.
        
        Returns:
            tuple: (container_id, container_name) if found, otherwise (None, None).
        """
        try:
            flannel_container = None
            pod_container = None

            containers = self.docker_client.containers.list(
                filters={'status': 'running'}
            )
            for container in containers:
                if 'k8s_kube-flannel' in container.name:
                    flannel_container = container
                elif 'k8s_POD_kube-flannel' in container.name:
                    pod_container = container

            if flannel_container and pod_container:
                self.logger.info(f"Found flannel container: {flannel_container.name}")
                return flannel_container.id, flannel_container.name
            else:
                if not flannel_container:
                    self.logger.error("No running k8s_kube-flannel container found")
                if not pod_container:
                    self.logger.error("No running k8s_POD_kube-flannel container found")
                return None, None
        except Exception as e:
            self.logger.error(f"Error getting flannel container ID: {e}")
            return None, None

    def get_flannel_pid(self):
        """
        Get the process ID (PID) of the flannel process.
        
        Returns:
            int: The PID of the flannel process if found, otherwise None.
        """
        try:
            for proc in psutil.process_iter(['name', 'pid']):
                if 'flannel' in proc.info['name']:
                    self.logger.info(f"Found flannel process with PID: {proc.info['pid']}")
                    return proc.info['pid']
            self.logger.error("Flannel process not found")
            return None
        except Exception as e:
            self.logger.error(f"Error getting flannel PID: {e}")
            return None

    def check_cni0_device(self):
        """
        Check if the cni0 network device exists and log its properties.
        
        Returns:
            bool: True if the cni0 device exists, otherwise False.
        """
        try:
            # Check if cni0 exists
            if 'cni0' in psutil.net_if_addrs():
                self.logger.info("cni0 device exists.")
                
                # Log the properties of the cni0 device
                cni0_info = psutil.net_if_addrs()['cni0']
                self.logger.info("cni0 device properties:")
                for addr in cni0_info:
                    self.logger.info(f" - Address: {addr.address}, Family: {addr.family}, Netmask: {addr.netmask}, Broadcast: {addr.broadcast}")
                
                return True
            else:
                self.logger.info("cni0 device does not exist.")
                return False
        except Exception as e:
            self.logger.error(f"Error checking cni0 device: {e}")
            return False

    def check_cni0_ip(self):
        """
        Check if the cni0 network device has an IP address and log its properties.
        
        Returns:
            bool: True if cni0 has an IP address, otherwise False.
        """
        try:
            # Get the addresses for cni0
            if 'cni0' in psutil.net_if_addrs():
                cni0_info = psutil.net_if_addrs()['cni0']
                has_ip = False
                
                for addr in cni0_info:
                    # Check if the address family is AF_INET (IPv4)
                    if addr.family == socket.AF_INET:
                        has_ip = True
                        self.logger.info(f"cni0 has an IP address: {addr.address}")
                        break  # Exit loop after finding the first IP address
                
                if has_ip:
                    return True
                else:
                    self.logger.error("cni0 does not have an IP address configured.")
                    return False
            else:
                self.logger.info("cni0 device does not exist.")
                return False
        except Exception as e:
            self.logger.error(f"Error checking cni0 IP configuration: {e}")
            return False

    def create_cni0_device(self, ip_suffix, netmask, broadcast_suffix):
        """
        Create the cni0 network device using the flannel binary.
        
        Parameters:
            ip_suffix (str): The last octet of the IP address for cni0.
            netmask (str): The subnet mask for the network.
            broadcast_suffix (str): The last octet of the broadcast address.
        
        Returns:
            bool: True if the cni0 device is created successfully, otherwise False.
        """
        try:
            flannel_bin_path = '/opt/cni/bin/flannel'
            if not os.path.exists(flannel_bin_path):
                self.logger.error(f"Flannel binary not found at {flannel_bin_path}")
                return False

            flannel_pid = self.get_flannel_pid()
            container_id, container_name = self.get_flannel_container_id()
            if not flannel_pid or not container_id:
                self.logger.error("Flannel process or container ID not found")
                return False

            env = os.environ.copy()
            env.update({
                'CNI_COMMAND': 'ADD',
                'CNI_CONTAINERID': container_id,
                'CNI_NETNS': f'/proc/{flannel_pid}/ns/net',
                'CNI_IFNAME': 'cni0',
                'CNI_PATH': '/opt/cni/bin'
            })

            # Log the environment variables
            self.logger.info(f"Environment variables for cni0 creation: {env}")

            config_path = '/etc/cni/net.d/10-flannel.conflist'
            if not os.path.exists(config_path):
                self.logger.error(f"Configuration file {config_path} not found")
                return False

            # Log the contents of the configuration file
            with open(config_path, 'r') as f:
                config_content = f.read()
                self.logger.info(f"Contents of {config_path}:\n{config_content}")

                result = subprocess.run(
                    [flannel_bin_path],
                    input=config_content,
                    env=env,
                    universal_newlines=True,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
            
            if result.returncode != 0:
                self.logger.error(f"Failed to create cni0 device: {result.stderr.strip()}")
                return False  # Return False immediately if creation fails

            self.logger.info("cni0 device created successfully")
            return True
        except Exception as e:
            self.logger.error(f"Error creating cni0 device: {e}")
            return False

    def fix_cni0_ip(self, ip_suffix, netmask, broadcast_suffix):
        """
        Fix the IP address of the cni0 network device.
        
        Parameters:
            ip_suffix (str): The last octet of the IP address for cni0.
            netmask (str): The subnet mask for the network.
            broadcast_suffix (str): The last octet of the broadcast address.
        
        Returns:
            bool: True if the IP address is set successfully, otherwise False.
        """
        try:
            subnet_env_path = '/run/flannel/subnet.env'
            if not os.path.exists(subnet_env_path):
                self.logger.error(f"Subnet configuration file {subnet_env_path} not found")
                return False

            # Log the contents of the subnet configuration file
            with open(subnet_env_path, 'r') as f:
                subnet_content = f.read()
                self.logger.info(f"Contents of {subnet_env_path}:\n{subnet_content}")

                for line in subnet_content.splitlines():
                    if line.startswith('FLANNEL_SUBNET='):
                        subnet = line.split('=')[1].strip()
                        break
                else:
                    self.logger.error("FLANNEL_SUBNET configuration not found")
                    return False

            base_ip = '.'.join(subnet.split('.')[:-1])
            cmd = f"ip addr add {base_ip}.{ip_suffix}/{netmask} brd {base_ip}.{broadcast_suffix} dev cni0"
            result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            if result.returncode == 0:
                self.logger.info("cni0 IP address set successfully")
            else:
                self.logger.error(f"Failed to set cni0 IP address: {result.stderr}")
            return result.returncode == 0
        except Exception as e:
            self.logger.error(f"Error setting cni0 IP address: {e}")
            return False
        
    def delete_cni0_ip(self):
        """
        Delete the IP address of the cni0 network device.
        
        Returns:
            bool: True if the IP address is deleted successfully, otherwise False.
        """
        try:
            cmd = "ip addr flush dev cni0"  # Command to delete all IP addresses from cni0
            result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            
            if result.returncode == 0:
                self.logger.info("Existing IP address for cni0 deleted successfully")
                return True
            else:
                self.logger.error(f"Failed to delete IP address for cni0: {result.stderr.strip()}")
                return False
        except Exception as e:
            self.logger.error(f"Error deleting IP address for cni0: {e}")
            return False
        
    def monitor_loop(self, ip_suffix, netmask, broadcast_suffix, log_interval=1):
        """
        Main monitoring loop that ensures the flannel CNI is correctly configured.
        
        Args:
            ip_suffix (str): The last octet of the IP address for cni0.
            netmask (str): The subnet mask for the network.
            broadcast_suffix (str): The last octet of the broadcast address.
            log_interval (int): The interval in seconds for logging status updates.
        """
        self.logger.info("CNI monitoring service started")
        
        # Notify systemd that the service is ready
        systemd_daemon.notify('READY=1')

        # Create a fixed-size stack to store the last three flanneld PIDs
        flanneld_pids = FixedSizeStack(size=2)

        while True:
            try:
                # Notify systemd that the service is alive
                systemd_daemon.notify('WATCHDOG=1')
                
                # Log status every log_interval seconds
                self.logger.info("Monitoring CNI status...")

                # Sleep for the defined log interval
                time.sleep(log_interval)  
                
                if self.check_kubelet_service():
                    # Get the current flanneld PID
                    current_flanneld_pid = self.get_flannel_pid()
                    if current_flanneld_pid:
                        # Push the current PID onto the stack
                        flanneld_pids.push(current_flanneld_pid)

                    # Check if cni0 device exists first
                    if not self.check_cni0_device():
                        self.logger.info("cni0 device not found, waiting for 60 seconds before attempting to create...")
                        time.sleep(60)

                        # Now attempt to create the cni0 device
                        if self.create_cni0_device(ip_suffix, netmask, broadcast_suffix):
                            self.logger.info("cni0 device created successfully")
                        else:
                            self.logger.error("Failed to create cni0 device")
                            continue
                    else:
                        if len(flanneld_pids) == 2:
                            # Compare the last two PIDs with the oldest PID
                            if flanneld_pids.get_stack()[1] != flanneld_pids.get_stack()[0]:
                                self.logger.info("Detected change in flanneld PIDs, checking cni0 device...")

                                # Clear the existing IP address on cni0 before setting a new one
                                if not self.delete_cni0_ip():
                                    self.logger.error("Failed flush ip to delete existing IP address for cni0")
                                    continue
                                else:
                                    # Now that we know cni0 exists, attempt to set the IP address
                                    if not self.fix_cni0_ip(ip_suffix, netmask, broadcast_suffix):
                                        self.logger.error("Failed to set IP address for cni0")
                                        continue
                                    else:
                                        self.logger.info("cni0 IP address set successfully.")
                                        continue
                            else:
                                self.logger.info(f"flanneld_pids {len(flanneld_pids)}")
                                continue

            except Exception as e:
                self.logger.error(f"Error in monitoring loop: {e}")


def reload_systemd():
    """Reload systemd manager configuration using D-Bus."""
    try:
        # Connect to the system bus
        bus = dbus.SystemBus()
        # Get the systemd manager object
        systemd_manager = dbus.Interface(
            bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1'),
            'org.freedesktop.systemd1.Manager'
        )
        # Call the Reload method to reload the systemd configuration
        systemd_manager.Reload()
        logging.info("Systemd manager configuration reloaded successfully.")
    except dbus.DBusException as e:
        logging.error(f"Error reloading systemd manager configuration: {e}")
    except Exception as e:
        logging.error(f"Unexpected error: {e}")

def install_service(force=False):
    """Install systemd service"""
    service_name = 'kubecni'  # Define the service name
    service_path = f'/etc/systemd/system/{service_name}.service'
    pid_file = '/var/run/kubecni.pid'

    # Reload systemd manager configuration
    logging.info("Reloading systemd manager configuration...")

    reload_systemd()

    # Check if the service is running and stop it if necessary
    try:
        if os.path.exists(service_path) and not force:
            logging.info("Service is already installed. Use --force to reinstall.")
            return True  # Return True to indicate the service is installed
        
        # Check if the service is active
        bus = dbus.SystemBus()
        systemd_manager = dbus.Interface(
            bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1'),
            'org.freedesktop.systemd1.Manager'
        )
        unit = systemd_manager.GetUnit(f"{service_name}.service")
        unit_proxy = bus.get_object('org.freedesktop.systemd1', str(unit))
        unit_interface = dbus.Interface(unit_proxy, 'org.freedesktop.DBus.Properties')
        active_state = unit_interface.Get('org.freedesktop.systemd1.Unit', 'ActiveState')

        if active_state == 'active':
            logging.info("Stopping the running service before installation...")
            systemd_manager.StopUnit(f'{service_name}.service', 'replace')
            # Wait for a moment to ensure the service has stopped
            time.sleep(10)

            systemd_manager.DisableUnitFiles([f'{service_name}.service'], False)
            
            # Wait for a moment to ensure the service has stopped
            time.sleep(10)

            # Check again if the service is still active
            active_state = unit_interface.Get('org.freedesktop.systemd1.Unit', 'ActiveState')
            if active_state == 'active':
                logging.error("Service could not be stopped. Please stop it manually before reinstalling.")
                return False  # Exit if the service is still active

            # Reload systemd daemon
            reload_systemd()
    except Exception as e:
        logging.error(f"Error checking or stopping service: {e}")

    service_content = f"""[Unit]
Description=Kubernetes CNI Hotfix Monitor
After=network.target docker.service kubelet.service
Requires=docker.service kubelet.service

[Service]
Type=forking
ExecStart=/usr/bin/kubecni --daemon=true
PIDFile={pid_file}
Restart=always

[Install]
WantedBy=multi-user.target
"""

    try:
        # Remove existing PID file if it exists
        if os.path.exists(pid_file):
            os.remove(pid_file)
            logging.info(f"Removed existing PID file: {pid_file}")

        # Copy executable
        try:
            shutil.copy2(sys.argv[0], '/usr/bin/kubecni')
            logging.info("Executable copied to /usr/bin/kubecni")
        except Exception as e:
            logging.error(f"Error copying executable: {e}")
            return False  # Exit if copying fails

        # Create service file
        with open(service_path, 'w') as f:
            f.write(service_content)
        logging.info(f"Service file created at: {service_path}")

        # Reload systemd daemon
        reload_systemd()

        # Enable service using dbus
        bus = dbus.SystemBus()
        systemd_manager = dbus.Interface(
            bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1'),
            'org.freedesktop.systemd1.Manager'
        )
        systemd_manager.EnableUnitFiles([f'{service_name}.service'], False, True)
        # Wait for a moment to ensure the service has stopped
        time.sleep(10)

        # Reload systemd daemon
        reload_systemd()

        # Start the service
        systemd_manager.RestartUnit(f'{service_name}.service', 'replace')
        # Wait for a moment to ensure the service has stopped
        time.sleep(10)

        # Get the unit object directly
        unit = systemd_manager.GetUnit(f"{service_name}.service")
        unit_proxy = bus.get_object('org.freedesktop.systemd1', str(unit))
        unit_interface = dbus.Interface(unit_proxy, 'org.freedesktop.DBus.Properties')
        active_state = unit_interface.Get('org.freedesktop.systemd1.Unit', 'ActiveState')

        if active_state == 'active':
            logging.info("Service installed and running successfully")
            return True
        else:
            logging.error("Service installed and running failed to start")
            return False

    except Exception as e:
        logging.error(f"Error during installation: {e}")
        return False

def uninstall_service():
    """Uninstall systemd service"""
    service_name = 'kubecni'  # Define the service name
    service_path = f'/etc/systemd/system/{service_name}.service'
    pid_file = '/var/run/kubecni.pid'

    try:
        # Connect to the D-Bus system bus
        bus = dbus.SystemBus()
        systemd_manager = dbus.Interface(
            bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1'),
            'org.freedesktop.systemd1.Manager'
        )

        # Reload systemd manager configuration before stopping the service
        reload_systemd()

        # Disable the service
        logging.info("Disabling the service...")
        systemd_manager.DisableUnitFiles([f'{service_name}.service'], False)
        logging.info("Service disabled.")

        # Stop the service
        logging.info("Service stopping in progress...")
        systemd_manager.StopUnit(f'{service_name}.service', 'replace')

        # Check the active state after stopping
        unit = systemd_manager.GetUnit(f"{service_name}.service")
        unit_proxy = bus.get_object('org.freedesktop.systemd1', str(unit))
        unit_interface = dbus.Interface(unit_proxy, 'org.freedesktop.DBus.Properties')
        active_state = unit_interface.Get('org.freedesktop.systemd1.Unit', 'ActiveState')

        if active_state != 'active':
            logging.info("Service has stopped successfully.")
            
            # Remove the service file
            if os.path.exists(service_path):
                os.remove(service_path)
                logging.info(f"Service file removed: {service_path}")

            # Remove the PID file if it exists
            if os.path.exists(pid_file):
                os.remove(pid_file)
                logging.info(f"PID file removed: {pid_file}")

            # Reload systemd manager configuration after removing the service
            reload_systemd()
        else:
            logging.error("Service failed to stop.")

    except dbus.DBusException as e:
        logging.error(f"DBus error: {e}")
    except Exception as e:
        logging.error(f"Error uninstalling service: {e}")

def main():
    # Log version information
    version = "2.0.0"  # Specify the version here
    logging.info(f"Kubecni version: {version}")

    # Configuration for IP settings
    ip_suffix = '1'
    netmask = '24'
    broadcast_suffix = '255'
    
    # Default log interval
    log_interval = 5  # Set your desired log interval here (in seconds)

    if os.geteuid() != 0:
        logging.error("Root privileges are required")
        sys.exit(1)

    valid_args = {'--install', '--uninstall', '--execute', '--daemon', '--help', '--force'}
    provided_args = {arg.split('=')[0]: arg.split('=')[1] for arg in sys.argv[1:] if '=' in arg}

    if not provided_args:
        logging.error("No arguments provided. Use --help for usage information.")
        sys.exit(1)

    if not all(arg in valid_args for arg in provided_args.keys()):
        logging.error("Invalid argument(s) provided. Use --help for usage information.")
        logging.info("Usage: kubecni [--install=true | --uninstall=true | --execute=true | --daemon=true | --help]")
        sys.exit(1)

    if '--install' in provided_args and provided_args['--install'].lower() == 'true':
        force = provided_args.get('--force', 'false').lower() == 'true'  # Check if --force is provided
        install_service(force)  # Pass the force argument to the function
        sys.exit(0)

    if '--uninstall' in provided_args and provided_args['--uninstall'].lower() == 'true':
        uninstall_service()
        sys.exit(0)

    if '--execute' in provided_args and provided_args['--execute'].lower() == 'true':
        monitor = FlannelMonitor()
        
        # Check system status before proceeding
        if not monitor.check_kubelet_service():
            monitor.logger.error("System check failed. Exiting.")
            sys.exit(1)  # Exit if the system check fails

        # Check if cni0 device exists
        if not monitor.check_cni0_device():
            monitor.logger.info("cni0 device not found, attempting to create...")
            if monitor.create_cni0_device(ip_suffix, netmask, broadcast_suffix):
                monitor.logger.info("cni0 device created successfully")
            else:
                monitor.logger.error("Failed to create cni0 device")
                sys.exit(1)  # Exit if the device creation fails

        # Now check if cni0 has a valid IP address configured
        if not monitor.check_cni0_ip():
            monitor.logger.info("cni0 does not have a valid IP address configured, attempting to set IP...")
            if not monitor.fix_cni0_ip(ip_suffix, netmask, broadcast_suffix):
                monitor.logger.error("Failed to set IP address for cni0")
                sys.exit(1)
            else:
                monitor.logger.info("cni0 IP address set successfully.")
        else:
            # Clear the existing IP address on cni0 before setting a new one
            if not monitor.delete_cni0_ip():
                monitor.logger.error("Failed flush ip to delete existing IP address for cni0")
                sys.exit(1)
            else:
                # Now that we know cni0 exists, attempt to set the IP address
                if not monitor.fix_cni0_ip(ip_suffix, netmask, broadcast_suffix):
                    monitor.logger.error("Failed to set IP address for cni0")
                    sys.exit(1)
                else:
                    monitor.logger.info("cni0 IP address set successfully.")
        sys.exit(0)

    if '--daemon' in provided_args and provided_args['--daemon'].lower() == 'true':
        logging.info("Starting the daemon process...")  # Log output for starting the daemon
        pid_file = '/var/run/kubecni.pid'  # Specify the PID file path
        with daemon.DaemonContext(
            pidfile=pidfile.TimeoutPIDLockFile(pid_file),  # Manage the PID file
            working_directory='/',
            umask=0o022,
            detach_process=True
        ):
            monitor = FlannelMonitor()
            monitor.monitor_loop(ip_suffix, netmask, broadcast_suffix, log_interval)  # Pass log_interval here
        sys.exit(0)

    if '--help' in provided_args:
        logging.info("Usage: kubecni [--install=true | --uninstall=true | --execute=true | --daemon=true | --help]")
        sys.exit(0)

if __name__ == "__main__":
    main()

  

posted on 2025-04-23 19:17  吃草的青蛙  阅读(19)  评论(0)    收藏  举报

导航