RSI-PI/src/RSIPI/rsi_client.py
Adam bb65500082 Complete Phase 2: Auto-reconnection and stability testing
Implement automatic connection recovery and long-duration testing infrastructure
to complete Phase 2 (Network Reliability) of the RSIPI improvement roadmap.

New Features:
- Auto-reconnection manager with configurable retry strategies
  - IMMEDIATE: Reconnect without delay
  - LINEAR_BACKOFF: Incremental retry delays
  - EXPONENTIAL_BACKOFF: Exponential retry delays
- Background watchdog monitoring (checks every 2 seconds)
- Reconnection statistics tracking (attempts, failures, timestamps)
- Optional callbacks for reconnection events (success/failure)
- 24-hour stability test script with comprehensive reporting
  - Configurable test duration and sample intervals
  - Real-time health monitoring and progress logging
  - Detailed JSON reports with timing and network statistics
  - Human-readable summary with health percentage

Modified Files:
- src/RSIPI/rsi_client.py
  - Added auto-reconnect integration with enable_auto_reconnect parameter
  - Start/stop auto-reconnect monitor in lifecycle methods
  - Clear metrics on reconnection to reset statistics

New Files:
- src/RSIPI/auto_reconnect.py (241 lines)
  - AutoReconnectManager class with background monitoring thread
  - ReconnectStrategy enum for retry behavior configuration
  - Watchdog timeout detection and automatic recovery
  - Reconnection verification with health checks

- tests/stability_test.py (365 lines)
  - StabilityTest class for long-duration testing
  - Command-line interface with argparse
  - Automatic log file generation with timestamps
  - Sample collection with configurable intervals
  - Statistical analysis and reporting
  - Graceful interruption handling (KeyboardInterrupt)

Phase 2 Status:  COMPLETE
-  Timing instrumentation (commit 6e8ea2e)
-  Watchdog timer (commit 6e8ea2e)
-  Network quality monitoring (commit 6e8ea2e)
-  DiagnosticsAPI implementation (commit 6e8ea2e)
-  Auto-reconnection with graceful recovery (this commit)
-  24-hour stability test infrastructure (this commit)

Next: Run stability test, then proceed to Phase 3 (KRL Coordination)
2026-01-17 00:12:44 +00:00

300 lines
11 KiB
Python

import logging
import multiprocessing
import time
from enum import Enum, auto
from threading import Lock, Thread
from typing import Optional
from .config_parser import ConfigParser
from .network_handler import NetworkProcess
from .safety_manager import SafetyManager
from .exceptions import RSIStateError, RSIInvalidTransition, RSIClientNotReady
from .auto_reconnect import AutoReconnectManager, ReconnectStrategy
class ClientState(Enum):
"""Connection states for RSIClient."""
INITIALIZED = auto() # After __init__, network process spawned but not started
STARTING = auto() # Start signal sent, waiting for network to be ready
RUNNING = auto() # Actively communicating with robot
STOPPING = auto() # Shutdown in progress
STOPPED = auto() # Fully stopped, cannot be restarted (use reconnect)
ERROR = auto() # Error state
class RSIClient:
"""Main RSI API class that integrates network, config handling, and message processing."""
# Valid state transitions
_VALID_TRANSITIONS = {
ClientState.INITIALIZED: {ClientState.STARTING, ClientState.STOPPING},
ClientState.STARTING: {ClientState.RUNNING, ClientState.STOPPING, ClientState.ERROR},
ClientState.RUNNING: {ClientState.STOPPING, ClientState.ERROR},
ClientState.STOPPING: {ClientState.STOPPED, ClientState.ERROR},
ClientState.STOPPED: {ClientState.INITIALIZED}, # Via reconnect
ClientState.ERROR: {ClientState.STOPPING, ClientState.INITIALIZED}, # Via reconnect
}
def __init__(
self,
config_file: str,
rsi_limits_file: Optional[str] = None,
enable_auto_reconnect: bool = False,
auto_reconnect_retries: int = 5,
auto_reconnect_delay: float = 5.0
) -> None:
"""
Initialize RSI client with configuration and safety limits.
Args:
config_file: Path to RSI_EthernetConfig.xml
rsi_limits_file: Optional path to .rsi.xml safety limits file
enable_auto_reconnect: Enable automatic reconnection on communication loss
auto_reconnect_retries: Maximum reconnection attempts (0 = unlimited)
auto_reconnect_delay: Base delay between retries in seconds
"""
logging.info(f"Loading RSI configuration from {config_file}...")
self._state: ClientState = ClientState.INITIALIZED
self._state_lock: Lock = Lock()
self.config_parser: ConfigParser = ConfigParser(config_file, rsi_limits_file)
network_settings = self.config_parser.get_network_settings()
self.manager: multiprocessing.Manager = multiprocessing.Manager()
self.send_variables = self.manager.dict(self.config_parser.send_variables)
self.receive_variables = self.manager.dict(self.config_parser.receive_variables)
self.stop_event: multiprocessing.Event = multiprocessing.Event()
self.start_event: multiprocessing.Event = multiprocessing.Event()
self.command_queue: multiprocessing.Queue = multiprocessing.Queue()
self.safety_manager: SafetyManager = SafetyManager(self.config_parser.safety_limits)
# Shared logging state (readable from parent process)
self._logging_active = multiprocessing.Value('b', False)
# Shared metrics dictionary (Phase 2)
self.metrics_dict = self.manager.dict()
# Create NetworkProcess but don't start communication yet
self.network_process: NetworkProcess = NetworkProcess(
network_settings["ip"],
network_settings["port"],
self.send_variables,
self.receive_variables,
self.stop_event,
self.config_parser,
self.start_event,
self.command_queue,
self.metrics_dict
)
# Share the logging_active flag
self.network_process.logging_active = self._logging_active
self.network_process.start()
self.logger: Optional[any] = None # Reserved for future use
self.running: bool = False
self.thread: Optional[Thread] = None
# Auto-reconnect manager (Phase 2)
self.auto_reconnect_manager: Optional[AutoReconnectManager] = None
if enable_auto_reconnect:
self.auto_reconnect_manager = AutoReconnectManager(
client=self,
enabled=True,
max_retries=auto_reconnect_retries,
retry_delay=auto_reconnect_delay,
strategy=ReconnectStrategy.LINEAR_BACKOFF
)
logging.info("Auto-reconnect enabled")
@property
def state(self) -> ClientState:
"""Get current client state (thread-safe)."""
with self._state_lock:
return self._state
def _transition_to(self, new_state: ClientState) -> bool:
"""
Attempt to transition to a new state.
Args:
new_state: Target state to transition to
Returns:
True if transition was valid and completed, False otherwise
"""
with self._state_lock:
if new_state in self._VALID_TRANSITIONS.get(self._state, set()):
old_state = self._state
self._state = new_state
logging.debug(f"State transition: {old_state.name} -> {new_state.name}")
return True
else:
logging.warning(
f"Invalid state transition attempted: {self._state.name} -> {new_state.name}"
)
return False
def start(self) -> None:
"""
Send start signal to NetworkProcess and run control loop.
Transitions through STARTING → RUNNING states and maintains
control loop until stopped.
Raises:
RSIClientNotReady: If client is not in appropriate state to start
"""
if not self._transition_to(ClientState.STARTING):
error_msg = f"Cannot start from state {self.state.name}"
logging.error(error_msg)
raise RSIClientNotReady(error_msg)
logging.info("RSIClient sending start signal to NetworkProcess...")
self.start_event.set()
if not self._transition_to(ClientState.RUNNING):
error_msg = "Failed to transition to RUNNING state"
logging.error(error_msg)
raise RSIStateError(error_msg)
self.running = True
logging.info("RSI Client Started")
# Start auto-reconnect monitor (Phase 2)
if self.auto_reconnect_manager:
self.auto_reconnect_manager.start()
try:
while self.running and not self.stop_event.is_set():
time.sleep(2)
except KeyboardInterrupt:
self.stop()
except Exception as e:
logging.error(f"RSI Client encountered an error: {e}")
self._transition_to(ClientState.ERROR)
raise
def stop(self) -> None:
"""Stop the network process and the client thread safely."""
if self.state in (ClientState.STOPPED, ClientState.STOPPING):
logging.debug("Already stopped or stopping")
return
if not self._transition_to(ClientState.STOPPING):
logging.warning("Could not transition to STOPPING state")
# Continue anyway to ensure cleanup
logging.info("Stopping RSI Client...")
self.running = False
self.stop_event.set()
if self.network_process and self.network_process.is_alive():
self.network_process.join(timeout=3)
if self.network_process.is_alive():
logging.warning("Forcing network process termination...")
self.network_process.terminate()
self.network_process.join()
if self.thread and self.thread.is_alive():
self.thread.join(timeout=2)
self.thread = None
# Stop auto-reconnect monitor (Phase 2)
if self.auto_reconnect_manager:
self.auto_reconnect_manager.stop()
self._transition_to(ClientState.STOPPED)
logging.info("RSI Client Stopped")
def reconnect(self) -> None:
"""
Reconnect the network process safely.
Stops existing connection, resets state, and creates fresh
network process with new communication resources.
"""
logging.info("Reconnecting RSI Client network...")
# Stop if currently running
if self.state in (ClientState.RUNNING, ClientState.STARTING):
self.stop()
if self.network_process and self.network_process.is_alive():
self.stop_event.set()
self.network_process.terminate()
self.network_process.join()
# Reset to initialized state
with self._state_lock:
self._state = ClientState.INITIALIZED
# Fresh new events and queue
self.stop_event = multiprocessing.Event()
self.start_event = multiprocessing.Event()
self.command_queue = multiprocessing.Queue()
# Reset metrics dictionary (Phase 2)
self.metrics_dict.clear()
# Create new network process
network_settings = self.config_parser.get_network_settings()
self.network_process = NetworkProcess(
network_settings["ip"],
network_settings["port"],
self.send_variables,
self.receive_variables,
self.stop_event,
self.config_parser,
self.start_event,
self.command_queue,
self.metrics_dict
)
self.network_process.logging_active = self._logging_active
self.network_process.start()
# Fresh control thread
self.thread = Thread(target=self.start, daemon=True)
self.thread.start()
def is_running(self) -> bool:
"""
Check if client is in running state.
Returns:
True if currently running
"""
return self.state == ClientState.RUNNING
def is_stopped(self) -> bool:
"""
Check if client is fully stopped.
Returns:
True if in STOPPED state
"""
return self.state == ClientState.STOPPED
def start_logging(self, filename: str) -> None:
"""
Start CSV logging to the specified file.
Args:
filename: Path to output CSV file
"""
self.command_queue.put({'action': 'start_logging', 'filename': filename})
def stop_logging(self) -> None:
"""Stop CSV logging."""
self.command_queue.put({'action': 'stop_logging'})
def is_logging_active(self) -> bool:
"""
Check if CSV logging is currently active.
Returns:
True if logging is active
"""
return self._logging_active.value