RSI-PI/tests/stability_test.py
Adam bb65500082 Complete Phase 2: Auto-reconnection and stability testing
Implement automatic connection recovery and long-duration testing infrastructure
to complete Phase 2 (Network Reliability) of the RSIPI improvement roadmap.

New Features:
- Auto-reconnection manager with configurable retry strategies
  - IMMEDIATE: Reconnect without delay
  - LINEAR_BACKOFF: Incremental retry delays
  - EXPONENTIAL_BACKOFF: Exponential retry delays
- Background watchdog monitoring (checks every 2 seconds)
- Reconnection statistics tracking (attempts, failures, timestamps)
- Optional callbacks for reconnection events (success/failure)
- 24-hour stability test script with comprehensive reporting
  - Configurable test duration and sample intervals
  - Real-time health monitoring and progress logging
  - Detailed JSON reports with timing and network statistics
  - Human-readable summary with health percentage

Modified Files:
- src/RSIPI/rsi_client.py
  - Added auto-reconnect integration with enable_auto_reconnect parameter
  - Start/stop auto-reconnect monitor in lifecycle methods
  - Clear metrics on reconnection to reset statistics

New Files:
- src/RSIPI/auto_reconnect.py (241 lines)
  - AutoReconnectManager class with background monitoring thread
  - ReconnectStrategy enum for retry behavior configuration
  - Watchdog timeout detection and automatic recovery
  - Reconnection verification with health checks

- tests/stability_test.py (365 lines)
  - StabilityTest class for long-duration testing
  - Command-line interface with argparse
  - Automatic log file generation with timestamps
  - Sample collection with configurable intervals
  - Statistical analysis and reporting
  - Graceful interruption handling (KeyboardInterrupt)

Phase 2 Status:  COMPLETE
-  Timing instrumentation (commit 6e8ea2e)
-  Watchdog timer (commit 6e8ea2e)
-  Network quality monitoring (commit 6e8ea2e)
-  DiagnosticsAPI implementation (commit 6e8ea2e)
-  Auto-reconnection with graceful recovery (this commit)
-  24-hour stability test infrastructure (this commit)

Next: Run stability test, then proceed to Phase 3 (KRL Coordination)
2026-01-17 00:12:44 +00:00

336 lines
11 KiB
Python

"""
24-Hour RSI Stability Test
Long-duration stability test for RSIPI network communication.
Monitors connection health, tracks metrics, and generates detailed
performance reports.
Usage:
python stability_test.py [--duration HOURS] [--config CONFIG_FILE] [--output OUTPUT_FILE]
Example:
# Run for 24 hours
python stability_test.py --duration 24
# Run for 1 hour with custom config
python stability_test.py --duration 1 --config custom_config.xml
# Quick 5-minute test
python stability_test.py --duration 0.083 # 5 minutes
"""
import sys
import os
import time
import argparse
import logging
import json
import datetime
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from RSIPI import RSIAPI
class StabilityTest:
"""Long-duration stability test for RSI communication."""
def __init__(
self,
config_file: str,
duration_hours: float,
output_file: str,
check_interval: float = 60.0
):
"""
Initialize stability test.
Args:
config_file: Path to RSI config file
duration_hours: Test duration in hours
output_file: Path for results JSON file
check_interval: How often to sample metrics (seconds)
"""
self.config_file = config_file
self.duration_hours = duration_hours
self.output_file = output_file
self.check_interval = check_interval
self.start_time = None
self.end_time = None
self.samples = []
self.api = None
def setup(self) -> None:
"""Set up logging and RSI connection."""
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'stability_test_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logging.info(f"=== RSI Stability Test ===")
logging.info(f"Config: {self.config_file}")
logging.info(f"Duration: {self.duration_hours} hours")
logging.info(f"Check interval: {self.check_interval}s")
logging.info(f"Output: {self.output_file}")
logging.info("=" * 50)
# Initialize API with auto-reconnect enabled
self.api = RSIAPI(
self.config_file,
enable_auto_reconnect=True,
auto_reconnect_retries=0, # Unlimited retries
auto_reconnect_delay=10.0
)
logging.info("Starting RSI communication...")
self.api.start()
# Wait for connection to stabilize
time.sleep(3)
if not self.api.is_running():
raise RuntimeError("Failed to start RSI communication")
logging.info("✅ RSI communication started successfully")
def run(self) -> None:
"""Run the stability test."""
self.start_time = time.time()
end_time = self.start_time + (self.duration_hours * 3600)
sample_count = 0
error_count = 0
logging.info(f"Test started at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logging.info(f"Will run until {datetime.datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S')}")
try:
while time.time() < end_time:
try:
# Collect metrics sample
sample = self._collect_sample()
self.samples.append(sample)
sample_count += 1
# Log progress
elapsed_hours = (time.time() - self.start_time) / 3600
remaining_hours = self.duration_hours - elapsed_hours
progress = (elapsed_hours / self.duration_hours) * 100
self._log_progress(sample, elapsed_hours, remaining_hours, progress, sample_count, error_count)
except Exception as e:
error_count += 1
logging.error(f"Error collecting sample: {e}")
# Sleep until next check
time.sleep(self.check_interval)
except KeyboardInterrupt:
logging.warning("\n⚠️ Test interrupted by user")
finally:
self.end_time = time.time()
self._cleanup()
def _collect_sample(self) -> dict:
"""Collect a single metrics sample."""
stats = self.api.diagnostics.get_stats()
sample = {
'timestamp': time.time(),
'mean_cycle_time': stats.get('mean_cycle_time', 0),
'jitter': stats.get('jitter', 0),
'packet_loss_rate': stats.get('packet_loss_rate', 0),
'ipoc_gap_rate': stats.get('ipoc_gap_rate', 0),
'total_cycles': stats.get('total_cycles', 0),
'is_healthy': stats.get('is_healthy', False),
'warnings': stats.get('warnings', []),
'uptime': stats.get('uptime', 0),
}
return sample
def _log_progress(
self,
sample: dict,
elapsed_hours: float,
remaining_hours: float,
progress: float,
sample_count: int,
error_count: int
) -> None:
"""Log current progress."""
health_icon = "" if sample['is_healthy'] else "⚠️"
logging.info(
f"{health_icon} Progress: {progress:.1f}% | "
f"Elapsed: {elapsed_hours:.2f}h | "
f"Remaining: {remaining_hours:.2f}h | "
f"Samples: {sample_count} | "
f"Jitter: {sample['jitter']*1000:.2f}ms | "
f"Loss: {sample['packet_loss_rate']:.2f}%"
)
if sample['warnings']:
for warning in sample['warnings']:
logging.warning(f" ⚠️ {warning}")
def _cleanup(self) -> None:
"""Clean up and generate report."""
logging.info("\n=== Test Complete ===")
# Stop RSI
logging.info("Stopping RSI communication...")
self.api.stop()
# Generate report
logging.info("Generating report...")
report = self._generate_report()
# Save results
with open(self.output_file, 'w') as f:
json.dump(report, f, indent=2)
logging.info(f"✅ Report saved to: {self.output_file}")
# Print summary
self._print_summary(report)
def _generate_report(self) -> dict:
"""Generate comprehensive test report."""
if not self.samples:
return {'error': 'No samples collected'}
# Calculate statistics
jitter_values = [s['jitter'] for s in self.samples]
packet_loss_values = [s['packet_loss_rate'] for s in self.samples]
cycle_time_values = [s['mean_cycle_time'] for s in self.samples]
healthy_samples = sum(1 for s in self.samples if s['is_healthy'])
unhealthy_samples = len(self.samples) - healthy_samples
report = {
'test_info': {
'config_file': self.config_file,
'duration_hours': self.duration_hours,
'start_time': datetime.datetime.fromtimestamp(self.start_time).isoformat(),
'end_time': datetime.datetime.fromtimestamp(self.end_time).isoformat(),
'actual_duration_hours': (self.end_time - self.start_time) / 3600,
'total_samples': len(self.samples),
},
'health_summary': {
'healthy_samples': healthy_samples,
'unhealthy_samples': unhealthy_samples,
'health_percentage': (healthy_samples / len(self.samples)) * 100,
},
'timing_stats': {
'mean_cycle_time_ms': statistics.mean(cycle_time_values) * 1000 if cycle_time_values else 0,
'min_cycle_time_ms': min(cycle_time_values) * 1000 if cycle_time_values else 0,
'max_cycle_time_ms': max(cycle_time_values) * 1000 if cycle_time_values else 0,
'mean_jitter_ms': statistics.mean(jitter_values) * 1000 if jitter_values else 0,
'max_jitter_ms': max(jitter_values) * 1000 if jitter_values else 0,
},
'network_stats': {
'mean_packet_loss_percent': statistics.mean(packet_loss_values) if packet_loss_values else 0,
'max_packet_loss_percent': max(packet_loss_values) if packet_loss_values else 0,
},
'final_metrics': self.samples[-1] if self.samples else {},
}
return report
def _print_summary(self, report: dict) -> None:
"""Print human-readable summary."""
print("\n" + "=" * 60)
print("STABILITY TEST SUMMARY")
print("=" * 60)
info = report['test_info']
health = report['health_summary']
timing = report['timing_stats']
network = report['network_stats']
print(f"\nTest Duration: {info['actual_duration_hours']:.2f} hours")
print(f"Total Samples: {info['total_samples']}")
print(f"\nHealth: {health['health_percentage']:.1f}% healthy")
print(f" Healthy samples: {health['healthy_samples']}")
print(f" Unhealthy samples: {health['unhealthy_samples']}")
print(f"\nTiming Performance:")
print(f" Mean cycle time: {timing['mean_cycle_time_ms']:.2f}ms")
print(f" Cycle time range: {timing['min_cycle_time_ms']:.2f} - {timing['max_cycle_time_ms']:.2f}ms")
print(f" Mean jitter: {timing['mean_jitter_ms']:.2f}ms")
print(f" Max jitter: {timing['max_jitter_ms']:.2f}ms")
print(f"\nNetwork Quality:")
print(f" Mean packet loss: {network['mean_packet_loss_percent']:.3f}%")
print(f" Max packet loss: {network['max_packet_loss_percent']:.3f}%")
health_icon = "✅ PASS" if health['health_percentage'] >= 95 else "⚠️ NEEDS IMPROVEMENT"
print(f"\nOverall Result: {health_icon}")
print("=" * 60)
import statistics
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description='RSI 24-Hour Stability Test')
parser.add_argument(
'--duration',
type=float,
default=24.0,
help='Test duration in hours (default: 24)'
)
parser.add_argument(
'--config',
type=str,
default='RSI_EthernetConfig.xml',
help='Path to RSI config file'
)
parser.add_argument(
'--output',
type=str,
default=None,
help='Output JSON file (default: stability_test_TIMESTAMP.json)'
)
parser.add_argument(
'--interval',
type=float,
default=60.0,
help='Check interval in seconds (default: 60)'
)
args = parser.parse_args()
# Generate default output filename if not specified
if args.output is None:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
args.output = f'stability_test_{timestamp}.json'
# Run test
test = StabilityTest(
config_file=args.config,
duration_hours=args.duration,
output_file=args.output,
check_interval=args.interval
)
test.setup()
test.run()
if __name__ == '__main__':
main()