Implement automatic connection recovery and long-duration testing infrastructure to complete Phase 2 (Network Reliability) of the RSIPI improvement roadmap. New Features: - Auto-reconnection manager with configurable retry strategies - IMMEDIATE: Reconnect without delay - LINEAR_BACKOFF: Incremental retry delays - EXPONENTIAL_BACKOFF: Exponential retry delays - Background watchdog monitoring (checks every 2 seconds) - Reconnection statistics tracking (attempts, failures, timestamps) - Optional callbacks for reconnection events (success/failure) - 24-hour stability test script with comprehensive reporting - Configurable test duration and sample intervals - Real-time health monitoring and progress logging - Detailed JSON reports with timing and network statistics - Human-readable summary with health percentage Modified Files: - src/RSIPI/rsi_client.py - Added auto-reconnect integration with enable_auto_reconnect parameter - Start/stop auto-reconnect monitor in lifecycle methods - Clear metrics on reconnection to reset statistics New Files: - src/RSIPI/auto_reconnect.py (241 lines) - AutoReconnectManager class with background monitoring thread - ReconnectStrategy enum for retry behavior configuration - Watchdog timeout detection and automatic recovery - Reconnection verification with health checks - tests/stability_test.py (365 lines) - StabilityTest class for long-duration testing - Command-line interface with argparse - Automatic log file generation with timestamps - Sample collection with configurable intervals - Statistical analysis and reporting - Graceful interruption handling (KeyboardInterrupt) Phase 2 Status: ✅ COMPLETE - ✅ Timing instrumentation (commit6e8ea2e) - ✅ Watchdog timer (commit6e8ea2e) - ✅ Network quality monitoring (commit6e8ea2e) - ✅ DiagnosticsAPI implementation (commit6e8ea2e) - ✅ Auto-reconnection with graceful recovery (this commit) - ✅ 24-hour stability test infrastructure (this commit) Next: Run stability test, then proceed to Phase 3 (KRL Coordination)
336 lines
11 KiB
Python
336 lines
11 KiB
Python
"""
|
|
24-Hour RSI Stability Test
|
|
|
|
Long-duration stability test for RSIPI network communication.
|
|
Monitors connection health, tracks metrics, and generates detailed
|
|
performance reports.
|
|
|
|
Usage:
|
|
python stability_test.py [--duration HOURS] [--config CONFIG_FILE] [--output OUTPUT_FILE]
|
|
|
|
Example:
|
|
# Run for 24 hours
|
|
python stability_test.py --duration 24
|
|
|
|
# Run for 1 hour with custom config
|
|
python stability_test.py --duration 1 --config custom_config.xml
|
|
|
|
# Quick 5-minute test
|
|
python stability_test.py --duration 0.083 # 5 minutes
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import time
|
|
import argparse
|
|
import logging
|
|
import json
|
|
import datetime
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from RSIPI import RSIAPI
|
|
|
|
|
|
class StabilityTest:
|
|
"""Long-duration stability test for RSI communication."""
|
|
|
|
def __init__(
|
|
self,
|
|
config_file: str,
|
|
duration_hours: float,
|
|
output_file: str,
|
|
check_interval: float = 60.0
|
|
):
|
|
"""
|
|
Initialize stability test.
|
|
|
|
Args:
|
|
config_file: Path to RSI config file
|
|
duration_hours: Test duration in hours
|
|
output_file: Path for results JSON file
|
|
check_interval: How often to sample metrics (seconds)
|
|
"""
|
|
self.config_file = config_file
|
|
self.duration_hours = duration_hours
|
|
self.output_file = output_file
|
|
self.check_interval = check_interval
|
|
|
|
self.start_time = None
|
|
self.end_time = None
|
|
self.samples = []
|
|
self.api = None
|
|
|
|
def setup(self) -> None:
|
|
"""Set up logging and RSI connection."""
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(f'stability_test_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
|
|
logging.info(f"=== RSI Stability Test ===")
|
|
logging.info(f"Config: {self.config_file}")
|
|
logging.info(f"Duration: {self.duration_hours} hours")
|
|
logging.info(f"Check interval: {self.check_interval}s")
|
|
logging.info(f"Output: {self.output_file}")
|
|
logging.info("=" * 50)
|
|
|
|
# Initialize API with auto-reconnect enabled
|
|
self.api = RSIAPI(
|
|
self.config_file,
|
|
enable_auto_reconnect=True,
|
|
auto_reconnect_retries=0, # Unlimited retries
|
|
auto_reconnect_delay=10.0
|
|
)
|
|
|
|
logging.info("Starting RSI communication...")
|
|
self.api.start()
|
|
|
|
# Wait for connection to stabilize
|
|
time.sleep(3)
|
|
|
|
if not self.api.is_running():
|
|
raise RuntimeError("Failed to start RSI communication")
|
|
|
|
logging.info("✅ RSI communication started successfully")
|
|
|
|
def run(self) -> None:
|
|
"""Run the stability test."""
|
|
self.start_time = time.time()
|
|
end_time = self.start_time + (self.duration_hours * 3600)
|
|
|
|
sample_count = 0
|
|
error_count = 0
|
|
|
|
logging.info(f"Test started at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
logging.info(f"Will run until {datetime.datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
try:
|
|
while time.time() < end_time:
|
|
try:
|
|
# Collect metrics sample
|
|
sample = self._collect_sample()
|
|
self.samples.append(sample)
|
|
sample_count += 1
|
|
|
|
# Log progress
|
|
elapsed_hours = (time.time() - self.start_time) / 3600
|
|
remaining_hours = self.duration_hours - elapsed_hours
|
|
progress = (elapsed_hours / self.duration_hours) * 100
|
|
|
|
self._log_progress(sample, elapsed_hours, remaining_hours, progress, sample_count, error_count)
|
|
|
|
except Exception as e:
|
|
error_count += 1
|
|
logging.error(f"Error collecting sample: {e}")
|
|
|
|
# Sleep until next check
|
|
time.sleep(self.check_interval)
|
|
|
|
except KeyboardInterrupt:
|
|
logging.warning("\n⚠️ Test interrupted by user")
|
|
|
|
finally:
|
|
self.end_time = time.time()
|
|
self._cleanup()
|
|
|
|
def _collect_sample(self) -> dict:
|
|
"""Collect a single metrics sample."""
|
|
stats = self.api.diagnostics.get_stats()
|
|
|
|
sample = {
|
|
'timestamp': time.time(),
|
|
'mean_cycle_time': stats.get('mean_cycle_time', 0),
|
|
'jitter': stats.get('jitter', 0),
|
|
'packet_loss_rate': stats.get('packet_loss_rate', 0),
|
|
'ipoc_gap_rate': stats.get('ipoc_gap_rate', 0),
|
|
'total_cycles': stats.get('total_cycles', 0),
|
|
'is_healthy': stats.get('is_healthy', False),
|
|
'warnings': stats.get('warnings', []),
|
|
'uptime': stats.get('uptime', 0),
|
|
}
|
|
|
|
return sample
|
|
|
|
def _log_progress(
|
|
self,
|
|
sample: dict,
|
|
elapsed_hours: float,
|
|
remaining_hours: float,
|
|
progress: float,
|
|
sample_count: int,
|
|
error_count: int
|
|
) -> None:
|
|
"""Log current progress."""
|
|
health_icon = "✅" if sample['is_healthy'] else "⚠️"
|
|
|
|
logging.info(
|
|
f"{health_icon} Progress: {progress:.1f}% | "
|
|
f"Elapsed: {elapsed_hours:.2f}h | "
|
|
f"Remaining: {remaining_hours:.2f}h | "
|
|
f"Samples: {sample_count} | "
|
|
f"Jitter: {sample['jitter']*1000:.2f}ms | "
|
|
f"Loss: {sample['packet_loss_rate']:.2f}%"
|
|
)
|
|
|
|
if sample['warnings']:
|
|
for warning in sample['warnings']:
|
|
logging.warning(f" ⚠️ {warning}")
|
|
|
|
def _cleanup(self) -> None:
|
|
"""Clean up and generate report."""
|
|
logging.info("\n=== Test Complete ===")
|
|
|
|
# Stop RSI
|
|
logging.info("Stopping RSI communication...")
|
|
self.api.stop()
|
|
|
|
# Generate report
|
|
logging.info("Generating report...")
|
|
report = self._generate_report()
|
|
|
|
# Save results
|
|
with open(self.output_file, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
logging.info(f"✅ Report saved to: {self.output_file}")
|
|
|
|
# Print summary
|
|
self._print_summary(report)
|
|
|
|
def _generate_report(self) -> dict:
|
|
"""Generate comprehensive test report."""
|
|
if not self.samples:
|
|
return {'error': 'No samples collected'}
|
|
|
|
# Calculate statistics
|
|
jitter_values = [s['jitter'] for s in self.samples]
|
|
packet_loss_values = [s['packet_loss_rate'] for s in self.samples]
|
|
cycle_time_values = [s['mean_cycle_time'] for s in self.samples]
|
|
|
|
healthy_samples = sum(1 for s in self.samples if s['is_healthy'])
|
|
unhealthy_samples = len(self.samples) - healthy_samples
|
|
|
|
report = {
|
|
'test_info': {
|
|
'config_file': self.config_file,
|
|
'duration_hours': self.duration_hours,
|
|
'start_time': datetime.datetime.fromtimestamp(self.start_time).isoformat(),
|
|
'end_time': datetime.datetime.fromtimestamp(self.end_time).isoformat(),
|
|
'actual_duration_hours': (self.end_time - self.start_time) / 3600,
|
|
'total_samples': len(self.samples),
|
|
},
|
|
'health_summary': {
|
|
'healthy_samples': healthy_samples,
|
|
'unhealthy_samples': unhealthy_samples,
|
|
'health_percentage': (healthy_samples / len(self.samples)) * 100,
|
|
},
|
|
'timing_stats': {
|
|
'mean_cycle_time_ms': statistics.mean(cycle_time_values) * 1000 if cycle_time_values else 0,
|
|
'min_cycle_time_ms': min(cycle_time_values) * 1000 if cycle_time_values else 0,
|
|
'max_cycle_time_ms': max(cycle_time_values) * 1000 if cycle_time_values else 0,
|
|
'mean_jitter_ms': statistics.mean(jitter_values) * 1000 if jitter_values else 0,
|
|
'max_jitter_ms': max(jitter_values) * 1000 if jitter_values else 0,
|
|
},
|
|
'network_stats': {
|
|
'mean_packet_loss_percent': statistics.mean(packet_loss_values) if packet_loss_values else 0,
|
|
'max_packet_loss_percent': max(packet_loss_values) if packet_loss_values else 0,
|
|
},
|
|
'final_metrics': self.samples[-1] if self.samples else {},
|
|
}
|
|
|
|
return report
|
|
|
|
def _print_summary(self, report: dict) -> None:
|
|
"""Print human-readable summary."""
|
|
print("\n" + "=" * 60)
|
|
print("STABILITY TEST SUMMARY")
|
|
print("=" * 60)
|
|
|
|
info = report['test_info']
|
|
health = report['health_summary']
|
|
timing = report['timing_stats']
|
|
network = report['network_stats']
|
|
|
|
print(f"\nTest Duration: {info['actual_duration_hours']:.2f} hours")
|
|
print(f"Total Samples: {info['total_samples']}")
|
|
|
|
print(f"\nHealth: {health['health_percentage']:.1f}% healthy")
|
|
print(f" Healthy samples: {health['healthy_samples']}")
|
|
print(f" Unhealthy samples: {health['unhealthy_samples']}")
|
|
|
|
print(f"\nTiming Performance:")
|
|
print(f" Mean cycle time: {timing['mean_cycle_time_ms']:.2f}ms")
|
|
print(f" Cycle time range: {timing['min_cycle_time_ms']:.2f} - {timing['max_cycle_time_ms']:.2f}ms")
|
|
print(f" Mean jitter: {timing['mean_jitter_ms']:.2f}ms")
|
|
print(f" Max jitter: {timing['max_jitter_ms']:.2f}ms")
|
|
|
|
print(f"\nNetwork Quality:")
|
|
print(f" Mean packet loss: {network['mean_packet_loss_percent']:.3f}%")
|
|
print(f" Max packet loss: {network['max_packet_loss_percent']:.3f}%")
|
|
|
|
health_icon = "✅ PASS" if health['health_percentage'] >= 95 else "⚠️ NEEDS IMPROVEMENT"
|
|
print(f"\nOverall Result: {health_icon}")
|
|
print("=" * 60)
|
|
|
|
|
|
import statistics
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description='RSI 24-Hour Stability Test')
|
|
parser.add_argument(
|
|
'--duration',
|
|
type=float,
|
|
default=24.0,
|
|
help='Test duration in hours (default: 24)'
|
|
)
|
|
parser.add_argument(
|
|
'--config',
|
|
type=str,
|
|
default='RSI_EthernetConfig.xml',
|
|
help='Path to RSI config file'
|
|
)
|
|
parser.add_argument(
|
|
'--output',
|
|
type=str,
|
|
default=None,
|
|
help='Output JSON file (default: stability_test_TIMESTAMP.json)'
|
|
)
|
|
parser.add_argument(
|
|
'--interval',
|
|
type=float,
|
|
default=60.0,
|
|
help='Check interval in seconds (default: 60)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Generate default output filename if not specified
|
|
if args.output is None:
|
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
args.output = f'stability_test_{timestamp}.json'
|
|
|
|
# Run test
|
|
test = StabilityTest(
|
|
config_file=args.config,
|
|
duration_hours=args.duration,
|
|
output_file=args.output,
|
|
check_interval=args.interval
|
|
)
|
|
|
|
test.setup()
|
|
test.run()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|