py
#!/bin/env python3# -*- coding: utf-8 -*-import osimport sysimport argparseimport subprocessimport timeimport jsonimport csvimport reimport randomimport shlexfrom datetime import datetimefrom pathlib import Pathfrom concurrent.futures import ThreadPoolExecutor, as_completedfrom typing import List, Dict, Tuple, Optionalimport copyimport threadingimport signalimport stat_transaction_count # reuse transaction/cycle count helpers
class Colors: """Terminal color definitions""" RED = '\033[91m' GREEN = '\033[92m' YELLOW = '\033[93m' BLUE = '\033[94m' MAGENTA = '\033[95m' CYAN = '\033[96m' WHITE = '\033[97m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m'
class TestResult: """Test result class""" def __init__(self, name: str, config: str = "default"): self.name = name self.config = config self.status = "PENDING" # PENDING, RUNNING, PASS, FAIL, TIMEOUT, ERROR, RERUN PASS, RERUN FAIL self.start_time = None self.end_time = None self.duration = 0 self.log_file = "" self.job_id = None self.retry_count = 0 self.retry_started = False # Track if retry has been started self.error_msg = "" self.coverage_db = "" self.estimated_duration = 0 # Estimated duration (seconds) self.seed = None self.opts = [] self.is_retry = False # Track if this is a retry case def start(self): """Start test""" self.start_time = time.time() self.status = "RUNNING" def finish(self, status: str, error_msg: str = ""): """Complete test""" self.end_time = time.time() self.duration = self.end_time - self.start_time if self.start_time else 0 # Handle retry statuses if self.is_retry or self.retry_count > 0: if status == "PASS": self.status = "RERUN PASS" elif status == "FAIL": self.status = "RERUN FAIL" else: self.status = status else: self.status = status self.error_msg = error_msg def get_duration_str(self) -> str: """Get formatted duration string""" if self.duration == 0: return "N/A" hours = int(self.duration // 3600) minutes = int((self.duration % 3600) // 60) seconds = int(self.duration % 60) if hours > 0: return f"{hours}h{minutes}m{seconds}s" elif minutes > 0: return f"{minutes}m{seconds}s" else: return f"{seconds}s"
class RegressionRunner: """Regression test runner""" def __init__(self, args): self.args = args self.tests = [] self.results = {} self.start_time = time.time() # Create timestamp for regression run self.regression_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Extract directory name from current simulation directory current_dir = os.path.basename(os.getcwd()) # Extract xxx part from sim_xxx pattern if current_dir.startswith('sim_'): dir_suffix = current_dir[4:] # Remove 'sim_' prefix else: dir_suffix = '' self.regression_dir_name = f"regression_{dir_suffix}_{self.regression_timestamp}" # Set up directories based on output path self._setup_directories() self.history_db_file = Path("test_history.json") # Historical data file self.test_history = self.load_test_history() # Load historical test data self.job_ids = [] self.submitted_jobs = [] # Track submitted job IDs self.submitted_results = [] # Track submitted test results # Map job_id -> full opcode used for submission, to allow direct resubmission without lookups self.job_meta: Dict[str, Dict] = {} self.running_jobs = 0 # Track number of running jobs (RUN status) self.pending_jobs = 0 # Track number of pending jobs (PEND status) self.lock = threading.Lock() # Delay first status summary until the print interval elapses (default: 30 minutes) self.last_status_print = time.time() self.status_print_interval = args.status_interval * 60 # Status print interval (seconds) self._stop_status_thread = False # Control status print thread stop # Create necessary directories immediately self._create_directories() # Initialize real-time report generation self.report_update_interval = 30 # Update report every 30 seconds self.last_report_update = time.time() self.real_time_report_path = self.report_dir / "zregress_report.log" # Load error monitor state if exists self.load_error_monitor_state() # Initialize error monitoring self.error_monitor_interval = args.error_monitor_interval * 60 # Convert minutes to seconds self.last_error_monitor_time = time.time() self.log_read_positions = {} # Track last read position for each log file # Track last time each log file produced new content; used to detect hung simulations self.log_last_update_times: Dict[str, float] = {} # Configurable hang timeout in seconds (no new log lines) self.hang_timeout_seconds = ( getattr(args, 'hang_timeout_minutes', 30) * 60 if hasattr(args, 'hang_timeout_minutes') and args.hang_timeout_minutes is not None else 30 * 60 ) # Configurable PEND timeout in seconds (for jobs waiting for resources) self.pend_timeout_seconds = ( getattr(args, 'pend_timeout_minutes', None) * 60 if hasattr(args, 'pend_timeout_minutes') and args.pend_timeout_minutes is not None else None ) self.error_keywords = ['UVM_ERROR', 'UVM_FATAL', 'Solver failed', 'FATAL', 'Error', 'Offending'] # Set up signal handling signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) # Initialize random seed for better randomization random.seed(time.time()) # Add flag to control validation frequency self._validation_count = 0 self._last_validation_time = 0 # Auto-restart configuration self.auto_restart = getattr(args, 'auto_restart', False) self.restart_interval_hours = getattr(args, 'restart_interval_hours', None) self.restart_count = 0 # Track number of restarts self.max_restarts = getattr(args, 'max_restarts', None) # Maximum number of restarts (None = unlimited) self.first_run_start_time = time.time() # Track first run start time for interval-based restart def _setup_directories(self): """Set up regression directories""" # Set up regression directory if hasattr(self.args, 'output_dir') and self.args.output_dir: base_output_dir = Path(self.args.output_dir) self.regression_dir = base_output_dir / self.regression_dir_name else: self.regression_dir = Path(self.regression_dir_name) # Set up subdirectories self.log_dir = self.regression_dir / "logs" self.report_dir = self.regression_dir / "report_log" self.coverage_dir = self.regression_dir / "coverage" self.wave_dir = self.regression_dir / "waves" def _create_directories(self): """Create all necessary directories""" self.regression_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) self.report_dir.mkdir(parents=True, exist_ok=True) self.coverage_dir.mkdir(parents=True, exist_ok=True) self.wave_dir.mkdir(parents=True, exist_ok=True) # Directory setup completed def _get_output_dir_path(self): """Get the output directory path""" if hasattr(self.args, 'output_dir'): return self.args.output_dir return "." def load_test_history(self) -> Dict: """Load historical test data""" if not self.history_db_file.exists(): return {} try: with open(self.history_db_file, 'r') as f: return json.load(f) except Exception as e: print(f"{Colors.YELLOW}Warning: Unable to load historical test data: {e}{Colors.END}") return {} def save_test_history(self): """Save historical test data""" # Update historical data for result_key, result in self.results.items(): if result.status in ["PASS", "RERUN PASS"] and result.duration > 0: test_config_key = f"{result.name}:{result.config}" # If it's a new test, initialize history record if test_config_key not in self.test_history: self.test_history[test_config_key] = { 'durations': [], 'avg_duration': 0, 'last_duration': 0, 'count': 0 } # Update historical data history = self.test_history[test_config_key] history['durations'].append(result.duration) # Only keep the last 10 runs if len(history['durations']) > 10: history['durations'] = history['durations'][-10:] history['avg_duration'] = sum(history['durations']) / len(history['durations']) history['last_duration'] = result.duration history['count'] += 1 # Save to file try: with open(self.history_db_file, 'w') as f: json.dump(self.test_history, f, indent=2) except Exception as e: print(f"{Colors.YELLOW}Warning: Unable to save historical test data: {e}{Colors.END}") def estimate_test_duration(self, test_name: str, config: str) -> float: """Estimate test duration (seconds)""" test_config_key = f"{test_name}:{config}" # If historical data exists, use average duration if test_config_key in self.test_history: return self.test_history[test_config_key]['avg_duration'] # If no specific test history, try to use average of tests with same config config_tests = [k for k in self.test_history.keys() if k.endswith(f":{config}")] if config_tests: avg_duration = sum(self.test_history[k]['avg_duration'] for k in config_tests) / len(config_tests) return avg_duration # If no historical data at all, use default value (5 minutes) return 300 def signal_handler(self, signum, frame): """Signal handler""" print(f"\n{Colors.YELLOW}Received signal {signum}, cleaning up...{Colors.END}") self.cleanup() sys.exit(1) def cleanup(self): """Clean up resources""" # Save error monitor state self.save_error_monitor_state() if self.args.mode == "lsf" and self.job_ids: print(f"{Colors.YELLOW}Cancelling LSF jobs...{Colors.END}") for job_id in self.job_ids: try: subprocess.run(["bkill", str(job_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=10) except: pass def load_test_list(self, test_file: str, default_config: str = None) -> List[Tuple[str, str]]: """Load test list, returns list of (test_name, config)""" tests = [] try: with open(test_file, 'r') as f: for line in f: line = line.strip() if line and not line.startswith('#'): # Check if line contains config info (format: test_name:config) if ':' in line: test_name, config = line.split(':', 1) tests.append((test_name.strip(), config.strip())) else: # Use default config tests.append((line, default_config)) except FileNotFoundError: print(f"{Colors.RED}Error: Test list file not found {test_file}{Colors.END}") sys.exit(1) return tests def load_test_cases(self, test_files: List[str]) -> List[Dict]: """Load the test define""" cases = [] for file in test_files: with open(file, "r") as f: cases.extend(json.load(f)) return cases def load_failed_regression_cases(self, failed_regression_file: str) -> List[Dict]: """Load failed test cases from failed regression JSON file""" try: with open(failed_regression_file, "r") as f: failed_cases = json.load(f) print(f"Loaded {len(failed_cases)} failed test cases from {failed_regression_file}") # Convert failed regression format back to standard test case format standard_cases = [] for failed_case in failed_cases: # Extract the original test case data, removing the failure-specific fields standard_case = {} for key, value in failed_case.items(): if not key.startswith('actual_') and key not in ['log_file', 'retry_count', 'failure_timestamp', 'original_repeat']: standard_case[key] = value # Restore original repeat count if it was modified if 'original_repeat' in failed_case: standard_case['repeat'] = failed_case['original_repeat'] standard_cases.append(standard_case) print(f"Converted {len(standard_cases)} failed test cases to standard format") return standard_cases except Exception as e: print(f"Error loading failed regression file {failed_regression_file}: {e}") return [] def load_regression_list_cases(self, regression_list_file: str) -> List[Dict]: """Load test cases from regression list JSON file""" try: with open(regression_list_file, "r") as f: test_cases = json.load(f) print(f"Loaded {len(test_cases)} test cases from regression list: {regression_list_file}") # Validate that each test case has required fields valid_cases = [] for i, case in enumerate(test_cases): if not isinstance(case, dict): print(f"Warning: Test case {i} is not a dictionary, skipping") continue if 'name' not in case: print(f"Warning: Test case {i} missing 'name' field, skipping") continue # Set default values for optional fields if 'config' not in case: case['config'] = 'default' if 'repeat' not in case: case['repeat'] = 1 if 'timeout' not in case: case['timeout'] = 60 if 'opts' not in case: case['opts'] = [] if 'group' not in case: case['group'] = ['default'] valid_cases.append(case) print(f"Validated {len(valid_cases)} test cases from regression list") return valid_cases except Exception as e: print(f"Error loading regression list file {regression_list_file}: {e}") return [] def filter_cases(self, cases: List[Dict], groups: List[str]) -> List[Dict]: """Select the group by tag""" return [case for case in cases if set(groups).issubset(set(case["group"]))] # get test cases by group def submit_compile(self, que: str, dienum: str, rtl_ver: str, mode: str, define: str = None) -> Dict: """Submit the elab and compile""" result = { "name": "compile", "status": "PENDING", } try: # Build output directory path: self.args.output_dir output_dir = self.args.output_dir # Using output directory for compile # Construct bsub command cmd = ["bsub"] # Add LSF parameters cmd.extend(["-q", que]) # Add resource reservation for compile job resource_requests = [] # Add memory reservation if specified if hasattr(self.args, 'memory') and self.args.memory is not None: memory_mb = self.args.memory * 1024 # Convert GB to MB resource_requests.append(f"rusage[mem={memory_mb}]") # Memory reservation configured for compile job # Add CPU selection for compile job (always request at least 1 CPU) cpu_cores = getattr(self.args, 'cpu_cores', 1) # Build resource request string with select and rusage resource_string = f"select[ncpus>={cpu_cores}]" if resource_requests: resource_string += f" rusage[{','.join(resource_requests)}]" cmd.extend(["-R", resource_string]) # Resource request configured for compile job # Set job name and output cmd.extend([ "-J", f"pre_jobs", "make", f'pre_full_run', #f'DUT_VER={rtl_ver}', #f'die_num={dienum}', #f'WORK_DIR={output_dir}', f'mode={mode}' #f'p2_mode={mode}' ]) if define is not None: cmd.extend([f'def+={define}']) # Compile command prepared # Submit job output = subprocess.check_output(cmd, shell=False) job_id = self.parse_job_id(output) result["job_id"] = job_id result["status"] = "SUBMITTED" except subprocess.CalledProcessError as e: result["status"] = "SUBMIT_FAIL" result["error"] = str(e) except Exception as e: result["status"] = f"ERROR: {str(e)}" return result def gen_test_case(self, case: Dict, w_dir: str, log_dir: str, que: str, specified_seed: str = None) -> List[Dict]: """Generate test case commands""" opcodes = [] for repeat in range(case["repeat"]): # Construct bsub command cmd = ["bsub"] # Add LSF parameters cmd.extend(["-q", que]) # Memory reservation handling memory_gb = None # Priority 1: Check if memory is specified in the test case JSON if "memory" in case and case["memory"]: try: memory_gb = int(case["memory"]) # Using memory from JSON configuration except (ValueError, TypeError): print(f" Warning: Invalid memory value in JSON: {case['memory']}") # Priority 2: Use command line argument if JSON doesn't have memory if memory_gb is None and hasattr(self.args, 'memory') and self.args.memory is not None: memory_gb = self.args.memory # Using memory from command line # Add resource reservation to bsub command resource_requests = [] # Add memory reservation if specified if memory_gb: memory_mb = memory_gb * 1024 # Convert GB to MB resource_requests.append(f"rusage[mem={memory_mb}]") # Memory reservation configured # Add CPU selection (always request at least 1 CPU) cpu_cores = getattr(self.args, 'cpu_cores', 1) # Build resource request string with select and rusage resource_string = f"select[ncpus>={cpu_cores}]" if resource_requests: resource_string += f" rusage[{','.join(resource_requests)}]" cmd.extend(["-R", resource_string]) # Resource request configured # Legacy memory handling (commented out in original) # if "memory" in case: # if case["memory"] != "": # cmd.extend(["-M", str(self.parse_memory(case["memory"]))]) # Use specified seed if provided, otherwise generate unique seed if specified_seed is not None: seed = int(specified_seed) # Using specified seed for repeat else: # Generate unique seed for each test case, opts, and repeat # Include opts in seed generation to ensure different opts get different seeds opts_str = "_".join(case["opts"]) if case["opts"] else "no_opts" unique_seed_base = hash(case["name"] + opts_str + str(repeat) + str(int(time.time() * 1000))) seed = abs(unique_seed_base) % 10000 # Generated seed for repeat with opts if "lmn" in case: lmn = case["lmn"] else: lmn = "" # Set job name and output cmd.extend([ "-J", f"TEST_{case['name']}_{repeat}", "make", f'batch_run', f'tc={case["name"]}', f'pl=UVM_LOW', f'timestamp=N', f'timeout={case["timeout"]}', f'WORK_DIR={w_dir}', f'LOGDIR={str(self.log_dir)}', # Point to the logs directory f'WAVEDIR={str(self.wave_dir)}', # Add wave directory parameter f'wave={"fsdb" if self.args.wave else "null"}', f'seed={seed}', f'lmn={lmn}' ]) # Add coverage parameter if specified if hasattr(self.args, 'cov') and self.args.cov: cmd.extend([f'cov={self.args.cov}']) # Coverage parameter configured # Debug: print timeout value # Test case timeout configured # Add VCS optimization options if hasattr(self.args, 'vcs_optimize') and self.args.vcs_optimize: vcs_cores = getattr(self.args, 'vcs_cores', 4) cmd.extend([f'opts+=+VCS_PARALLEL={vcs_cores}']) cmd.extend([f'opts+=+VCS_OPTIMIZE=1']) # VCS optimization configured if hasattr(self.args, 'vcs_xa') and self.args.vcs_xa: cmd.extend([f'opts+=+VCS_XA=1']) # VCS-XA acceleration configured # Add optional parameters # Adding opts configuration for opt in case["opts"]: if opt: # Only add non-empty opts cmd.extend([f'opts+=+{opt}']) # Added opt configuration # Submit job - use the regression-specific log directory # Create test-specific log directory under logs/ # Build log file name robustly to avoid extra underscores when fields are empty opts_str = "_".join([o for o in (case.get("opts") or []) if o]) test_log_dir = self.log_dir / case['name'] test_log_dir.mkdir(parents=True, exist_ok=True) name_parts = [case['name'], str(seed)] if opts_str: name_parts.append(opts_str) else: name_parts.append("no_opts") if lmn: name_parts.append(lmn) safe_name = "_".join(name_parts) log_file = str(test_log_dir / f"{safe_name}.log") # Log file configured opcodes.append({ "cmd": cmd, "case": case, "id": repeat, "log_path": log_file, "seed": str(seed) }) return opcodes def submit_test_case(self, opcode: Dict) -> Dict: """Submit one test to LSF""" result = { "name": opcode["case"].get("name", "unknown"), "status": "PENDING", "seed": opcode["seed"], "id": opcode["id"] } try: # Submit cmd with detailed error capture timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Submitting job: {opcode['case'].get('name', 'unknown')} seed={opcode['seed']}") print(f"INFO: {timestamp} Command: {' '.join(opcode['cmd'])}") # Use subprocess.run to capture both stdout and stderr process = subprocess.run(opcode["cmd"], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, timeout=120) # Check return code if process.returncode != 0: result["status"] = "SUBMIT_FAIL" result["error"] = f"Command failed with return code {process.returncode}" result["stdout"] = process.stdout result["stderr"] = process.stderr timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL") print(f"INFO: {timestamp} Return code: {process.returncode}") print(f"INFO: {timestamp} STDOUT: {process.stdout}") print(f"INFO: {timestamp} STDERR: {process.stderr}") return result # Parse job ID from output job_id = self.parse_job_id(process.stdout.encode()) if job_id == "UNKNOWN": result["status"] = "SUBMIT_FAIL" result["error"] = "Failed to parse job ID from LSF output" result["stdout"] = process.stdout result["stderr"] = process.stderr timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL") print(f"INFO: {timestamp} LSF Output: {process.stdout}") print(f"INFO: {timestamp} LSF Error: {process.stderr}") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') opts_str = "_".join(opcode['case'].get('opts', [])) if opcode['case'].get('opts') else "no_opts" print(f"INFO: {timestamp} [jobid {job_id}] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMITTED") result["job_id"] = job_id result["status"] = "SUBMITTED" result["error"] = '' result["stdout"] = process.stdout result["stderr"] = process.stderr if job_id not in self.submitted_jobs: self.submitted_jobs.append(job_id) # Store additional test case info for later reference result["case_name"] = opcode['case'].get('name', 'unknown') result["case_seed"] = opcode['seed'] # Persist the full opcode for this job so FAIL handling can resubmit directly try: self.job_meta[str(job_id)] = copy.deepcopy(opcode) result["opcode"] = self.job_meta[str(job_id)] except Exception: # Best-effort; do not block on deepcopy issues self.job_meta[str(job_id)] = opcode # Also update the corresponding TestResult object # Use the full unique key to find the correct TestResult test_name = opcode['case'].get('name', 'unknown') config = opcode['case'].get('config', 'default') seed = opcode['seed'] opts_str = "_".join(opcode['case'].get('opts', [])) if opcode['case'].get('opts') else "no_opts" unique_key = f"{test_name}:{config}:{seed}:{opts_str}" # Store unique_key alongside opcode for direct updates later try: self.job_meta[str(job_id)]["unique_key"] = unique_key except Exception: pass result["unique_key"] = unique_key if unique_key in self.results: self.results[unique_key].job_id = job_id self.results[unique_key].seed = seed if opcode.get('log_path'): self.results[unique_key].log_file = opcode['log_path'] print(f"DEBUG: Updated TestResult {unique_key} with job_id {job_id}") else: print(f"Warning: TestResult not found for key: {unique_key}") except subprocess.TimeoutExpired as e: result["status"] = "SUBMIT_FAIL" result["error"] = f"Command timeout: {str(e)}" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL: Timeout") except subprocess.CalledProcessError as e: result["status"] = "SUBMIT_FAIL" result["error"] = str(e) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL: {str(e)}") except Exception as e: result["status"] = f"ERROR: {str(e)}" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} ERROR: {str(e)}") return result def get_test_info_by_job_id(self, job_id: str) -> Optional[Dict]: """Get test case information by job ID""" # First search through submitted results to find matching job_id for result in getattr(self, 'submitted_results', []): if result.get('job_id') == job_id: # Try multiple field names to find the test name test_name = result.get('case_name') or result.get('name') test_seed = result.get('case_seed') or result.get('seed') if test_name and test_seed: return { 'name': test_name, 'seed': test_seed, 'id': result.get('id', 'unknown') } # If not found in submitted_results, search through self.results for result_key, result in self.results.items(): if hasattr(result, 'job_id') and result.job_id == job_id: return { 'name': result.name, 'seed': getattr(result, 'seed', 'unknown'), 'id': getattr(result, 'id', 'unknown') } # Debug info (muted) # print(f"DEBUG: Could not find test info for job_id {job_id}") # print(f"DEBUG: submitted_results count: {len(getattr(self, 'submitted_results', []))}") # print(f"DEBUG: self.results count: {len(self.results)}") # for i, result in enumerate(getattr(self, 'submitted_results', [])[:3]): # print(f"DEBUG: submitted_results[{i}]: {result}") # for i, (key, result) in enumerate(list(self.results.items())[:3]): # print(f"DEBUG: results[{i}] {key}: job_id={getattr(result, 'job_id', 'None')}") return None def _resubmit_from_stored_opcode(self, job_id: str): """Directly resubmit a failed job using the stored opcode, avoiding any name/seed lookup.""" stored = self.job_meta.get(str(job_id)) if not stored: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] No stored opcode found; cannot direct-resubmit") return # CRITICAL FIX: Check retry limit before proceeding current_retry_attempt = stored.get('retry_attempt', 0) max_retries = getattr(self.args, 'retry', 0) if max_retries <= 0: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] Retry disabled (max_retries={max_retries})") return if current_retry_attempt >= max_retries: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] Max retries reached ({current_retry_attempt}/{max_retries}), stopping retry") return try: # Clone and mark as retry while keeping same seed/opts/config opcode = copy.deepcopy(stored) opcode['retry_attempt'] = current_retry_attempt + 1 opcode['retry_seed'] = opcode.get('seed', 'unknown') except Exception: opcode = stored opcode['retry_attempt'] = current_retry_attempt + 1 opcode['retry_seed'] = opcode.get('seed', 'unknown')
# Backup the original log before resubmitting to avoid it being overwritten by the retry try: original_log_path = self.get_test_log_path_by_job_id(job_id) if original_log_path and os.path.exists(original_log_path): p = Path(original_log_path) backup_path = p.with_name(p.stem + '_bak.log') if p.suffix == '.log' else Path(str(p) + '_bak.log') if not backup_path.exists(): os.rename(str(p), str(backup_path)) print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Backed up log: {p} -> {backup_path}") except Exception as _e: print(f"WARNING: Failed to backup log for job {job_id}: {_e}")
# Force wave dump on retry: ensure wave=fsdb in command try: cmd_list = opcode.get('cmd', []) if isinstance(cmd_list, list): replaced = False for i, token in enumerate(cmd_list): if isinstance(token, str) and token.startswith('wave='): if token != 'wave=fsdb': cmd_list[i] = 'wave=fsdb' replaced = True break if not replaced: cmd_list.append('wave=fsdb') # Keep rerun tag consistent where applicable for standard resubmits (no harm if already present) if not any(isinstance(t, str) and t.startswith('lmn=') for t in cmd_list): cmd_list.append('lmn=rerun') opcode['cmd'] = cmd_list except Exception: pass
# Ensure corresponding TestResult exists/updated based on unique_key (bypass name/seed mapping) unique_key = stored.get('unique_key') if not unique_key: # Fallback: compute from opcode test_name = opcode.get('case', {}).get('name', 'unknown') config = opcode.get('case', {}).get('config', 'default') seed_val = opcode.get('seed', 'unknown') opts_str = "_".join(opcode.get('case', {}).get('opts', [])) if opcode.get('case', {}).get('opts') else "no_opts" unique_key = f"{test_name}:{config}:{seed_val}:{opts_str}" else: # Parse parts just for constructing missing TestResult if needed try: name_part, config_part, seed_part, _ = unique_key.split(':', 3) except ValueError: name_part = opcode.get('case', {}).get('name', 'unknown') config_part = opcode.get('case', {}).get('config', 'default') seed_part = opcode.get('seed', 'unknown')
# Create or update TestResult entry directly by unique_key try: if unique_key not in self.results: # Build a new TestResult with available meta new_name = locals().get('name_part', opcode.get('case', {}).get('name', 'unknown')) new_cfg = locals().get('config_part', opcode.get('case', {}).get('config', 'default')) self.results[unique_key] = TestResult(new_name, new_cfg) self.results[unique_key].seed = locals().get('seed_part', opcode.get('seed', 'unknown')) self.results[unique_key].opts = opcode.get('case', {}).get('opts', []) try: self.results[unique_key].estimated_duration = self.estimate_test_duration(new_name, new_cfg) except Exception: pass else: # Update existing TestResult for retry existing_result = self.results[unique_key] existing_result.retry_count = opcode.get('retry_attempt', 1) existing_result.is_retry = True existing_result.retry_started = True # Create a new unique key for retry cases to track them separately retry_unique_key = f"{unique_key}_retry_{opcode.get('retry_attempt', 1)}" if retry_unique_key not in self.results: retry_name = locals().get('name_part', opcode.get('case', {}).get('name', 'unknown')) retry_cfg = locals().get('config_part', opcode.get('case', {}).get('config', 'default')) retry_result = TestResult(retry_name, retry_cfg) retry_result.seed = locals().get('seed_part', opcode.get('seed', 'unknown')) retry_result.opts = opcode.get('case', {}).get('opts', []) retry_result.retry_count = opcode.get('retry_attempt', 1) retry_result.is_retry = True retry_result.retry_started = True try: retry_result.estimated_duration = self.estimate_test_duration(retry_name, retry_cfg) except Exception: pass self.results[retry_unique_key] = retry_result except Exception: pass
timestamp = datetime.now().strftime('%m-%d %H:%M:%S') case_name = opcode.get('case', {}).get('name', 'unknown') case_seed = opcode.get('seed', 'unknown') print(f"INFO: {timestamp} [jobid {job_id}] Directly resubmitting FAIL case {case_name} seed={case_seed} (retry {opcode.get('retry_attempt', 1)}/{max_retries})") # Submit the retry job result = self.submit_test_case(opcode) if result["status"] == "SUBMITTED": # Store the retry job metadata retry_job_id = result["job_id"] opcode['unique_key'] = retry_unique_key self.job_meta[str(retry_job_id)] = opcode # Update the retry TestResult with the new job ID if retry_unique_key in self.results: self.results[retry_unique_key].job_id = retry_job_id self.results[retry_unique_key].status = "PENDING" # Add to submitted jobs list if retry_job_id not in self.submitted_jobs: self.submitted_jobs.append(retry_job_id) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {retry_job_id}] {case_name} seed={case_seed} Submitted (retry {opcode.get('retry_attempt', 1)}/{max_retries})") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] Retry submission failed: {result.get('error', 'Unknown error')}")
def update_test_result_with_job_info(self, test_name: str, job_id: str, seed: str, log_file: str = None): """Update TestResult object with job information""" for result_key, result in self.results.items(): if result.name == test_name: result.job_id = job_id # Store seed information if not hasattr(result, 'seed'): result.seed = seed # Store log file path if log_file: result.log_file = log_file # Update status to RUNNING when job is submitted if result.status == "PENDING": result.start() # This will set status to RUNNING and start_time break def update_test_result_status(self, job_id: str, status: str): """Update test result status in self.results""" # Find the test result by job_id for result_key, result in self.results.items(): if hasattr(result, 'job_id') and result.job_id == job_id: result.finish(status, "") break # Also check in submitted_results for result in getattr(self, 'submitted_results', []): if result.get('job_id') == job_id: result['status'] = status break def _update_test_result_status(self, unique_key: str, job_id: str, seed: str, status: str = "PENDING"): """Helper method to update TestResult status consistently""" if unique_key in self.results: self.results[unique_key].job_id = job_id self.results[unique_key].seed = seed self.results[unique_key].status = status return True else: print(f"Warning: TestResult not found for key: {unique_key}") return False def get_test_status_display(self, job_id: str, status: str) -> str: """Get formatted test status display string""" test_info = self.get_test_info_by_job_id(job_id) if test_info: return f"[jobid {job_id}] {test_info['name']} seed={test_info['seed']}" else: # Unknown mapping happens when TestResult hasn't been recorded yet; mute noisy label return f"[jobid {job_id}]" def parse_job_id(self, output: bytes) -> str: """Parse job ID from LSF output""" try: output_str = output.decode('utf-8') # Look for pattern like "Job <12345> is submitted to queue <queue_name>" match = re.search(r'Job <(\d+)>', output_str) if match: return match.group(1) else: return "UNKNOWN" except Exception: return "UNKNOWN" def parse_memory(self, memory_str: str) -> int: """Parse memory string to MB""" try: if memory_str.endswith('GB'): return int(float(memory_str[:-2]) * 1024) elif memory_str.endswith('MB'): return int(float(memory_str[:-2])) else: return int(memory_str) except Exception: return 4000 # Default to 4GB def run_compile_and_regression(self, dienum: str, rtl_ver: str, mode: str, define: str = None) -> bool: """Run complete compile and regression flow, returns True if successful, False if failed""" print(f"{Colors.BLUE}=== Starting Compile and Regression Flow ==={Colors.END}") # Build output directory path: self.args.output_dir output_dir = self.args.output_dir print(f"Using compile output directory: {output_dir}") print(f"Using regression directory: {self.regression_dir}") # Check if compile should be skipped skip_compile = self.should_skip_compile() if skip_compile: print(f"{Colors.YELLOW}Compile step is set to be bypassed{Colors.END}") # Check if compile files already exist if self.check_compile_files_exist(output_dir, dienum, rtl_ver, mode): print(f"{Colors.GREEN}Existing compile files found, skipping compile step{Colors.END}") compile_required = False else: print(f"{Colors.YELLOW}No existing compile files found, compile step is required{Colors.END}") compile_required = True else: compile_required = True # Step 1: Submit compile job (if required) if compile_required: print(f"Step 1: Submitting compile job...") compile_result = self.submit_compile(self.args.queue, dienum, rtl_ver, mode, define) if compile_result["status"] == "SUBMITTED": timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {compile_result['job_id']}] compile_job SUBMITTED") # Wait for compile job to complete print(f"Waiting for compile job to complete...") compile_success = self.wait_for_job_completion(compile_result["job_id"]) # If compile failed, exit immediately if not compile_success: print(f"{Colors.RED}Compilation failed! Exiting without running regression tests.{Colors.END}") return False # Verify compile was successful by checking for output files if not self.check_compile_files_exist(output_dir, dienum, rtl_ver, mode): print(f"{Colors.RED}Error: Compile job completed but no output files found{Colors.END}") print(f"{Colors.RED}Compilation verification failed! Exiting without running regression tests.{Colors.END}") return False print(f"{Colors.GREEN}Compile job completed successfully{Colors.END}") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] compile_job SUBMIT_FAIL: {compile_result.get('error', 'Unknown error')}") return False else: print(f"Step 1: Compile step skipped (bypass enabled and files exist)") # Step 2: Generate and submit test cases print(f"Step 2: Generating and submitting test cases...") # Load test cases if hasattr(self.args, 'failed_regression') and self.args.failed_regression: # Load from failed regression file print(f"Loading failed test cases from: {self.args.failed_regression}") test_cases = self.load_failed_regression_cases(self.args.failed_regression) if not test_cases: print(f"{Colors.RED}Error: No test cases loaded from failed regression file{Colors.END}") return False print(f"Loaded {len(test_cases)} failed test cases for re-run") elif hasattr(self.args, 'list') and self.args.list: # Load from regression list file in ../def/case_def/ directory regression_list_path = os.path.join(os.getcwd(), "..", "def", "case_def", self.args.list) print(f"Loading test cases from regression list: {regression_list_path}") test_cases = self.load_regression_list_cases(regression_list_path) if not test_cases: print(f"{Colors.RED}Error: No test cases loaded from regression list file{Colors.END}") return False print(f"Loaded {len(test_cases)} test cases from regression list") else: # Load from normal test files cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" test_file_list = [] with open(test_file_list_name, 'r') as f: for line in f: if line != '\n': file_path = cur_path + "/../def" + line test_file_list.append(file_path.replace('\n', '')) test_cases = self.load_test_cases(test_file_list) print(f"Loaded {len(test_cases)} test cases from files") # Filter test cases by group (only if groups are specified) if self.args.groups: test_cases = self.filter_cases(test_cases, self.args.groups) print(f"Filtered to {len(test_cases)} test cases for groups: {self.args.groups}") else: print(f"No group filter applied, using all {len(test_cases)} test cases") # Generate test case commands all_opcodes = [] for case in test_cases: print(f"Processing test case: {case['name']} with repeat={case.get('repeat', 1)}") # Use regression-specific log directory for simulation output sim_output_dir = str(self.regression_dir) print(f" Using simulation output directory: {sim_output_dir}") opcodes = self.gen_test_case(case, output_dir, sim_output_dir, self.args.queue) all_opcodes.extend(opcodes) # Remove duplicate opcodes based on unique identifier print(f"Generated {len(all_opcodes)} test case commands") print("Removing duplicate opcodes...") # Create a set to track unique identifiers seen_identifiers = set() unique_opcodes = [] for opcode in all_opcodes: # Create a unique identifier for each opcode # Combine test name, seed, and repeat ID to ensure uniqueness unique_id = f"{opcode['case']['name']}_{opcode['seed']}_{opcode['id']}" if unique_id not in seen_identifiers: seen_identifiers.add(unique_id) unique_opcodes.append(opcode) else: # print(f" Skipping duplicate: {unique_id}") pass all_opcodes = unique_opcodes print(f"After removing duplicates: {len(all_opcodes)} unique test case commands") # Initialize test results for all test cases - ensure no duplicates print(f"Initializing test results for {len(all_opcodes)} opcodes...") unique_test_keys = set() for opcode in all_opcodes: test_name = opcode["case"].get("name", "unknown") config = opcode["case"].get("config", "default") seed = opcode["seed"] opts_str = "_".join(opcode["case"].get("opts", [])) if opcode["case"].get("opts") else "no_opts" # Create a unique key that combines test name, config, seed, and opts unique_key = f"{test_name}:{config}:{seed}:{opts_str}" if unique_key not in unique_test_keys: unique_test_keys.add(unique_key) if unique_key not in self.results: self.results[unique_key] = TestResult(test_name, config) self.results[unique_key].seed = seed self.results[unique_key].opts = opcode["case"].get("opts", []) self.results[unique_key].estimated_duration = self.estimate_test_duration(test_name, config) # print(f" Created TestResult for: {unique_key}") else: # print(f" TestResult already exists for: {unique_key}") pass else: # print(f" Skipping duplicate opcode for: {unique_key}") pass print(f"Initialized {len(unique_test_keys)} unique test results") timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Submitting test cases with max concurrent limit: {self.args.max_concurrent}") # Initial job submission up to max_concurrent initial_submit_count = min(self.args.max_concurrent, len(all_opcodes)) sim_cases_num = len(all_opcodes) print(f"Initial submission: will submit {initial_submit_count} jobs") for _ in range(initial_submit_count): if all_opcodes: opcode = all_opcodes.pop(0) result = self.submit_test_case(opcode) self.submitted_results.append(result) if result["status"] == "SUBMITTED": # Don't increment running_jobs yet - wait for actual RUN status if result["job_id"] not in self.submitted_jobs: self.submitted_jobs.append(result["job_id"]) # Update corresponding TestResult object test_name = opcode["case"].get("name", "unknown") config = opcode["case"].get("config", "default") seed = opcode["seed"] opts_str = "_".join(opcode["case"].get("opts", [])) if opcode["case"].get("opts") else "no_opts" unique_key = f"{test_name}:{config}:{seed}:{opts_str}" if self._update_test_result_status(unique_key, result["job_id"], seed, "PENDING"): print(f"DEBUG: Updated TestResult {unique_key} with job_id {result['job_id']} in initial submission") timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {result['job_id']}] {result['name']} seed={result.get('seed', 'unknown')} PENDING") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {result['name']} seed={result.get('seed', 'unknown')} SUBMIT_FAIL: {result.get('error', 'Unknown error')}") # Add delay between submissions to prevent server overload time.sleep(2) print(f"Initial submission complete. Submitted jobs: {len(self.submitted_jobs)}, Pending opcodes: {len(all_opcodes)}") # Clean any duplicate job IDs that may have been added self._clean_submitted_jobs() # CRITICAL FIX: Initialize job status counts after initial submission if self.submitted_jobs: print(f"Initializing job status counts for {len(self.submitted_jobs)} submitted jobs...") # Check current status of all submitted jobs initial_status_changes = {} for job_id in self.submitted_jobs: status = self.check_lsf_job_status(int(job_id)) initial_status_changes[int(job_id)] = status print(f"DEBUG: Job {job_id} initial status: {status}") # Update counts based on initial status self._update_job_status_counts(initial_status_changes) print(f"DEBUG: After initialization - running_jobs: {self.running_jobs}, pending_jobs: {self.pending_jobs}") # Main loop: monitor jobs and submit new ones as slots become available last_status_print_time = time.time() # Track last status print time last_status_log_time = time.time() # Track last RERUN status log time (60s) # Start status monitoring thread for LSF regression self.status_thread = threading.Thread(target=self._status_print_thread, daemon=True) self.status_thread.start() print(f"{Colors.BLUE}Started status monitoring thread for real-time report updates{Colors.END}") # 提前启动作业监控线程,避免主循环阻塞导致后续 monitor 阶段无法及时覆盖 try: if not hasattr(self, 'monitor_thread') or not getattr(self, 'monitor_thread', None) or not self.monitor_thread.is_alive(): # Do not pass the shared list reference; let the function pick up the live list self.monitor_thread = threading.Thread(target=self.monitor_all_jobs, daemon=True) self.monitor_thread.start() print(f"{Colors.BLUE}Started early monitor_all_jobs thread{Colors.END}") except Exception: pass
# DEBUG: Print loop condition values print(f"DEBUG: Loop condition check - all_opcodes: {len(all_opcodes)}, running_jobs: {self.running_jobs}, pending_jobs: {self.pending_jobs}") print(f"DEBUG: Loop condition result: {bool(all_opcodes or (self.running_jobs > 0 or self.pending_jobs > 0))}") while all_opcodes or (self.running_jobs > 0 or self.pending_jobs > 0): # Check for completed jobs and update status counts monitor_alive = False try: monitor_alive = hasattr(self, 'monitor_thread') and self.monitor_thread and self.monitor_thread.is_alive() except Exception: monitor_alive = False if self.submitted_jobs and not monitor_alive: # Lightweight accounting only; lifecycle handled by monitor thread status_changes = {} for job_id in self.submitted_jobs[:]: status_changes[job_id] = self.check_lsf_job_status(int(job_id)) self._update_job_status_counts(status_changes) try: self.update_real_time_report() except Exception: pass
# Print status summary similar to the image format try: total_reruns = sum(getattr(res, 'retry_count', 0) for _, res in self.results.items()) pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("PASS", "RERUN PASS")) fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("FAIL", "RERUN FAIL")) rerun_pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN PASS") rerun_fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN FAIL") total_test_cases = self.get_total_test_cases_count() timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Status: Running={self.running_jobs}, Pending={self.pending_jobs}, Had Rerun={total_reruns}, Pass={pass_count}, Fail={fail_count}, RerunPass={rerun_pass_count}, RerunFail={rerun_fail_count}, Total={total_test_cases}") except Exception: pass elif monitor_alive: # When monitor thread owns job lifecycle, perform a lightweight status update only try: self._update_lsf_job_statuses() self.update_real_time_report() except Exception: pass # Submit new jobs if we have capacity and pending opcodes while all_opcodes and self.running_jobs < self.args.max_concurrent: opcode = all_opcodes.pop(0) result = self.submit_test_case(opcode) self.submitted_results.append(result) if result["status"] == "SUBMITTED": # Don't increment running_jobs yet - wait for actual RUN status if result["job_id"] not in self.submitted_jobs: self.submitted_jobs.append(result["job_id"]) # Update corresponding TestResult object test_name = opcode["case"].get("name", "unknown") config = opcode["case"].get("config", "default") seed = opcode["seed"] opts_str = "_".join(opcode["case"].get("opts", [])) if opcode["case"].get("opts") else "no_opts" unique_key = f"{test_name}:{config}:{seed}:{opts_str}" if self._update_test_result_status(unique_key, result["job_id"], seed, "PENDING"): print(f"DEBUG: Updated TestResult {unique_key} with job_id {result['job_id']} in main loop") timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {result['job_id']}] {result['name']} seed={result.get('seed', 'unknown')} PENDING") # Show regression status after each submission total_test_cases = self.get_total_test_cases_count() self.show_regression_status(self.running_jobs, self.pending_jobs, total_test_cases) else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {result['name']} seed={result.get('seed', 'unknown')} SUBMIT_FAIL: {result.get('error', 'Unknown error')}") # Add delay between submissions to prevent server overload time.sleep(2) # If we're waiting for jobs to complete, add a small delay and show status every 30 seconds if (self.running_jobs > 0 or self.pending_jobs > 0) and not all_opcodes: current_time = time.time() # Show regression status every 30 seconds if current_time - last_status_print_time >= 30: # Update LSF job statuses before showing status self._update_lsf_job_statuses() total_test_cases = self.get_total_test_cases_count() self.show_regression_status(self.running_jobs, self.pending_jobs, total_test_cases) last_status_print_time = current_time # Update real-time report every 30 seconds self.update_real_time_report() # Log detailed status with RERUNS every 60 seconds if current_time - last_status_log_time >= 60: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') try: total_reruns = sum(getattr(res, 'retry_count', 0) for _, res in self.results.items()) except Exception: total_reruns = 0 try: pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("PASS", "RERUN PASS")) fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("FAIL", "RERUN FAIL")) rerun_pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN PASS") rerun_fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN FAIL") except Exception: pass_count = fail_count = rerun_pass_count = rerun_fail_count = 0 print(f"INFO: {timestamp} Status: RUNNING={self.running_jobs}, PENDING={self.pending_jobs}, Total={sim_cases_num}, RERUNS={total_reruns}, Pass={pass_count}, Fail={fail_count}, RerunPass={rerun_pass_count}, RerunFail={rerun_fail_count}") last_status_log_time = current_time time.sleep(3) # Step 3: Monitor all jobs print(f"Step 3: Monitoring all jobs...") # Show final submission summary successful_submissions = len([r for r in self.submitted_results if r["status"] == "SUBMITTED"]) failed_submissions = len([r for r in self.submitted_results if r["status"] != "SUBMITTED"]) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Submission Summary: {successful_submissions} successful, {failed_submissions} failed") # Clean any duplicate job IDs before final status check self._clean_submitted_jobs() # Final status check: ensure all submitted jobs have their final status print(f"Performing final status check for all submitted jobs...") for job_id in self.submitted_jobs[:]: # Use slice copy status = self.check_lsf_job_status(int(job_id)) if status in ["DONE", "EXIT", "TERM", "KILL"]: # Update corresponding TestResult object test_info = self.get_test_info_by_job_id(job_id) if test_info: test_name = test_info['name'] seed = test_info['seed'] # Find the correct TestResult object by searching through all results found_result = None for result_key, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if found_result: if status == "DONE": # CRITICAL FIX: Even for DONE status, check log file for errors log_file_path = self.get_test_log_path_by_job_id(job_id) if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: found_result.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} PASS") elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL with error info error_msg = "Test passed but had runtime errors (running but had error)" found_result.finish("FAIL", error_msg) found_result.error_detected = True print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} FAIL (running but had error)") else: found_result.finish("FAIL", "Test failed (from log file)") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} FAIL") else: # No log file available, assume PASS for DONE status found_result.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} PASS (no log file)") else: # LSF显示EXIT/TERM/KILL时,先看仿真log是否已PASS log_file_path = self.get_test_log_path_by_job_id(job_id) if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): found_result.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} PASS") else: found_result.finish("FAIL", f"Job status: {status}") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} FAIL") else: print(f"Warning: TestResult not found for {test_name} seed={seed}") self.submitted_jobs.remove(job_id) # Additional check: ensure all TestResult objects have correct status # print(f"Performing additional status validation for all test results...") for result_key, result_obj in self.results.items(): if result_obj.status == "RUNNING" and hasattr(result_obj, 'job_id') and result_obj.job_id: # Check if this job is actually completed try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status in ["DONE", "EXIT", "TERM", "KILL"]: if status == "DONE": # CRITICAL FIX: Even for DONE status, check log file for errors log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> PASS") elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL error_msg = "Test passed but had runtime errors (running but had error)" result_obj.finish("FAIL", error_msg) result_obj.error_detected = True print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> FAIL (running but had error)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> FAIL") else: # No log file available, assume PASS for DONE status result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> PASS (no log file)") else: # 再次以日志为准 log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> PASS (from log)") else: result_obj.finish("FAIL", f"Job status: {status}") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> FAIL") except Exception as e: print(f"Warning: Could not check status for job {result_obj.job_id}: {e}") # Final validation: ensure no RUNNING status remains if all jobs are done if len(self.submitted_jobs) == 0: print(f"All jobs completed, ensuring no RUNNING status remains...") for result_key, result_obj in self.results.items(): if result_obj.status == "RUNNING": # If job is not in submitted_jobs but status is RUNNING, # it means the job completed but status wasn't updated if hasattr(result_obj, 'job_id') and result_obj.job_id: try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status == "DONE": # CRITICAL FIX: Even for DONE status, check log file for errors log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PASS") elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL error_msg = "Test passed but had runtime errors (running but had error)" result_obj.finish("FAIL", error_msg) result_obj.error_detected = True print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> FAIL (running but had error)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> FAIL") else: # No log file available, assume PASS for DONE status result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PASS (no log file)") elif status in ["EXIT", "TERM", "KILL"]: log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PASS (from log)") else: result_obj.finish("FAIL", f"Job status: {status}") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> FAIL") except Exception as e: print(f"Warning: Could not check final status for job {result_obj.job_id}: {e}") else: # No job_id, mark as PENDING result_obj.status = "PENDING" print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PENDING (no job_id)") # ENHANCED: Final status refresh - recheck all test results based on log files print(f"{Colors.BLUE}Performing final status refresh based on log files...{Colors.END}") self._final_status_refresh() self.monitor_all_jobs(self.submitted_jobs) # Stop all monitoring threads after all jobs are completed print(f"{Colors.BLUE}Stopping all monitoring threads...{Colors.END}") self._stop_status_thread = True if hasattr(self, 'status_thread') and self.status_thread.is_alive(): self.status_thread.join(timeout=5) print(f"{Colors.BLUE}Status monitoring thread stopped{Colors.END}") # Return True to indicate successful completion return True def wait_for_job_completion(self, job_id: str) -> bool: """Wait for a specific job to complete, returns True if successful, False if failed""" print(f"Waiting for job {job_id} to complete...") unknown_count = 0 max_unknown_threshold = 20 # Allow more UNKNOWN status for compile jobs while True: status = self.check_lsf_job_status(int(job_id)) if status == "DONE": timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] compile_job PASS :)") return True # Compilation successful elif status in ["EXIT", "TERM", "KILL"]: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] compile_job FAIL :(") return False # Compilation failed elif status == "UNKNOWN": unknown_count += 1 if unknown_count >= max_unknown_threshold: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] compile_job FAIL: unknown_lsf_status") return False # Compilation failed due to unknown status print(f"Job {job_id} status: {status} (unknown count: {unknown_count})") else: # Reset unknown counter for other statuses unknown_count = 0 print(f"Job {job_id} status: {status}") time.sleep(10) # Add a small delay after job completion to ensure all files are written print(f"Job {job_id} completed with status: {status}, waiting 5 seconds for file system sync...") time.sleep(5) def monitor_all_jobs(self, job_list=None): if job_list is None: job_list = self.submitted_jobs self._clean_submitted_jobs() print(f"Monitoring {len(job_list)} submitted jobs...") max_unknown_count = {} max_unknown_threshold = 10 while job_list: completed_jobs = [] job_ids = [int(job_id) for job_id in job_list] status_map = self.batch_check_job_status(job_ids) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') # Show lightweight progress for RUN/PEND for job_id, status in status_map.items(): if status in ["RUN", "PEND"]: print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} {status}") for job_id in list(job_list): int_job_id = int(job_id) status = status_map.get(int_job_id, "UNKNOWN") if status == "DONE": # Decide by log contents try: log_file_path = self.get_test_log_path_by_job_id(job_id) except Exception: log_file_path = None test_passed = False has_runtime_errors = False if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) test_info = self.get_test_info_by_job_id(job_id) found_result = None if test_info: test_name = test_info['name'] seed = test_info['seed'] for _, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if test_passed and not has_runtime_errors: # Update status considering retry context if found_result and found_result.is_retry: found_result.finish("RERUN PASS", "") else: found_result.finish("PASS", "") self.update_test_result_status(job_id, "PASS") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, 'DONE')} PASS") completed_jobs.append(job_id) elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL error_msg = "Test passed but had runtime errors (running but had error)" if found_result and found_result.is_retry: found_result.finish("RERUN FAIL", error_msg) else: found_result.finish("FAIL", error_msg) found_result.error_detected = True self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, 'DONE')} FAIL (running but had error)") # Centralized retry on FAIL - let _resubmit_from_stored_opcode handle retry limits if getattr(self.args, 'retry', 0) > 0: self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) else: # Test failed - mark as FAIL if found_result and found_result.is_retry: found_result.finish("RERUN FAIL", "DONE but log indicates failure") else: found_result.finish("FAIL", "DONE but log indicates failure") self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, 'DONE')} FAIL") # Centralized retry on FAIL - let _resubmit_from_stored_opcode handle retry limits if getattr(self.args, 'retry', 0) > 0: self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) elif status in ["EXIT", "TERM", "KILL"]: # Prefer log PASS override if available try: log_file_path = self.get_test_log_path_by_job_id(job_id) except Exception: log_file_path = None if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): # Update status considering retry context if found_result and found_result.is_retry: found_result.finish("PASS", "") else: self.update_test_result_status(job_id, "PASS") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} PASS") completed_jobs.append(job_id) else: # Update status considering retry context if found_result and found_result.is_retry: found_result.finish("FAIL", f"Job status: {status}") else: self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} FAIL") if getattr(self.args, 'retry', 0) > 0: # Let _resubmit_from_stored_opcode handle retry limits self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) elif status == "UNKNOWN": # Use log to decide if possible; else threshold-based retry try: log_file_path = self.get_test_log_path_by_job_id(job_id) except Exception: log_file_path = None if log_file_path and os.path.exists(log_file_path): if self.check_test_result(log_file_path): self.update_test_result_status(job_id, "PASS") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} PASS (from log)") completed_jobs.append(job_id) else: self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} FAIL (from log)") if getattr(self.args, 'retry', 0) > 0: # Retry directly from stored opcode self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) else: # Track UNKNOWN streak if job_id not in max_unknown_count: max_unknown_count[job_id] = 0 max_unknown_count[job_id] += 1 if max_unknown_count[job_id] >= max_unknown_threshold: print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} FAIL: unknown_lsf_status") if getattr(self.args, 'retry', 0) > 0: self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) else: # Reset UNKNOWN counter for stable statuses if job_id in max_unknown_count: max_unknown_count[job_id] = 0 # Remove completed jobs from monitoring list for job_id in completed_jobs: if job_id in job_list: job_list.remove(job_id) if job_id in max_unknown_count: del max_unknown_count[job_id] # Recalculate running/pending counts from current statuses try: remaining_ids = [int(j) for j in job_list] # Build a fresh status map limited to remaining jobs remaining_statuses = {jid: status_map.get(jid, "UNKNOWN") for jid in remaining_ids} self.running_jobs = sum(1 for s in remaining_statuses.values() if s == "RUN") self.pending_jobs = sum(1 for s in remaining_statuses.values() if s == "PEND") except Exception: pass # Print status summary after job completion if completed_jobs: try: total_reruns = sum(getattr(res, 'retry_count', 0) for _, res in self.results.items()) pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("PASS", "RERUN PASS")) fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("FAIL", "RERUN FAIL")) rerun_pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN PASS") rerun_fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN FAIL") total_test_cases = self.get_total_test_cases_count() timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Completed {len(completed_jobs)} jobs. Status: Running={self.running_jobs}, Pending={self.pending_jobs}, Had Rerun={total_reruns}, Pass={pass_count}, Fail={fail_count}, RerunPass={rerun_pass_count}, RerunFail={rerun_fail_count}, Total={total_test_cases}") except Exception: pass if job_list: time.sleep(5) # Grace period: if new jobs appear (e.g., retries) after list became empty, resume monitoring try: for _ in range(3): if len(self.submitted_jobs) > 0: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} New jobs detected after completion, resuming monitoring...") return self.monitor_all_jobs(self.submitted_jobs) time.sleep(5) except Exception: pass
try: self.running_jobs = 0 self.pending_jobs = 0 except Exception: pass print("All jobs completed!") self._stop_status_thread = True if hasattr(self, 'status_thread') and self.status_thread.is_alive(): self.status_thread.join(timeout=5) print(f"{Colors.BLUE}Stopped status monitoring thread{Colors.END}") # Also stop monitor thread if it exists if hasattr(self, 'monitor_thread') and self.monitor_thread.is_alive(): print(f"{Colors.BLUE}Stopping monitor thread{Colors.END}") # Note: monitor_thread is a daemon thread, it will stop when main thread exits
def check_lsf_job_status(self, job_id: int) -> str: """Check LSF job status""" try: result = subprocess.run( ["bjobs", "-noheader", str(job_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, # 使用universal_newlines代替text参数 timeout=30 ) if result.returncode == 0: lines = result.stdout.strip().split('\n') if lines and lines[0]: parts = lines[0].split() if len(parts) >= 3: status = parts[2] # Status column # Map LSF status to our status if status in ["RUN", "PEND", "WAIT", "SUSP"]: return status elif status in ["DONE", "EXIT", "TERM", "KILL"]: return status else: return status else: # Job not found in queue, might have completed print(f"INFO: Job {job_id} not found in queue, checking if completed") return "UNKNOWN" else: # Command failed, print error details print(f"Warning: bjobs command failed for job {job_id}") print(f"Return code: {result.returncode}") print(f"STDOUT: {result.stdout}") print(f"STDERR: {result.stderr}") except subprocess.TimeoutExpired: print(f"Warning: bjobs command timeout for job {job_id}") except FileNotFoundError: print(f"Warning: bjobs command not found, LSF may not be available") except Exception as e: print(f"Warning: Error checking job {job_id} status: {e}") return "UNKNOWN" def get_job_details(self, job_id: int) -> Dict: """Get detailed job information including failure reasons""" try: result = subprocess.run( ["bjobs", "-l", str(job_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, timeout=60 ) details = { "job_id": job_id, "status": "UNKNOWN", "exit_code": None, "exit_reason": None, "submission_time": None, "start_time": None, "finish_time": None, "cpu_time": None, "max_memory": None, "output_file": None, "error_file": None } if result.returncode == 0: content = result.stdout lines = content.split('\n') for line in lines: line = line.strip() if "Job Status" in line: details["status"] = line.split()[-1] elif "Exit Status" in line: details["exit_code"] = line.split()[-1] elif "Exit Reason" in line: details["exit_reason"] = line.split(":", 1)[-1].strip() elif "Submitted" in line: details["submission_time"] = line.split(":", 1)[-1].strip() elif "Started" in line: details["start_time"] = line.split(":", 1)[-1].strip() elif "Finished" in line: details["finish_time"] = line.split(":", 1)[-1].strip() elif "CPU time used" in line: details["cpu_time"] = line.split(":", 1)[-1].strip() elif "MAX MEM" in line: details["max_memory"] = line.split(":", 1)[-1].strip() elif "Output file" in line: details["output_file"] = line.split(":", 1)[-1].strip() elif "Error file" in line: details["error_file"] = line.split(":", 1)[-1].strip() return details except Exception as e: print(f"Warning: Error getting job details for {job_id}: {e}") return {"job_id": job_id, "status": "UNKNOWN", "error": str(e)} def batch_check_job_status(self, job_ids: List[int]) -> Dict[int, str]: """Batch check multiple job statuses to reduce LSF calls""" if not job_ids: return {} try: # Use bjobs with multiple job IDs to reduce calls job_id_str = " ".join(map(str, job_ids)) result = subprocess.run( ["bjobs", "-noheader"] + list(map(str, job_ids)), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, # 使用universal_newlines代替text参数 timeout=45 ) status_map = {} if result.returncode == 0: lines = result.stdout.strip().split('\n') for line in lines: if line.strip(): parts = line.split() if len(parts) >= 3: job_id = int(parts[0]) status = parts[2] status_map[job_id] = status # Fill in UNKNOWN for jobs not found for job_id in job_ids: if job_id not in status_map: status_map[job_id] = "UNKNOWN" return status_map except Exception as e: print(f"Warning: Batch job status check failed: {e}") # Fall back to individual checks return {job_id: self.check_lsf_job_status(job_id) for job_id in job_ids}
def check_test_result(self, log_file: str) -> bool: """Check test result - Enhanced version that prioritizes UVM Report Summary""" if not os.path.exists(log_file): return False try: with open(log_file, 'r') as f: content = f.read() # PRIORITY 1: Check UVM Report Summary first - this is the most reliable indicator try: severity_block_match = re.search(r"\*\*\s*Report counts by severity[\s\S]*?(?:\n\*\*|\Z)", content, re.IGNORECASE) if severity_block_match: severity_block = severity_block_match.group(0) def _extract_count(label: str) -> int: m = re.search(rf"{label}\s*:\s*(\d+)", severity_block, re.IGNORECASE) return int(m.group(1)) if m else 0 summary_error_count = _extract_count('UVM_ERROR') summary_fatal_count = _extract_count('UVM_FATAL') # If UVM Report Summary shows 0 errors and 0 fatals, check for TEST CASE PASSED if summary_error_count == 0 and summary_fatal_count == 0: # Check for TEST CASE PASSED - this is the definitive indicator if re.search(r'TEST CASE PASSED', content, re.IGNORECASE): print(f"DEBUG: UVM Report Summary shows 0 errors/0 fatals + TEST CASE PASSED found -> PASS") return True else: print(f"DEBUG: UVM Report Summary shows 0 errors/0 fatals but no TEST CASE PASSED -> FAIL") return False else: # UVM Report Summary shows errors/fatals - definitely FAIL print(f"DEBUG: UVM Report Summary indicates FAIL (UVM_ERROR={summary_error_count}, UVM_FATAL={summary_fatal_count})") return False except Exception: # Non-fatal; fall back to detailed checks below pass # Also check for "TEST CASE FAILED" pattern if re.search(r'TEST CASE FAILED', content, re.IGNORECASE): print(f"DEBUG: 'TEST CASE FAILED' found in log -> FAIL") return False
# CRITICAL FIX: First check for errors before summary section # Locate the boundary where summary starts (errors after this should be ignored) summary_markers = [ r'UVM Report catcher Summary', r'UVM Report Summary' ] summary_idx = len(content) for marker in summary_markers: m = re.search(marker, content) if m: summary_idx = min(summary_idx, m.start())
# Region to search for real errors (before summary) error_region = content[:summary_idx]
# Error detection (only before summary) error_patterns = [r'UVM_ERROR', r'UVM_FATAL', r'SCOREBOARD_MISMATCH', r'Solver failed', r'Error', r'Offending'] first_error_match = None last_error_idx = -1 for pat in error_patterns: for m in re.finditer(pat, error_region): if first_error_match is None: first_error_match = m last_error_idx = max(last_error_idx, m.start())
# Non-recoverable rule: any UVM_FATAL before summary is immediate FAIL if re.search(r'UVM_FATAL', error_region): print("DEBUG: UVM_FATAL found before summary -> immediate FAIL") return False
# PASS detection (stricter to avoid accidental matches) # Accept common canonical PASS lines only, anchored to line start pass_match = None pass_patterns = [ r'^\s*TEST CASE PASSED\b', r'^\s*UVM_.*?TEST PASSED\b', r'^\s*SIMULATION PASSED\b', ] for _pat in pass_patterns: _m = re.search(_pat, content, re.MULTILINE) if _m: pass_match = _m break
# CRITICAL FIX: New logic to handle "running but had error" cases # If there are errors before summary, check if PASS comes after the last error if first_error_match is not None: if pass_match: pass_idx = pass_match.start() # Only PASS if PASS comes AFTER the last error (indicating recovery) if pass_idx > last_error_idx: # Before returning PASS, ensure no tail errors after summary tail_region = content[summary_idx:] if re.search(r'(UVM_ERROR|UVM_FATAL|SCOREBOARD_MISMATCH|Solver failed|Error|Offending)', tail_region, re.IGNORECASE): print("DEBUG: Errors found after summary (tail region) -> treat as FAIL") return False print(f"DEBUG: Test PASSED after errors - PASS at {pass_idx}, last error at {last_error_idx}") return True else: # PASS came before or at the same time as error - this is "running but had error" error_pos = first_error_match.start() start_pos = max(0, error_pos - 100) end_pos = min(len(error_region), error_pos + 200) error_context = error_region[start_pos:end_pos].strip() error_log_file = Path(log_file).with_suffix('.error.log') with open(error_log_file, 'w') as ef: ef.write(f"Original Log File: {log_file}\n") ef.write(f"Error Type: {first_error_match.group(0)}\n") ef.write(f"Error Context:\n{error_context}\n") ef.write(f"PASS position: {pass_idx}, Last error position: {last_error_idx}\n") ef.write(f"Decision: FAIL - PASS came before/at same time as error\n") print(f"DEBUG: Test FAILED - PASS at {pass_idx}, last error at {last_error_idx} (running but had error)") return False else: # No PASS found, but there are errors - definitely FAIL error_pos = first_error_match.start() start_pos = max(0, error_pos - 100) end_pos = min(len(error_region), error_pos + 200) error_context = error_region[start_pos:end_pos].strip() error_log_file = Path(log_file).with_suffix('.error.log') with open(error_log_file, 'w') as ef: ef.write(f"Original Log File: {log_file}\n") ef.write(f"Error Type: {first_error_match.group(0)}\n") ef.write(f"Error Context:\n{error_context}\n") ef.write(f"Decision: FAIL - No PASS found, but errors exist\n") print(f"DEBUG: Test FAILED - No PASS found, but errors exist") return False
# No errors found before summary if pass_match: # Additional guard: if tail (after summary) contains errors due to log stitching, FAIL tail_region = content[summary_idx:] if re.search(r'(UVM_ERROR|UVM_FATAL|SCOREBOARD_MISMATCH|Solver failed|Error|Offending)', tail_region, re.IGNORECASE): print("DEBUG: Tail errors detected after summary despite PASS -> FAIL") return False # No errors and PASS found - definitely PASS print(f"DEBUG: Test PASSED - No errors found, PASS exists") return True else: # No errors and no PASS - treat as not passed print(f"DEBUG: Test not passed - No errors found, but no PASS either") return False except Exception as e: print(f"{Colors.RED}Error checking log file: {e}{Colors.END}") return False def _final_status_refresh(self): """Final status refresh - recheck all test results based on log files""" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') refreshed_count = 0 for result_key, result_obj in self.results.items(): # Skip if no job_id or already in final state if not hasattr(result_obj, 'job_id') or not result_obj.job_id: continue # Skip if already in final PASS/FAIL state if result_obj.status in ["PASS", "FAIL", "RERUN PASS", "RERUN FAIL", "ERROR", "TIMEOUT"]: continue try: # Get log file path log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if not log_file_path or not os.path.exists(log_file_path): continue # Check log file for TEST CASE PASSED test_passed = self.check_test_result(log_file_path) if test_passed: # Test passed - update status based on retry context if result_obj.is_retry: old_status = result_obj.status result_obj.finish("RERUN PASS", "") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> RERUN PASS (log shows TEST CASE PASSED)") else: old_status = result_obj.status result_obj.finish("PASS", "") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> PASS (log shows TEST CASE PASSED)") refreshed_count += 1 else: # Test failed - update status based on retry context if result_obj.is_retry: old_status = result_obj.status result_obj.finish("RERUN FAIL", "Final refresh: log indicates failure") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> RERUN FAIL (log indicates failure)") else: old_status = result_obj.status result_obj.finish("FAIL", "Final refresh: log indicates failure") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> FAIL (log indicates failure)") refreshed_count += 1 except Exception as e: print(f"Warning: Could not refresh status for {result_obj.name}: {e}") if refreshed_count > 0: print(f"INFO: {timestamp} Final status refresh completed: {refreshed_count} tests updated") else: print(f"INFO: {timestamp} Final status refresh completed: no tests needed updating") def _check_for_runtime_errors(self, log_file: str) -> bool: """Check for runtime errors in log file that indicate test should be retried""" if not os.path.exists(log_file): return False try: with open(log_file, 'r') as f: content = f.read() # Check for runtime error patterns that indicate test should be retried runtime_error_patterns = [ r'UVM_ERROR', r'UVM_FATAL', r'SCOREBOARD_MISMATCH', r'Solver failed', r'\bError\b', r'Offending', r'ERROR.*runtime', r'FATAL.*runtime', r'Exception.*occurred', r'Assertion.*failed', r'Timeout.*occurred', r'Memory.*leak', r'Resource.*exhausted', r'Connection.*failed', r'Protocol.*violation', r'Deadlock.*detected', r'Livelock.*detected' ] # Search for runtime errors in the entire log for pattern in runtime_error_patterns: if re.search(pattern, content, re.IGNORECASE): print(f"DEBUG: Runtime error detected in {log_file}: {pattern}") return True return False except Exception as e: print(f"{Colors.RED}Error checking for runtime errors in log file: {e}{Colors.END}") return False def retry_failed_tests(self): """Retry failed tests with parallel retry mechanism""" failed_tests = [(name, result) for name, result in self.results.items() if result.status in ["FAIL", "ERROR", "TIMEOUT"]] if not failed_tests: print(f"{Colors.GREEN}No tests need retry{Colors.END}") return print(f"\n{Colors.YELLOW}=== Retry Failed Tests ==={Colors.END}") print(f"Failed test count: {len(failed_tests)}") print(f"Max retry count: {self.args.retry}") print(f"Retry strategy: Parallel retry - ALL retries must pass for test to pass") for test_key, result in failed_tests: # Extract test name from the key (test_key format is "test_name:config:seed") test_name = result.name # Use the actual test name from TestResult object original_seed = getattr(result, 'seed', None) print(f"\n{Colors.CYAN}Starting parallel retry for {test_name} (original seed: {original_seed}){Colors.END}") # LSF mode parallel retry retry_results = self._run_parallel_lsf_retry(test_name, result, original_seed) # Process retry results self._process_parallel_retry_results(test_name, result, retry_results)
def _run_parallel_lsf_retry(self, test_name: str, result, original_seed: str) -> List[Dict]: """Run parallel LSF retry tests""" retry_results = [] # Find the original test case data print(f" Searching for original test case: {test_name}") original_case = self.find_original_test_case(test_name) if not original_case: print(f" Test case not found in JSON files, checking self.tests...") for test_tuple in self.tests: if test_tuple[0] == test_name: print(f" Found test case '{test_name}' in self.tests") original_case = { 'name': test_name, 'config': test_tuple[1], 'repeat': 1, 'timeout': 60, 'opts': [] } break if not original_case: print(f"{Colors.RED}Warning: Could not find original test case for {test_name}, skipping retry{Colors.END}") return [] # Generate retry opcodes output_dir = self.args.output_dir sim_output_dir = getattr(self.args, 'dir', output_dir) retry_opcodes = [] for retry in range(1, self.args.retry + 1): if retry == 1 and original_seed: # First retry: use original seed print(f" Retry {retry}: Using original seed: {original_seed}") opcodes = self.gen_test_case(original_case, output_dir, sim_output_dir, self.args.queue, specified_seed=original_seed) else: # Other retries: use random seed print(f" Retry {retry}: Using random seed") opcodes = self.gen_test_case(original_case, output_dir, sim_output_dir, self.args.queue) if opcodes: opcode = opcodes[0] opcode['retry_attempt'] = retry opcode['retry_seed'] = original_seed if retry == 1 else 'random' retry_opcodes.append(opcode) # Submit all retry jobs in parallel print(f" Submitting {len(retry_opcodes)} parallel retry jobs...") submitted_jobs = [] for opcode in retry_opcodes: retry_result = self.submit_test_case(opcode) if retry_result["status"] == "SUBMITTED": retry_result['retry_attempt'] = opcode['retry_attempt'] retry_result['retry_seed'] = opcode['retry_seed'] submitted_jobs.append(retry_result) print(f" Retry {opcode['retry_attempt']} submitted: jobid {retry_result['job_id']}") else: print(f" Retry {opcode['retry_attempt']} submission failed: {retry_result.get('error', 'Unknown error')}") # Wait for all jobs to complete if submitted_jobs: print(f" Waiting for {len(submitted_jobs)} retry jobs to complete...") retry_results = self._wait_for_parallel_jobs(submitted_jobs) return retry_results def _wait_for_parallel_jobs(self, submitted_jobs: List[Dict]) -> List[Dict]: """Wait for multiple jobs to complete in parallel""" completed_jobs = [] job_ids = [job['job_id'] for job in submitted_jobs] while job_ids: completed_jobs_batch = [] jobs_to_remove = [] for job_id in job_ids: status = self.check_lsf_job_status(int(job_id)) if status in ["DONE", "EXIT", "TERM", "KILL"]: # Find the corresponding job info job_info = next((job for job in submitted_jobs if job['job_id'] == job_id), None) if job_info: if status == "DONE": job_info["status"] = "PASS" else: job_info["status"] = "FAIL" completed_jobs_batch.append(job_info) jobs_to_remove.append(job_id) # Remove completed jobs from monitoring list for job_id in jobs_to_remove: job_ids.remove(job_id) if completed_jobs_batch: completed_jobs.extend(completed_jobs_batch) for job in completed_jobs_batch: if job["status"] == "PASS": status_icon = f"{Colors.GREEN}✓{Colors.END}" elif job["status"] == "RERUN PASS": status_icon = f"{Colors.CYAN}✓{Colors.END}" else: status_icon = f"{Colors.RED}✗{Colors.END}" print(f" Retry {job['retry_attempt']} completed: {status_icon} {job['retry_seed']}") if job_ids: time.sleep(10) # Wait before next check return completed_jobs def _start_immediate_retry(self, result): """Start immediate retry for a failed test""" test_name = result.name original_seed = getattr(result, 'seed', None) # Rate limit concurrent retry threads if not hasattr(self, 'active_retry_threads'): self.active_retry_threads = 0 if self.active_retry_threads >= 20: print(f" Retry queue full (20). Delaying retry for {test_name}...") # Busy-wait with sleep until slot available while self.active_retry_threads >= 20: time.sleep(5) # Check if we've already started retries for this test if hasattr(result, 'retry_started') and result.retry_started: print(f" Retry already started for {test_name}, skipping") return # Mark that retry has been started if not hasattr(result, 'retry_started'): result.retry_started = False result.retry_started = True result.retry_count = 0 print(f"\n{Colors.CYAN}🚀 Starting immediate retry for {test_name} (original seed: {original_seed}){Colors.END}") # Start retry in a separate thread to avoid blocking the main monitoring loop self.active_retry_threads += 1 retry_thread = threading.Thread( target=self._run_immediate_retry, args=(test_name, result, original_seed), daemon=True ) retry_thread.start() def _run_immediate_retry(self, test_name: str, result, original_seed: str): """Run immediate retry in a separate thread""" try: # LSF mode immediate retry self._run_immediate_lsf_retry(test_name, result, original_seed) except Exception as e: print(f"{Colors.RED}Error in immediate retry for {test_name}: {e}{Colors.END}") finally: # Decrease active retry counter when thread finishes if hasattr(self, 'active_retry_threads') and self.active_retry_threads > 0: self.active_retry_threads -= 1 def _run_immediate_lsf_retry(self, test_name: str, result, original_seed: str): """Run immediate LSF retry - stop on first success""" # Find the original test case data with proper opts handling original_case = self.find_original_test_case_with_opts(test_name, result) if not original_case: print(f"{Colors.RED}Warning: Could not find original test case for {test_name}, trying fallback method{Colors.END}") # Fallback to original method original_case = self.find_original_test_case(test_name) if not original_case: print(f"{Colors.RED}Warning: Could not find original test case for {test_name}, skipping immediate retry{Colors.END}") return # Start only one retry attempt - the main loop will handle subsequent retries if needed retry = 1 if original_seed: # First retry: use original seed print(f" Immediate retry {retry}: Using original seed: {original_seed}") opcodes = self.gen_test_case(original_case, self.args.output_dir, str(self.regression_dir), self.args.queue, specified_seed=original_seed) else: # Use random seed print(f" Immediate retry {retry}: Using random seed") opcodes = self.gen_test_case(original_case, self.args.output_dir, str(self.regression_dir), self.args.queue)
# Force wave dump for retry-generated opcodes BEFORE selecting and submitting try: for oc in opcodes or []: cmd_list = oc.get('cmd', []) if isinstance(cmd_list, list): replaced = False for i, token in enumerate(cmd_list): if isinstance(token, str) and token.startswith('wave='): if token != 'wave=fsdb': cmd_list[i] = 'wave=fsdb' replaced = True break if not replaced: cmd_list.append('wave=fsdb') # Tag immediate retry if not any(isinstance(t, str) and t.startswith('lmn=') for t in cmd_list): cmd_list.append('lmn=rerun') oc['cmd'] = cmd_list except Exception: pass if not opcodes: print(f" Failed to generate retry opcodes for {test_name}") return opcode = opcodes[0] # Ensure the selected opcode itself carries wave=fsdb (explicit) try: cmd_list = opcode.get('cmd', []) if isinstance(cmd_list, list): replaced = False for i, token in enumerate(cmd_list): if isinstance(token, str) and token.startswith('wave='): if token != 'wave=fsdb': cmd_list[i] = 'wave=fsdb' replaced = True break if not replaced: cmd_list.append('wave=fsdb') # Tag immediate retry if not any(isinstance(t, str) and t.startswith('lmn=') for t in cmd_list): cmd_list.append('lmn=rerun') opcode['cmd'] = cmd_list except Exception: pass retry_result = self.submit_test_case(opcode) if retry_result["status"] == "SUBMITTED": result.retry_count = retry print(f" Immediate retry {retry} submitted: jobid {retry_result['job_id']}") # CRITICAL FIX: Don't wait here - let the main loop handle retry job completion # The retry job is now tracked in the main loop and will be processed there print(f" Immediate retry {retry} submitted and tracked in main loop: jobid {retry_result['job_id']}") # Return immediately - the main loop will handle completion return else: print(f" {test_name} retry {retry} submission failed: {retry_result.get('error', 'Unknown error')}") # If submission failed, we can't retry further print(f"{Colors.RED}✗{Colors.END} {test_name} retry submission failed - Original test remains FAIL") def _wait_for_retry_job_completion(self, job_id: str, test_name: str, retry_num: int) -> bool: """Wait for a specific retry job to complete""" print(f" Waiting for retry {retry_num} job {job_id} to complete...") while True: status = self.check_lsf_job_status(int(job_id)) if status == "DONE": print(f" Retry {retry_num} job {job_id} PASSED") return True elif status in ["EXIT", "TERM", "KILL"]: print(f" Retry {retry_num} job {job_id} FAILED") return False elif status == "UNKNOWN": # Job may have completed and been removed from queue print(f" Retry {retry_num} job {job_id} status UNKNOWN, checking log file...") # Try to determine result from log file test_info = self.get_test_info_by_job_id(job_id) if test_info: log_file_path = self.get_test_log_path_by_job_id(job_id) if log_file_path and os.path.exists(log_file_path): # CRITICAL FIX: Enhanced status determination for retry jobs test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: print(f" Retry {retry_num} job {job_id} PASSED (from log file)") return True elif test_passed and has_runtime_errors: print(f" Retry {retry_num} job {job_id} FAILED (running but had error, from log file)") return False else: print(f" Retry {retry_num} job {job_id} FAILED (from log file)") return False # If we can't determine, assume failed print(f" Retry {retry_num} job {job_id} assumed FAILED") return False else: # Still running time.sleep(10) def get_test_log_path_by_job_id(self, job_id: str) -> str: """Get log file path by job ID""" # 1) Prefer the opcode we stored at submission time (survives after job DONE) try: stored = getattr(self, 'job_meta', {}).get(str(job_id)) if stored: log_path = stored.get('log_path') or stored.get('log_file') if log_path: return log_path except Exception: pass
# 2) Fallback: search through submitted_results snapshot try: for result in getattr(self, 'submitted_results', []): if result.get('job_id') == job_id: return result.get('log_path', '') except Exception: pass
# 3) Fallback: locate TestResult by job_id, then derive its log path try: for _, res in getattr(self, 'results', {}).items(): if hasattr(res, 'job_id') and str(getattr(res, 'job_id', '')) == str(job_id): # Prefer explicitly recorded path if present if getattr(res, 'log_file', ''): return res.log_file # Derive from standard layout derived = self.get_test_log_path(res) if derived: return derived except Exception: pass
# 4) Nothing found return '' def _process_parallel_retry_results(self, test_name: str, result, retry_results: List): """Process parallel retry results and update original test result""" if not retry_results: print(f"{Colors.RED}No retry results for {test_name}{Colors.END}") return # Check if ALL retries passed passed_retries = [r for r in retry_results if r.status == "PASS" or r.get("status") == "PASS"] total_retries = len(retry_results) if len(passed_retries) == total_retries: # ALL retries passed - mark original test as PASS passed_retry = passed_retries[0] # Use the first passed retry for info if hasattr(passed_retry, 'status'): # LSF mode result.finish("PASS", f"ALL {total_retries} retries successful") result.retry_count = total_retries result.seed = passed_retry.seed result.log_file = passed_retry.log_file print(f"{Colors.GREEN}✓{Colors.END} {test_name} ALL {total_retries} retries passed - Original test marked as PASS") else: # LSF mode result.finish("PASS", f"ALL {total_retries} retries successful") result.retry_count = total_retries result.seed = passed_retry.get('seed', 'unknown') result.job_id = passed_retry.get('job_id', 'unknown') print(f"{Colors.GREEN}✓{Colors.END} {test_name} ALL {total_retries} retries passed - Original test marked as PASS") else: # Not all retries passed - test remains FAIL failed_count = total_retries - len(passed_retries) print(f"{Colors.RED}✗{Colors.END} {test_name} {failed_count}/{total_retries} retries failed - Original test remains FAIL") # Update with the last retry attempt info last_retry = retry_results[-1] if hasattr(last_retry, 'retry_attempt'): # LSF mode result.retry_count = last_retry.retry_attempt else: # LSF mode result.retry_count = last_retry['retry_attempt'] def merge_coverage(self): """Merge coverage databases""" if not self.args.coverage: return print(f"\n{Colors.BLUE}=== Merge Coverage Databases ==={Colors.END}") # Find all coverage databases cov_dbs = [] for result in self.results.values(): if result.coverage_db and os.path.exists(result.coverage_db): cov_dbs.append(result.coverage_db) if not cov_dbs: print(f"{Colors.YELLOW}No coverage databases found{Colors.END}") return # Merge coverage merged_db = self.coverage_dir / f"merged_{datetime.now().strftime('%Y%m%d_%H%M%S')}.vdb" try: cmd = ["urg", "-dir"] + cov_dbs + ["-dbname", str(merged_db)] subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(f"{Colors.GREEN}Coverage merge completed: {merged_db}{Colors.END}") # Generate coverage report report_dir = self.report_dir / f"coverage_{datetime.now().strftime('%Y%m%d_%H%M%S')}" cmd = ["urg", "-dir", str(merged_db), "-report", str(report_dir)] subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(f"{Colors.GREEN}Coverage report generated: {report_dir}{Colors.END}") except subprocess.CalledProcessError as e: print(f"{Colors.RED}Coverage merge failed: {e}{Colors.END}") def generate_report(self): """Generate test report""" print(f"\n{Colors.BLUE}=== Generate Test Report ==={Colors.END}") # Final status validation before generating report print(f"Performing final status validation before generating report...") self.validate_all_test_statuses() # Clean up any duplicate test results before counting self._clean_duplicate_test_results() # Count results - ensure we count only tests with final statuses final_status_results = [r for r in self.results.values() if r.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] print(f"Counting results from {len(final_status_results)} completed test cases...") total = len(final_status_results) passed = len([r for r in final_status_results if r.status == "PASS"]) rerun_passed = len([r for r in final_status_results if r.status == "RERUN PASS"]) failed = len([r for r in final_status_results if r.status in ["FAIL", "RERUN FAIL"]]) errors = len([r for r in final_status_results if r.status == "ERROR"]) timeouts = len([r for r in final_status_results if r.status == "TIMEOUT"]) print(f"Result counts: Total={total}, Passed={passed}, Rerun Passed={rerun_passed}, Failed={failed}, Errors={errors}, Timeouts={timeouts}") # Calculate total time total_time = time.time() - self.start_time # Generate CSV report timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') csv_file = self.report_dir / f"regression_{timestamp}.csv" with open(csv_file, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['Test Name', 'Config', 'Status', 'Duration', 'Estimated Time', 'Retry Count', 'Log File', 'Error Message']) for result in final_status_results: writer.writerow([ result.name, result.config, result.status, result.get_duration_str(), f"{result.estimated_duration:.1f}s", result.retry_count, result.log_file, result.error_msg ]) # Generate JSON report json_file = self.report_dir / f"regression_{timestamp}.json" report_data = { 'summary': { 'total': total, 'passed': passed, 'rerun_passed': rerun_passed, 'failed': failed, 'errors': errors, 'timeouts': timeouts, 'pass_rate': f"{(passed + rerun_passed)/total*100:.1f}%" if total > 0 else "0%", 'total_time': f"{total_time:.1f}s", 'timestamp': timestamp }, 'tests': { f"{result.name}_{getattr(result, 'seed', 'unknown')}": { 'config': result.config, 'status': result.status, 'duration': result.duration, 'estimated_duration': result.estimated_duration, 'retry_count': result.retry_count, 'log_file': result.log_file, 'error_msg': result.error_msg } for result in final_status_results } } with open(json_file, 'w') as f: json.dump(report_data, f, indent=2, ensure_ascii=False) # Save historical test data for future time estimation self.save_test_history() # Print summary print(f"\n{Colors.BOLD}=== Regression Test Summary ==={Colors.END}") print(f"Total Tests: {total}") print(f"{Colors.GREEN}Passed: {passed}{Colors.END}") print(f"{Colors.CYAN}Rerun Passed: {rerun_passed}{Colors.END}") print(f"{Colors.RED}Failed: {failed}{Colors.END}") print(f"{Colors.RED}Errors: {errors}{Colors.END}") print(f"{Colors.YELLOW}Timeouts: {timeouts}{Colors.END}") print(f"Pass Rate: {(passed + rerun_passed)/total*100:.1f}%" if total > 0 else "0%") print(f"Total Time: {total_time/60:.1f} minutes") print(f"\nReport Files:") print(f" CSV: {csv_file}") print(f" JSON: {json_file}") # Show failed tests failed_results = [r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] if failed_results: print(f"\n{Colors.RED}Failed Tests:{Colors.END}") for result in failed_results: print(f" {result.name}: {result.status} - {result.error_msg}") # Generate detailed regression report (like Image 1) self.generate_detailed_regression_report() # Generate error summary report self.generate_error_summary_report() # Generate regression summary info (like Image 3) self.generate_regression_summary_info() # Generate final real-time report self.generate_real_time_report() # Save error monitor state self.save_error_monitor_state()
# After all standard reports, also collect and persist transaction/cycle statistics try: self.update_transaction_cycle_statistics() except Exception as e: print(f"{Colors.YELLOW}Warning: Failed to update transaction/cycle statistics: {e}{Colors.END}") def extract_job_statistics(self, result): """Extract job statistics (CPU time, max memory, processes) from actual data""" cpu_time = "0 sec" max_mem = "N/A" procs = "N/A" # Try to get CPU time from result duration if result.duration > 0: cpu_time = f"{int(result.duration)} sec" # Try to get job statistics from LSF if job_id is available if hasattr(result, 'job_id') and result.job_id and result.job_id != 'unknown': try: # Use bjobs to get detailed job information cmd = ["bjobs", "-l", str(result.job_id)] output = subprocess.check_output(cmd, stderr=subprocess.PIPE, universal_newlines=True, timeout=10) # Parse CPU time cpu_match = re.search(r'CPU time used is (\d+\.?\d*) seconds', output) if cpu_match: cpu_seconds = float(cpu_match.group(1)) cpu_time = f"{int(cpu_seconds)} sec" # Parse memory usage mem_match = re.search(r'MAX MEM: (\d+\.?\d*) (\w+)', output) if mem_match: mem_value = mem_match.group(1) mem_unit = mem_match.group(2) max_mem = f"{mem_value} {mem_unit}" # Parse number of processes proc_match = re.search(r'Number of processors: (\d+)', output) if proc_match: procs = proc_match.group(1) except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): # If bjobs fails, try to extract from log file pass # If LSF info not available, try to extract from log file if max_mem == "N/A" and hasattr(result, 'log_file') and result.log_file: try: if os.path.exists(result.log_file): with open(result.log_file, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # Look for memory usage patterns in log mem_patterns = [ r'max_memory[:\s]+(\d+\.?\d*)\s*(\w+)', r'memory_usage[:\s]+(\d+\.?\d*)\s*(\w+)', r'peak_memory[:\s]+(\d+\.?\d*)\s*(\w+)', r'MAX_MEM[:\s]+(\d+\.?\d*)\s*(\w+)' ] for pattern in mem_patterns: mem_match = re.search(pattern, content, re.IGNORECASE) if mem_match: mem_value = mem_match.group(1) mem_unit = mem_match.group(2) max_mem = f"{mem_value} {mem_unit}" break # Look for process count patterns proc_patterns = [ r'processes[:\s]+(\d+)', r'num_procs[:\s]+(\d+)', r'process_count[:\s]+(\d+)' ] for pattern in proc_patterns: proc_match = re.search(pattern, content, re.IGNORECASE) if proc_match: procs = proc_match.group(1) break except Exception: pass # If still no data, use reasonable defaults based on test type if max_mem == "N/A": # Estimate memory based on test name or use default if "stress" in result.name.lower() or "full" in result.name.lower(): max_mem = "16 GB" elif "small" in result.name.lower() or "basic" in result.name.lower(): max_mem = "4 GB" else: max_mem = "8 GB" if procs == "N/A": # Estimate process count based on test type or use default if "stress" in result.name.lower() or "full" in result.name.lower(): procs = "12" elif "small" in result.name.lower() or "basic" in result.name.lower(): procs = "4" else: procs = "8" return cpu_time, max_mem, procs def get_test_log_path(self, result): """Get the actual log file path for a test result""" # 1. Use result.log_file if it exists and is a valid path if result.log_file and os.path.exists(result.log_file): return os.path.abspath(result.log_file) # 2. Try to construct log file path based on test case structure sim_output_dir = str(self.regression_dir) seed = getattr(result, 'seed', 'unknown') test_name = result.name # Try different log file naming patterns possible_log_paths = [ f"{sim_output_dir}/logs/{test_name}/{test_name}_{seed}_*.log", # Primary path with opts: logs/test_name/test_name_seed_opts_*.log f"{sim_output_dir}/logs/{test_name}/{test_name}_{seed}.log", # Fallback: logs/test_name/test_name_seed.log f"{sim_output_dir}/logs/{test_name}/{test_name}_*.log", # Wildcard pattern for logs/test_name/ f"{sim_output_dir}/logs/{test_name}.log", # Fallback: logs/test_name.log f"{sim_output_dir}/{test_name}/report.log", # Legacy path: test_name/report.log f"{sim_output_dir}/{test_name}_{seed}/report.log", # Legacy path: test_name_seed/report.log ] for log_path in possible_log_paths: if '*' in log_path: # Handle wildcard patterns import glob matching_files = glob.glob(log_path) if matching_files: return os.path.abspath(matching_files[0]) # Return first matching file elif os.path.exists(log_path): return os.path.abspath(log_path) return None def update_real_time_report(self): """Update real-time regression report""" current_time = time.time() if current_time - self.last_report_update >= self.report_update_interval: self.last_report_update = current_time self.generate_real_time_report() def generate_real_time_report(self): """Generate real-time regression report""" try: with open(self.real_time_report_path, 'w', encoding='utf-8') as f: # Write header f.write("=" * 80 + "\n") f.write(f"REAL-TIME REGRESSION REPORT - Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write("=" * 80 + "\n\n") # Generate test status and log paths section f.write("=== TEST STATUS AND LOG PATHS ===\n") f.write(self.generate_test_status_and_log_paths_content()) f.write("\n\n") # Generate summary info f.write("=== REGRESSION SUMMARY ===\n") f.write(self.generate_regression_summary_info_content()) f.write("\n\n") # Write footer f.write("=" * 80 + "\n") f.write("REPORT WILL BE UPDATED EVERY 30 SECONDS\n") f.write("=" * 80 + "\n") except Exception as e: print(f"Warning: Could not update real-time report: {e}") def generate_progress_bar(self, percentage, width=50): """Generate progress bar""" filled_width = int(width * percentage / 100) bar = '█' * filled_width + '░' * (width - filled_width) return f"[{bar}]" def generate_detailed_regression_report(self): """Generate detailed regression report like Image 1""" print(f"\n{Colors.BLUE}=== Detailed Regression Report ==={Colors.END}") # Validate statuses before generating report self.validate_all_test_statuses() # Print header timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp}: {'+' * 15} REPORT {'+' * 15}") # Print table header print(f"INFO: {timestamp}: | status | test_name | seed | jobid | cpu_time | max_mem | procs |") # Process each test result - now PENDING status should be correctly updated final_status_results = [r for r in self.results.values() if r.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] for result in final_status_results: # Get test info test_name = result.name seed = getattr(result, 'seed', 'unknown') job_id = getattr(result, 'job_id', 'unknown') # Get CPU time and memory info from actual data cpu_time, max_mem, procs = self.extract_job_statistics(result) # If runtime errors were detected, force status to FAIL for reporting and accounting if hasattr(result, 'error_detected') and result.error_detected and result.status in ["PASS", "RERUN PASS"]: result.status = "FAIL"
# Format status with proper colors (after possible override) status = result.status if status == "PASS": status = f"{Colors.GREEN}PASS{Colors.END}" elif status == "RERUN PASS": status = f"{Colors.CYAN}RERUN PASS{Colors.END}" elif status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: status = f"{Colors.RED}FAIL{Colors.END}" else: status = f"{Colors.YELLOW}{status}{Colors.END}" # Add error detection info to status error_info = "" if hasattr(result, 'error_detected') and result.error_detected: error_info = " (running but had error)" # Print test result line print(f"INFO: {timestamp}: | {status} | {test_name} | {seed} | {job_id} | {cpu_time} | {max_mem} | {procs} |{error_info}") print(f"INFO: {timestamp}: {'+' * 15} END REPORT {'+' * 15}") print(f"Total unique tests reported: {len(final_status_results)}") def generate_error_summary_report(self): """Generate error summary report with UVM_ERROR and UVM_FATAL details""" print(f"\n{Colors.BLUE}=== Error Summary Report ==={Colors.END}") # Collect all error information error_info = {} failed_tests = [] for result in self.results.values(): # Treat "running but had error" as FAIL for summary as well if hasattr(result, 'error_detected') and result.error_detected and result.status in ["PASS", "RERUN PASS"]: result.status = "FAIL" if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: failed_tests.append(result) # Try to read log file for UVM_ERROR and UVM_FATAL self.analyze_log_for_errors(result, error_info) if not failed_tests: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: No failed tests found") return if not error_info: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: Found {len(failed_tests)} failed tests but no UVM_ERROR or UVM_FATAL found in logs") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: Failed tests with log paths:") for result in failed_tests: # Get the actual log file path log_path = self.get_test_log_path(result) if log_path: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: [{result.status}] {log_path}") else: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: {result.name}: {result.status} - {result.error_msg} (No log file found)") return # Print error summary timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp}: Total {len(failed_tests)} failure syndromes:") # Group errors by type error_count = 1 for error_type, error_details in error_info.items(): print(f"INFO: {timestamp}: ({error_count}) ERR ID:{error_details['id']}:") print(f"INFO: {timestamp}: MSG: \"{error_details['message']}\"") # Print error count if available (from log content before UVM Report catcher Summary) if 'count' in error_details: print(f"INFO: {timestamp}: Count: {error_details['count']} (from log content before UVM Report catcher Summary)") # Print associated test paths for test_path in error_details['tests']: print(f"INFO: {timestamp}: {test_path}") error_count += 1 def analyze_log_for_errors(self, result, error_info): """Analyze log file for UVM_ERROR and UVM_FATAL messages ONLY in content before UVM Report catcher Summary""" # Use the new get_test_log_path method to get the actual log file path log_file_path = self.get_test_log_path(result) if not log_file_path: print(f"Warning: No log file found for test {result.name}") return # Use the found log file path log_file_paths = [log_file_path] # Analyze each log file for log_file_path in log_file_paths: try: with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # First, look for UVM Report catcher Summary section uvm_summary_errors = self.analyze_uvm_report_catcher_summary(content, log_file_path) for error_type, error_details in uvm_summary_errors.items(): if error_type not in error_info: error_info[error_type] = error_details else: # Merge tests if error type already exists for test_path in error_details['tests']: if test_path not in error_info[error_type]['tests']: error_info[error_type]['tests'].append(test_path) # Only use UVM Report catcher Summary analysis, skip direct pattern matching # to avoid capturing errors from UVM Report catcher Summary section pass except Exception as e: print(f"Warning: Could not analyze log file {log_file_path}: {e}") continue def analyze_uvm_report_catcher_summary(self, content, log_file_path): """Analyze log content before 'UVM Report catcher Summary' line for error information""" error_info = {} # Split content into lines to find the UVM Report catcher Summary line lines = content.split('\n') summary_line_index = -1 # Find the line containing "UVM Report catcher Summary" for i, line in enumerate(lines): if "UVM Report catcher Summary" in line: summary_line_index = i break if summary_line_index == -1: # No UVM Report catcher Summary found, return empty error_info return error_info # Extract content before the UVM Report catcher Summary line content_before_summary = '\n'.join(lines[:summary_line_index]) # Look for UVM_ERROR and UVM_FATAL in the content before summary error_patterns = [ r'UVM_ERROR\s*@\s*[^\n]*\s*:\s*([^\n]+)', r'UVM_FATAL\s*@\s*[^\n]*\s*:\s*([^\n]+)', r'UVM_ERROR\s+([^\n]+)', r'UVM_FATAL\s+([^\n]+)' ] # Collect all error messages found before the summary error_messages = [] fatal_messages = [] for pattern in error_patterns: matches = re.findall(pattern, content_before_summary, re.IGNORECASE) for match in matches: if "UVM_ERROR" in pattern or "UVM_ERROR" in match: error_messages.append(match.strip()) elif "UVM_FATAL" in pattern or "UVM_FATAL" in match: fatal_messages.append(match.strip()) # Create error info structure if error_messages: # Use the first error message as representative error_message = error_messages[0] error_id = str(hash(error_message))[-8:] error_info['UVM_ERROR'] = { 'id': error_id, 'message': f"UVM_ERROR: {error_message}", 'tests': [log_file_path], 'count': len(error_messages) } if fatal_messages: # Use the first fatal message as representative fatal_message = fatal_messages[0] fatal_id = str(hash(fatal_message))[-8:] error_info['UVM_FATAL'] = { 'id': fatal_id, 'message': f"UVM_FATAL: {fatal_message}", 'tests': [log_file_path], 'count': len(fatal_messages) } return error_info def generate_regression_summary_info(self): """Generate regression summary info like Image 3""" print(f"\n{Colors.BLUE}=== Regression Summary Info ==={Colors.END}") # Validate statuses before generating summary self.validate_all_test_statuses() timestamp = datetime.now().strftime('%m-%d %H:%M:%S') # Generate regression seed regress_seed = random.randint(1000000000, 9999999999) print(f"INFO: {timestamp}: Regress Seed (rseed): {regress_seed}") # Test list path - use absolute path cur_path = os.getcwd() test_list_path = os.path.abspath(os.path.join(cur_path, "../def/json_list")) print(f"INFO: {timestamp}: Test list: {test_list_path}") # Failure list path - use absolute path failure_list_path = os.path.abspath(os.path.join(self.log_dir, "error.lst")) print(f"INFO: {timestamp}: Failure list: {failure_list_path}") # Regression report path - use absolute path regression_report_path = os.path.abspath(os.path.join(self.report_dir, "zregress_report.log")) print(f"INFO: {timestamp}: Regression report: {regression_report_path}") # End time end_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"INFO: {timestamp}: End Time: {end_time}") # Elapsed CPU time elapsed_time = time.time() - self.start_time hours = int(elapsed_time // 3600) minutes = int((elapsed_time % 3600) // 60) seconds = int(elapsed_time % 60) print(f"INFO: {timestamp}: Elapsed CPU Time: {hours}:{minutes:02d}:{seconds:02d}") # Determine regression result - use validated statuses total_tests = len(self.results) passed_tests = len([r for r in self.results.values() if r.status == "PASS"]) failed_tests = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]]) pending_tests = len([r for r in self.results.values() if r.status == "PENDING"]) running_tests = len([r for r in self.results.values() if r.status == "RUNNING"]) # If there are still pending or running tests, consider it incomplete if pending_tests > 0 or running_tests > 0: print(f"INFO: {timestamp}: ZREGRESS INCOMPLETE (Pending: {pending_tests}, Running: {running_tests})") elif failed_tests == 0: print(f"{Colors.GREEN}INFO: {timestamp}: ZREGRESS PASS{Colors.END}") else: print(f"{Colors.RED}INFO: {timestamp}: ZREGRESS FAIL{Colors.END}") # Print detailed status print(f"INFO: {timestamp}: Total Tests: {total_tests}") print(f"INFO: {timestamp}: Passed: {passed_tests}") print(f"INFO: {timestamp}: Failed: {failed_tests}") print(f"INFO: {timestamp}: Pending: {pending_tests}") print(f"INFO: {timestamp}: Running: {running_tests}") # Save error list to file and get saved paths saved_paths = self.save_error_list(failure_list_path) # Print saved file paths within regression summary if saved_paths: print(f"INFO: {timestamp}: Error list saved to: {saved_paths['error_lst']}") print(f"INFO: {timestamp}: Error JSON saved to: {saved_paths['error_json']}") print(f"INFO: {timestamp}: Failed regression list saved to: {saved_paths['failed_regression']}") # Print summary of failed tests if any failed_count = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]]) if failed_count > 0: print(f"INFO: {timestamp}: Generated failed regression list with {failed_count} failed test cases") print(f"INFO: {timestamp}: You can re-run failed tests using: python3 regress.py --failed-regression {saved_paths['failed_regression']}") # End markers print(f"INFO: {timestamp}: {'+' * 30}") print(f"INFO: {timestamp}: {' ' * 10} ZREGRESS END {' ' * 10}") print(f"INFO: {timestamp}: {'+' * 30}") # Generate comprehensive regression report self.generate_comprehensive_regression_report(regression_report_path) def save_error_list(self, failure_list_path): """Save error list to file and return saved paths""" saved_paths = {} try: # Save error.lst (text format) - simple list of failed test names with open(failure_list_path, 'w', encoding='utf-8') as f: for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: f.write(f"{result.name}\n") saved_paths['error_lst'] = failure_list_path # Save error.json (JSON format with full test case details) error_json_path = failure_list_path.replace('.lst', '.json') error_cases = [] for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: # Find the original test case JSON data with proper opts handling test_case_data = self.find_original_test_case_with_opts(result.name, result) if not test_case_data: # Fallback to original method test_case_data = self.find_original_test_case(result.name) if test_case_data: error_case = { 'test_name': result.name, 'config': result.config, 'status': result.status, 'error_message': result.error_msg, 'duration': result.duration, 'seed': getattr(result, 'seed', 'unknown'), 'job_id': getattr(result, 'job_id', 'unknown'), 'original_test_case': test_case_data } error_cases.append(error_case) with open(error_json_path, 'w', encoding='utf-8') as f: json.dump(error_cases, f, indent=2, ensure_ascii=False) saved_paths['error_json'] = error_json_path # Save failed_regression.json - regression list format for failed tests only failed_regression_path = failure_list_path.replace('.lst', '_regression.json') failed_regression_cases = [] for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: # Find the original test case JSON data with proper opts handling test_case_data = self.find_original_test_case_with_opts(result.name, result) if not test_case_data: # Fallback to original method test_case_data = self.find_original_test_case(result.name) if test_case_data: # Create a regression list entry with the same format as original test cases # but with updated information from the failed run failed_case = test_case_data.copy() # Start with original test case data # Update with actual run information failed_case.update({ 'actual_status': result.status, 'actual_error_message': result.error_msg, 'actual_duration': result.duration, 'actual_seed': getattr(result, 'seed', 'unknown'), 'actual_job_id': getattr(result, 'job_id', 'unknown'), 'log_file': getattr(result, 'log_file', ''), 'retry_count': getattr(result, 'retry_count', 0), 'failure_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') }) # Optionally adjust repeat count for retry (if retry was attempted) if hasattr(result, 'retry_count') and result.retry_count > 0: # Reduce repeat count by retry attempts to avoid over-running original_repeat = failed_case.get('repeat', 1) failed_case['repeat'] = max(1, original_repeat - result.retry_count) failed_case['original_repeat'] = original_repeat failed_regression_cases.append(failed_case) with open(failed_regression_path, 'w', encoding='utf-8') as f: json.dump(failed_regression_cases, f, indent=2, ensure_ascii=False) saved_paths['failed_regression'] = failed_regression_path except Exception as e: print(f"Warning: Could not save error list: {e}") return None return saved_paths def find_original_test_case_with_opts(self, test_name: str, result): """Find the original test case JSON data by test name using the exact opts array stored on the result.
Important: Do NOT derive opts by splitting the unique key, because opts values may contain underscores (e.g., "DELAY_SET_OFF") which would be incorrectly split. Prefer `result.opts` when available. """ try: # Prefer opts directly from the provided result object target_opts = [] if hasattr(result, 'opts') and isinstance(result.opts, list): target_opts = result.opts else: # Fallback: search a matching TestResult and read its opts property for result_key, result_obj in self.results.items(): if result_obj.name == test_name and hasattr(result_obj, 'opts'): target_opts = result_obj.opts or [] break
original_target_opts = list(target_opts) if isinstance(target_opts, list) else [] print(f"Looking for test case: {test_name} with opts: {original_target_opts}") original_case = self.find_original_test_case_by_name_and_opts(test_name, original_target_opts) if original_case: print(f"Found original test case with matching opts: {original_target_opts}") return original_case
# Strict mode: fail fast when opts don't match msg = f"Strict opts match failed for '{test_name}'. target_opts={original_target_opts}" print(msg) raise RuntimeError(msg)
except Exception as e: print(f"Warning: Could not find original test case with opts for {test_name}: {e}") return None def find_original_test_case_by_name_and_opts(self, test_name: str, target_opts: list): """Find the original test case JSON data by test name and opts.
STRICT MATCH MODE: Compare opts arrays exactly as listed in JSON. No token splitting or normalization. "DELAY_SET_OFF" only matches the same string. """ try: # Load test cases from the original JSON files cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" if not os.path.exists(test_file_list_name): print(f"Warning: Test file list not found: {test_file_list_name}") return None print(f"Looking for test case: {test_name} with opts: {target_opts}") print(f"Searching in test file list: {test_file_list_name}") with open(test_file_list_name, 'r') as f: for line in f: if line.strip() and not line.startswith('#'): file_path = cur_path + "/../def" + line.strip() print(f"Checking file: {file_path}") if os.path.exists(file_path): with open(file_path, 'r') as json_file: test_cases = json.load(json_file) for case in test_cases: case_name = case.get('name', '') case_opts = case.get('opts', []) if case_name == test_name: # Check if opts match EXACTLY (strict mode) if case_opts == target_opts: print(f"Found test case '{test_name}' with matching opts {case_opts} in file: {file_path}") return case else: print(f" Found test case '{test_name}' but opts don't match: expected {target_opts}, got {case_opts}") else: print(f" Checking case: '{case_name}' vs '{test_name}'") else: print(f"Warning: Test file not found: {file_path}") print(f"Test case '{test_name}' with opts {target_opts} not found in any JSON files") return None except Exception as e: print(f"Warning: Could not find original test case by name and opts for {test_name}: {e}") return None def find_original_test_case(self, test_name): """Find the original test case JSON data by test name""" try: # Load test cases from the original JSON files cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" if not os.path.exists(test_file_list_name): print(f"Warning: Test file list not found: {test_file_list_name}") return None print(f"Looking for test case: {test_name}") print(f"Searching in test file list: {test_file_list_name}") with open(test_file_list_name, 'r') as f: for line in f: if line.strip() and not line.startswith('#'): file_path = cur_path + "/../def" + line.strip() print(f"Checking file: {file_path}") if os.path.exists(file_path): with open(file_path, 'r') as json_file: test_cases = json.load(json_file) for case in test_cases: case_name = case.get('name', '') if case_name == test_name: print(f"Found test case '{test_name}' in file: {file_path}") return case else: print(f" Checking case: '{case_name}' vs '{test_name}'") else: print(f"Warning: Test file not found: {file_path}") print(f"Test case '{test_name}' not found in any JSON files") return None except Exception as e: print(f"Warning: Could not find original test case for {test_name}: {e}") return None def generate_comprehensive_regression_report(self, report_path): """Generate comprehensive regression report including all sections""" try: with open(report_path, 'w', encoding='utf-8') as f: # Write header f.write("=" * 80 + "\n") f.write("COMPREHENSIVE REGRESSION REPORT\n") f.write("=" * 80 + "\n\n") # Generate detailed regression report f.write("=== DETAILED REGRESSION REPORT ===\n") f.write(self.generate_detailed_regression_report_content()) f.write("\n\n") # Generate error summary report f.write("=== ERROR SUMMARY REPORT ===\n") f.write(self.generate_error_summary_report_content()) f.write("\n\n") # Generate regression summary info f.write("=== REGRESSION SUMMARY INFO ===\n") f.write(self.generate_regression_summary_info_content()) f.write("\n\n") # Generate test status and log paths section f.write("=== TEST STATUS AND LOG PATHS ===\n") f.write(self.generate_test_status_and_log_paths_content()) f.write("\n\n") # Write footer f.write("=" * 80 + "\n") f.write("END OF REPORT\n") f.write("=" * 80 + "\n") print(f"Comprehensive regression report saved to: {report_path}") except Exception as e: print(f"Warning: Could not generate comprehensive regression report: {e}") def show_concurrent_status(self, running_jobs: int, total_jobs: int, max_concurrent: int): """Show concurrent job status""" # Commented out concurrent status printing as requested # timestamp = datetime.now().strftime('%m-%d %H:%M:%S') # if max_concurrent > 0: # utilization = (running_jobs / max_concurrent) * 100 # print(f"INFO: {timestamp} Concurrent Status: {running_jobs}/{max_concurrent} jobs running ({utilization:.1f}% utilization)") # else: # print(f"INFO: {timestamp} Concurrent Status: {running_jobs} jobs running (no limit)") # # if total_jobs > 0: # progress = ((total_jobs - len(self.submitted_jobs)) / total_jobs) * 100 # print(f"INFO: {timestamp} Overall Progress: {progress:.1f}%") pass def get_total_test_cases_count(self): """Get the correct total test cases count, avoiding duplicates and transient states""" # Clean up duplicates first self._clean_duplicate_test_results() # Only count tests with final statuses, exclude transient PENDING/RUNNING states final_status_tests = [] for result_key, result_obj in self.results.items(): # Only count tests that have reached a final state if result_obj.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: final_status_tests.append(result_key) return len(final_status_tests) def show_regression_status(self, running_jobs: int, pending_jobs: int, total_test_cases: int): """Show regression status: running/pending/completed test cases""" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Regression Status: {running_jobs} running, {pending_jobs} pending, {total_test_cases} completed") def run(self): """Run regression test""" print(f"{Colors.BOLD}CMN-700 UVM Regression Test{Colors.END}") print(f"Mode: {self.args.mode}") print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # Display resource configuration if hasattr(self.args, 'memory') and self.args.memory is not None: print(f"Memory Reservation: {self.args.memory}GB per job (command line)") else: print(f"Memory Reservation: Not specified (using LSF default or JSON values)") cpu_cores = getattr(self.args, 'cpu_cores', 1) print(f"CPU Cores: {cpu_cores} per job") # Display coverage configuration if hasattr(self.args, 'cov') and self.args.cov: print(f"Coverage Type: {self.args.cov}") elif hasattr(self.args, 'coverage') and self.args.coverage: print(f"Coverage: Enabled (legacy mode)") else: print(f"Coverage: Disabled") print() # Initialize environment cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" test_file_list = [] # Load test file list with open(test_file_list_name, 'r') as f: for line in f: if line != '\n': file_path = cur_path + "/../def" + line # print(file_path) # Commented out debug print test_file_list.append(file_path.replace('\n', '')) # Load test cases test_cases = self.load_test_cases(test_file_list) # print(test_cases) # Commented out debug print # Filter test cases by group (only if groups are specified) if self.args.groups: selected_cases = self.filter_cases(test_cases, self.args.groups) else: selected_cases = test_cases # Convert test cases to test configs format test_configs = [] for case in selected_cases: # Extract test name and config from the case test_name = case.get('name', 'unknown') config = case.get('config', 'default') test_configs.append((test_name, config)) # Save processed test list self.tests = test_configs print(f"Total loaded tests: {len(self.tests)}") # Set estimated time for each test # Note: TestResult objects will be created later in run_compile_and_regression # with the correct key format (test_name:config:seed:opts) print(f"TestResult objects will be created during job submission with proper key format") # Run tests based on mode if self.args.legacy_mode == "compile_regression": compile_success = self.run_compile_and_regression( str(self.args.dienum), self.args.rtl_ver, self.args.p2_mode, self.args.define ) # If compilation failed, exit without running regression if not compile_success: print(f"{Colors.RED}Compilation failed! Exiting without running regression tests.{Colors.END}") return else: print(f"{Colors.YELLOW}Local regression mode is not supported. Please use LSF regression mode.{Colors.END}") return # Note: Retry is now handled immediately when tests fail # No need to run retry_failed_tests() here anymore if self.args.retry > 0: print(f"{Colors.YELLOW}Note: Retry is enabled and will be triggered immediately when tests fail{Colors.END}") # Merge coverage if self.args.coverage: self.merge_coverage() # Generate report self.generate_report() # Handle auto-restart logic self._handle_auto_restart() def _handle_auto_restart(self): """Handle auto-restart logic after regression completion""" # Check if auto-restart is enabled if not self.auto_restart and self.restart_interval_hours is None: return # Check max restarts limit if self.max_restarts is not None and self.restart_count >= self.max_restarts: timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"\n{Colors.YELLOW}Auto-restart limit reached ({self.max_restarts} restarts). Stopping.{Colors.END}") print(f"INFO: {timestamp} Total restarts: {self.restart_count}") return # Determine if we should restart should_restart = False restart_reason = "" if self.auto_restart: # Immediate restart after completion should_restart = True restart_reason = "auto-restart enabled" elif self.restart_interval_hours is not None: # Time-based restart current_time = time.time() elapsed_hours = (current_time - self.first_run_start_time) / 3600.0 if elapsed_hours >= self.restart_interval_hours: should_restart = True restart_reason = f"restart interval reached ({self.restart_interval_hours} hours)" else: # Calculate wait time until next restart remaining_hours = self.restart_interval_hours - elapsed_hours remaining_minutes = int((remaining_hours - int(remaining_hours)) * 60) timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"\n{Colors.BLUE}INFO: {timestamp} Next auto-restart in {int(remaining_hours)}h {remaining_minutes}m{Colors.END}") # Wait until restart interval is reached wait_seconds = remaining_hours * 3600 if wait_seconds > 0: print(f"{Colors.BLUE}Waiting {int(remaining_hours)}h {remaining_minutes}m until next restart...{Colors.END}") time.sleep(wait_seconds) should_restart = True restart_reason = f"restart interval reached ({self.restart_interval_hours} hours)" if should_restart: self.restart_count += 1 timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"\n{Colors.CYAN}{'='*80}{Colors.END}") print(f"{Colors.CYAN}Auto-Restart #{self.restart_count} - {restart_reason}{Colors.END}") print(f"{Colors.CYAN}Time: {timestamp}{Colors.END}") print(f"{Colors.CYAN}{'='*80}{Colors.END}\n") # Create new regression directory for restart print(f"{Colors.BLUE}Creating new regression directory for restart #{self.restart_count}...{Colors.END}") self.regression_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Extract directory name from current simulation directory current_dir = os.path.basename(os.getcwd()) # Extract xxx part from sim_xxx pattern if current_dir.startswith('sim_'): dir_suffix = current_dir[4:] # Remove 'sim_' prefix else: dir_suffix = '' # Add restart count to directory name to make it unique self.regression_dir_name = f"regression_{dir_suffix}_{self.regression_timestamp}_restart{self.restart_count}" # Re-setup and create directories self._setup_directories() self._create_directories() # Update real-time report path for new directory self.real_time_report_path = self.report_dir / "zregress_report.log" print(f"{Colors.GREEN}New regression directory created: {self.regression_dir}{Colors.END}\n") # Reset some state for new run self.start_time = time.time() self.results = {} self.submitted_jobs = [] self.submitted_results = [] self.job_meta = {} self.running_jobs = 0 self.pending_jobs = 0 self.log_read_positions = {} self.log_last_update_times = {} # Reset error monitoring state self.last_error_monitor_time = time.time() # Reset status thread control self._stop_status_thread = False # For time-based restart, reset the first run start time to current time # so the next interval starts from now if self.restart_interval_hours is not None: self.first_run_start_time = time.time() # Small delay before restart time.sleep(2) # Recursively call run() to start new regression try: self.run() except KeyboardInterrupt: print(f"\n{Colors.YELLOW}User interrupted during auto-restart, cleaning up...{Colors.END}") self.cleanup() raise except Exception as e: print(f"{Colors.RED}Error during auto-restart: {e}{Colors.END}") self.cleanup() raise
def collect_transaction_and_cycle_stats(self) -> Tuple[int, int, int]: """ 扫描当前回归目录下的所有仿真 log 文件,统计 transaction 和 cycle 数量总和。
返回: (total_transaction_count, total_cycle_count, counted_log_files) """ log_root = Path(self.log_dir) if not log_root.exists(): return 0, 0, 0
total_txn = 0 total_cycles = 0 counted_logs = 0
# 递归扫描当前回归 logs 目录下的所有 .log 文件 for log_file in log_root.rglob("*.log"): try: txn = stat_transaction_count.extract_transaction_count(log_file) cyc = stat_transaction_count.extract_cycle_count(log_file) except Exception: continue
if txn is not None: total_txn += txn counted_logs += 1 if cyc is not None: total_cycles += cyc
return total_txn, total_cycles, counted_logs
def update_transaction_cycle_statistics(self): """ 使用 stat_transaction_count 的统计逻辑,将本次回归的 transaction_count 和 cycle_count 写入一个全局文档,并维护累加计数。
文档格式(CSV): date,regression_dir,transaction_count,cycle_count,cumulative_transaction_count,cumulative_cycle_count,log_files_count """ # 先统计本次回归 total_txn, total_cycles, counted_logs = self.collect_transaction_and_cycle_stats()
# 如果一个 log 都没有统计到 transaction,就直接返回,避免写入无意义记录 if counted_logs == 0: print(f"{Colors.YELLOW}Warning: No transaction/cycle information found under {self.log_dir}{Colors.END}") return
history_path = Path("transaction_cycle_history.csv").resolve()
cumulative_txn = 0 cumulative_cycles = 0
# 如果历史文件存在,先把以前的记录读出来,计算累加值 if history_path.exists(): try: with history_path.open("r", encoding="utf-8") as f: reader = csv.reader(f) header_read = False for row in reader: # 跳过表头 if not header_read: header_read = True continue if len(row) < 5: continue try: # 第 3、4 列是本次的 transaction/cycle,总累加重新算 run_txn = int(row[2]) run_cycles = int(row[3]) except ValueError: continue cumulative_txn += run_txn cumulative_cycles += run_cycles except Exception as e: print(f"{Colors.YELLOW}Warning: Failed to read existing history file {history_path}: {e}{Colors.END}") cumulative_txn = 0 cumulative_cycles = 0
# 把本次回归加到累加值里 cumulative_txn += total_txn cumulative_cycles += total_cycles
# 准备写入一行新记录 now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") regression_dir_str = str(self.regression_dir.resolve())
# 如果文件不存在,先写表头 file_exists = history_path.exists() try: with history_path.open("a", encoding="utf-8", newline="") as f: writer = csv.writer(f) if not file_exists: writer.writerow([ "date", "regression_dir", "transaction_count", "cycle_count", "cumulative_transaction_count", "cumulative_cycle_count", "log_files_count", ]) writer.writerow([ now_str, regression_dir_str, total_txn, total_cycles, cumulative_txn, cumulative_cycles, counted_logs, ])
print( f"{Colors.GREEN}Transaction/Cycle statistics updated. " f"txn={total_txn}, cycles={total_cycles}, " f"cumulative_txn={cumulative_txn}, cumulative_cycles={cumulative_cycles}{Colors.END}" ) except Exception as e: print(f"{Colors.YELLOW}Warning: Failed to write transaction/cycle history to {history_path}: {e}{Colors.END}") def check_compile_files_exist(self, output_dir: str, dienum: str, rtl_ver: str, mode: str) -> bool: """Check if compile files already exist""" try: # Check for common compile output files compile_files = [ f"{output_dir}/compile.log", f"{output_dir}/compile_ok", f"{output_dir}/compile.done", f"{output_dir}/simv", # VCS executable f"{output_dir}/simv.daidir", # VCS directory f"{output_dir}/csrc", # VCS source directory ] # Check if any of these files exist existing_files = [f for f in compile_files if os.path.exists(f)] if existing_files: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Found existing compile files:") for f in existing_files: print(f" - {f}") return True else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} No existing compile files found") return False except Exception as e: print(f"{Colors.YELLOW}Warning: Error checking compile files: {e}{Colors.END}") return False def should_skip_compile(self) -> bool: """Determine if compile should be skipped based on bypass argument""" if not hasattr(self.args, 'bypass') or self.args.bypass is None: return False # Handle different bypass argument formats if isinstance(self.args.bypass, list): if len(self.args.bypass) == 0: return False bypass_value = self.args.bypass[0] else: bypass_value = self.args.bypass # Convert to string and check bypass_str = str(bypass_value).lower().strip() # Skip compile if bypass is "1", "true", "yes", "skip", "bypass" # Default is "0" which means compile skip_values = ["1", "true", "yes", "skip", "bypass"] return bypass_str in skip_values
def _status_print_thread(self): """Status print thread function""" while not self._stop_status_thread: time.sleep(5) # Check every 5 seconds if status should be printed # Check if all tests are completed and stop if so with self.lock: all_completed = all(result.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] for result in self.results.values()) if all_completed and (self.running_jobs == 0 and self.pending_jobs == 0): print(f"{Colors.GREEN}All tests completed, stopping status monitoring thread{Colors.END}") self._stop_status_thread = True break # Update LSF job statuses in real-time self._update_lsf_job_statuses() self.print_status_summary() # Update real-time report in status thread self.update_real_time_report() # Monitor running tests for errors self.monitor_running_tests_for_errors() def _update_lsf_job_statuses(self): """Update LSF job statuses in real-time to keep TestResult objects synchronized""" if not hasattr(self, 'submitted_jobs') or not self.submitted_jobs: return try: # Get current LSF status for all submitted jobs job_ids = [int(job_id) for job_id in self.submitted_jobs] if not job_ids: return # Use batch status check for better performance status_map = self.batch_check_job_status(job_ids) # Update TestResult objects based on current LSF status for job_id, lsf_status in status_map.items(): if lsf_status in ["RUN", "PEND"]: # Find the corresponding TestResult object test_info = self.get_test_info_by_job_id(str(job_id)) if test_info: test_name = test_info['name'] seed = test_info['seed'] # Find the TestResult object by searching through all results found_result = None for result_key, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if found_result: # Update status based on LSF status if lsf_status == "RUN" and found_result.status == "PENDING": # Job just started running found_result.status = "RUNNING" if not found_result.start_time: found_result.start() print(f"DEBUG: Status updated: {test_name} seed={seed} PENDING -> RUNNING (job_id: {job_id})") elif lsf_status == "PEND" and found_result.status == "RUNNING": # Job went back to pending (resource preemption, etc.) found_result.status = "PENDING" print(f"DEBUG: Status updated: {test_name} seed={seed} RUNNING -> PENDING (job_id: {job_id})") else: print(f"DEBUG: Could not find TestResult for {test_name} seed={seed}") else: print(f"DEBUG: Could not get test info for job_id {job_id}") except Exception as e: print(f"Warning: Error updating LSF job statuses: {e}") def estimate_completion_time(self): """Estimate completion time""" now = time.time() # Calculate total time of completed tests completed_tests = [r for r in self.results.values() if r.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] completed_time = sum(r.duration for r in completed_tests) # Calculate running time of currently running tests running_tests = [r for r in self.results.values() if r.status == "RUNNING"] running_time = sum(now - r.start_time for r in running_tests if r.start_time) # Calculate estimated time of pending tests pending_tests = [r for r in self.results.values() if r.status == "PENDING"] pending_time = sum(r.estimated_duration for r in pending_tests) # Calculate remaining estimated time of running tests running_remaining = sum(max(0, r.estimated_duration - (now - r.start_time)) for r in running_tests if r.start_time) # Total estimated remaining time total_remaining = running_remaining + pending_time # Calculate estimated completion time estimated_completion = now + total_remaining completion_time = datetime.fromtimestamp(estimated_completion).strftime('%Y-%m-%d %H:%M:%S') # Calculate progress percentage (count-based, more stable) total_tests = len(self.results) if self.results else 0 completed_count = len(completed_tests) if total_tests > 0: # Running contribution: fraction of elapsed/estimated for each running test running_fraction_sum = 0.0 for r in running_tests: if r.start_time and getattr(r, 'estimated_duration', 0) > 0: elapsed_r = max(0.0, now - r.start_time) est_r = max(1.0, float(r.estimated_duration)) running_fraction_sum += min(elapsed_r / est_r, 1.0) progress = (completed_count + running_fraction_sum) / total_tests * 100.0 else: progress = 0.0 return completion_time, progress, total_remaining def print_status_summary(self): """Print current test status summary""" now = time.time() # status_print_interval is in seconds; default args.status_interval=5 (minutes) → here we want 30 minutes # Force the print interval to 30 minutes (override any smaller value) thirty_minutes = 30 * 60 effective_interval = max(self.status_print_interval, thirty_minutes) # Skip if not enough time has passed since last status print if now - self.last_status_print < effective_interval: return self.last_status_print = now # Count tests in each status total = len(self.results) pending = len([r for r in self.results.values() if r.status == "PENDING"]) running = len([r for r in self.results.values() if r.status == "RUNNING"]) passed = len([r for r in self.results.values() if r.status == "PASS"]) rerun_passed = len([r for r in self.results.values() if r.status == "RERUN PASS"]) failed = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL"]]) errors = len([r for r in self.results.values() if r.status == "ERROR"]) timeouts = len([r for r in self.results.values() if r.status == "TIMEOUT"]) # Debug: Print status distribution print(f"DEBUG: Status distribution - PENDING: {pending}, RUNNING: {running}, PASS: {passed}, RERUN PASS: {rerun_passed}, FAIL: {failed}") # Debug: Print some test statuses for verification if running > 0: running_tests = [r for r in self.results.values() if r.status == "RUNNING"] print(f"DEBUG: Sample RUNNING tests: {[r.name for r in running_tests[:3]]}") if pending > 0: pending_tests = [r for r in self.results.values() if r.status == "PENDING"] print(f"DEBUG: Sample PENDING tests: {[r.name for r in pending_tests[:3]]}") # Calculate elapsed time elapsed = now - self.start_time hours = int(elapsed // 3600) minutes = int((elapsed % 3600) // 60) seconds = int(elapsed % 60) # Estimate completion time and progress completion_time, progress, remaining = self.estimate_completion_time() remaining_hours = int(remaining // 3600) remaining_minutes = int((remaining % 3600) // 60) # Print status summary print(f"\n{Colors.BOLD}=== Regression Status Summary (Runtime: {hours:02d}:{minutes:02d}:{seconds:02d}) ==={Colors.END}") print(f"Total Tests: {total}") print(f"{Colors.YELLOW}Pending: {pending}{Colors.END}") print(f"{Colors.BLUE}Running: {running}{Colors.END}") # CRITICAL FIX: Calculate actual LSF status counts from submitted jobs actual_running_jobs = 0 actual_pending_jobs = 0 if hasattr(self, 'submitted_jobs') and self.submitted_jobs: try: # Get current LSF status for all submitted jobs job_ids = [int(job_id) for job_id in self.submitted_jobs] if job_ids: status_map = self.batch_check_job_status(job_ids) for job_id, status in status_map.items(): if status == "RUN": actual_running_jobs += 1 elif status == "PEND": actual_pending_jobs += 1 except Exception as e: print(f"Warning: Error calculating actual LSF status: {e}") print(f"{Colors.CYAN}LSF Status - RUN: {actual_running_jobs}, PEND: {actual_pending_jobs}{Colors.END}") print(f"{Colors.GREEN}Passed: {passed}{Colors.END}") print(f"{Colors.CYAN}Rerun Passed: {rerun_passed}{Colors.END}") print(f"{Colors.RED}Failed: {failed}{Colors.END}") print(f"{Colors.RED}Errors: {errors}{Colors.END}") print(f"{Colors.YELLOW}Timeouts: {timeouts}{Colors.END}") # Calculate progress based on test count (Passed/Total) test_progress = ((passed + rerun_passed) / total) * 100.0 if total > 0 else 0.0 progress_bar = self.generate_progress_bar(test_progress) print(f"\nProgress: {test_progress:.1f}% {progress_bar}") print(f"Estimated Remaining Time: {remaining_hours} hours {remaining_minutes} minutes") print(f"Estimated Completion Time: {completion_time}") # Print running tests if running > 0: print(f"\n{Colors.BLUE}Running Tests:{Colors.END}") running_tests = [r for r in self.results.values() if r.status == "RUNNING"] for test in running_tests[:5]: # Show at most 5 elapsed = now - test.start_time if test.start_time else 0 minutes = int(elapsed // 60) seconds = int(elapsed % 60) # Get seed and opts information seed = getattr(test, 'seed', 'unknown') opts = getattr(test, 'opts', []) opts_str = "_".join(opts) if opts else "no_opts" status_info = "" if hasattr(test, 'error_detected') and test.error_detected: status_info = f" {Colors.RED}(running but had error){Colors.END}" print(f" {test.name} seed={seed} opts={opts_str} (Runtime: {minutes}m{seconds}s){status_info}") if len(running_tests) > 5: print(f" ... and {len(running_tests) - 5} other tests") # Print recently failed tests if failed > 0 or errors > 0 or timeouts > 0: print(f"\n{Colors.RED}Recently Failed Tests:{Colors.END}") failed_tests = [r for r in self.results.values() if r.status in ["FAIL", "ERROR", "TIMEOUT"]] for test in failed_tests[-5:]: # Show at most 5 recent ones log_path = getattr(test, 'log_file', '') or '' log_part = f" log={log_path}" if log_path else "" print(f" {test.name}: {test.status}{log_part}") print() # Empty line
def validate_all_test_statuses(self): """Validate and correct all test statuses before generating report""" current_time = time.time() # Limit validation frequency to avoid excessive calls if current_time - self._last_validation_time < 60: # Only validate once per minute return self._last_validation_time = current_time self._validation_count += 1 # print(f"Validating all test statuses... (validation #{self._validation_count})") # First check all TestResult objects with RUNNING status running_tests = [r for r in self.results.values() if r.status == "RUNNING"] if running_tests: print(f"Found {len(running_tests)} tests with RUNNING status, checking actual job status...") for result_obj in running_tests: if hasattr(result_obj, 'job_id') and result_obj.job_id: try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status == "DONE": result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS") elif status in ["EXIT", "TERM", "KILL"]: result_obj.finish("FAIL", f"Job status: {status}") print(f" Status corrected: {result_obj.name} -> FAIL") elif status in ["RUN", "PEND", "WAIT", "SUSP"]: # Job is still running or pending, keep RUNNING status pass elif status == "UNKNOWN": # Job may have completed and been removed from queue, or still running # Only change status if we can definitively determine the result if hasattr(result_obj, 'log_file') and result_obj.log_file: if os.path.exists(result_obj.log_file): if self.check_test_result(result_obj.log_file): result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS (from log file)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f" Status corrected: {result_obj.name} -> FAIL (from log file)") else: # No log file yet, keep RUNNING status (job might still be running) pass else: # No log file info, keep RUNNING status (job might still be running) pass else: # Unknown LSF status, keep RUNNING status pass except Exception as e: print(f" Warning: Could not check status for {result_obj.name}: {e}") # Keep RUNNING status if we can't determine status (job might still be running) pass else: # No job_id, keep RUNNING status (job not yet submitted) pass # Now check all TestResult objects with PENDING status to see if they've completed pending_tests = [r for r in self.results.values() if r.status == "PENDING"] if pending_tests: print(f"Found {len(pending_tests)} tests with PENDING status, checking if they've completed...") for result_obj in pending_tests: if hasattr(result_obj, 'job_id') and result_obj.job_id: try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status == "DONE": result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS (was PENDING)") elif status in ["EXIT", "TERM", "KILL"]: result_obj.finish("FAIL", f"Job status: {status}") print(f" Status corrected: {result_obj.name} -> FAIL (was PENDING)") elif status in ["RUN", "PEND", "WAIT", "SUSP"]: # If job is RUN, upgrade PENDING -> RUNNING; otherwise keep PENDING if status == "RUN": result_obj.status = "RUNNING" # Initialize start time if missing if not getattr(result_obj, 'start_time', None): result_obj.start() print(f" Status corrected: {result_obj.name} PENDING -> RUNNING") else: # Still pending/wait/suspend; keep PENDING print(f" {result_obj.name} still {status}") elif status == "UNKNOWN": # Job may have completed and been removed from queue # Try to check if log file exists and determine result if hasattr(result_obj, 'log_file') and result_obj.log_file: if os.path.exists(result_obj.log_file): if self.check_test_result(result_obj.log_file): result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS (from log file, was PENDING)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f" Status corrected: {result_obj.name} -> FAIL (from log file, was PENDING)") else: # No log file, keep PENDING status print(f" {result_obj.name} no log file - keeping PENDING status") else: # No log file info, keep PENDING status print(f" {result_obj.name} no log file info - keeping PENDING status") else: # Unknown status, keep PENDING print(f" {result_obj.name} unknown status {status} - keeping PENDING status") except Exception as e: print(f" Warning: Could not check status for {result_obj.name}: {e}") # Keep PENDING status if we can't determine status print(f" {result_obj.name} keeping PENDING status due to error") else: # No job_id, keep PENDING status print(f" {result_obj.name} no job_id - keeping PENDING status") # Final count final_running = len([r for r in self.results.values() if r.status == "RUNNING"]) final_pending = len([r for r in self.results.values() if r.status == "PENDING"]) final_passed = len([r for r in self.results.values() if r.status == "PASS"]) final_failed = len([r for r in self.results.values() if r.status in ["FAIL", "ERROR", "TIMEOUT"]]) def generate_detailed_regression_report_content(self): """Generate detailed regression report content as string""" output = [] # Validate statuses before generating report self.validate_all_test_statuses() # Print header timestamp = datetime.now().strftime('%m-%d %H:%M:%S') output.append(f"INFO: {timestamp}: {'+' * 15} REPORT {'+' * 15}") # Print table header output.append(f"INFO: {timestamp}: | status | test_name | seed | jobid | cpu_time | max_mem | procs |") # Process each test result for result in self.results.values(): # Get test info test_name = result.name seed = getattr(result, 'seed', 'unknown') job_id = getattr(result, 'job_id', 'unknown') # Get CPU time and memory info from actual data cpu_time, max_mem, procs = self.extract_job_statistics(result) # Handle PENDING status specially for file output if result.status == "PENDING": cpu_time = "-1|unknown" # Format status with proper colors (remove color codes for file output) status = result.status if status == "PASS": status = "PASS" elif status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: status = "FAIL" elif status == "RUNNING": status = "RUNNING" elif status == "PENDING": status = "PENDING" else: status = status # Print test result line output.append(f"INFO: {timestamp}: | {status} | {test_name} | {seed} | {job_id} | {cpu_time} | {max_mem} | {procs} |") output.append(f"INFO: {timestamp}: {'+' * 15} END REPORT {'+' * 15}") output.append(f"Total unique tests reported: {len(self.results)}") return "\n".join(output) def generate_error_summary_report_content(self): """Generate error summary report content as string""" output = [] # Collect all error information from log content before UVM Report catcher Summary and direct patterns error_info = {} failed_tests = [] for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: failed_tests.append(result) # Analyze log file for errors in content before UVM Report catcher Summary self.analyze_log_for_errors(result, error_info) if not failed_tests: output.append("No UVM_ERROR or UVM_FATAL found in logs") else: if not error_info: output.append(f"Found {len(failed_tests)} failed tests but no UVM_ERROR or UVM_FATAL found in logs") for result in failed_tests: output.append(f" {result.name}: {result.status} - {result.error_msg}") else: output.append(f"Found {len(failed_tests)} failed tests with error details:") # Group errors by type error_count = 1 for error_type, error_details in error_info.items(): output.append(f"({error_count}) ERR ID:{error_details['id']}:") output.append(f"MSG: \"{error_details['message']}\"") # Print error count if available (from log content before UVM Report catcher Summary) if 'count' in error_details: output.append(f"Count: {error_details['count']} (from log content before UVM Report catcher Summary)") # Print associated test paths for test_path in error_details['tests']: output.append(f"{test_path}") error_count += 1 return "\n".join(output) def generate_regression_summary_info_content(self): """Generate regression summary info content as string""" output = [] # Validate statuses before generating report self.validate_all_test_statuses() # Count results total_tests = len(self.results) passed_tests = len([r for r in self.results.values() if r.status == "PASS"]) rerun_passed_tests = len([r for r in self.results.values() if r.status == "RERUN PASS"]) failed_tests = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]]) pending_tests = len([r for r in self.results.values() if r.status == "PENDING"]) running_tests = len([r for r in self.results.values() if r.status == "RUNNING"]) # Print summary output.append(f"Running: {running_tests}") output.append(f"Pending: {pending_tests}") output.append(f"Passed: {passed_tests}") output.append(f"Rerun Passed: {rerun_passed_tests}") output.append(f"Failed: {failed_tests}") output.append(f"Total tests: {total_tests}") # Print detailed status output.append(f"Total Tests: {total_tests}") output.append(f"Passed: {passed_tests}") output.append(f"Rerun Passed: {rerun_passed_tests}") output.append(f"Failed: {failed_tests}") output.append(f"Pending: {pending_tests}") output.append(f"Running: {running_tests}") # Calculate pass rate if total_tests > 0: pass_rate = ((passed_tests + rerun_passed_tests) / total_tests) * 100 output.append(f"Pass Rate: {pass_rate:.1f}%") else: output.append("Pass Rate: 0.0%") # Determine overall status if pending_tests > 0 or running_tests > 0: output.append("Overall Status: INCOMPLETE") elif failed_tests == 0: output.append("Overall Status: PASS") else: output.append("Overall Status: FAIL") return "\n".join(output) def generate_test_status_and_log_paths_content(self): """Generate test status and log paths content for zregress_report.log""" output = [] # Validate statuses before generating report self.validate_all_test_statuses() # Get ALL test results (not just final statuses) all_results = list(self.results.values()) # Sort results by status priority: RUNNING, PENDING, FAIL/RERUN FAIL/ERROR/TIMEOUT, PASS, RERUN PASS def sort_key(result): status_priority = { "RUNNING": 0, "PENDING": 1, "FAIL": 2, "RERUN FAIL": 2, "ERROR": 2, "TIMEOUT": 2, "PASS": 3, "RERUN PASS": 3 } return status_priority.get(result.status, 4) all_results.sort(key=sort_key) # Print all test results with their log paths for result in all_results: log_path = self.get_test_log_path(result) error_info = "" if hasattr(result, 'error_detected') and result.error_detected: error_info = " (running but had error)" if log_path: output.append(f"[{result.status}]{error_info} {log_path}") else: # If no log file found, still show the test but indicate no log output.append(f"[{result.status}]{error_info} {result.name}: No log file found") return "\n".join(output) def _update_job_status_counts(self, status_changes: Dict[int, str]): """Update running_jobs and pending_jobs counts based on current LSF status""" # Reset counts new_running_count = 0 new_pending_count = 0 # Count jobs by status for job_id, status in status_changes.items(): if status == "RUN": new_running_count += 1 elif status == "PEND": new_pending_count += 1 # CRITICAL FIX: Also count retry jobs if they exist if hasattr(self, 'retry_results'): for retry_job_id, retry_result in self.retry_results.items(): if int(retry_job_id) not in status_changes: # Check status of retry job retry_status = self.check_lsf_job_status(int(retry_job_id)) if retry_status == "RUN": new_running_count += 1 elif retry_status == "PEND": new_pending_count += 1 # Update counts and TestResult objects for job_id, status in status_changes.items(): test_info = self.get_test_info_by_job_id(job_id) if test_info: test_name = test_info['name'] seed = test_info['seed'] # Find the TestResult object found_result = None for result_key, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if found_result: if status == "RUN" and found_result.status == "PENDING": # Job just started running found_result.status = "RUNNING" found_result.start() # Set start time self._reset_log_read_position(test_name, seed) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] {test_name} seed={seed} RUNNING") elif status == "PEND" and found_result.status == "RUNNING": # Job went back to pending (resource preemption, etc.) found_result.status = "PENDING" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] {test_name} seed={seed} PENDING") # Update global counts old_running = self.running_jobs old_pending = self.pending_jobs self.running_jobs = new_running_count self.pending_jobs = new_pending_count # Log count changes if significant if old_running != new_running_count or old_pending != new_pending_count: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Status Count Update: RUNNING {old_running} -> {new_running_count}, PENDING {old_pending} -> {new_pending_count}") def _clean_submitted_jobs(self): """Clean duplicate job IDs from submitted_jobs list""" if len(self.submitted_jobs) != len(set(self.submitted_jobs)): original_count = len(self.submitted_jobs) self.submitted_jobs = list(dict.fromkeys(self.submitted_jobs)) # Remove duplicates while preserving order cleaned_count = len(self.submitted_jobs) if original_count != cleaned_count: print(f"Cleaned duplicate job IDs: {original_count} -> {cleaned_count}") print(f"Removed {original_count - cleaned_count} duplicate job IDs") def _clean_duplicate_test_results(self): """Clean up duplicate TestResult objects based on name:config:seed combination""" print(f"Cleaning up duplicate test results...") original_count = len(self.results) # Create a mapping to track unique tests unique_tests = {} duplicates_to_remove = [] for result_key, result_obj in self.results.items(): # Create a unique identifier for each test test_identifier = f"{result_obj.name}:{result_obj.config}:{getattr(result_obj, 'seed', 'unknown')}" if test_identifier not in unique_tests: # First occurrence of this test unique_tests[test_identifier] = result_key else: # Duplicate found - keep the one with more complete information existing_key = unique_tests[test_identifier] existing_obj = self.results[existing_key] # Determine which one to keep (prefer PASS over FAIL/ERROR/TIMEOUT, then prefer final status over PENDING/RUNNING) if result_obj.status == "PASS" and existing_obj.status != "PASS": # Always keep PASS over any other status duplicates_to_remove.append(existing_key) unique_tests[test_identifier] = result_key elif existing_obj.status == "PASS" and result_obj.status != "PASS": # Keep existing PASS, remove new non-PASS duplicates_to_remove.append(result_key) elif result_obj.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] and existing_obj.status in ["PENDING", "RUNNING"]: # Keep the new one with final status, remove the old one with transient status duplicates_to_remove.append(existing_key) unique_tests[test_identifier] = result_key elif existing_obj.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] and result_obj.status in ["PENDING", "RUNNING"]: # Keep the existing one with final status, remove the new one with transient status duplicates_to_remove.append(result_key) else: # Both have same status level, keep the one with more info if hasattr(result_obj, 'job_id') and result_obj.job_id and not (hasattr(existing_obj, 'job_id') and existing_obj.job_id): # New one has job_id, existing one doesn't duplicates_to_remove.append(existing_key) unique_tests[test_identifier] = result_key else: # Keep existing one duplicates_to_remove.append(result_key) # Remove duplicates for key in duplicates_to_remove: if key in self.results: del self.results[key] cleaned_count = len(self.results)
def monitor_running_tests_for_errors(self): """Monitor running tests for errors in their log files""" current_time = time.time() if current_time - self.last_error_monitor_time < self.error_monitor_interval: return self.last_error_monitor_time = current_time timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"\n{Colors.YELLOW}=== 30-Minute Error Monitoring Check ({timestamp}) ==={Colors.END}") # Get all running tests and filter out PEND jobs running_tests = [] for result in self.results.values(): if result.status == "RUNNING": # Check if job is actually running (not pending) job_id = getattr(result, 'job_id', None) if job_id: try: lsf_status = self.check_lsf_job_status(int(job_id)) if lsf_status == "RUN": running_tests.append(result) # Skip PEND jobs - they're waiting for resources, not actually running except Exception: # If we can't check LSF status, include it in monitoring running_tests.append(result) else: # No job_id, include in monitoring running_tests.append(result) if not running_tests: print(f" No actually running tests to monitor") # Check if all tests are completed and stop monitoring if so all_completed = all(result.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] for result in self.results.values()) if all_completed and (self.running_jobs == 0 and self.pending_jobs == 0): print(f" {Colors.GREEN}All tests completed, stopping error monitoring{Colors.END}") self._stop_status_thread = True return print(f" Scanning {len(running_tests)} running tests for first-time errors...") errors_found_count = 0 for result in running_tests: if self._check_test_log_for_errors(result): errors_found_count += 1 if errors_found_count == 0: print(f" {Colors.GREEN}✓ No new errors detected in running tests{Colors.END}") else: print(f" {Colors.RED}⚠ Found {errors_found_count} new error(s) in running tests{Colors.END}") def _check_test_log_for_errors(self, result): """Check a specific test's log file for errors""" test_name = result.name seed = getattr(result, 'seed', 'unknown') # Get the log file path log_file_path = self.get_test_log_path(result) if not log_file_path or not os.path.exists(log_file_path): return False # Get or initialize the last read position for this log file log_key = f"{test_name}_{seed}" last_position = self.log_read_positions.get(log_key, 0) # Check if we've already reported errors for this test if hasattr(result, 'error_reported') and result.error_reported: return False # CRITICAL FIX: Only check for timeout if job is actually RUNNING # PEND jobs should not be subject to timeout detection as they're waiting for resources job_id = getattr(result, 'job_id', None) if job_id: try: lsf_status = self.check_lsf_job_status(int(job_id)) if lsf_status == "PEND": # Job is pending - check for PEND timeout if configured if self.pend_timeout_seconds is not None: now_ts = time.time() last_update_ts = self.log_last_update_times.get(log_key, now_ts) if now_ts - last_update_ts >= self.pend_timeout_seconds: mins = int(self.pend_timeout_seconds // 60) result.finish("TIMEOUT", f"Job pending for {mins} minutes (resource timeout)") print(f"\n{Colors.YELLOW}⏱{Colors.END} {test_name} seed={seed} TIMEOUT - pending for {mins} minutes") self.log_last_update_times[log_key] = now_ts return False # No PEND timeout configured or not yet reached - don't check for hang timeout return False except Exception: # If we can't check LSF status, assume it's running and proceed with timeout check pass try: with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as f: # Seek to the last read position f.seek(last_position) # Read new content new_content = f.read() current_position = f.tell()
# Update last update time if file advanced, else check for hang now_ts = time.time() last_update_ts = self.log_last_update_times.get(log_key, now_ts) if current_position > last_position: # File advanced; update last update time self.log_last_update_times[log_key] = now_ts else: # No progress; if configured threshold without new lines, mark TIMEOUT (hung) # Only apply timeout to RUNNING jobs, not PEND jobs if now_ts - last_update_ts >= self.hang_timeout_seconds: mins = int(self.hang_timeout_seconds // 60) result.finish("TIMEOUT", f"No new log lines for {mins} minutes (assumed hang)") print(f"\n{Colors.YELLOW}⏱{Colors.END} {test_name} seed={seed} TIMEOUT - no log updates for {mins} minutes") # Reset tracking to avoid repeated triggers self.log_last_update_times[log_key] = now_ts # CRITICAL FIX: Trigger retry for TIMEOUT cases if getattr(self.args, 'retry', 0) > 0 and hasattr(result, 'job_id') and result.job_id: print(f"{Colors.CYAN}🚀 Triggering retry for TIMEOUT case {test_name} seed={seed}{Colors.END}") self._resubmit_from_stored_opcode(result.job_id) return False # Update the last read position self.log_read_positions[log_key] = current_position # Only scan content BEFORE 'UVM Report catcher Summary' to avoid false positives summary_idx = new_content.find('UVM Report catcher Summary') scan_text = new_content[:summary_idx] if summary_idx != -1 else new_content
# Check for error keywords in the scan_text first_error_found = None for keyword in self.error_keywords: if keyword in scan_text: # Find the first occurrence of this error keyword lines = scan_text.split('\n') for i, line in enumerate(lines): if keyword in line: # Skip report-summary style lines that are not real errors if 'UVM_' in keyword or 'UVM_' in line: if 'Number of' in line and 'reports' in line: continue # Get some context around the error start_line = max(0, i - 2) end_line = min(len(lines), i + 3) context = '\n'.join(lines[start_line:end_line]) first_error_found = { 'keyword': keyword, 'line': line.strip(), 'context': context } break if first_error_found: break if first_error_found: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"\n{Colors.RED}🚨 FIRST ERROR DETECTED 🚨{Colors.END}") print(f"{Colors.RED}[{timestamp}] Test: {test_name} seed={seed} - running but had error{Colors.END}") print(f"{Colors.RED}Log File: {log_file_path}{Colors.END}") print(f"{Colors.RED}Error Type: {first_error_found['keyword']}{Colors.END}") print(f"{Colors.RED}Error Line: {first_error_found['line']}{Colors.END}") print(f"{Colors.RED}Error Context:{Colors.END}") for context_line in first_error_found['context'].split('\n'): print(f"{Colors.RED} {context_line}{Colors.END}") print(f"{Colors.RED}{'='*80}{Colors.END}") # Mark that we've reported the first error for this test result.error_reported = True result.error_detected = True result.first_error_details = first_error_found return True except Exception as e: print(f" Warning: Could not read log file {log_file_path}: {e}") return False def _reset_log_read_position(self, test_name, seed): """Reset the log read position for a test (when it starts running)""" log_key = f"{test_name}_{seed}" self.log_read_positions[log_key] = 0 def _cleanup_log_read_positions(self): """Clean up log read positions for completed tests""" completed_tests = [result for result in self.results.values() if result.status in ["PASS", "RERUN PASS", "FAIL", "ERROR", "TIMEOUT"]] for result in completed_tests: test_name = result.name seed = getattr(result, 'seed', 'unknown') log_key = f"{test_name}_{seed}" if log_key in self.log_read_positions: del self.log_read_positions[log_key] if hasattr(self, 'log_last_update_times') and log_key in self.log_last_update_times: del self.log_last_update_times[log_key] def save_error_monitor_state(self): """Save error monitoring state to file""" try: state_file = self.report_dir / "error_monitor_state.json" state_data = { 'log_read_positions': self.log_read_positions, 'last_error_monitor_time': self.last_error_monitor_time, 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } with open(state_file, 'w') as f: json.dump(state_data, f, indent=2) except Exception as e: print(f"Warning: Could not save error monitor state: {e}") def load_error_monitor_state(self): """Load error monitoring state from file""" try: state_file = self.report_dir / "error_monitor_state.json" if state_file.exists(): with open(state_file, 'r') as f: state_data = json.load(f) self.log_read_positions = state_data.get('log_read_positions', {}) self.last_error_monitor_time = state_data.get('last_error_monitor_time', time.time()) print(f"Loaded error monitor state from {state_file}") except Exception as e: print(f"Warning: Could not load error monitor state: {e}")
def parse_it_regress_alias(alias_file: str) -> List[Tuple[str, str]]: """Parse it_regress.alias file.
Expected format (example): sim_1d: cmd: ./hregress.py -g full_1die --auto_restart ...
Returns: List of (sim_dir_name, cmd_string) """ items: List[Tuple[str, str]] = [] current_sim: Optional[str] = None
with open(alias_file, "r", encoding="utf-8", errors="ignore") as f: for raw in f: line = raw.strip() if not line or line.startswith("#"): continue
# Section header: sim_xxx: if line.endswith(":") and not line.lower().startswith("cmd:"): current_sim = line[:-1].strip() continue
# Command line: cmd: ... if line.lower().startswith("cmd:"): if not current_sim: raise ValueError(f"Found 'cmd:' before any sim section in {alias_file}: {raw!r}") cmd = line.split(":", 1)[1].strip() if not cmd: raise ValueError(f"Empty cmd for section {current_sim} in {alias_file}") items.append((current_sim, cmd)) continue
return items
def _maybe_prefix_python(cmd_tokens: List[str], sim_dir: str) -> List[str]: """Best-effort: if entry is a .py under sim_dir but not executable, run via current python.""" if not cmd_tokens: return cmd_tokens first = cmd_tokens[0] if first.endswith(".py"): p = Path(sim_dir) / first try: if p.exists() and not os.access(str(p), os.X_OK): return [sys.executable, first] + cmd_tokens[1:] except Exception: pass return cmd_tokens
def submit_all_regress_from_alias(args) -> int: """When args.all_regress_en == 1: read alias file and submit regressions in each sim dir.""" alias_file = getattr(args, "alias_file", "it_regress.alias") sim_root = getattr(args, "all_regress_sim_root", ".") template_sim_dir = getattr(args, "template_sim_dir", "sim")
alias_path = os.path.abspath(alias_file) if not os.path.isabs(alias_file) else alias_file
# Base directory resolution: # - If user provides --all-regress-sim-root and it exists, use it. # - Otherwise, use alias file's directory (most common in flow: cd sim_xxx and run). # - Fallback to current working directory. sim_root_abs = os.path.abspath(sim_root) if sim_root else "" if sim_root_abs and os.path.isdir(sim_root_abs): base_dir = sim_root_abs else: base_dir = os.path.abspath(os.path.dirname(alias_path)) if alias_path else os.getcwd()
if not os.path.exists(alias_path): print(f"{Colors.RED}Error: alias file not found: {alias_path}{Colors.END}") return 1
try: items = parse_it_regress_alias(alias_path) except Exception as e: print(f"{Colors.RED}Error: failed to parse alias file {alias_path}: {e}{Colors.END}") return 1
if not items: print(f"{Colors.YELLOW}Warning: no cmd entries found in {alias_path}{Colors.END}") return 0
print(f"{Colors.BOLD}=== ALL REGRESS MODE (from alias) ==={Colors.END}") print(f"Alias file: {alias_path}") print(f"Base dir: {base_dir}") if sim_root_abs and os.path.isdir(sim_root_abs): print(f"Sim root: {sim_root_abs} (enabled)") else: print(f"Sim root: {sim_root_abs or '(not set)'} (ignored; using base dir)") template_abs = template_sim_dir if os.path.isabs(template_sim_dir) else os.path.join(base_dir, template_sim_dir) print(f"Template: {template_abs}") print(f"Found {len(items)} cmd entries")
failed: List[Tuple[str, int]] = [] jobs: List[Dict[str, str]] = [] for sim_name, cmd_str in items: sim_dir = sim_name if os.path.isabs(sim_name) else os.path.join(base_dir, sim_name) if not os.path.isdir(sim_dir): # Auto-create sim_xxx from template 'sim' directory (cp -rf sim sim_xxx) template_path = template_sim_dir if os.path.isabs(template_sim_dir) else os.path.join(base_dir, template_sim_dir) if not os.path.isdir(template_path): print(f"{Colors.RED}Error: sim dir not found: {sim_dir}{Colors.END}") print(f"{Colors.RED}Error: template sim dir not found: {template_path}{Colors.END}") failed.append((sim_name, 127)) continue ts = datetime.now().strftime("%m-%d %H:%M:%S") print(f"{Colors.YELLOW}INFO: {ts} [{sim_name}] sim dir missing, creating via: cp -rf {template_path} {sim_dir}{Colors.END}") try: r_cp = subprocess.run(["cp", "-rf", template_path, sim_dir], cwd=base_dir) if r_cp.returncode != 0 or not os.path.isdir(sim_dir): print(f"{Colors.RED}INFO: {ts} [{sim_name}] create FAIL rc={r_cp.returncode}{Colors.END}") failed.append((sim_name, int(r_cp.returncode) if r_cp.returncode is not None else 1)) continue print(f"{Colors.GREEN}INFO: {ts} [{sim_name}] created OK{Colors.END}") except Exception as e: print(f"{Colors.RED}INFO: {ts} [{sim_name}] create FAIL: {e}{Colors.END}") failed.append((sim_name, 1)) continue
try: tokens = shlex.split(cmd_str) except Exception as e: print(f"{Colors.RED}Error: shlex split failed for {sim_name} cmd={cmd_str!r}: {e}{Colors.END}") failed.append((sim_name, 2)) continue
tokens = _maybe_prefix_python(tokens, sim_dir) jobs.append({"name": sim_name, "sim_dir": sim_dir, "cmd_str": cmd_str, "tokens": tokens})
if not jobs and failed: # 所有条目都在准备阶段失败了 print(f"\n{Colors.RED}=== ALL REGRESS MODE SUMMARY: FAIL (no job started) ==={Colors.END}") for name, rc in failed: print(f" - {name}: rc={rc}") return 1
# 并行启动所有 sim_xxx 的 hregress 命令 procs: List[Tuple[Dict[str, str], subprocess.Popen]] = [] for job in jobs: ts = datetime.now().strftime("%m-%d %H:%M:%S") print(f"INFO: {ts} [{job['name']}] START cwd={job['sim_dir']} cmd: {job['cmd_str']}") try: p = subprocess.Popen( job["tokens"], cwd=job["sim_dir"], ) procs.append((job, p)) except FileNotFoundError as e: print(f"{Colors.RED}INFO: {ts} [{job['name']}] start FAIL: {e}{Colors.END}") failed.append((job["name"], 127)) except Exception as e: print(f"{Colors.RED}INFO: {ts} [{job['name']}] start FAIL: {e}{Colors.END}") failed.append((job["name"], 1))
# 等待所有并行回归结束,统计返回码 for job, p in procs: rc = p.wait() ts = datetime.now().strftime("%m-%d %H:%M:%S") if rc == 0: print(f"{Colors.GREEN}INFO: {ts} [{job['name']}] FINISH OK{Colors.END}") else: print(f"{Colors.RED}INFO: {ts} [{job['name']}] FINISH FAIL rc={rc}{Colors.END}") failed.append((job["name"], int(rc)))
if failed: print(f"\n{Colors.RED}=== ALL REGRESS MODE SUMMARY: FAIL ==={Colors.END}") for name, rc in failed: print(f" - {name}: rc={rc}") return 1
print(f"\n{Colors.GREEN}=== ALL REGRESS MODE SUMMARY: OK ==={Colors.END}") return 0
def parse_arguments(): """Parse the operation""" parser = argparse.ArgumentParser(description="regress") # Required arguments (conditionally required based on other parameters) parser.add_argument("-g", "--groups", nargs="+", required=False, help="group tag (required unless using -lst/--list)") # Optional arguments parser.add_argument("-d", "--dienum", type=int, nargs="?", default=2, help="die num :1 to 4") parser.add_argument("-v", "--rtl_ver", nargs="?", default="STUB NOC_TOP", help="rtl vision") # parser.add_argument("-v", "--rtl_ver", nargs="?", default="FULL", # help="rtl vision") parser.add_argument("-m", "--mode", nargs="?", default="", help="mode") parser.add_argument("-def", "--define", type=str, nargs="?", help="rtl define marco") parser.add_argument("-q", "--queue", nargs="?", default="pron_normal", help="queue") parser.add_argument("--timestamp", nargs="*", help="add timestamp or not, use True or False") parser.add_argument("--bypass", nargs="*", default="1", help="bypass the pre_full_run: 0=compile, 1=skip compile if files exist") parser.add_argument("--wait-timeout", type=int, nargs="?", default=100, help="waiting timeout (h)") parser.add_argument('--max_concurrent', type=int, default=50, help='max concurrent job count') # Legacy arguments for backward compatibility parser.add_argument('--legacy-mode', choices=['compile_regression'], default='compile_regression', help='Legacy run mode: compile_regression(compile then run regression)') parser.add_argument('--timeout', type=int, default=60, help='Single test timeout (minutes) (default: 60)') parser.add_argument('--output-dir', default='.', help='Output directory for compile and regression (default: ./output)') parser.add_argument('--dir', default='.', help='Simulation output directory path (default: ./output)') parser.add_argument('--p2-mode', default='normal', help='P2 mode for compilation (default: normal)') parser.add_argument('--seed', default='random', help='Random seed (default: random)') parser.add_argument('--wave', action='store_true', help='Enable FSDB wave format (default: no wave)') parser.add_argument('--wave-on-fail', action='store_true', help='Generate wave file only when test fails') parser.add_argument('--coverage', action='store_true', help='Enable coverage collection') parser.add_argument('--cov', type=str, default=None, choices=['all', 'tgl', 'line', 'cond', 'fsm', 'branch', 'assert'], help='Coverage type: all, tgl, line, cond, fsm, branch, assert (default: None)') parser.add_argument('--vcs-optimize', action='store_true', default=False, help='Enable VCS optimization (parallel compilation and simulation) (default: disabled)') parser.add_argument('--vcs-cores', type=int, default=1, help='Number of cores for VCS parallel compilation/simulation (default: 1, single-threaded)') parser.add_argument('--vcs-xa', action='store_true', default=False, help='Enable VCS-XA acceleration (default: disabled)') parser.add_argument('--verbosity', default='UVM_MEDIUM', choices=['UVM_NONE', 'UVM_LOW', 'UVM_MEDIUM', 'UVM_HIGH', 'UVM_FULL'], help='UVM verbosity level (default: UVM_MEDIUM)') parser.add_argument('--plusargs', default='', help='Additional plusargs parameters') parser.add_argument('--retry', type=int, default=1, help='Failed test retry count (default: 1)') parser.add_argument('--debug', action='store_true', help='Enable debug mode') parser.add_argument('--keep-logs', action='store_true', help='Keep all log files') parser.add_argument('--status-interval', type=int, default=5, help='Status print interval (minutes) (default: 5)') parser.add_argument('--error-monitor-interval', type=int, default=30, help='Error monitoring interval (minutes) (default: 30)') parser.add_argument('--hang-timeout-minutes', type=int, default=30, help='Timeout for no new log lines (minutes) (default: 30)') parser.add_argument('--pend-timeout-minutes', type=int, default=None, help='Timeout for PEND jobs (minutes). If not set, PEND jobs will wait indefinitely for resources (default: None)') parser.add_argument('--memory', type=int, default=None, help='Memory reservation in GB for LSF jobs (default: not specified, use LSF default)') parser.add_argument('--cpu-cores', type=int, default=1, help='CPU cores to request for LSF jobs (default: 1)') parser.add_argument('--failed-regression', type=str, default=None, help='Path to failed regression JSON file to re-run failed tests only') parser.add_argument('-lst', '--list', type=str, default=None, help='Path to JSON regression list file to run all test cases in the list') parser.add_argument('--auto-restart', action='store_true', default=False, help='Automatically restart regression after completion (default: False)') parser.add_argument('--restart-interval-hours', type=float, default=None, help='Auto-restart interval in hours (e.g., 12.0 for 12 hours). If set, regression will restart after this interval (default: None)') parser.add_argument('--max-restarts', type=int, default=None, help='Maximum number of auto-restarts (default: None, unlimited)')
# Multi-topology one-click mode parser.add_argument('--all-regress-en', type=int, default=0, help='Enable all-regress mode: when set to 1, read alias file and submit regress in each sim_xxx dir (default: 0)') parser.add_argument('--alias-file', type=str, default='it_regress.alias', help='Alias file path (it_regress.alias format) used when --all-regress-en=1') parser.add_argument('--all-regress-sim-root', type=str, default='.', help='Sim root dir that contains sim_xxx subdirs, used when --all-regress-en=1') parser.add_argument('--template-sim-dir', type=str, default='sim', help="Template sim directory name/path under sim-root. If sim_xxx doesn't exist, create it by running: cp -rf <template> <sim_xxx>") return parser.parse_args()
def main(): args = parse_arguments()
# Short-circuit: all-regress mode (submit commands from alias file in each sim_xxx dir) try: if int(getattr(args, 'all_regress_en', 0)) == 1: rc = submit_all_regress_from_alias(args) sys.exit(int(rc)) except Exception as e: print(f"{Colors.RED}Error in all-regress mode: {e}{Colors.END}") sys.exit(1) # Validate arguments based on mode if hasattr(args, 'failed_regression') and args.failed_regression: # Validate failed regression file if not os.path.exists(args.failed_regression): print(f"{Colors.RED}Error: Failed regression file not found {args.failed_regression}{Colors.END}") sys.exit(1) print(f"Running failed tests from: {args.failed_regression}") elif hasattr(args, 'list') and args.list: # Validate regression list file - check in ../def/case_def/ directory regression_list_path = os.path.join(os.getcwd(), "..", "def", "case_def", args.list) if not os.path.exists(regression_list_path): print(f"{Colors.RED}Error: Regression list file not found {regression_list_path}{Colors.END}") sys.exit(1) print(f"Running tests from regression list: {regression_list_path}") # For regression list mode, groups are not required else: # For normal regression mode, groups are required if not args.groups: print(f"{Colors.RED}Error: -g/--groups is required when not using -lst/--list or --failed-regression{Colors.END}") sys.exit(1) # Check if json_list file exists (only for normal regression) json_list_path = os.path.join(os.getcwd(), "../def/json_list") if not os.path.exists(json_list_path): print(f"{Colors.RED}Error: Test list file not found {json_list_path}{Colors.END}") sys.exit(1) # Run regression test runner = RegressionRunner(args) try: runner.run() except KeyboardInterrupt: print(f"\n{Colors.YELLOW}User interrupted, cleaning up...{Colors.END}") runner.cleanup() sys.exit(1) except Exception as e: print(f"{Colors.RED}Regression test exception: {e}{Colors.END}") runner.cleanup() sys.exit(1)
if __name__ == "__main__": main()
class Colors: """Terminal color definitions""" RED = '\033[91m' GREEN = '\033[92m' YELLOW = '\033[93m' BLUE = '\033[94m' MAGENTA = '\033[95m' CYAN = '\033[96m' WHITE = '\033[97m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m'
class TestResult: """Test result class""" def __init__(self, name: str, config: str = "default"): self.name = name self.config = config self.status = "PENDING" # PENDING, RUNNING, PASS, FAIL, TIMEOUT, ERROR, RERUN PASS, RERUN FAIL self.start_time = None self.end_time = None self.duration = 0 self.log_file = "" self.job_id = None self.retry_count = 0 self.retry_started = False # Track if retry has been started self.error_msg = "" self.coverage_db = "" self.estimated_duration = 0 # Estimated duration (seconds) self.seed = None self.opts = [] self.is_retry = False # Track if this is a retry case def start(self): """Start test""" self.start_time = time.time() self.status = "RUNNING" def finish(self, status: str, error_msg: str = ""): """Complete test""" self.end_time = time.time() self.duration = self.end_time - self.start_time if self.start_time else 0 # Handle retry statuses if self.is_retry or self.retry_count > 0: if status == "PASS": self.status = "RERUN PASS" elif status == "FAIL": self.status = "RERUN FAIL" else: self.status = status else: self.status = status self.error_msg = error_msg def get_duration_str(self) -> str: """Get formatted duration string""" if self.duration == 0: return "N/A" hours = int(self.duration // 3600) minutes = int((self.duration % 3600) // 60) seconds = int(self.duration % 60) if hours > 0: return f"{hours}h{minutes}m{seconds}s" elif minutes > 0: return f"{minutes}m{seconds}s" else: return f"{seconds}s"
class RegressionRunner: """Regression test runner""" def __init__(self, args): self.args = args self.tests = [] self.results = {} self.start_time = time.time() # Create timestamp for regression run self.regression_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Extract directory name from current simulation directory current_dir = os.path.basename(os.getcwd()) # Extract xxx part from sim_xxx pattern if current_dir.startswith('sim_'): dir_suffix = current_dir[4:] # Remove 'sim_' prefix else: dir_suffix = '' self.regression_dir_name = f"regression_{dir_suffix}_{self.regression_timestamp}" # Set up directories based on output path self._setup_directories() self.history_db_file = Path("test_history.json") # Historical data file self.test_history = self.load_test_history() # Load historical test data self.job_ids = [] self.submitted_jobs = [] # Track submitted job IDs self.submitted_results = [] # Track submitted test results # Map job_id -> full opcode used for submission, to allow direct resubmission without lookups self.job_meta: Dict[str, Dict] = {} self.running_jobs = 0 # Track number of running jobs (RUN status) self.pending_jobs = 0 # Track number of pending jobs (PEND status) self.lock = threading.Lock() # Delay first status summary until the print interval elapses (default: 30 minutes) self.last_status_print = time.time() self.status_print_interval = args.status_interval * 60 # Status print interval (seconds) self._stop_status_thread = False # Control status print thread stop # Create necessary directories immediately self._create_directories() # Initialize real-time report generation self.report_update_interval = 30 # Update report every 30 seconds self.last_report_update = time.time() self.real_time_report_path = self.report_dir / "zregress_report.log" # Load error monitor state if exists self.load_error_monitor_state() # Initialize error monitoring self.error_monitor_interval = args.error_monitor_interval * 60 # Convert minutes to seconds self.last_error_monitor_time = time.time() self.log_read_positions = {} # Track last read position for each log file # Track last time each log file produced new content; used to detect hung simulations self.log_last_update_times: Dict[str, float] = {} # Configurable hang timeout in seconds (no new log lines) self.hang_timeout_seconds = ( getattr(args, 'hang_timeout_minutes', 30) * 60 if hasattr(args, 'hang_timeout_minutes') and args.hang_timeout_minutes is not None else 30 * 60 ) # Configurable PEND timeout in seconds (for jobs waiting for resources) self.pend_timeout_seconds = ( getattr(args, 'pend_timeout_minutes', None) * 60 if hasattr(args, 'pend_timeout_minutes') and args.pend_timeout_minutes is not None else None ) self.error_keywords = ['UVM_ERROR', 'UVM_FATAL', 'Solver failed', 'FATAL', 'Error', 'Offending'] # Set up signal handling signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) # Initialize random seed for better randomization random.seed(time.time()) # Add flag to control validation frequency self._validation_count = 0 self._last_validation_time = 0 # Auto-restart configuration self.auto_restart = getattr(args, 'auto_restart', False) self.restart_interval_hours = getattr(args, 'restart_interval_hours', None) self.restart_count = 0 # Track number of restarts self.max_restarts = getattr(args, 'max_restarts', None) # Maximum number of restarts (None = unlimited) self.first_run_start_time = time.time() # Track first run start time for interval-based restart def _setup_directories(self): """Set up regression directories""" # Set up regression directory if hasattr(self.args, 'output_dir') and self.args.output_dir: base_output_dir = Path(self.args.output_dir) self.regression_dir = base_output_dir / self.regression_dir_name else: self.regression_dir = Path(self.regression_dir_name) # Set up subdirectories self.log_dir = self.regression_dir / "logs" self.report_dir = self.regression_dir / "report_log" self.coverage_dir = self.regression_dir / "coverage" self.wave_dir = self.regression_dir / "waves" def _create_directories(self): """Create all necessary directories""" self.regression_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) self.report_dir.mkdir(parents=True, exist_ok=True) self.coverage_dir.mkdir(parents=True, exist_ok=True) self.wave_dir.mkdir(parents=True, exist_ok=True) # Directory setup completed def _get_output_dir_path(self): """Get the output directory path""" if hasattr(self.args, 'output_dir'): return self.args.output_dir return "." def load_test_history(self) -> Dict: """Load historical test data""" if not self.history_db_file.exists(): return {} try: with open(self.history_db_file, 'r') as f: return json.load(f) except Exception as e: print(f"{Colors.YELLOW}Warning: Unable to load historical test data: {e}{Colors.END}") return {} def save_test_history(self): """Save historical test data""" # Update historical data for result_key, result in self.results.items(): if result.status in ["PASS", "RERUN PASS"] and result.duration > 0: test_config_key = f"{result.name}:{result.config}" # If it's a new test, initialize history record if test_config_key not in self.test_history: self.test_history[test_config_key] = { 'durations': [], 'avg_duration': 0, 'last_duration': 0, 'count': 0 } # Update historical data history = self.test_history[test_config_key] history['durations'].append(result.duration) # Only keep the last 10 runs if len(history['durations']) > 10: history['durations'] = history['durations'][-10:] history['avg_duration'] = sum(history['durations']) / len(history['durations']) history['last_duration'] = result.duration history['count'] += 1 # Save to file try: with open(self.history_db_file, 'w') as f: json.dump(self.test_history, f, indent=2) except Exception as e: print(f"{Colors.YELLOW}Warning: Unable to save historical test data: {e}{Colors.END}") def estimate_test_duration(self, test_name: str, config: str) -> float: """Estimate test duration (seconds)""" test_config_key = f"{test_name}:{config}" # If historical data exists, use average duration if test_config_key in self.test_history: return self.test_history[test_config_key]['avg_duration'] # If no specific test history, try to use average of tests with same config config_tests = [k for k in self.test_history.keys() if k.endswith(f":{config}")] if config_tests: avg_duration = sum(self.test_history[k]['avg_duration'] for k in config_tests) / len(config_tests) return avg_duration # If no historical data at all, use default value (5 minutes) return 300 def signal_handler(self, signum, frame): """Signal handler""" print(f"\n{Colors.YELLOW}Received signal {signum}, cleaning up...{Colors.END}") self.cleanup() sys.exit(1) def cleanup(self): """Clean up resources""" # Save error monitor state self.save_error_monitor_state() if self.args.mode == "lsf" and self.job_ids: print(f"{Colors.YELLOW}Cancelling LSF jobs...{Colors.END}") for job_id in self.job_ids: try: subprocess.run(["bkill", str(job_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=10) except: pass def load_test_list(self, test_file: str, default_config: str = None) -> List[Tuple[str, str]]: """Load test list, returns list of (test_name, config)""" tests = [] try: with open(test_file, 'r') as f: for line in f: line = line.strip() if line and not line.startswith('#'): # Check if line contains config info (format: test_name:config) if ':' in line: test_name, config = line.split(':', 1) tests.append((test_name.strip(), config.strip())) else: # Use default config tests.append((line, default_config)) except FileNotFoundError: print(f"{Colors.RED}Error: Test list file not found {test_file}{Colors.END}") sys.exit(1) return tests def load_test_cases(self, test_files: List[str]) -> List[Dict]: """Load the test define""" cases = [] for file in test_files: with open(file, "r") as f: cases.extend(json.load(f)) return cases def load_failed_regression_cases(self, failed_regression_file: str) -> List[Dict]: """Load failed test cases from failed regression JSON file""" try: with open(failed_regression_file, "r") as f: failed_cases = json.load(f) print(f"Loaded {len(failed_cases)} failed test cases from {failed_regression_file}") # Convert failed regression format back to standard test case format standard_cases = [] for failed_case in failed_cases: # Extract the original test case data, removing the failure-specific fields standard_case = {} for key, value in failed_case.items(): if not key.startswith('actual_') and key not in ['log_file', 'retry_count', 'failure_timestamp', 'original_repeat']: standard_case[key] = value # Restore original repeat count if it was modified if 'original_repeat' in failed_case: standard_case['repeat'] = failed_case['original_repeat'] standard_cases.append(standard_case) print(f"Converted {len(standard_cases)} failed test cases to standard format") return standard_cases except Exception as e: print(f"Error loading failed regression file {failed_regression_file}: {e}") return [] def load_regression_list_cases(self, regression_list_file: str) -> List[Dict]: """Load test cases from regression list JSON file""" try: with open(regression_list_file, "r") as f: test_cases = json.load(f) print(f"Loaded {len(test_cases)} test cases from regression list: {regression_list_file}") # Validate that each test case has required fields valid_cases = [] for i, case in enumerate(test_cases): if not isinstance(case, dict): print(f"Warning: Test case {i} is not a dictionary, skipping") continue if 'name' not in case: print(f"Warning: Test case {i} missing 'name' field, skipping") continue # Set default values for optional fields if 'config' not in case: case['config'] = 'default' if 'repeat' not in case: case['repeat'] = 1 if 'timeout' not in case: case['timeout'] = 60 if 'opts' not in case: case['opts'] = [] if 'group' not in case: case['group'] = ['default'] valid_cases.append(case) print(f"Validated {len(valid_cases)} test cases from regression list") return valid_cases except Exception as e: print(f"Error loading regression list file {regression_list_file}: {e}") return [] def filter_cases(self, cases: List[Dict], groups: List[str]) -> List[Dict]: """Select the group by tag""" return [case for case in cases if set(groups).issubset(set(case["group"]))] # get test cases by group def submit_compile(self, que: str, dienum: str, rtl_ver: str, mode: str, define: str = None) -> Dict: """Submit the elab and compile""" result = { "name": "compile", "status": "PENDING", } try: # Build output directory path: self.args.output_dir output_dir = self.args.output_dir # Using output directory for compile # Construct bsub command cmd = ["bsub"] # Add LSF parameters cmd.extend(["-q", que]) # Add resource reservation for compile job resource_requests = [] # Add memory reservation if specified if hasattr(self.args, 'memory') and self.args.memory is not None: memory_mb = self.args.memory * 1024 # Convert GB to MB resource_requests.append(f"rusage[mem={memory_mb}]") # Memory reservation configured for compile job # Add CPU selection for compile job (always request at least 1 CPU) cpu_cores = getattr(self.args, 'cpu_cores', 1) # Build resource request string with select and rusage resource_string = f"select[ncpus>={cpu_cores}]" if resource_requests: resource_string += f" rusage[{','.join(resource_requests)}]" cmd.extend(["-R", resource_string]) # Resource request configured for compile job # Set job name and output cmd.extend([ "-J", f"pre_jobs", "make", f'pre_full_run', #f'DUT_VER={rtl_ver}', #f'die_num={dienum}', #f'WORK_DIR={output_dir}', f'mode={mode}' #f'p2_mode={mode}' ]) if define is not None: cmd.extend([f'def+={define}']) # Compile command prepared # Submit job output = subprocess.check_output(cmd, shell=False) job_id = self.parse_job_id(output) result["job_id"] = job_id result["status"] = "SUBMITTED" except subprocess.CalledProcessError as e: result["status"] = "SUBMIT_FAIL" result["error"] = str(e) except Exception as e: result["status"] = f"ERROR: {str(e)}" return result def gen_test_case(self, case: Dict, w_dir: str, log_dir: str, que: str, specified_seed: str = None) -> List[Dict]: """Generate test case commands""" opcodes = [] for repeat in range(case["repeat"]): # Construct bsub command cmd = ["bsub"] # Add LSF parameters cmd.extend(["-q", que]) # Memory reservation handling memory_gb = None # Priority 1: Check if memory is specified in the test case JSON if "memory" in case and case["memory"]: try: memory_gb = int(case["memory"]) # Using memory from JSON configuration except (ValueError, TypeError): print(f" Warning: Invalid memory value in JSON: {case['memory']}") # Priority 2: Use command line argument if JSON doesn't have memory if memory_gb is None and hasattr(self.args, 'memory') and self.args.memory is not None: memory_gb = self.args.memory # Using memory from command line # Add resource reservation to bsub command resource_requests = [] # Add memory reservation if specified if memory_gb: memory_mb = memory_gb * 1024 # Convert GB to MB resource_requests.append(f"rusage[mem={memory_mb}]") # Memory reservation configured # Add CPU selection (always request at least 1 CPU) cpu_cores = getattr(self.args, 'cpu_cores', 1) # Build resource request string with select and rusage resource_string = f"select[ncpus>={cpu_cores}]" if resource_requests: resource_string += f" rusage[{','.join(resource_requests)}]" cmd.extend(["-R", resource_string]) # Resource request configured # Legacy memory handling (commented out in original) # if "memory" in case: # if case["memory"] != "": # cmd.extend(["-M", str(self.parse_memory(case["memory"]))]) # Use specified seed if provided, otherwise generate unique seed if specified_seed is not None: seed = int(specified_seed) # Using specified seed for repeat else: # Generate unique seed for each test case, opts, and repeat # Include opts in seed generation to ensure different opts get different seeds opts_str = "_".join(case["opts"]) if case["opts"] else "no_opts" unique_seed_base = hash(case["name"] + opts_str + str(repeat) + str(int(time.time() * 1000))) seed = abs(unique_seed_base) % 10000 # Generated seed for repeat with opts if "lmn" in case: lmn = case["lmn"] else: lmn = "" # Set job name and output cmd.extend([ "-J", f"TEST_{case['name']}_{repeat}", "make", f'batch_run', f'tc={case["name"]}', f'pl=UVM_LOW', f'timestamp=N', f'timeout={case["timeout"]}', f'WORK_DIR={w_dir}', f'LOGDIR={str(self.log_dir)}', # Point to the logs directory f'WAVEDIR={str(self.wave_dir)}', # Add wave directory parameter f'wave={"fsdb" if self.args.wave else "null"}', f'seed={seed}', f'lmn={lmn}' ]) # Add coverage parameter if specified if hasattr(self.args, 'cov') and self.args.cov: cmd.extend([f'cov={self.args.cov}']) # Coverage parameter configured # Debug: print timeout value # Test case timeout configured # Add VCS optimization options if hasattr(self.args, 'vcs_optimize') and self.args.vcs_optimize: vcs_cores = getattr(self.args, 'vcs_cores', 4) cmd.extend([f'opts+=+VCS_PARALLEL={vcs_cores}']) cmd.extend([f'opts+=+VCS_OPTIMIZE=1']) # VCS optimization configured if hasattr(self.args, 'vcs_xa') and self.args.vcs_xa: cmd.extend([f'opts+=+VCS_XA=1']) # VCS-XA acceleration configured # Add optional parameters # Adding opts configuration for opt in case["opts"]: if opt: # Only add non-empty opts cmd.extend([f'opts+=+{opt}']) # Added opt configuration # Submit job - use the regression-specific log directory # Create test-specific log directory under logs/ # Build log file name robustly to avoid extra underscores when fields are empty opts_str = "_".join([o for o in (case.get("opts") or []) if o]) test_log_dir = self.log_dir / case['name'] test_log_dir.mkdir(parents=True, exist_ok=True) name_parts = [case['name'], str(seed)] if opts_str: name_parts.append(opts_str) else: name_parts.append("no_opts") if lmn: name_parts.append(lmn) safe_name = "_".join(name_parts) log_file = str(test_log_dir / f"{safe_name}.log") # Log file configured opcodes.append({ "cmd": cmd, "case": case, "id": repeat, "log_path": log_file, "seed": str(seed) }) return opcodes def submit_test_case(self, opcode: Dict) -> Dict: """Submit one test to LSF""" result = { "name": opcode["case"].get("name", "unknown"), "status": "PENDING", "seed": opcode["seed"], "id": opcode["id"] } try: # Submit cmd with detailed error capture timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Submitting job: {opcode['case'].get('name', 'unknown')} seed={opcode['seed']}") print(f"INFO: {timestamp} Command: {' '.join(opcode['cmd'])}") # Use subprocess.run to capture both stdout and stderr process = subprocess.run(opcode["cmd"], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, timeout=120) # Check return code if process.returncode != 0: result["status"] = "SUBMIT_FAIL" result["error"] = f"Command failed with return code {process.returncode}" result["stdout"] = process.stdout result["stderr"] = process.stderr timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL") print(f"INFO: {timestamp} Return code: {process.returncode}") print(f"INFO: {timestamp} STDOUT: {process.stdout}") print(f"INFO: {timestamp} STDERR: {process.stderr}") return result # Parse job ID from output job_id = self.parse_job_id(process.stdout.encode()) if job_id == "UNKNOWN": result["status"] = "SUBMIT_FAIL" result["error"] = "Failed to parse job ID from LSF output" result["stdout"] = process.stdout result["stderr"] = process.stderr timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL") print(f"INFO: {timestamp} LSF Output: {process.stdout}") print(f"INFO: {timestamp} LSF Error: {process.stderr}") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') opts_str = "_".join(opcode['case'].get('opts', [])) if opcode['case'].get('opts') else "no_opts" print(f"INFO: {timestamp} [jobid {job_id}] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMITTED") result["job_id"] = job_id result["status"] = "SUBMITTED" result["error"] = '' result["stdout"] = process.stdout result["stderr"] = process.stderr if job_id not in self.submitted_jobs: self.submitted_jobs.append(job_id) # Store additional test case info for later reference result["case_name"] = opcode['case'].get('name', 'unknown') result["case_seed"] = opcode['seed'] # Persist the full opcode for this job so FAIL handling can resubmit directly try: self.job_meta[str(job_id)] = copy.deepcopy(opcode) result["opcode"] = self.job_meta[str(job_id)] except Exception: # Best-effort; do not block on deepcopy issues self.job_meta[str(job_id)] = opcode # Also update the corresponding TestResult object # Use the full unique key to find the correct TestResult test_name = opcode['case'].get('name', 'unknown') config = opcode['case'].get('config', 'default') seed = opcode['seed'] opts_str = "_".join(opcode['case'].get('opts', [])) if opcode['case'].get('opts') else "no_opts" unique_key = f"{test_name}:{config}:{seed}:{opts_str}" # Store unique_key alongside opcode for direct updates later try: self.job_meta[str(job_id)]["unique_key"] = unique_key except Exception: pass result["unique_key"] = unique_key if unique_key in self.results: self.results[unique_key].job_id = job_id self.results[unique_key].seed = seed if opcode.get('log_path'): self.results[unique_key].log_file = opcode['log_path'] print(f"DEBUG: Updated TestResult {unique_key} with job_id {job_id}") else: print(f"Warning: TestResult not found for key: {unique_key}") except subprocess.TimeoutExpired as e: result["status"] = "SUBMIT_FAIL" result["error"] = f"Command timeout: {str(e)}" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL: Timeout") except subprocess.CalledProcessError as e: result["status"] = "SUBMIT_FAIL" result["error"] = str(e) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} SUBMIT_FAIL: {str(e)}") except Exception as e: result["status"] = f"ERROR: {str(e)}" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {opcode['case'].get('name', 'unknown')} seed={opcode['seed']} ERROR: {str(e)}") return result def get_test_info_by_job_id(self, job_id: str) -> Optional[Dict]: """Get test case information by job ID""" # First search through submitted results to find matching job_id for result in getattr(self, 'submitted_results', []): if result.get('job_id') == job_id: # Try multiple field names to find the test name test_name = result.get('case_name') or result.get('name') test_seed = result.get('case_seed') or result.get('seed') if test_name and test_seed: return { 'name': test_name, 'seed': test_seed, 'id': result.get('id', 'unknown') } # If not found in submitted_results, search through self.results for result_key, result in self.results.items(): if hasattr(result, 'job_id') and result.job_id == job_id: return { 'name': result.name, 'seed': getattr(result, 'seed', 'unknown'), 'id': getattr(result, 'id', 'unknown') } # Debug info (muted) # print(f"DEBUG: Could not find test info for job_id {job_id}") # print(f"DEBUG: submitted_results count: {len(getattr(self, 'submitted_results', []))}") # print(f"DEBUG: self.results count: {len(self.results)}") # for i, result in enumerate(getattr(self, 'submitted_results', [])[:3]): # print(f"DEBUG: submitted_results[{i}]: {result}") # for i, (key, result) in enumerate(list(self.results.items())[:3]): # print(f"DEBUG: results[{i}] {key}: job_id={getattr(result, 'job_id', 'None')}") return None def _resubmit_from_stored_opcode(self, job_id: str): """Directly resubmit a failed job using the stored opcode, avoiding any name/seed lookup.""" stored = self.job_meta.get(str(job_id)) if not stored: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] No stored opcode found; cannot direct-resubmit") return # CRITICAL FIX: Check retry limit before proceeding current_retry_attempt = stored.get('retry_attempt', 0) max_retries = getattr(self.args, 'retry', 0) if max_retries <= 0: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] Retry disabled (max_retries={max_retries})") return if current_retry_attempt >= max_retries: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] Max retries reached ({current_retry_attempt}/{max_retries}), stopping retry") return try: # Clone and mark as retry while keeping same seed/opts/config opcode = copy.deepcopy(stored) opcode['retry_attempt'] = current_retry_attempt + 1 opcode['retry_seed'] = opcode.get('seed', 'unknown') except Exception: opcode = stored opcode['retry_attempt'] = current_retry_attempt + 1 opcode['retry_seed'] = opcode.get('seed', 'unknown')
# Backup the original log before resubmitting to avoid it being overwritten by the retry try: original_log_path = self.get_test_log_path_by_job_id(job_id) if original_log_path and os.path.exists(original_log_path): p = Path(original_log_path) backup_path = p.with_name(p.stem + '_bak.log') if p.suffix == '.log' else Path(str(p) + '_bak.log') if not backup_path.exists(): os.rename(str(p), str(backup_path)) print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Backed up log: {p} -> {backup_path}") except Exception as _e: print(f"WARNING: Failed to backup log for job {job_id}: {_e}")
# Force wave dump on retry: ensure wave=fsdb in command try: cmd_list = opcode.get('cmd', []) if isinstance(cmd_list, list): replaced = False for i, token in enumerate(cmd_list): if isinstance(token, str) and token.startswith('wave='): if token != 'wave=fsdb': cmd_list[i] = 'wave=fsdb' replaced = True break if not replaced: cmd_list.append('wave=fsdb') # Keep rerun tag consistent where applicable for standard resubmits (no harm if already present) if not any(isinstance(t, str) and t.startswith('lmn=') for t in cmd_list): cmd_list.append('lmn=rerun') opcode['cmd'] = cmd_list except Exception: pass
# Ensure corresponding TestResult exists/updated based on unique_key (bypass name/seed mapping) unique_key = stored.get('unique_key') if not unique_key: # Fallback: compute from opcode test_name = opcode.get('case', {}).get('name', 'unknown') config = opcode.get('case', {}).get('config', 'default') seed_val = opcode.get('seed', 'unknown') opts_str = "_".join(opcode.get('case', {}).get('opts', [])) if opcode.get('case', {}).get('opts') else "no_opts" unique_key = f"{test_name}:{config}:{seed_val}:{opts_str}" else: # Parse parts just for constructing missing TestResult if needed try: name_part, config_part, seed_part, _ = unique_key.split(':', 3) except ValueError: name_part = opcode.get('case', {}).get('name', 'unknown') config_part = opcode.get('case', {}).get('config', 'default') seed_part = opcode.get('seed', 'unknown')
# Create or update TestResult entry directly by unique_key try: if unique_key not in self.results: # Build a new TestResult with available meta new_name = locals().get('name_part', opcode.get('case', {}).get('name', 'unknown')) new_cfg = locals().get('config_part', opcode.get('case', {}).get('config', 'default')) self.results[unique_key] = TestResult(new_name, new_cfg) self.results[unique_key].seed = locals().get('seed_part', opcode.get('seed', 'unknown')) self.results[unique_key].opts = opcode.get('case', {}).get('opts', []) try: self.results[unique_key].estimated_duration = self.estimate_test_duration(new_name, new_cfg) except Exception: pass else: # Update existing TestResult for retry existing_result = self.results[unique_key] existing_result.retry_count = opcode.get('retry_attempt', 1) existing_result.is_retry = True existing_result.retry_started = True # Create a new unique key for retry cases to track them separately retry_unique_key = f"{unique_key}_retry_{opcode.get('retry_attempt', 1)}" if retry_unique_key not in self.results: retry_name = locals().get('name_part', opcode.get('case', {}).get('name', 'unknown')) retry_cfg = locals().get('config_part', opcode.get('case', {}).get('config', 'default')) retry_result = TestResult(retry_name, retry_cfg) retry_result.seed = locals().get('seed_part', opcode.get('seed', 'unknown')) retry_result.opts = opcode.get('case', {}).get('opts', []) retry_result.retry_count = opcode.get('retry_attempt', 1) retry_result.is_retry = True retry_result.retry_started = True try: retry_result.estimated_duration = self.estimate_test_duration(retry_name, retry_cfg) except Exception: pass self.results[retry_unique_key] = retry_result except Exception: pass
timestamp = datetime.now().strftime('%m-%d %H:%M:%S') case_name = opcode.get('case', {}).get('name', 'unknown') case_seed = opcode.get('seed', 'unknown') print(f"INFO: {timestamp} [jobid {job_id}] Directly resubmitting FAIL case {case_name} seed={case_seed} (retry {opcode.get('retry_attempt', 1)}/{max_retries})") # Submit the retry job result = self.submit_test_case(opcode) if result["status"] == "SUBMITTED": # Store the retry job metadata retry_job_id = result["job_id"] opcode['unique_key'] = retry_unique_key self.job_meta[str(retry_job_id)] = opcode # Update the retry TestResult with the new job ID if retry_unique_key in self.results: self.results[retry_unique_key].job_id = retry_job_id self.results[retry_unique_key].status = "PENDING" # Add to submitted jobs list if retry_job_id not in self.submitted_jobs: self.submitted_jobs.append(retry_job_id) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {retry_job_id}] {case_name} seed={case_seed} Submitted (retry {opcode.get('retry_attempt', 1)}/{max_retries})") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] Retry submission failed: {result.get('error', 'Unknown error')}")
def update_test_result_with_job_info(self, test_name: str, job_id: str, seed: str, log_file: str = None): """Update TestResult object with job information""" for result_key, result in self.results.items(): if result.name == test_name: result.job_id = job_id # Store seed information if not hasattr(result, 'seed'): result.seed = seed # Store log file path if log_file: result.log_file = log_file # Update status to RUNNING when job is submitted if result.status == "PENDING": result.start() # This will set status to RUNNING and start_time break def update_test_result_status(self, job_id: str, status: str): """Update test result status in self.results""" # Find the test result by job_id for result_key, result in self.results.items(): if hasattr(result, 'job_id') and result.job_id == job_id: result.finish(status, "") break # Also check in submitted_results for result in getattr(self, 'submitted_results', []): if result.get('job_id') == job_id: result['status'] = status break def _update_test_result_status(self, unique_key: str, job_id: str, seed: str, status: str = "PENDING"): """Helper method to update TestResult status consistently""" if unique_key in self.results: self.results[unique_key].job_id = job_id self.results[unique_key].seed = seed self.results[unique_key].status = status return True else: print(f"Warning: TestResult not found for key: {unique_key}") return False def get_test_status_display(self, job_id: str, status: str) -> str: """Get formatted test status display string""" test_info = self.get_test_info_by_job_id(job_id) if test_info: return f"[jobid {job_id}] {test_info['name']} seed={test_info['seed']}" else: # Unknown mapping happens when TestResult hasn't been recorded yet; mute noisy label return f"[jobid {job_id}]" def parse_job_id(self, output: bytes) -> str: """Parse job ID from LSF output""" try: output_str = output.decode('utf-8') # Look for pattern like "Job <12345> is submitted to queue <queue_name>" match = re.search(r'Job <(\d+)>', output_str) if match: return match.group(1) else: return "UNKNOWN" except Exception: return "UNKNOWN" def parse_memory(self, memory_str: str) -> int: """Parse memory string to MB""" try: if memory_str.endswith('GB'): return int(float(memory_str[:-2]) * 1024) elif memory_str.endswith('MB'): return int(float(memory_str[:-2])) else: return int(memory_str) except Exception: return 4000 # Default to 4GB def run_compile_and_regression(self, dienum: str, rtl_ver: str, mode: str, define: str = None) -> bool: """Run complete compile and regression flow, returns True if successful, False if failed""" print(f"{Colors.BLUE}=== Starting Compile and Regression Flow ==={Colors.END}") # Build output directory path: self.args.output_dir output_dir = self.args.output_dir print(f"Using compile output directory: {output_dir}") print(f"Using regression directory: {self.regression_dir}") # Check if compile should be skipped skip_compile = self.should_skip_compile() if skip_compile: print(f"{Colors.YELLOW}Compile step is set to be bypassed{Colors.END}") # Check if compile files already exist if self.check_compile_files_exist(output_dir, dienum, rtl_ver, mode): print(f"{Colors.GREEN}Existing compile files found, skipping compile step{Colors.END}") compile_required = False else: print(f"{Colors.YELLOW}No existing compile files found, compile step is required{Colors.END}") compile_required = True else: compile_required = True # Step 1: Submit compile job (if required) if compile_required: print(f"Step 1: Submitting compile job...") compile_result = self.submit_compile(self.args.queue, dienum, rtl_ver, mode, define) if compile_result["status"] == "SUBMITTED": timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {compile_result['job_id']}] compile_job SUBMITTED") # Wait for compile job to complete print(f"Waiting for compile job to complete...") compile_success = self.wait_for_job_completion(compile_result["job_id"]) # If compile failed, exit immediately if not compile_success: print(f"{Colors.RED}Compilation failed! Exiting without running regression tests.{Colors.END}") return False # Verify compile was successful by checking for output files if not self.check_compile_files_exist(output_dir, dienum, rtl_ver, mode): print(f"{Colors.RED}Error: Compile job completed but no output files found{Colors.END}") print(f"{Colors.RED}Compilation verification failed! Exiting without running regression tests.{Colors.END}") return False print(f"{Colors.GREEN}Compile job completed successfully{Colors.END}") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] compile_job SUBMIT_FAIL: {compile_result.get('error', 'Unknown error')}") return False else: print(f"Step 1: Compile step skipped (bypass enabled and files exist)") # Step 2: Generate and submit test cases print(f"Step 2: Generating and submitting test cases...") # Load test cases if hasattr(self.args, 'failed_regression') and self.args.failed_regression: # Load from failed regression file print(f"Loading failed test cases from: {self.args.failed_regression}") test_cases = self.load_failed_regression_cases(self.args.failed_regression) if not test_cases: print(f"{Colors.RED}Error: No test cases loaded from failed regression file{Colors.END}") return False print(f"Loaded {len(test_cases)} failed test cases for re-run") elif hasattr(self.args, 'list') and self.args.list: # Load from regression list file in ../def/case_def/ directory regression_list_path = os.path.join(os.getcwd(), "..", "def", "case_def", self.args.list) print(f"Loading test cases from regression list: {regression_list_path}") test_cases = self.load_regression_list_cases(regression_list_path) if not test_cases: print(f"{Colors.RED}Error: No test cases loaded from regression list file{Colors.END}") return False print(f"Loaded {len(test_cases)} test cases from regression list") else: # Load from normal test files cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" test_file_list = [] with open(test_file_list_name, 'r') as f: for line in f: if line != '\n': file_path = cur_path + "/../def" + line test_file_list.append(file_path.replace('\n', '')) test_cases = self.load_test_cases(test_file_list) print(f"Loaded {len(test_cases)} test cases from files") # Filter test cases by group (only if groups are specified) if self.args.groups: test_cases = self.filter_cases(test_cases, self.args.groups) print(f"Filtered to {len(test_cases)} test cases for groups: {self.args.groups}") else: print(f"No group filter applied, using all {len(test_cases)} test cases") # Generate test case commands all_opcodes = [] for case in test_cases: print(f"Processing test case: {case['name']} with repeat={case.get('repeat', 1)}") # Use regression-specific log directory for simulation output sim_output_dir = str(self.regression_dir) print(f" Using simulation output directory: {sim_output_dir}") opcodes = self.gen_test_case(case, output_dir, sim_output_dir, self.args.queue) all_opcodes.extend(opcodes) # Remove duplicate opcodes based on unique identifier print(f"Generated {len(all_opcodes)} test case commands") print("Removing duplicate opcodes...") # Create a set to track unique identifiers seen_identifiers = set() unique_opcodes = [] for opcode in all_opcodes: # Create a unique identifier for each opcode # Combine test name, seed, and repeat ID to ensure uniqueness unique_id = f"{opcode['case']['name']}_{opcode['seed']}_{opcode['id']}" if unique_id not in seen_identifiers: seen_identifiers.add(unique_id) unique_opcodes.append(opcode) else: # print(f" Skipping duplicate: {unique_id}") pass all_opcodes = unique_opcodes print(f"After removing duplicates: {len(all_opcodes)} unique test case commands") # Initialize test results for all test cases - ensure no duplicates print(f"Initializing test results for {len(all_opcodes)} opcodes...") unique_test_keys = set() for opcode in all_opcodes: test_name = opcode["case"].get("name", "unknown") config = opcode["case"].get("config", "default") seed = opcode["seed"] opts_str = "_".join(opcode["case"].get("opts", [])) if opcode["case"].get("opts") else "no_opts" # Create a unique key that combines test name, config, seed, and opts unique_key = f"{test_name}:{config}:{seed}:{opts_str}" if unique_key not in unique_test_keys: unique_test_keys.add(unique_key) if unique_key not in self.results: self.results[unique_key] = TestResult(test_name, config) self.results[unique_key].seed = seed self.results[unique_key].opts = opcode["case"].get("opts", []) self.results[unique_key].estimated_duration = self.estimate_test_duration(test_name, config) # print(f" Created TestResult for: {unique_key}") else: # print(f" TestResult already exists for: {unique_key}") pass else: # print(f" Skipping duplicate opcode for: {unique_key}") pass print(f"Initialized {len(unique_test_keys)} unique test results") timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Submitting test cases with max concurrent limit: {self.args.max_concurrent}") # Initial job submission up to max_concurrent initial_submit_count = min(self.args.max_concurrent, len(all_opcodes)) sim_cases_num = len(all_opcodes) print(f"Initial submission: will submit {initial_submit_count} jobs") for _ in range(initial_submit_count): if all_opcodes: opcode = all_opcodes.pop(0) result = self.submit_test_case(opcode) self.submitted_results.append(result) if result["status"] == "SUBMITTED": # Don't increment running_jobs yet - wait for actual RUN status if result["job_id"] not in self.submitted_jobs: self.submitted_jobs.append(result["job_id"]) # Update corresponding TestResult object test_name = opcode["case"].get("name", "unknown") config = opcode["case"].get("config", "default") seed = opcode["seed"] opts_str = "_".join(opcode["case"].get("opts", [])) if opcode["case"].get("opts") else "no_opts" unique_key = f"{test_name}:{config}:{seed}:{opts_str}" if self._update_test_result_status(unique_key, result["job_id"], seed, "PENDING"): print(f"DEBUG: Updated TestResult {unique_key} with job_id {result['job_id']} in initial submission") timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {result['job_id']}] {result['name']} seed={result.get('seed', 'unknown')} PENDING") else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {result['name']} seed={result.get('seed', 'unknown')} SUBMIT_FAIL: {result.get('error', 'Unknown error')}") # Add delay between submissions to prevent server overload time.sleep(2) print(f"Initial submission complete. Submitted jobs: {len(self.submitted_jobs)}, Pending opcodes: {len(all_opcodes)}") # Clean any duplicate job IDs that may have been added self._clean_submitted_jobs() # CRITICAL FIX: Initialize job status counts after initial submission if self.submitted_jobs: print(f"Initializing job status counts for {len(self.submitted_jobs)} submitted jobs...") # Check current status of all submitted jobs initial_status_changes = {} for job_id in self.submitted_jobs: status = self.check_lsf_job_status(int(job_id)) initial_status_changes[int(job_id)] = status print(f"DEBUG: Job {job_id} initial status: {status}") # Update counts based on initial status self._update_job_status_counts(initial_status_changes) print(f"DEBUG: After initialization - running_jobs: {self.running_jobs}, pending_jobs: {self.pending_jobs}") # Main loop: monitor jobs and submit new ones as slots become available last_status_print_time = time.time() # Track last status print time last_status_log_time = time.time() # Track last RERUN status log time (60s) # Start status monitoring thread for LSF regression self.status_thread = threading.Thread(target=self._status_print_thread, daemon=True) self.status_thread.start() print(f"{Colors.BLUE}Started status monitoring thread for real-time report updates{Colors.END}") # 提前启动作业监控线程,避免主循环阻塞导致后续 monitor 阶段无法及时覆盖 try: if not hasattr(self, 'monitor_thread') or not getattr(self, 'monitor_thread', None) or not self.monitor_thread.is_alive(): # Do not pass the shared list reference; let the function pick up the live list self.monitor_thread = threading.Thread(target=self.monitor_all_jobs, daemon=True) self.monitor_thread.start() print(f"{Colors.BLUE}Started early monitor_all_jobs thread{Colors.END}") except Exception: pass
# DEBUG: Print loop condition values print(f"DEBUG: Loop condition check - all_opcodes: {len(all_opcodes)}, running_jobs: {self.running_jobs}, pending_jobs: {self.pending_jobs}") print(f"DEBUG: Loop condition result: {bool(all_opcodes or (self.running_jobs > 0 or self.pending_jobs > 0))}") while all_opcodes or (self.running_jobs > 0 or self.pending_jobs > 0): # Check for completed jobs and update status counts monitor_alive = False try: monitor_alive = hasattr(self, 'monitor_thread') and self.monitor_thread and self.monitor_thread.is_alive() except Exception: monitor_alive = False if self.submitted_jobs and not monitor_alive: # Lightweight accounting only; lifecycle handled by monitor thread status_changes = {} for job_id in self.submitted_jobs[:]: status_changes[job_id] = self.check_lsf_job_status(int(job_id)) self._update_job_status_counts(status_changes) try: self.update_real_time_report() except Exception: pass
# Print status summary similar to the image format try: total_reruns = sum(getattr(res, 'retry_count', 0) for _, res in self.results.items()) pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("PASS", "RERUN PASS")) fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("FAIL", "RERUN FAIL")) rerun_pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN PASS") rerun_fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN FAIL") total_test_cases = self.get_total_test_cases_count() timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Status: Running={self.running_jobs}, Pending={self.pending_jobs}, Had Rerun={total_reruns}, Pass={pass_count}, Fail={fail_count}, RerunPass={rerun_pass_count}, RerunFail={rerun_fail_count}, Total={total_test_cases}") except Exception: pass elif monitor_alive: # When monitor thread owns job lifecycle, perform a lightweight status update only try: self._update_lsf_job_statuses() self.update_real_time_report() except Exception: pass # Submit new jobs if we have capacity and pending opcodes while all_opcodes and self.running_jobs < self.args.max_concurrent: opcode = all_opcodes.pop(0) result = self.submit_test_case(opcode) self.submitted_results.append(result) if result["status"] == "SUBMITTED": # Don't increment running_jobs yet - wait for actual RUN status if result["job_id"] not in self.submitted_jobs: self.submitted_jobs.append(result["job_id"]) # Update corresponding TestResult object test_name = opcode["case"].get("name", "unknown") config = opcode["case"].get("config", "default") seed = opcode["seed"] opts_str = "_".join(opcode["case"].get("opts", [])) if opcode["case"].get("opts") else "no_opts" unique_key = f"{test_name}:{config}:{seed}:{opts_str}" if self._update_test_result_status(unique_key, result["job_id"], seed, "PENDING"): print(f"DEBUG: Updated TestResult {unique_key} with job_id {result['job_id']} in main loop") timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {result['job_id']}] {result['name']} seed={result.get('seed', 'unknown')} PENDING") # Show regression status after each submission total_test_cases = self.get_total_test_cases_count() self.show_regression_status(self.running_jobs, self.pending_jobs, total_test_cases) else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid UNKNOWN] {result['name']} seed={result.get('seed', 'unknown')} SUBMIT_FAIL: {result.get('error', 'Unknown error')}") # Add delay between submissions to prevent server overload time.sleep(2) # If we're waiting for jobs to complete, add a small delay and show status every 30 seconds if (self.running_jobs > 0 or self.pending_jobs > 0) and not all_opcodes: current_time = time.time() # Show regression status every 30 seconds if current_time - last_status_print_time >= 30: # Update LSF job statuses before showing status self._update_lsf_job_statuses() total_test_cases = self.get_total_test_cases_count() self.show_regression_status(self.running_jobs, self.pending_jobs, total_test_cases) last_status_print_time = current_time # Update real-time report every 30 seconds self.update_real_time_report() # Log detailed status with RERUNS every 60 seconds if current_time - last_status_log_time >= 60: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') try: total_reruns = sum(getattr(res, 'retry_count', 0) for _, res in self.results.items()) except Exception: total_reruns = 0 try: pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("PASS", "RERUN PASS")) fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("FAIL", "RERUN FAIL")) rerun_pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN PASS") rerun_fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN FAIL") except Exception: pass_count = fail_count = rerun_pass_count = rerun_fail_count = 0 print(f"INFO: {timestamp} Status: RUNNING={self.running_jobs}, PENDING={self.pending_jobs}, Total={sim_cases_num}, RERUNS={total_reruns}, Pass={pass_count}, Fail={fail_count}, RerunPass={rerun_pass_count}, RerunFail={rerun_fail_count}") last_status_log_time = current_time time.sleep(3) # Step 3: Monitor all jobs print(f"Step 3: Monitoring all jobs...") # Show final submission summary successful_submissions = len([r for r in self.submitted_results if r["status"] == "SUBMITTED"]) failed_submissions = len([r for r in self.submitted_results if r["status"] != "SUBMITTED"]) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Submission Summary: {successful_submissions} successful, {failed_submissions} failed") # Clean any duplicate job IDs before final status check self._clean_submitted_jobs() # Final status check: ensure all submitted jobs have their final status print(f"Performing final status check for all submitted jobs...") for job_id in self.submitted_jobs[:]: # Use slice copy status = self.check_lsf_job_status(int(job_id)) if status in ["DONE", "EXIT", "TERM", "KILL"]: # Update corresponding TestResult object test_info = self.get_test_info_by_job_id(job_id) if test_info: test_name = test_info['name'] seed = test_info['seed'] # Find the correct TestResult object by searching through all results found_result = None for result_key, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if found_result: if status == "DONE": # CRITICAL FIX: Even for DONE status, check log file for errors log_file_path = self.get_test_log_path_by_job_id(job_id) if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: found_result.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} PASS") elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL with error info error_msg = "Test passed but had runtime errors (running but had error)" found_result.finish("FAIL", error_msg) found_result.error_detected = True print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} FAIL (running but had error)") else: found_result.finish("FAIL", "Test failed (from log file)") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} FAIL") else: # No log file available, assume PASS for DONE status found_result.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} PASS (no log file)") else: # LSF显示EXIT/TERM/KILL时,先看仿真log是否已PASS log_file_path = self.get_test_log_path_by_job_id(job_id) if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): found_result.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} PASS") else: found_result.finish("FAIL", f"Job status: {status}") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} [jobid {job_id}] {test_name} seed={seed} FAIL") else: print(f"Warning: TestResult not found for {test_name} seed={seed}") self.submitted_jobs.remove(job_id) # Additional check: ensure all TestResult objects have correct status # print(f"Performing additional status validation for all test results...") for result_key, result_obj in self.results.items(): if result_obj.status == "RUNNING" and hasattr(result_obj, 'job_id') and result_obj.job_id: # Check if this job is actually completed try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status in ["DONE", "EXIT", "TERM", "KILL"]: if status == "DONE": # CRITICAL FIX: Even for DONE status, check log file for errors log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> PASS") elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL error_msg = "Test passed but had runtime errors (running but had error)" result_obj.finish("FAIL", error_msg) result_obj.error_detected = True print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> FAIL (running but had error)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> FAIL") else: # No log file available, assume PASS for DONE status result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> PASS (no log file)") else: # 再次以日志为准 log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> PASS (from log)") else: result_obj.finish("FAIL", f"Job status: {status}") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Status corrected: {result_obj.name} -> FAIL") except Exception as e: print(f"Warning: Could not check status for job {result_obj.job_id}: {e}") # Final validation: ensure no RUNNING status remains if all jobs are done if len(self.submitted_jobs) == 0: print(f"All jobs completed, ensuring no RUNNING status remains...") for result_key, result_obj in self.results.items(): if result_obj.status == "RUNNING": # If job is not in submitted_jobs but status is RUNNING, # it means the job completed but status wasn't updated if hasattr(result_obj, 'job_id') and result_obj.job_id: try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status == "DONE": # CRITICAL FIX: Even for DONE status, check log file for errors log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PASS") elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL error_msg = "Test passed but had runtime errors (running but had error)" result_obj.finish("FAIL", error_msg) result_obj.error_detected = True print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> FAIL (running but had error)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> FAIL") else: # No log file available, assume PASS for DONE status result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PASS (no log file)") elif status in ["EXIT", "TERM", "KILL"]: log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): result_obj.finish("PASS", "") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PASS (from log)") else: result_obj.finish("FAIL", f"Job status: {status}") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> FAIL") except Exception as e: print(f"Warning: Could not check final status for job {result_obj.job_id}: {e}") else: # No job_id, mark as PENDING result_obj.status = "PENDING" print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} Final status correction: {result_obj.name} -> PENDING (no job_id)") # ENHANCED: Final status refresh - recheck all test results based on log files print(f"{Colors.BLUE}Performing final status refresh based on log files...{Colors.END}") self._final_status_refresh() self.monitor_all_jobs(self.submitted_jobs) # Stop all monitoring threads after all jobs are completed print(f"{Colors.BLUE}Stopping all monitoring threads...{Colors.END}") self._stop_status_thread = True if hasattr(self, 'status_thread') and self.status_thread.is_alive(): self.status_thread.join(timeout=5) print(f"{Colors.BLUE}Status monitoring thread stopped{Colors.END}") # Return True to indicate successful completion return True def wait_for_job_completion(self, job_id: str) -> bool: """Wait for a specific job to complete, returns True if successful, False if failed""" print(f"Waiting for job {job_id} to complete...") unknown_count = 0 max_unknown_threshold = 20 # Allow more UNKNOWN status for compile jobs while True: status = self.check_lsf_job_status(int(job_id)) if status == "DONE": timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] compile_job PASS :)") return True # Compilation successful elif status in ["EXIT", "TERM", "KILL"]: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] compile_job FAIL :(") return False # Compilation failed elif status == "UNKNOWN": unknown_count += 1 if unknown_count >= max_unknown_threshold: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] compile_job FAIL: unknown_lsf_status") return False # Compilation failed due to unknown status print(f"Job {job_id} status: {status} (unknown count: {unknown_count})") else: # Reset unknown counter for other statuses unknown_count = 0 print(f"Job {job_id} status: {status}") time.sleep(10) # Add a small delay after job completion to ensure all files are written print(f"Job {job_id} completed with status: {status}, waiting 5 seconds for file system sync...") time.sleep(5) def monitor_all_jobs(self, job_list=None): if job_list is None: job_list = self.submitted_jobs self._clean_submitted_jobs() print(f"Monitoring {len(job_list)} submitted jobs...") max_unknown_count = {} max_unknown_threshold = 10 while job_list: completed_jobs = [] job_ids = [int(job_id) for job_id in job_list] status_map = self.batch_check_job_status(job_ids) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') # Show lightweight progress for RUN/PEND for job_id, status in status_map.items(): if status in ["RUN", "PEND"]: print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} {status}") for job_id in list(job_list): int_job_id = int(job_id) status = status_map.get(int_job_id, "UNKNOWN") if status == "DONE": # Decide by log contents try: log_file_path = self.get_test_log_path_by_job_id(job_id) except Exception: log_file_path = None test_passed = False has_runtime_errors = False if log_file_path and os.path.exists(log_file_path): test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) test_info = self.get_test_info_by_job_id(job_id) found_result = None if test_info: test_name = test_info['name'] seed = test_info['seed'] for _, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if test_passed and not has_runtime_errors: # Update status considering retry context if found_result and found_result.is_retry: found_result.finish("RERUN PASS", "") else: found_result.finish("PASS", "") self.update_test_result_status(job_id, "PASS") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, 'DONE')} PASS") completed_jobs.append(job_id) elif test_passed and has_runtime_errors: # Test passed but had runtime errors - mark as FAIL error_msg = "Test passed but had runtime errors (running but had error)" if found_result and found_result.is_retry: found_result.finish("RERUN FAIL", error_msg) else: found_result.finish("FAIL", error_msg) found_result.error_detected = True self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, 'DONE')} FAIL (running but had error)") # Centralized retry on FAIL - let _resubmit_from_stored_opcode handle retry limits if getattr(self.args, 'retry', 0) > 0: self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) else: # Test failed - mark as FAIL if found_result and found_result.is_retry: found_result.finish("RERUN FAIL", "DONE but log indicates failure") else: found_result.finish("FAIL", "DONE but log indicates failure") self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, 'DONE')} FAIL") # Centralized retry on FAIL - let _resubmit_from_stored_opcode handle retry limits if getattr(self.args, 'retry', 0) > 0: self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) elif status in ["EXIT", "TERM", "KILL"]: # Prefer log PASS override if available try: log_file_path = self.get_test_log_path_by_job_id(job_id) except Exception: log_file_path = None if log_file_path and os.path.exists(log_file_path) and self.check_test_result(log_file_path): # Update status considering retry context if found_result and found_result.is_retry: found_result.finish("PASS", "") else: self.update_test_result_status(job_id, "PASS") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} PASS") completed_jobs.append(job_id) else: # Update status considering retry context if found_result and found_result.is_retry: found_result.finish("FAIL", f"Job status: {status}") else: self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} FAIL") if getattr(self.args, 'retry', 0) > 0: # Let _resubmit_from_stored_opcode handle retry limits self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) elif status == "UNKNOWN": # Use log to decide if possible; else threshold-based retry try: log_file_path = self.get_test_log_path_by_job_id(job_id) except Exception: log_file_path = None if log_file_path and os.path.exists(log_file_path): if self.check_test_result(log_file_path): self.update_test_result_status(job_id, "PASS") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} PASS (from log)") completed_jobs.append(job_id) else: self.update_test_result_status(job_id, "FAIL") print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} FAIL (from log)") if getattr(self.args, 'retry', 0) > 0: # Retry directly from stored opcode self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) else: # Track UNKNOWN streak if job_id not in max_unknown_count: max_unknown_count[job_id] = 0 max_unknown_count[job_id] += 1 if max_unknown_count[job_id] >= max_unknown_threshold: print(f"INFO: {timestamp} {self.get_test_status_display(job_id, status)} FAIL: unknown_lsf_status") if getattr(self.args, 'retry', 0) > 0: self._resubmit_from_stored_opcode(job_id) completed_jobs.append(job_id) else: # Reset UNKNOWN counter for stable statuses if job_id in max_unknown_count: max_unknown_count[job_id] = 0 # Remove completed jobs from monitoring list for job_id in completed_jobs: if job_id in job_list: job_list.remove(job_id) if job_id in max_unknown_count: del max_unknown_count[job_id] # Recalculate running/pending counts from current statuses try: remaining_ids = [int(j) for j in job_list] # Build a fresh status map limited to remaining jobs remaining_statuses = {jid: status_map.get(jid, "UNKNOWN") for jid in remaining_ids} self.running_jobs = sum(1 for s in remaining_statuses.values() if s == "RUN") self.pending_jobs = sum(1 for s in remaining_statuses.values() if s == "PEND") except Exception: pass # Print status summary after job completion if completed_jobs: try: total_reruns = sum(getattr(res, 'retry_count', 0) for _, res in self.results.items()) pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("PASS", "RERUN PASS")) fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') in ("FAIL", "RERUN FAIL")) rerun_pass_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN PASS") rerun_fail_count = sum(1 for _, res in self.results.items() if getattr(res, 'status', '') == "RERUN FAIL") total_test_cases = self.get_total_test_cases_count() timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Completed {len(completed_jobs)} jobs. Status: Running={self.running_jobs}, Pending={self.pending_jobs}, Had Rerun={total_reruns}, Pass={pass_count}, Fail={fail_count}, RerunPass={rerun_pass_count}, RerunFail={rerun_fail_count}, Total={total_test_cases}") except Exception: pass if job_list: time.sleep(5) # Grace period: if new jobs appear (e.g., retries) after list became empty, resume monitoring try: for _ in range(3): if len(self.submitted_jobs) > 0: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')} New jobs detected after completion, resuming monitoring...") return self.monitor_all_jobs(self.submitted_jobs) time.sleep(5) except Exception: pass
try: self.running_jobs = 0 self.pending_jobs = 0 except Exception: pass print("All jobs completed!") self._stop_status_thread = True if hasattr(self, 'status_thread') and self.status_thread.is_alive(): self.status_thread.join(timeout=5) print(f"{Colors.BLUE}Stopped status monitoring thread{Colors.END}") # Also stop monitor thread if it exists if hasattr(self, 'monitor_thread') and self.monitor_thread.is_alive(): print(f"{Colors.BLUE}Stopping monitor thread{Colors.END}") # Note: monitor_thread is a daemon thread, it will stop when main thread exits
def check_lsf_job_status(self, job_id: int) -> str: """Check LSF job status""" try: result = subprocess.run( ["bjobs", "-noheader", str(job_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, # 使用universal_newlines代替text参数 timeout=30 ) if result.returncode == 0: lines = result.stdout.strip().split('\n') if lines and lines[0]: parts = lines[0].split() if len(parts) >= 3: status = parts[2] # Status column # Map LSF status to our status if status in ["RUN", "PEND", "WAIT", "SUSP"]: return status elif status in ["DONE", "EXIT", "TERM", "KILL"]: return status else: return status else: # Job not found in queue, might have completed print(f"INFO: Job {job_id} not found in queue, checking if completed") return "UNKNOWN" else: # Command failed, print error details print(f"Warning: bjobs command failed for job {job_id}") print(f"Return code: {result.returncode}") print(f"STDOUT: {result.stdout}") print(f"STDERR: {result.stderr}") except subprocess.TimeoutExpired: print(f"Warning: bjobs command timeout for job {job_id}") except FileNotFoundError: print(f"Warning: bjobs command not found, LSF may not be available") except Exception as e: print(f"Warning: Error checking job {job_id} status: {e}") return "UNKNOWN" def get_job_details(self, job_id: int) -> Dict: """Get detailed job information including failure reasons""" try: result = subprocess.run( ["bjobs", "-l", str(job_id)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, timeout=60 ) details = { "job_id": job_id, "status": "UNKNOWN", "exit_code": None, "exit_reason": None, "submission_time": None, "start_time": None, "finish_time": None, "cpu_time": None, "max_memory": None, "output_file": None, "error_file": None } if result.returncode == 0: content = result.stdout lines = content.split('\n') for line in lines: line = line.strip() if "Job Status" in line: details["status"] = line.split()[-1] elif "Exit Status" in line: details["exit_code"] = line.split()[-1] elif "Exit Reason" in line: details["exit_reason"] = line.split(":", 1)[-1].strip() elif "Submitted" in line: details["submission_time"] = line.split(":", 1)[-1].strip() elif "Started" in line: details["start_time"] = line.split(":", 1)[-1].strip() elif "Finished" in line: details["finish_time"] = line.split(":", 1)[-1].strip() elif "CPU time used" in line: details["cpu_time"] = line.split(":", 1)[-1].strip() elif "MAX MEM" in line: details["max_memory"] = line.split(":", 1)[-1].strip() elif "Output file" in line: details["output_file"] = line.split(":", 1)[-1].strip() elif "Error file" in line: details["error_file"] = line.split(":", 1)[-1].strip() return details except Exception as e: print(f"Warning: Error getting job details for {job_id}: {e}") return {"job_id": job_id, "status": "UNKNOWN", "error": str(e)} def batch_check_job_status(self, job_ids: List[int]) -> Dict[int, str]: """Batch check multiple job statuses to reduce LSF calls""" if not job_ids: return {} try: # Use bjobs with multiple job IDs to reduce calls job_id_str = " ".join(map(str, job_ids)) result = subprocess.run( ["bjobs", "-noheader"] + list(map(str, job_ids)), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, # 使用universal_newlines代替text参数 timeout=45 ) status_map = {} if result.returncode == 0: lines = result.stdout.strip().split('\n') for line in lines: if line.strip(): parts = line.split() if len(parts) >= 3: job_id = int(parts[0]) status = parts[2] status_map[job_id] = status # Fill in UNKNOWN for jobs not found for job_id in job_ids: if job_id not in status_map: status_map[job_id] = "UNKNOWN" return status_map except Exception as e: print(f"Warning: Batch job status check failed: {e}") # Fall back to individual checks return {job_id: self.check_lsf_job_status(job_id) for job_id in job_ids}
def check_test_result(self, log_file: str) -> bool: """Check test result - Enhanced version that prioritizes UVM Report Summary""" if not os.path.exists(log_file): return False try: with open(log_file, 'r') as f: content = f.read() # PRIORITY 1: Check UVM Report Summary first - this is the most reliable indicator try: severity_block_match = re.search(r"\*\*\s*Report counts by severity[\s\S]*?(?:\n\*\*|\Z)", content, re.IGNORECASE) if severity_block_match: severity_block = severity_block_match.group(0) def _extract_count(label: str) -> int: m = re.search(rf"{label}\s*:\s*(\d+)", severity_block, re.IGNORECASE) return int(m.group(1)) if m else 0 summary_error_count = _extract_count('UVM_ERROR') summary_fatal_count = _extract_count('UVM_FATAL') # If UVM Report Summary shows 0 errors and 0 fatals, check for TEST CASE PASSED if summary_error_count == 0 and summary_fatal_count == 0: # Check for TEST CASE PASSED - this is the definitive indicator if re.search(r'TEST CASE PASSED', content, re.IGNORECASE): print(f"DEBUG: UVM Report Summary shows 0 errors/0 fatals + TEST CASE PASSED found -> PASS") return True else: print(f"DEBUG: UVM Report Summary shows 0 errors/0 fatals but no TEST CASE PASSED -> FAIL") return False else: # UVM Report Summary shows errors/fatals - definitely FAIL print(f"DEBUG: UVM Report Summary indicates FAIL (UVM_ERROR={summary_error_count}, UVM_FATAL={summary_fatal_count})") return False except Exception: # Non-fatal; fall back to detailed checks below pass # Also check for "TEST CASE FAILED" pattern if re.search(r'TEST CASE FAILED', content, re.IGNORECASE): print(f"DEBUG: 'TEST CASE FAILED' found in log -> FAIL") return False
# CRITICAL FIX: First check for errors before summary section # Locate the boundary where summary starts (errors after this should be ignored) summary_markers = [ r'UVM Report catcher Summary', r'UVM Report Summary' ] summary_idx = len(content) for marker in summary_markers: m = re.search(marker, content) if m: summary_idx = min(summary_idx, m.start())
# Region to search for real errors (before summary) error_region = content[:summary_idx]
# Error detection (only before summary) error_patterns = [r'UVM_ERROR', r'UVM_FATAL', r'SCOREBOARD_MISMATCH', r'Solver failed', r'Error', r'Offending'] first_error_match = None last_error_idx = -1 for pat in error_patterns: for m in re.finditer(pat, error_region): if first_error_match is None: first_error_match = m last_error_idx = max(last_error_idx, m.start())
# Non-recoverable rule: any UVM_FATAL before summary is immediate FAIL if re.search(r'UVM_FATAL', error_region): print("DEBUG: UVM_FATAL found before summary -> immediate FAIL") return False
# PASS detection (stricter to avoid accidental matches) # Accept common canonical PASS lines only, anchored to line start pass_match = None pass_patterns = [ r'^\s*TEST CASE PASSED\b', r'^\s*UVM_.*?TEST PASSED\b', r'^\s*SIMULATION PASSED\b', ] for _pat in pass_patterns: _m = re.search(_pat, content, re.MULTILINE) if _m: pass_match = _m break
# CRITICAL FIX: New logic to handle "running but had error" cases # If there are errors before summary, check if PASS comes after the last error if first_error_match is not None: if pass_match: pass_idx = pass_match.start() # Only PASS if PASS comes AFTER the last error (indicating recovery) if pass_idx > last_error_idx: # Before returning PASS, ensure no tail errors after summary tail_region = content[summary_idx:] if re.search(r'(UVM_ERROR|UVM_FATAL|SCOREBOARD_MISMATCH|Solver failed|Error|Offending)', tail_region, re.IGNORECASE): print("DEBUG: Errors found after summary (tail region) -> treat as FAIL") return False print(f"DEBUG: Test PASSED after errors - PASS at {pass_idx}, last error at {last_error_idx}") return True else: # PASS came before or at the same time as error - this is "running but had error" error_pos = first_error_match.start() start_pos = max(0, error_pos - 100) end_pos = min(len(error_region), error_pos + 200) error_context = error_region[start_pos:end_pos].strip() error_log_file = Path(log_file).with_suffix('.error.log') with open(error_log_file, 'w') as ef: ef.write(f"Original Log File: {log_file}\n") ef.write(f"Error Type: {first_error_match.group(0)}\n") ef.write(f"Error Context:\n{error_context}\n") ef.write(f"PASS position: {pass_idx}, Last error position: {last_error_idx}\n") ef.write(f"Decision: FAIL - PASS came before/at same time as error\n") print(f"DEBUG: Test FAILED - PASS at {pass_idx}, last error at {last_error_idx} (running but had error)") return False else: # No PASS found, but there are errors - definitely FAIL error_pos = first_error_match.start() start_pos = max(0, error_pos - 100) end_pos = min(len(error_region), error_pos + 200) error_context = error_region[start_pos:end_pos].strip() error_log_file = Path(log_file).with_suffix('.error.log') with open(error_log_file, 'w') as ef: ef.write(f"Original Log File: {log_file}\n") ef.write(f"Error Type: {first_error_match.group(0)}\n") ef.write(f"Error Context:\n{error_context}\n") ef.write(f"Decision: FAIL - No PASS found, but errors exist\n") print(f"DEBUG: Test FAILED - No PASS found, but errors exist") return False
# No errors found before summary if pass_match: # Additional guard: if tail (after summary) contains errors due to log stitching, FAIL tail_region = content[summary_idx:] if re.search(r'(UVM_ERROR|UVM_FATAL|SCOREBOARD_MISMATCH|Solver failed|Error|Offending)', tail_region, re.IGNORECASE): print("DEBUG: Tail errors detected after summary despite PASS -> FAIL") return False # No errors and PASS found - definitely PASS print(f"DEBUG: Test PASSED - No errors found, PASS exists") return True else: # No errors and no PASS - treat as not passed print(f"DEBUG: Test not passed - No errors found, but no PASS either") return False except Exception as e: print(f"{Colors.RED}Error checking log file: {e}{Colors.END}") return False def _final_status_refresh(self): """Final status refresh - recheck all test results based on log files""" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') refreshed_count = 0 for result_key, result_obj in self.results.items(): # Skip if no job_id or already in final state if not hasattr(result_obj, 'job_id') or not result_obj.job_id: continue # Skip if already in final PASS/FAIL state if result_obj.status in ["PASS", "FAIL", "RERUN PASS", "RERUN FAIL", "ERROR", "TIMEOUT"]: continue try: # Get log file path log_file_path = self.get_test_log_path_by_job_id(result_obj.job_id) if not log_file_path or not os.path.exists(log_file_path): continue # Check log file for TEST CASE PASSED test_passed = self.check_test_result(log_file_path) if test_passed: # Test passed - update status based on retry context if result_obj.is_retry: old_status = result_obj.status result_obj.finish("RERUN PASS", "") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> RERUN PASS (log shows TEST CASE PASSED)") else: old_status = result_obj.status result_obj.finish("PASS", "") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> PASS (log shows TEST CASE PASSED)") refreshed_count += 1 else: # Test failed - update status based on retry context if result_obj.is_retry: old_status = result_obj.status result_obj.finish("RERUN FAIL", "Final refresh: log indicates failure") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> RERUN FAIL (log indicates failure)") else: old_status = result_obj.status result_obj.finish("FAIL", "Final refresh: log indicates failure") print(f"INFO: {timestamp} Final refresh: {result_obj.name} {old_status} -> FAIL (log indicates failure)") refreshed_count += 1 except Exception as e: print(f"Warning: Could not refresh status for {result_obj.name}: {e}") if refreshed_count > 0: print(f"INFO: {timestamp} Final status refresh completed: {refreshed_count} tests updated") else: print(f"INFO: {timestamp} Final status refresh completed: no tests needed updating") def _check_for_runtime_errors(self, log_file: str) -> bool: """Check for runtime errors in log file that indicate test should be retried""" if not os.path.exists(log_file): return False try: with open(log_file, 'r') as f: content = f.read() # Check for runtime error patterns that indicate test should be retried runtime_error_patterns = [ r'UVM_ERROR', r'UVM_FATAL', r'SCOREBOARD_MISMATCH', r'Solver failed', r'\bError\b', r'Offending', r'ERROR.*runtime', r'FATAL.*runtime', r'Exception.*occurred', r'Assertion.*failed', r'Timeout.*occurred', r'Memory.*leak', r'Resource.*exhausted', r'Connection.*failed', r'Protocol.*violation', r'Deadlock.*detected', r'Livelock.*detected' ] # Search for runtime errors in the entire log for pattern in runtime_error_patterns: if re.search(pattern, content, re.IGNORECASE): print(f"DEBUG: Runtime error detected in {log_file}: {pattern}") return True return False except Exception as e: print(f"{Colors.RED}Error checking for runtime errors in log file: {e}{Colors.END}") return False def retry_failed_tests(self): """Retry failed tests with parallel retry mechanism""" failed_tests = [(name, result) for name, result in self.results.items() if result.status in ["FAIL", "ERROR", "TIMEOUT"]] if not failed_tests: print(f"{Colors.GREEN}No tests need retry{Colors.END}") return print(f"\n{Colors.YELLOW}=== Retry Failed Tests ==={Colors.END}") print(f"Failed test count: {len(failed_tests)}") print(f"Max retry count: {self.args.retry}") print(f"Retry strategy: Parallel retry - ALL retries must pass for test to pass") for test_key, result in failed_tests: # Extract test name from the key (test_key format is "test_name:config:seed") test_name = result.name # Use the actual test name from TestResult object original_seed = getattr(result, 'seed', None) print(f"\n{Colors.CYAN}Starting parallel retry for {test_name} (original seed: {original_seed}){Colors.END}") # LSF mode parallel retry retry_results = self._run_parallel_lsf_retry(test_name, result, original_seed) # Process retry results self._process_parallel_retry_results(test_name, result, retry_results)
def _run_parallel_lsf_retry(self, test_name: str, result, original_seed: str) -> List[Dict]: """Run parallel LSF retry tests""" retry_results = [] # Find the original test case data print(f" Searching for original test case: {test_name}") original_case = self.find_original_test_case(test_name) if not original_case: print(f" Test case not found in JSON files, checking self.tests...") for test_tuple in self.tests: if test_tuple[0] == test_name: print(f" Found test case '{test_name}' in self.tests") original_case = { 'name': test_name, 'config': test_tuple[1], 'repeat': 1, 'timeout': 60, 'opts': [] } break if not original_case: print(f"{Colors.RED}Warning: Could not find original test case for {test_name}, skipping retry{Colors.END}") return [] # Generate retry opcodes output_dir = self.args.output_dir sim_output_dir = getattr(self.args, 'dir', output_dir) retry_opcodes = [] for retry in range(1, self.args.retry + 1): if retry == 1 and original_seed: # First retry: use original seed print(f" Retry {retry}: Using original seed: {original_seed}") opcodes = self.gen_test_case(original_case, output_dir, sim_output_dir, self.args.queue, specified_seed=original_seed) else: # Other retries: use random seed print(f" Retry {retry}: Using random seed") opcodes = self.gen_test_case(original_case, output_dir, sim_output_dir, self.args.queue) if opcodes: opcode = opcodes[0] opcode['retry_attempt'] = retry opcode['retry_seed'] = original_seed if retry == 1 else 'random' retry_opcodes.append(opcode) # Submit all retry jobs in parallel print(f" Submitting {len(retry_opcodes)} parallel retry jobs...") submitted_jobs = [] for opcode in retry_opcodes: retry_result = self.submit_test_case(opcode) if retry_result["status"] == "SUBMITTED": retry_result['retry_attempt'] = opcode['retry_attempt'] retry_result['retry_seed'] = opcode['retry_seed'] submitted_jobs.append(retry_result) print(f" Retry {opcode['retry_attempt']} submitted: jobid {retry_result['job_id']}") else: print(f" Retry {opcode['retry_attempt']} submission failed: {retry_result.get('error', 'Unknown error')}") # Wait for all jobs to complete if submitted_jobs: print(f" Waiting for {len(submitted_jobs)} retry jobs to complete...") retry_results = self._wait_for_parallel_jobs(submitted_jobs) return retry_results def _wait_for_parallel_jobs(self, submitted_jobs: List[Dict]) -> List[Dict]: """Wait for multiple jobs to complete in parallel""" completed_jobs = [] job_ids = [job['job_id'] for job in submitted_jobs] while job_ids: completed_jobs_batch = [] jobs_to_remove = [] for job_id in job_ids: status = self.check_lsf_job_status(int(job_id)) if status in ["DONE", "EXIT", "TERM", "KILL"]: # Find the corresponding job info job_info = next((job for job in submitted_jobs if job['job_id'] == job_id), None) if job_info: if status == "DONE": job_info["status"] = "PASS" else: job_info["status"] = "FAIL" completed_jobs_batch.append(job_info) jobs_to_remove.append(job_id) # Remove completed jobs from monitoring list for job_id in jobs_to_remove: job_ids.remove(job_id) if completed_jobs_batch: completed_jobs.extend(completed_jobs_batch) for job in completed_jobs_batch: if job["status"] == "PASS": status_icon = f"{Colors.GREEN}✓{Colors.END}" elif job["status"] == "RERUN PASS": status_icon = f"{Colors.CYAN}✓{Colors.END}" else: status_icon = f"{Colors.RED}✗{Colors.END}" print(f" Retry {job['retry_attempt']} completed: {status_icon} {job['retry_seed']}") if job_ids: time.sleep(10) # Wait before next check return completed_jobs def _start_immediate_retry(self, result): """Start immediate retry for a failed test""" test_name = result.name original_seed = getattr(result, 'seed', None) # Rate limit concurrent retry threads if not hasattr(self, 'active_retry_threads'): self.active_retry_threads = 0 if self.active_retry_threads >= 20: print(f" Retry queue full (20). Delaying retry for {test_name}...") # Busy-wait with sleep until slot available while self.active_retry_threads >= 20: time.sleep(5) # Check if we've already started retries for this test if hasattr(result, 'retry_started') and result.retry_started: print(f" Retry already started for {test_name}, skipping") return # Mark that retry has been started if not hasattr(result, 'retry_started'): result.retry_started = False result.retry_started = True result.retry_count = 0 print(f"\n{Colors.CYAN}🚀 Starting immediate retry for {test_name} (original seed: {original_seed}){Colors.END}") # Start retry in a separate thread to avoid blocking the main monitoring loop self.active_retry_threads += 1 retry_thread = threading.Thread( target=self._run_immediate_retry, args=(test_name, result, original_seed), daemon=True ) retry_thread.start() def _run_immediate_retry(self, test_name: str, result, original_seed: str): """Run immediate retry in a separate thread""" try: # LSF mode immediate retry self._run_immediate_lsf_retry(test_name, result, original_seed) except Exception as e: print(f"{Colors.RED}Error in immediate retry for {test_name}: {e}{Colors.END}") finally: # Decrease active retry counter when thread finishes if hasattr(self, 'active_retry_threads') and self.active_retry_threads > 0: self.active_retry_threads -= 1 def _run_immediate_lsf_retry(self, test_name: str, result, original_seed: str): """Run immediate LSF retry - stop on first success""" # Find the original test case data with proper opts handling original_case = self.find_original_test_case_with_opts(test_name, result) if not original_case: print(f"{Colors.RED}Warning: Could not find original test case for {test_name}, trying fallback method{Colors.END}") # Fallback to original method original_case = self.find_original_test_case(test_name) if not original_case: print(f"{Colors.RED}Warning: Could not find original test case for {test_name}, skipping immediate retry{Colors.END}") return # Start only one retry attempt - the main loop will handle subsequent retries if needed retry = 1 if original_seed: # First retry: use original seed print(f" Immediate retry {retry}: Using original seed: {original_seed}") opcodes = self.gen_test_case(original_case, self.args.output_dir, str(self.regression_dir), self.args.queue, specified_seed=original_seed) else: # Use random seed print(f" Immediate retry {retry}: Using random seed") opcodes = self.gen_test_case(original_case, self.args.output_dir, str(self.regression_dir), self.args.queue)
# Force wave dump for retry-generated opcodes BEFORE selecting and submitting try: for oc in opcodes or []: cmd_list = oc.get('cmd', []) if isinstance(cmd_list, list): replaced = False for i, token in enumerate(cmd_list): if isinstance(token, str) and token.startswith('wave='): if token != 'wave=fsdb': cmd_list[i] = 'wave=fsdb' replaced = True break if not replaced: cmd_list.append('wave=fsdb') # Tag immediate retry if not any(isinstance(t, str) and t.startswith('lmn=') for t in cmd_list): cmd_list.append('lmn=rerun') oc['cmd'] = cmd_list except Exception: pass if not opcodes: print(f" Failed to generate retry opcodes for {test_name}") return opcode = opcodes[0] # Ensure the selected opcode itself carries wave=fsdb (explicit) try: cmd_list = opcode.get('cmd', []) if isinstance(cmd_list, list): replaced = False for i, token in enumerate(cmd_list): if isinstance(token, str) and token.startswith('wave='): if token != 'wave=fsdb': cmd_list[i] = 'wave=fsdb' replaced = True break if not replaced: cmd_list.append('wave=fsdb') # Tag immediate retry if not any(isinstance(t, str) and t.startswith('lmn=') for t in cmd_list): cmd_list.append('lmn=rerun') opcode['cmd'] = cmd_list except Exception: pass retry_result = self.submit_test_case(opcode) if retry_result["status"] == "SUBMITTED": result.retry_count = retry print(f" Immediate retry {retry} submitted: jobid {retry_result['job_id']}") # CRITICAL FIX: Don't wait here - let the main loop handle retry job completion # The retry job is now tracked in the main loop and will be processed there print(f" Immediate retry {retry} submitted and tracked in main loop: jobid {retry_result['job_id']}") # Return immediately - the main loop will handle completion return else: print(f" {test_name} retry {retry} submission failed: {retry_result.get('error', 'Unknown error')}") # If submission failed, we can't retry further print(f"{Colors.RED}✗{Colors.END} {test_name} retry submission failed - Original test remains FAIL") def _wait_for_retry_job_completion(self, job_id: str, test_name: str, retry_num: int) -> bool: """Wait for a specific retry job to complete""" print(f" Waiting for retry {retry_num} job {job_id} to complete...") while True: status = self.check_lsf_job_status(int(job_id)) if status == "DONE": print(f" Retry {retry_num} job {job_id} PASSED") return True elif status in ["EXIT", "TERM", "KILL"]: print(f" Retry {retry_num} job {job_id} FAILED") return False elif status == "UNKNOWN": # Job may have completed and been removed from queue print(f" Retry {retry_num} job {job_id} status UNKNOWN, checking log file...") # Try to determine result from log file test_info = self.get_test_info_by_job_id(job_id) if test_info: log_file_path = self.get_test_log_path_by_job_id(job_id) if log_file_path and os.path.exists(log_file_path): # CRITICAL FIX: Enhanced status determination for retry jobs test_passed = self.check_test_result(log_file_path) has_runtime_errors = self._check_for_runtime_errors(log_file_path) if test_passed and not has_runtime_errors: print(f" Retry {retry_num} job {job_id} PASSED (from log file)") return True elif test_passed and has_runtime_errors: print(f" Retry {retry_num} job {job_id} FAILED (running but had error, from log file)") return False else: print(f" Retry {retry_num} job {job_id} FAILED (from log file)") return False # If we can't determine, assume failed print(f" Retry {retry_num} job {job_id} assumed FAILED") return False else: # Still running time.sleep(10) def get_test_log_path_by_job_id(self, job_id: str) -> str: """Get log file path by job ID""" # 1) Prefer the opcode we stored at submission time (survives after job DONE) try: stored = getattr(self, 'job_meta', {}).get(str(job_id)) if stored: log_path = stored.get('log_path') or stored.get('log_file') if log_path: return log_path except Exception: pass
# 2) Fallback: search through submitted_results snapshot try: for result in getattr(self, 'submitted_results', []): if result.get('job_id') == job_id: return result.get('log_path', '') except Exception: pass
# 3) Fallback: locate TestResult by job_id, then derive its log path try: for _, res in getattr(self, 'results', {}).items(): if hasattr(res, 'job_id') and str(getattr(res, 'job_id', '')) == str(job_id): # Prefer explicitly recorded path if present if getattr(res, 'log_file', ''): return res.log_file # Derive from standard layout derived = self.get_test_log_path(res) if derived: return derived except Exception: pass
# 4) Nothing found return '' def _process_parallel_retry_results(self, test_name: str, result, retry_results: List): """Process parallel retry results and update original test result""" if not retry_results: print(f"{Colors.RED}No retry results for {test_name}{Colors.END}") return # Check if ALL retries passed passed_retries = [r for r in retry_results if r.status == "PASS" or r.get("status") == "PASS"] total_retries = len(retry_results) if len(passed_retries) == total_retries: # ALL retries passed - mark original test as PASS passed_retry = passed_retries[0] # Use the first passed retry for info if hasattr(passed_retry, 'status'): # LSF mode result.finish("PASS", f"ALL {total_retries} retries successful") result.retry_count = total_retries result.seed = passed_retry.seed result.log_file = passed_retry.log_file print(f"{Colors.GREEN}✓{Colors.END} {test_name} ALL {total_retries} retries passed - Original test marked as PASS") else: # LSF mode result.finish("PASS", f"ALL {total_retries} retries successful") result.retry_count = total_retries result.seed = passed_retry.get('seed', 'unknown') result.job_id = passed_retry.get('job_id', 'unknown') print(f"{Colors.GREEN}✓{Colors.END} {test_name} ALL {total_retries} retries passed - Original test marked as PASS") else: # Not all retries passed - test remains FAIL failed_count = total_retries - len(passed_retries) print(f"{Colors.RED}✗{Colors.END} {test_name} {failed_count}/{total_retries} retries failed - Original test remains FAIL") # Update with the last retry attempt info last_retry = retry_results[-1] if hasattr(last_retry, 'retry_attempt'): # LSF mode result.retry_count = last_retry.retry_attempt else: # LSF mode result.retry_count = last_retry['retry_attempt'] def merge_coverage(self): """Merge coverage databases""" if not self.args.coverage: return print(f"\n{Colors.BLUE}=== Merge Coverage Databases ==={Colors.END}") # Find all coverage databases cov_dbs = [] for result in self.results.values(): if result.coverage_db and os.path.exists(result.coverage_db): cov_dbs.append(result.coverage_db) if not cov_dbs: print(f"{Colors.YELLOW}No coverage databases found{Colors.END}") return # Merge coverage merged_db = self.coverage_dir / f"merged_{datetime.now().strftime('%Y%m%d_%H%M%S')}.vdb" try: cmd = ["urg", "-dir"] + cov_dbs + ["-dbname", str(merged_db)] subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(f"{Colors.GREEN}Coverage merge completed: {merged_db}{Colors.END}") # Generate coverage report report_dir = self.report_dir / f"coverage_{datetime.now().strftime('%Y%m%d_%H%M%S')}" cmd = ["urg", "-dir", str(merged_db), "-report", str(report_dir)] subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(f"{Colors.GREEN}Coverage report generated: {report_dir}{Colors.END}") except subprocess.CalledProcessError as e: print(f"{Colors.RED}Coverage merge failed: {e}{Colors.END}") def generate_report(self): """Generate test report""" print(f"\n{Colors.BLUE}=== Generate Test Report ==={Colors.END}") # Final status validation before generating report print(f"Performing final status validation before generating report...") self.validate_all_test_statuses() # Clean up any duplicate test results before counting self._clean_duplicate_test_results() # Count results - ensure we count only tests with final statuses final_status_results = [r for r in self.results.values() if r.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] print(f"Counting results from {len(final_status_results)} completed test cases...") total = len(final_status_results) passed = len([r for r in final_status_results if r.status == "PASS"]) rerun_passed = len([r for r in final_status_results if r.status == "RERUN PASS"]) failed = len([r for r in final_status_results if r.status in ["FAIL", "RERUN FAIL"]]) errors = len([r for r in final_status_results if r.status == "ERROR"]) timeouts = len([r for r in final_status_results if r.status == "TIMEOUT"]) print(f"Result counts: Total={total}, Passed={passed}, Rerun Passed={rerun_passed}, Failed={failed}, Errors={errors}, Timeouts={timeouts}") # Calculate total time total_time = time.time() - self.start_time # Generate CSV report timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') csv_file = self.report_dir / f"regression_{timestamp}.csv" with open(csv_file, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['Test Name', 'Config', 'Status', 'Duration', 'Estimated Time', 'Retry Count', 'Log File', 'Error Message']) for result in final_status_results: writer.writerow([ result.name, result.config, result.status, result.get_duration_str(), f"{result.estimated_duration:.1f}s", result.retry_count, result.log_file, result.error_msg ]) # Generate JSON report json_file = self.report_dir / f"regression_{timestamp}.json" report_data = { 'summary': { 'total': total, 'passed': passed, 'rerun_passed': rerun_passed, 'failed': failed, 'errors': errors, 'timeouts': timeouts, 'pass_rate': f"{(passed + rerun_passed)/total*100:.1f}%" if total > 0 else "0%", 'total_time': f"{total_time:.1f}s", 'timestamp': timestamp }, 'tests': { f"{result.name}_{getattr(result, 'seed', 'unknown')}": { 'config': result.config, 'status': result.status, 'duration': result.duration, 'estimated_duration': result.estimated_duration, 'retry_count': result.retry_count, 'log_file': result.log_file, 'error_msg': result.error_msg } for result in final_status_results } } with open(json_file, 'w') as f: json.dump(report_data, f, indent=2, ensure_ascii=False) # Save historical test data for future time estimation self.save_test_history() # Print summary print(f"\n{Colors.BOLD}=== Regression Test Summary ==={Colors.END}") print(f"Total Tests: {total}") print(f"{Colors.GREEN}Passed: {passed}{Colors.END}") print(f"{Colors.CYAN}Rerun Passed: {rerun_passed}{Colors.END}") print(f"{Colors.RED}Failed: {failed}{Colors.END}") print(f"{Colors.RED}Errors: {errors}{Colors.END}") print(f"{Colors.YELLOW}Timeouts: {timeouts}{Colors.END}") print(f"Pass Rate: {(passed + rerun_passed)/total*100:.1f}%" if total > 0 else "0%") print(f"Total Time: {total_time/60:.1f} minutes") print(f"\nReport Files:") print(f" CSV: {csv_file}") print(f" JSON: {json_file}") # Show failed tests failed_results = [r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] if failed_results: print(f"\n{Colors.RED}Failed Tests:{Colors.END}") for result in failed_results: print(f" {result.name}: {result.status} - {result.error_msg}") # Generate detailed regression report (like Image 1) self.generate_detailed_regression_report() # Generate error summary report self.generate_error_summary_report() # Generate regression summary info (like Image 3) self.generate_regression_summary_info() # Generate final real-time report self.generate_real_time_report() # Save error monitor state self.save_error_monitor_state()
# After all standard reports, also collect and persist transaction/cycle statistics try: self.update_transaction_cycle_statistics() except Exception as e: print(f"{Colors.YELLOW}Warning: Failed to update transaction/cycle statistics: {e}{Colors.END}") def extract_job_statistics(self, result): """Extract job statistics (CPU time, max memory, processes) from actual data""" cpu_time = "0 sec" max_mem = "N/A" procs = "N/A" # Try to get CPU time from result duration if result.duration > 0: cpu_time = f"{int(result.duration)} sec" # Try to get job statistics from LSF if job_id is available if hasattr(result, 'job_id') and result.job_id and result.job_id != 'unknown': try: # Use bjobs to get detailed job information cmd = ["bjobs", "-l", str(result.job_id)] output = subprocess.check_output(cmd, stderr=subprocess.PIPE, universal_newlines=True, timeout=10) # Parse CPU time cpu_match = re.search(r'CPU time used is (\d+\.?\d*) seconds', output) if cpu_match: cpu_seconds = float(cpu_match.group(1)) cpu_time = f"{int(cpu_seconds)} sec" # Parse memory usage mem_match = re.search(r'MAX MEM: (\d+\.?\d*) (\w+)', output) if mem_match: mem_value = mem_match.group(1) mem_unit = mem_match.group(2) max_mem = f"{mem_value} {mem_unit}" # Parse number of processes proc_match = re.search(r'Number of processors: (\d+)', output) if proc_match: procs = proc_match.group(1) except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): # If bjobs fails, try to extract from log file pass # If LSF info not available, try to extract from log file if max_mem == "N/A" and hasattr(result, 'log_file') and result.log_file: try: if os.path.exists(result.log_file): with open(result.log_file, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # Look for memory usage patterns in log mem_patterns = [ r'max_memory[:\s]+(\d+\.?\d*)\s*(\w+)', r'memory_usage[:\s]+(\d+\.?\d*)\s*(\w+)', r'peak_memory[:\s]+(\d+\.?\d*)\s*(\w+)', r'MAX_MEM[:\s]+(\d+\.?\d*)\s*(\w+)' ] for pattern in mem_patterns: mem_match = re.search(pattern, content, re.IGNORECASE) if mem_match: mem_value = mem_match.group(1) mem_unit = mem_match.group(2) max_mem = f"{mem_value} {mem_unit}" break # Look for process count patterns proc_patterns = [ r'processes[:\s]+(\d+)', r'num_procs[:\s]+(\d+)', r'process_count[:\s]+(\d+)' ] for pattern in proc_patterns: proc_match = re.search(pattern, content, re.IGNORECASE) if proc_match: procs = proc_match.group(1) break except Exception: pass # If still no data, use reasonable defaults based on test type if max_mem == "N/A": # Estimate memory based on test name or use default if "stress" in result.name.lower() or "full" in result.name.lower(): max_mem = "16 GB" elif "small" in result.name.lower() or "basic" in result.name.lower(): max_mem = "4 GB" else: max_mem = "8 GB" if procs == "N/A": # Estimate process count based on test type or use default if "stress" in result.name.lower() or "full" in result.name.lower(): procs = "12" elif "small" in result.name.lower() or "basic" in result.name.lower(): procs = "4" else: procs = "8" return cpu_time, max_mem, procs def get_test_log_path(self, result): """Get the actual log file path for a test result""" # 1. Use result.log_file if it exists and is a valid path if result.log_file and os.path.exists(result.log_file): return os.path.abspath(result.log_file) # 2. Try to construct log file path based on test case structure sim_output_dir = str(self.regression_dir) seed = getattr(result, 'seed', 'unknown') test_name = result.name # Try different log file naming patterns possible_log_paths = [ f"{sim_output_dir}/logs/{test_name}/{test_name}_{seed}_*.log", # Primary path with opts: logs/test_name/test_name_seed_opts_*.log f"{sim_output_dir}/logs/{test_name}/{test_name}_{seed}.log", # Fallback: logs/test_name/test_name_seed.log f"{sim_output_dir}/logs/{test_name}/{test_name}_*.log", # Wildcard pattern for logs/test_name/ f"{sim_output_dir}/logs/{test_name}.log", # Fallback: logs/test_name.log f"{sim_output_dir}/{test_name}/report.log", # Legacy path: test_name/report.log f"{sim_output_dir}/{test_name}_{seed}/report.log", # Legacy path: test_name_seed/report.log ] for log_path in possible_log_paths: if '*' in log_path: # Handle wildcard patterns import glob matching_files = glob.glob(log_path) if matching_files: return os.path.abspath(matching_files[0]) # Return first matching file elif os.path.exists(log_path): return os.path.abspath(log_path) return None def update_real_time_report(self): """Update real-time regression report""" current_time = time.time() if current_time - self.last_report_update >= self.report_update_interval: self.last_report_update = current_time self.generate_real_time_report() def generate_real_time_report(self): """Generate real-time regression report""" try: with open(self.real_time_report_path, 'w', encoding='utf-8') as f: # Write header f.write("=" * 80 + "\n") f.write(f"REAL-TIME REGRESSION REPORT - Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write("=" * 80 + "\n\n") # Generate test status and log paths section f.write("=== TEST STATUS AND LOG PATHS ===\n") f.write(self.generate_test_status_and_log_paths_content()) f.write("\n\n") # Generate summary info f.write("=== REGRESSION SUMMARY ===\n") f.write(self.generate_regression_summary_info_content()) f.write("\n\n") # Write footer f.write("=" * 80 + "\n") f.write("REPORT WILL BE UPDATED EVERY 30 SECONDS\n") f.write("=" * 80 + "\n") except Exception as e: print(f"Warning: Could not update real-time report: {e}") def generate_progress_bar(self, percentage, width=50): """Generate progress bar""" filled_width = int(width * percentage / 100) bar = '█' * filled_width + '░' * (width - filled_width) return f"[{bar}]" def generate_detailed_regression_report(self): """Generate detailed regression report like Image 1""" print(f"\n{Colors.BLUE}=== Detailed Regression Report ==={Colors.END}") # Validate statuses before generating report self.validate_all_test_statuses() # Print header timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp}: {'+' * 15} REPORT {'+' * 15}") # Print table header print(f"INFO: {timestamp}: | status | test_name | seed | jobid | cpu_time | max_mem | procs |") # Process each test result - now PENDING status should be correctly updated final_status_results = [r for r in self.results.values() if r.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] for result in final_status_results: # Get test info test_name = result.name seed = getattr(result, 'seed', 'unknown') job_id = getattr(result, 'job_id', 'unknown') # Get CPU time and memory info from actual data cpu_time, max_mem, procs = self.extract_job_statistics(result) # If runtime errors were detected, force status to FAIL for reporting and accounting if hasattr(result, 'error_detected') and result.error_detected and result.status in ["PASS", "RERUN PASS"]: result.status = "FAIL"
# Format status with proper colors (after possible override) status = result.status if status == "PASS": status = f"{Colors.GREEN}PASS{Colors.END}" elif status == "RERUN PASS": status = f"{Colors.CYAN}RERUN PASS{Colors.END}" elif status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: status = f"{Colors.RED}FAIL{Colors.END}" else: status = f"{Colors.YELLOW}{status}{Colors.END}" # Add error detection info to status error_info = "" if hasattr(result, 'error_detected') and result.error_detected: error_info = " (running but had error)" # Print test result line print(f"INFO: {timestamp}: | {status} | {test_name} | {seed} | {job_id} | {cpu_time} | {max_mem} | {procs} |{error_info}") print(f"INFO: {timestamp}: {'+' * 15} END REPORT {'+' * 15}") print(f"Total unique tests reported: {len(final_status_results)}") def generate_error_summary_report(self): """Generate error summary report with UVM_ERROR and UVM_FATAL details""" print(f"\n{Colors.BLUE}=== Error Summary Report ==={Colors.END}") # Collect all error information error_info = {} failed_tests = [] for result in self.results.values(): # Treat "running but had error" as FAIL for summary as well if hasattr(result, 'error_detected') and result.error_detected and result.status in ["PASS", "RERUN PASS"]: result.status = "FAIL" if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: failed_tests.append(result) # Try to read log file for UVM_ERROR and UVM_FATAL self.analyze_log_for_errors(result, error_info) if not failed_tests: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: No failed tests found") return if not error_info: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: Found {len(failed_tests)} failed tests but no UVM_ERROR or UVM_FATAL found in logs") print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: Failed tests with log paths:") for result in failed_tests: # Get the actual log file path log_path = self.get_test_log_path(result) if log_path: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: [{result.status}] {log_path}") else: print(f"INFO: {datetime.now().strftime('%m-%d %H:%M:%S')}: {result.name}: {result.status} - {result.error_msg} (No log file found)") return # Print error summary timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp}: Total {len(failed_tests)} failure syndromes:") # Group errors by type error_count = 1 for error_type, error_details in error_info.items(): print(f"INFO: {timestamp}: ({error_count}) ERR ID:{error_details['id']}:") print(f"INFO: {timestamp}: MSG: \"{error_details['message']}\"") # Print error count if available (from log content before UVM Report catcher Summary) if 'count' in error_details: print(f"INFO: {timestamp}: Count: {error_details['count']} (from log content before UVM Report catcher Summary)") # Print associated test paths for test_path in error_details['tests']: print(f"INFO: {timestamp}: {test_path}") error_count += 1 def analyze_log_for_errors(self, result, error_info): """Analyze log file for UVM_ERROR and UVM_FATAL messages ONLY in content before UVM Report catcher Summary""" # Use the new get_test_log_path method to get the actual log file path log_file_path = self.get_test_log_path(result) if not log_file_path: print(f"Warning: No log file found for test {result.name}") return # Use the found log file path log_file_paths = [log_file_path] # Analyze each log file for log_file_path in log_file_paths: try: with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # First, look for UVM Report catcher Summary section uvm_summary_errors = self.analyze_uvm_report_catcher_summary(content, log_file_path) for error_type, error_details in uvm_summary_errors.items(): if error_type not in error_info: error_info[error_type] = error_details else: # Merge tests if error type already exists for test_path in error_details['tests']: if test_path not in error_info[error_type]['tests']: error_info[error_type]['tests'].append(test_path) # Only use UVM Report catcher Summary analysis, skip direct pattern matching # to avoid capturing errors from UVM Report catcher Summary section pass except Exception as e: print(f"Warning: Could not analyze log file {log_file_path}: {e}") continue def analyze_uvm_report_catcher_summary(self, content, log_file_path): """Analyze log content before 'UVM Report catcher Summary' line for error information""" error_info = {} # Split content into lines to find the UVM Report catcher Summary line lines = content.split('\n') summary_line_index = -1 # Find the line containing "UVM Report catcher Summary" for i, line in enumerate(lines): if "UVM Report catcher Summary" in line: summary_line_index = i break if summary_line_index == -1: # No UVM Report catcher Summary found, return empty error_info return error_info # Extract content before the UVM Report catcher Summary line content_before_summary = '\n'.join(lines[:summary_line_index]) # Look for UVM_ERROR and UVM_FATAL in the content before summary error_patterns = [ r'UVM_ERROR\s*@\s*[^\n]*\s*:\s*([^\n]+)', r'UVM_FATAL\s*@\s*[^\n]*\s*:\s*([^\n]+)', r'UVM_ERROR\s+([^\n]+)', r'UVM_FATAL\s+([^\n]+)' ] # Collect all error messages found before the summary error_messages = [] fatal_messages = [] for pattern in error_patterns: matches = re.findall(pattern, content_before_summary, re.IGNORECASE) for match in matches: if "UVM_ERROR" in pattern or "UVM_ERROR" in match: error_messages.append(match.strip()) elif "UVM_FATAL" in pattern or "UVM_FATAL" in match: fatal_messages.append(match.strip()) # Create error info structure if error_messages: # Use the first error message as representative error_message = error_messages[0] error_id = str(hash(error_message))[-8:] error_info['UVM_ERROR'] = { 'id': error_id, 'message': f"UVM_ERROR: {error_message}", 'tests': [log_file_path], 'count': len(error_messages) } if fatal_messages: # Use the first fatal message as representative fatal_message = fatal_messages[0] fatal_id = str(hash(fatal_message))[-8:] error_info['UVM_FATAL'] = { 'id': fatal_id, 'message': f"UVM_FATAL: {fatal_message}", 'tests': [log_file_path], 'count': len(fatal_messages) } return error_info def generate_regression_summary_info(self): """Generate regression summary info like Image 3""" print(f"\n{Colors.BLUE}=== Regression Summary Info ==={Colors.END}") # Validate statuses before generating summary self.validate_all_test_statuses() timestamp = datetime.now().strftime('%m-%d %H:%M:%S') # Generate regression seed regress_seed = random.randint(1000000000, 9999999999) print(f"INFO: {timestamp}: Regress Seed (rseed): {regress_seed}") # Test list path - use absolute path cur_path = os.getcwd() test_list_path = os.path.abspath(os.path.join(cur_path, "../def/json_list")) print(f"INFO: {timestamp}: Test list: {test_list_path}") # Failure list path - use absolute path failure_list_path = os.path.abspath(os.path.join(self.log_dir, "error.lst")) print(f"INFO: {timestamp}: Failure list: {failure_list_path}") # Regression report path - use absolute path regression_report_path = os.path.abspath(os.path.join(self.report_dir, "zregress_report.log")) print(f"INFO: {timestamp}: Regression report: {regression_report_path}") # End time end_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"INFO: {timestamp}: End Time: {end_time}") # Elapsed CPU time elapsed_time = time.time() - self.start_time hours = int(elapsed_time // 3600) minutes = int((elapsed_time % 3600) // 60) seconds = int(elapsed_time % 60) print(f"INFO: {timestamp}: Elapsed CPU Time: {hours}:{minutes:02d}:{seconds:02d}") # Determine regression result - use validated statuses total_tests = len(self.results) passed_tests = len([r for r in self.results.values() if r.status == "PASS"]) failed_tests = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]]) pending_tests = len([r for r in self.results.values() if r.status == "PENDING"]) running_tests = len([r for r in self.results.values() if r.status == "RUNNING"]) # If there are still pending or running tests, consider it incomplete if pending_tests > 0 or running_tests > 0: print(f"INFO: {timestamp}: ZREGRESS INCOMPLETE (Pending: {pending_tests}, Running: {running_tests})") elif failed_tests == 0: print(f"{Colors.GREEN}INFO: {timestamp}: ZREGRESS PASS{Colors.END}") else: print(f"{Colors.RED}INFO: {timestamp}: ZREGRESS FAIL{Colors.END}") # Print detailed status print(f"INFO: {timestamp}: Total Tests: {total_tests}") print(f"INFO: {timestamp}: Passed: {passed_tests}") print(f"INFO: {timestamp}: Failed: {failed_tests}") print(f"INFO: {timestamp}: Pending: {pending_tests}") print(f"INFO: {timestamp}: Running: {running_tests}") # Save error list to file and get saved paths saved_paths = self.save_error_list(failure_list_path) # Print saved file paths within regression summary if saved_paths: print(f"INFO: {timestamp}: Error list saved to: {saved_paths['error_lst']}") print(f"INFO: {timestamp}: Error JSON saved to: {saved_paths['error_json']}") print(f"INFO: {timestamp}: Failed regression list saved to: {saved_paths['failed_regression']}") # Print summary of failed tests if any failed_count = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]]) if failed_count > 0: print(f"INFO: {timestamp}: Generated failed regression list with {failed_count} failed test cases") print(f"INFO: {timestamp}: You can re-run failed tests using: python3 regress.py --failed-regression {saved_paths['failed_regression']}") # End markers print(f"INFO: {timestamp}: {'+' * 30}") print(f"INFO: {timestamp}: {' ' * 10} ZREGRESS END {' ' * 10}") print(f"INFO: {timestamp}: {'+' * 30}") # Generate comprehensive regression report self.generate_comprehensive_regression_report(regression_report_path) def save_error_list(self, failure_list_path): """Save error list to file and return saved paths""" saved_paths = {} try: # Save error.lst (text format) - simple list of failed test names with open(failure_list_path, 'w', encoding='utf-8') as f: for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: f.write(f"{result.name}\n") saved_paths['error_lst'] = failure_list_path # Save error.json (JSON format with full test case details) error_json_path = failure_list_path.replace('.lst', '.json') error_cases = [] for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: # Find the original test case JSON data with proper opts handling test_case_data = self.find_original_test_case_with_opts(result.name, result) if not test_case_data: # Fallback to original method test_case_data = self.find_original_test_case(result.name) if test_case_data: error_case = { 'test_name': result.name, 'config': result.config, 'status': result.status, 'error_message': result.error_msg, 'duration': result.duration, 'seed': getattr(result, 'seed', 'unknown'), 'job_id': getattr(result, 'job_id', 'unknown'), 'original_test_case': test_case_data } error_cases.append(error_case) with open(error_json_path, 'w', encoding='utf-8') as f: json.dump(error_cases, f, indent=2, ensure_ascii=False) saved_paths['error_json'] = error_json_path # Save failed_regression.json - regression list format for failed tests only failed_regression_path = failure_list_path.replace('.lst', '_regression.json') failed_regression_cases = [] for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: # Find the original test case JSON data with proper opts handling test_case_data = self.find_original_test_case_with_opts(result.name, result) if not test_case_data: # Fallback to original method test_case_data = self.find_original_test_case(result.name) if test_case_data: # Create a regression list entry with the same format as original test cases # but with updated information from the failed run failed_case = test_case_data.copy() # Start with original test case data # Update with actual run information failed_case.update({ 'actual_status': result.status, 'actual_error_message': result.error_msg, 'actual_duration': result.duration, 'actual_seed': getattr(result, 'seed', 'unknown'), 'actual_job_id': getattr(result, 'job_id', 'unknown'), 'log_file': getattr(result, 'log_file', ''), 'retry_count': getattr(result, 'retry_count', 0), 'failure_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') }) # Optionally adjust repeat count for retry (if retry was attempted) if hasattr(result, 'retry_count') and result.retry_count > 0: # Reduce repeat count by retry attempts to avoid over-running original_repeat = failed_case.get('repeat', 1) failed_case['repeat'] = max(1, original_repeat - result.retry_count) failed_case['original_repeat'] = original_repeat failed_regression_cases.append(failed_case) with open(failed_regression_path, 'w', encoding='utf-8') as f: json.dump(failed_regression_cases, f, indent=2, ensure_ascii=False) saved_paths['failed_regression'] = failed_regression_path except Exception as e: print(f"Warning: Could not save error list: {e}") return None return saved_paths def find_original_test_case_with_opts(self, test_name: str, result): """Find the original test case JSON data by test name using the exact opts array stored on the result.
Important: Do NOT derive opts by splitting the unique key, because opts values may contain underscores (e.g., "DELAY_SET_OFF") which would be incorrectly split. Prefer `result.opts` when available. """ try: # Prefer opts directly from the provided result object target_opts = [] if hasattr(result, 'opts') and isinstance(result.opts, list): target_opts = result.opts else: # Fallback: search a matching TestResult and read its opts property for result_key, result_obj in self.results.items(): if result_obj.name == test_name and hasattr(result_obj, 'opts'): target_opts = result_obj.opts or [] break
original_target_opts = list(target_opts) if isinstance(target_opts, list) else [] print(f"Looking for test case: {test_name} with opts: {original_target_opts}") original_case = self.find_original_test_case_by_name_and_opts(test_name, original_target_opts) if original_case: print(f"Found original test case with matching opts: {original_target_opts}") return original_case
# Strict mode: fail fast when opts don't match msg = f"Strict opts match failed for '{test_name}'. target_opts={original_target_opts}" print(msg) raise RuntimeError(msg)
except Exception as e: print(f"Warning: Could not find original test case with opts for {test_name}: {e}") return None def find_original_test_case_by_name_and_opts(self, test_name: str, target_opts: list): """Find the original test case JSON data by test name and opts.
STRICT MATCH MODE: Compare opts arrays exactly as listed in JSON. No token splitting or normalization. "DELAY_SET_OFF" only matches the same string. """ try: # Load test cases from the original JSON files cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" if not os.path.exists(test_file_list_name): print(f"Warning: Test file list not found: {test_file_list_name}") return None print(f"Looking for test case: {test_name} with opts: {target_opts}") print(f"Searching in test file list: {test_file_list_name}") with open(test_file_list_name, 'r') as f: for line in f: if line.strip() and not line.startswith('#'): file_path = cur_path + "/../def" + line.strip() print(f"Checking file: {file_path}") if os.path.exists(file_path): with open(file_path, 'r') as json_file: test_cases = json.load(json_file) for case in test_cases: case_name = case.get('name', '') case_opts = case.get('opts', []) if case_name == test_name: # Check if opts match EXACTLY (strict mode) if case_opts == target_opts: print(f"Found test case '{test_name}' with matching opts {case_opts} in file: {file_path}") return case else: print(f" Found test case '{test_name}' but opts don't match: expected {target_opts}, got {case_opts}") else: print(f" Checking case: '{case_name}' vs '{test_name}'") else: print(f"Warning: Test file not found: {file_path}") print(f"Test case '{test_name}' with opts {target_opts} not found in any JSON files") return None except Exception as e: print(f"Warning: Could not find original test case by name and opts for {test_name}: {e}") return None def find_original_test_case(self, test_name): """Find the original test case JSON data by test name""" try: # Load test cases from the original JSON files cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" if not os.path.exists(test_file_list_name): print(f"Warning: Test file list not found: {test_file_list_name}") return None print(f"Looking for test case: {test_name}") print(f"Searching in test file list: {test_file_list_name}") with open(test_file_list_name, 'r') as f: for line in f: if line.strip() and not line.startswith('#'): file_path = cur_path + "/../def" + line.strip() print(f"Checking file: {file_path}") if os.path.exists(file_path): with open(file_path, 'r') as json_file: test_cases = json.load(json_file) for case in test_cases: case_name = case.get('name', '') if case_name == test_name: print(f"Found test case '{test_name}' in file: {file_path}") return case else: print(f" Checking case: '{case_name}' vs '{test_name}'") else: print(f"Warning: Test file not found: {file_path}") print(f"Test case '{test_name}' not found in any JSON files") return None except Exception as e: print(f"Warning: Could not find original test case for {test_name}: {e}") return None def generate_comprehensive_regression_report(self, report_path): """Generate comprehensive regression report including all sections""" try: with open(report_path, 'w', encoding='utf-8') as f: # Write header f.write("=" * 80 + "\n") f.write("COMPREHENSIVE REGRESSION REPORT\n") f.write("=" * 80 + "\n\n") # Generate detailed regression report f.write("=== DETAILED REGRESSION REPORT ===\n") f.write(self.generate_detailed_regression_report_content()) f.write("\n\n") # Generate error summary report f.write("=== ERROR SUMMARY REPORT ===\n") f.write(self.generate_error_summary_report_content()) f.write("\n\n") # Generate regression summary info f.write("=== REGRESSION SUMMARY INFO ===\n") f.write(self.generate_regression_summary_info_content()) f.write("\n\n") # Generate test status and log paths section f.write("=== TEST STATUS AND LOG PATHS ===\n") f.write(self.generate_test_status_and_log_paths_content()) f.write("\n\n") # Write footer f.write("=" * 80 + "\n") f.write("END OF REPORT\n") f.write("=" * 80 + "\n") print(f"Comprehensive regression report saved to: {report_path}") except Exception as e: print(f"Warning: Could not generate comprehensive regression report: {e}") def show_concurrent_status(self, running_jobs: int, total_jobs: int, max_concurrent: int): """Show concurrent job status""" # Commented out concurrent status printing as requested # timestamp = datetime.now().strftime('%m-%d %H:%M:%S') # if max_concurrent > 0: # utilization = (running_jobs / max_concurrent) * 100 # print(f"INFO: {timestamp} Concurrent Status: {running_jobs}/{max_concurrent} jobs running ({utilization:.1f}% utilization)") # else: # print(f"INFO: {timestamp} Concurrent Status: {running_jobs} jobs running (no limit)") # # if total_jobs > 0: # progress = ((total_jobs - len(self.submitted_jobs)) / total_jobs) * 100 # print(f"INFO: {timestamp} Overall Progress: {progress:.1f}%") pass def get_total_test_cases_count(self): """Get the correct total test cases count, avoiding duplicates and transient states""" # Clean up duplicates first self._clean_duplicate_test_results() # Only count tests with final statuses, exclude transient PENDING/RUNNING states final_status_tests = [] for result_key, result_obj in self.results.items(): # Only count tests that have reached a final state if result_obj.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: final_status_tests.append(result_key) return len(final_status_tests) def show_regression_status(self, running_jobs: int, pending_jobs: int, total_test_cases: int): """Show regression status: running/pending/completed test cases""" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Regression Status: {running_jobs} running, {pending_jobs} pending, {total_test_cases} completed") def run(self): """Run regression test""" print(f"{Colors.BOLD}CMN-700 UVM Regression Test{Colors.END}") print(f"Mode: {self.args.mode}") print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # Display resource configuration if hasattr(self.args, 'memory') and self.args.memory is not None: print(f"Memory Reservation: {self.args.memory}GB per job (command line)") else: print(f"Memory Reservation: Not specified (using LSF default or JSON values)") cpu_cores = getattr(self.args, 'cpu_cores', 1) print(f"CPU Cores: {cpu_cores} per job") # Display coverage configuration if hasattr(self.args, 'cov') and self.args.cov: print(f"Coverage Type: {self.args.cov}") elif hasattr(self.args, 'coverage') and self.args.coverage: print(f"Coverage: Enabled (legacy mode)") else: print(f"Coverage: Disabled") print() # Initialize environment cur_path = os.getcwd() test_file_list_name = cur_path + "/../def/json_list" test_file_list = [] # Load test file list with open(test_file_list_name, 'r') as f: for line in f: if line != '\n': file_path = cur_path + "/../def" + line # print(file_path) # Commented out debug print test_file_list.append(file_path.replace('\n', '')) # Load test cases test_cases = self.load_test_cases(test_file_list) # print(test_cases) # Commented out debug print # Filter test cases by group (only if groups are specified) if self.args.groups: selected_cases = self.filter_cases(test_cases, self.args.groups) else: selected_cases = test_cases # Convert test cases to test configs format test_configs = [] for case in selected_cases: # Extract test name and config from the case test_name = case.get('name', 'unknown') config = case.get('config', 'default') test_configs.append((test_name, config)) # Save processed test list self.tests = test_configs print(f"Total loaded tests: {len(self.tests)}") # Set estimated time for each test # Note: TestResult objects will be created later in run_compile_and_regression # with the correct key format (test_name:config:seed:opts) print(f"TestResult objects will be created during job submission with proper key format") # Run tests based on mode if self.args.legacy_mode == "compile_regression": compile_success = self.run_compile_and_regression( str(self.args.dienum), self.args.rtl_ver, self.args.p2_mode, self.args.define ) # If compilation failed, exit without running regression if not compile_success: print(f"{Colors.RED}Compilation failed! Exiting without running regression tests.{Colors.END}") return else: print(f"{Colors.YELLOW}Local regression mode is not supported. Please use LSF regression mode.{Colors.END}") return # Note: Retry is now handled immediately when tests fail # No need to run retry_failed_tests() here anymore if self.args.retry > 0: print(f"{Colors.YELLOW}Note: Retry is enabled and will be triggered immediately when tests fail{Colors.END}") # Merge coverage if self.args.coverage: self.merge_coverage() # Generate report self.generate_report() # Handle auto-restart logic self._handle_auto_restart() def _handle_auto_restart(self): """Handle auto-restart logic after regression completion""" # Check if auto-restart is enabled if not self.auto_restart and self.restart_interval_hours is None: return # Check max restarts limit if self.max_restarts is not None and self.restart_count >= self.max_restarts: timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"\n{Colors.YELLOW}Auto-restart limit reached ({self.max_restarts} restarts). Stopping.{Colors.END}") print(f"INFO: {timestamp} Total restarts: {self.restart_count}") return # Determine if we should restart should_restart = False restart_reason = "" if self.auto_restart: # Immediate restart after completion should_restart = True restart_reason = "auto-restart enabled" elif self.restart_interval_hours is not None: # Time-based restart current_time = time.time() elapsed_hours = (current_time - self.first_run_start_time) / 3600.0 if elapsed_hours >= self.restart_interval_hours: should_restart = True restart_reason = f"restart interval reached ({self.restart_interval_hours} hours)" else: # Calculate wait time until next restart remaining_hours = self.restart_interval_hours - elapsed_hours remaining_minutes = int((remaining_hours - int(remaining_hours)) * 60) timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"\n{Colors.BLUE}INFO: {timestamp} Next auto-restart in {int(remaining_hours)}h {remaining_minutes}m{Colors.END}") # Wait until restart interval is reached wait_seconds = remaining_hours * 3600 if wait_seconds > 0: print(f"{Colors.BLUE}Waiting {int(remaining_hours)}h {remaining_minutes}m until next restart...{Colors.END}") time.sleep(wait_seconds) should_restart = True restart_reason = f"restart interval reached ({self.restart_interval_hours} hours)" if should_restart: self.restart_count += 1 timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"\n{Colors.CYAN}{'='*80}{Colors.END}") print(f"{Colors.CYAN}Auto-Restart #{self.restart_count} - {restart_reason}{Colors.END}") print(f"{Colors.CYAN}Time: {timestamp}{Colors.END}") print(f"{Colors.CYAN}{'='*80}{Colors.END}\n") # Create new regression directory for restart print(f"{Colors.BLUE}Creating new regression directory for restart #{self.restart_count}...{Colors.END}") self.regression_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Extract directory name from current simulation directory current_dir = os.path.basename(os.getcwd()) # Extract xxx part from sim_xxx pattern if current_dir.startswith('sim_'): dir_suffix = current_dir[4:] # Remove 'sim_' prefix else: dir_suffix = '' # Add restart count to directory name to make it unique self.regression_dir_name = f"regression_{dir_suffix}_{self.regression_timestamp}_restart{self.restart_count}" # Re-setup and create directories self._setup_directories() self._create_directories() # Update real-time report path for new directory self.real_time_report_path = self.report_dir / "zregress_report.log" print(f"{Colors.GREEN}New regression directory created: {self.regression_dir}{Colors.END}\n") # Reset some state for new run self.start_time = time.time() self.results = {} self.submitted_jobs = [] self.submitted_results = [] self.job_meta = {} self.running_jobs = 0 self.pending_jobs = 0 self.log_read_positions = {} self.log_last_update_times = {} # Reset error monitoring state self.last_error_monitor_time = time.time() # Reset status thread control self._stop_status_thread = False # For time-based restart, reset the first run start time to current time # so the next interval starts from now if self.restart_interval_hours is not None: self.first_run_start_time = time.time() # Small delay before restart time.sleep(2) # Recursively call run() to start new regression try: self.run() except KeyboardInterrupt: print(f"\n{Colors.YELLOW}User interrupted during auto-restart, cleaning up...{Colors.END}") self.cleanup() raise except Exception as e: print(f"{Colors.RED}Error during auto-restart: {e}{Colors.END}") self.cleanup() raise
def collect_transaction_and_cycle_stats(self) -> Tuple[int, int, int]: """ 扫描当前回归目录下的所有仿真 log 文件,统计 transaction 和 cycle 数量总和。
返回: (total_transaction_count, total_cycle_count, counted_log_files) """ log_root = Path(self.log_dir) if not log_root.exists(): return 0, 0, 0
total_txn = 0 total_cycles = 0 counted_logs = 0
# 递归扫描当前回归 logs 目录下的所有 .log 文件 for log_file in log_root.rglob("*.log"): try: txn = stat_transaction_count.extract_transaction_count(log_file) cyc = stat_transaction_count.extract_cycle_count(log_file) except Exception: continue
if txn is not None: total_txn += txn counted_logs += 1 if cyc is not None: total_cycles += cyc
return total_txn, total_cycles, counted_logs
def update_transaction_cycle_statistics(self): """ 使用 stat_transaction_count 的统计逻辑,将本次回归的 transaction_count 和 cycle_count 写入一个全局文档,并维护累加计数。
文档格式(CSV): date,regression_dir,transaction_count,cycle_count,cumulative_transaction_count,cumulative_cycle_count,log_files_count """ # 先统计本次回归 total_txn, total_cycles, counted_logs = self.collect_transaction_and_cycle_stats()
# 如果一个 log 都没有统计到 transaction,就直接返回,避免写入无意义记录 if counted_logs == 0: print(f"{Colors.YELLOW}Warning: No transaction/cycle information found under {self.log_dir}{Colors.END}") return
history_path = Path("transaction_cycle_history.csv").resolve()
cumulative_txn = 0 cumulative_cycles = 0
# 如果历史文件存在,先把以前的记录读出来,计算累加值 if history_path.exists(): try: with history_path.open("r", encoding="utf-8") as f: reader = csv.reader(f) header_read = False for row in reader: # 跳过表头 if not header_read: header_read = True continue if len(row) < 5: continue try: # 第 3、4 列是本次的 transaction/cycle,总累加重新算 run_txn = int(row[2]) run_cycles = int(row[3]) except ValueError: continue cumulative_txn += run_txn cumulative_cycles += run_cycles except Exception as e: print(f"{Colors.YELLOW}Warning: Failed to read existing history file {history_path}: {e}{Colors.END}") cumulative_txn = 0 cumulative_cycles = 0
# 把本次回归加到累加值里 cumulative_txn += total_txn cumulative_cycles += total_cycles
# 准备写入一行新记录 now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") regression_dir_str = str(self.regression_dir.resolve())
# 如果文件不存在,先写表头 file_exists = history_path.exists() try: with history_path.open("a", encoding="utf-8", newline="") as f: writer = csv.writer(f) if not file_exists: writer.writerow([ "date", "regression_dir", "transaction_count", "cycle_count", "cumulative_transaction_count", "cumulative_cycle_count", "log_files_count", ]) writer.writerow([ now_str, regression_dir_str, total_txn, total_cycles, cumulative_txn, cumulative_cycles, counted_logs, ])
print( f"{Colors.GREEN}Transaction/Cycle statistics updated. " f"txn={total_txn}, cycles={total_cycles}, " f"cumulative_txn={cumulative_txn}, cumulative_cycles={cumulative_cycles}{Colors.END}" ) except Exception as e: print(f"{Colors.YELLOW}Warning: Failed to write transaction/cycle history to {history_path}: {e}{Colors.END}") def check_compile_files_exist(self, output_dir: str, dienum: str, rtl_ver: str, mode: str) -> bool: """Check if compile files already exist""" try: # Check for common compile output files compile_files = [ f"{output_dir}/compile.log", f"{output_dir}/compile_ok", f"{output_dir}/compile.done", f"{output_dir}/simv", # VCS executable f"{output_dir}/simv.daidir", # VCS directory f"{output_dir}/csrc", # VCS source directory ] # Check if any of these files exist existing_files = [f for f in compile_files if os.path.exists(f)] if existing_files: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Found existing compile files:") for f in existing_files: print(f" - {f}") return True else: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} No existing compile files found") return False except Exception as e: print(f"{Colors.YELLOW}Warning: Error checking compile files: {e}{Colors.END}") return False def should_skip_compile(self) -> bool: """Determine if compile should be skipped based on bypass argument""" if not hasattr(self.args, 'bypass') or self.args.bypass is None: return False # Handle different bypass argument formats if isinstance(self.args.bypass, list): if len(self.args.bypass) == 0: return False bypass_value = self.args.bypass[0] else: bypass_value = self.args.bypass # Convert to string and check bypass_str = str(bypass_value).lower().strip() # Skip compile if bypass is "1", "true", "yes", "skip", "bypass" # Default is "0" which means compile skip_values = ["1", "true", "yes", "skip", "bypass"] return bypass_str in skip_values
def _status_print_thread(self): """Status print thread function""" while not self._stop_status_thread: time.sleep(5) # Check every 5 seconds if status should be printed # Check if all tests are completed and stop if so with self.lock: all_completed = all(result.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] for result in self.results.values()) if all_completed and (self.running_jobs == 0 and self.pending_jobs == 0): print(f"{Colors.GREEN}All tests completed, stopping status monitoring thread{Colors.END}") self._stop_status_thread = True break # Update LSF job statuses in real-time self._update_lsf_job_statuses() self.print_status_summary() # Update real-time report in status thread self.update_real_time_report() # Monitor running tests for errors self.monitor_running_tests_for_errors() def _update_lsf_job_statuses(self): """Update LSF job statuses in real-time to keep TestResult objects synchronized""" if not hasattr(self, 'submitted_jobs') or not self.submitted_jobs: return try: # Get current LSF status for all submitted jobs job_ids = [int(job_id) for job_id in self.submitted_jobs] if not job_ids: return # Use batch status check for better performance status_map = self.batch_check_job_status(job_ids) # Update TestResult objects based on current LSF status for job_id, lsf_status in status_map.items(): if lsf_status in ["RUN", "PEND"]: # Find the corresponding TestResult object test_info = self.get_test_info_by_job_id(str(job_id)) if test_info: test_name = test_info['name'] seed = test_info['seed'] # Find the TestResult object by searching through all results found_result = None for result_key, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if found_result: # Update status based on LSF status if lsf_status == "RUN" and found_result.status == "PENDING": # Job just started running found_result.status = "RUNNING" if not found_result.start_time: found_result.start() print(f"DEBUG: Status updated: {test_name} seed={seed} PENDING -> RUNNING (job_id: {job_id})") elif lsf_status == "PEND" and found_result.status == "RUNNING": # Job went back to pending (resource preemption, etc.) found_result.status = "PENDING" print(f"DEBUG: Status updated: {test_name} seed={seed} RUNNING -> PENDING (job_id: {job_id})") else: print(f"DEBUG: Could not find TestResult for {test_name} seed={seed}") else: print(f"DEBUG: Could not get test info for job_id {job_id}") except Exception as e: print(f"Warning: Error updating LSF job statuses: {e}") def estimate_completion_time(self): """Estimate completion time""" now = time.time() # Calculate total time of completed tests completed_tests = [r for r in self.results.values() if r.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]] completed_time = sum(r.duration for r in completed_tests) # Calculate running time of currently running tests running_tests = [r for r in self.results.values() if r.status == "RUNNING"] running_time = sum(now - r.start_time for r in running_tests if r.start_time) # Calculate estimated time of pending tests pending_tests = [r for r in self.results.values() if r.status == "PENDING"] pending_time = sum(r.estimated_duration for r in pending_tests) # Calculate remaining estimated time of running tests running_remaining = sum(max(0, r.estimated_duration - (now - r.start_time)) for r in running_tests if r.start_time) # Total estimated remaining time total_remaining = running_remaining + pending_time # Calculate estimated completion time estimated_completion = now + total_remaining completion_time = datetime.fromtimestamp(estimated_completion).strftime('%Y-%m-%d %H:%M:%S') # Calculate progress percentage (count-based, more stable) total_tests = len(self.results) if self.results else 0 completed_count = len(completed_tests) if total_tests > 0: # Running contribution: fraction of elapsed/estimated for each running test running_fraction_sum = 0.0 for r in running_tests: if r.start_time and getattr(r, 'estimated_duration', 0) > 0: elapsed_r = max(0.0, now - r.start_time) est_r = max(1.0, float(r.estimated_duration)) running_fraction_sum += min(elapsed_r / est_r, 1.0) progress = (completed_count + running_fraction_sum) / total_tests * 100.0 else: progress = 0.0 return completion_time, progress, total_remaining def print_status_summary(self): """Print current test status summary""" now = time.time() # status_print_interval is in seconds; default args.status_interval=5 (minutes) → here we want 30 minutes # Force the print interval to 30 minutes (override any smaller value) thirty_minutes = 30 * 60 effective_interval = max(self.status_print_interval, thirty_minutes) # Skip if not enough time has passed since last status print if now - self.last_status_print < effective_interval: return self.last_status_print = now # Count tests in each status total = len(self.results) pending = len([r for r in self.results.values() if r.status == "PENDING"]) running = len([r for r in self.results.values() if r.status == "RUNNING"]) passed = len([r for r in self.results.values() if r.status == "PASS"]) rerun_passed = len([r for r in self.results.values() if r.status == "RERUN PASS"]) failed = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL"]]) errors = len([r for r in self.results.values() if r.status == "ERROR"]) timeouts = len([r for r in self.results.values() if r.status == "TIMEOUT"]) # Debug: Print status distribution print(f"DEBUG: Status distribution - PENDING: {pending}, RUNNING: {running}, PASS: {passed}, RERUN PASS: {rerun_passed}, FAIL: {failed}") # Debug: Print some test statuses for verification if running > 0: running_tests = [r for r in self.results.values() if r.status == "RUNNING"] print(f"DEBUG: Sample RUNNING tests: {[r.name for r in running_tests[:3]]}") if pending > 0: pending_tests = [r for r in self.results.values() if r.status == "PENDING"] print(f"DEBUG: Sample PENDING tests: {[r.name for r in pending_tests[:3]]}") # Calculate elapsed time elapsed = now - self.start_time hours = int(elapsed // 3600) minutes = int((elapsed % 3600) // 60) seconds = int(elapsed % 60) # Estimate completion time and progress completion_time, progress, remaining = self.estimate_completion_time() remaining_hours = int(remaining // 3600) remaining_minutes = int((remaining % 3600) // 60) # Print status summary print(f"\n{Colors.BOLD}=== Regression Status Summary (Runtime: {hours:02d}:{minutes:02d}:{seconds:02d}) ==={Colors.END}") print(f"Total Tests: {total}") print(f"{Colors.YELLOW}Pending: {pending}{Colors.END}") print(f"{Colors.BLUE}Running: {running}{Colors.END}") # CRITICAL FIX: Calculate actual LSF status counts from submitted jobs actual_running_jobs = 0 actual_pending_jobs = 0 if hasattr(self, 'submitted_jobs') and self.submitted_jobs: try: # Get current LSF status for all submitted jobs job_ids = [int(job_id) for job_id in self.submitted_jobs] if job_ids: status_map = self.batch_check_job_status(job_ids) for job_id, status in status_map.items(): if status == "RUN": actual_running_jobs += 1 elif status == "PEND": actual_pending_jobs += 1 except Exception as e: print(f"Warning: Error calculating actual LSF status: {e}") print(f"{Colors.CYAN}LSF Status - RUN: {actual_running_jobs}, PEND: {actual_pending_jobs}{Colors.END}") print(f"{Colors.GREEN}Passed: {passed}{Colors.END}") print(f"{Colors.CYAN}Rerun Passed: {rerun_passed}{Colors.END}") print(f"{Colors.RED}Failed: {failed}{Colors.END}") print(f"{Colors.RED}Errors: {errors}{Colors.END}") print(f"{Colors.YELLOW}Timeouts: {timeouts}{Colors.END}") # Calculate progress based on test count (Passed/Total) test_progress = ((passed + rerun_passed) / total) * 100.0 if total > 0 else 0.0 progress_bar = self.generate_progress_bar(test_progress) print(f"\nProgress: {test_progress:.1f}% {progress_bar}") print(f"Estimated Remaining Time: {remaining_hours} hours {remaining_minutes} minutes") print(f"Estimated Completion Time: {completion_time}") # Print running tests if running > 0: print(f"\n{Colors.BLUE}Running Tests:{Colors.END}") running_tests = [r for r in self.results.values() if r.status == "RUNNING"] for test in running_tests[:5]: # Show at most 5 elapsed = now - test.start_time if test.start_time else 0 minutes = int(elapsed // 60) seconds = int(elapsed % 60) # Get seed and opts information seed = getattr(test, 'seed', 'unknown') opts = getattr(test, 'opts', []) opts_str = "_".join(opts) if opts else "no_opts" status_info = "" if hasattr(test, 'error_detected') and test.error_detected: status_info = f" {Colors.RED}(running but had error){Colors.END}" print(f" {test.name} seed={seed} opts={opts_str} (Runtime: {minutes}m{seconds}s){status_info}") if len(running_tests) > 5: print(f" ... and {len(running_tests) - 5} other tests") # Print recently failed tests if failed > 0 or errors > 0 or timeouts > 0: print(f"\n{Colors.RED}Recently Failed Tests:{Colors.END}") failed_tests = [r for r in self.results.values() if r.status in ["FAIL", "ERROR", "TIMEOUT"]] for test in failed_tests[-5:]: # Show at most 5 recent ones log_path = getattr(test, 'log_file', '') or '' log_part = f" log={log_path}" if log_path else "" print(f" {test.name}: {test.status}{log_part}") print() # Empty line
def validate_all_test_statuses(self): """Validate and correct all test statuses before generating report""" current_time = time.time() # Limit validation frequency to avoid excessive calls if current_time - self._last_validation_time < 60: # Only validate once per minute return self._last_validation_time = current_time self._validation_count += 1 # print(f"Validating all test statuses... (validation #{self._validation_count})") # First check all TestResult objects with RUNNING status running_tests = [r for r in self.results.values() if r.status == "RUNNING"] if running_tests: print(f"Found {len(running_tests)} tests with RUNNING status, checking actual job status...") for result_obj in running_tests: if hasattr(result_obj, 'job_id') and result_obj.job_id: try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status == "DONE": result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS") elif status in ["EXIT", "TERM", "KILL"]: result_obj.finish("FAIL", f"Job status: {status}") print(f" Status corrected: {result_obj.name} -> FAIL") elif status in ["RUN", "PEND", "WAIT", "SUSP"]: # Job is still running or pending, keep RUNNING status pass elif status == "UNKNOWN": # Job may have completed and been removed from queue, or still running # Only change status if we can definitively determine the result if hasattr(result_obj, 'log_file') and result_obj.log_file: if os.path.exists(result_obj.log_file): if self.check_test_result(result_obj.log_file): result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS (from log file)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f" Status corrected: {result_obj.name} -> FAIL (from log file)") else: # No log file yet, keep RUNNING status (job might still be running) pass else: # No log file info, keep RUNNING status (job might still be running) pass else: # Unknown LSF status, keep RUNNING status pass except Exception as e: print(f" Warning: Could not check status for {result_obj.name}: {e}") # Keep RUNNING status if we can't determine status (job might still be running) pass else: # No job_id, keep RUNNING status (job not yet submitted) pass # Now check all TestResult objects with PENDING status to see if they've completed pending_tests = [r for r in self.results.values() if r.status == "PENDING"] if pending_tests: print(f"Found {len(pending_tests)} tests with PENDING status, checking if they've completed...") for result_obj in pending_tests: if hasattr(result_obj, 'job_id') and result_obj.job_id: try: status = self.check_lsf_job_status(int(result_obj.job_id)) if status == "DONE": result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS (was PENDING)") elif status in ["EXIT", "TERM", "KILL"]: result_obj.finish("FAIL", f"Job status: {status}") print(f" Status corrected: {result_obj.name} -> FAIL (was PENDING)") elif status in ["RUN", "PEND", "WAIT", "SUSP"]: # If job is RUN, upgrade PENDING -> RUNNING; otherwise keep PENDING if status == "RUN": result_obj.status = "RUNNING" # Initialize start time if missing if not getattr(result_obj, 'start_time', None): result_obj.start() print(f" Status corrected: {result_obj.name} PENDING -> RUNNING") else: # Still pending/wait/suspend; keep PENDING print(f" {result_obj.name} still {status}") elif status == "UNKNOWN": # Job may have completed and been removed from queue # Try to check if log file exists and determine result if hasattr(result_obj, 'log_file') and result_obj.log_file: if os.path.exists(result_obj.log_file): if self.check_test_result(result_obj.log_file): result_obj.finish("PASS", "") print(f" Status corrected: {result_obj.name} -> PASS (from log file, was PENDING)") else: result_obj.finish("FAIL", "Test failed (from log file)") print(f" Status corrected: {result_obj.name} -> FAIL (from log file, was PENDING)") else: # No log file, keep PENDING status print(f" {result_obj.name} no log file - keeping PENDING status") else: # No log file info, keep PENDING status print(f" {result_obj.name} no log file info - keeping PENDING status") else: # Unknown status, keep PENDING print(f" {result_obj.name} unknown status {status} - keeping PENDING status") except Exception as e: print(f" Warning: Could not check status for {result_obj.name}: {e}") # Keep PENDING status if we can't determine status print(f" {result_obj.name} keeping PENDING status due to error") else: # No job_id, keep PENDING status print(f" {result_obj.name} no job_id - keeping PENDING status") # Final count final_running = len([r for r in self.results.values() if r.status == "RUNNING"]) final_pending = len([r for r in self.results.values() if r.status == "PENDING"]) final_passed = len([r for r in self.results.values() if r.status == "PASS"]) final_failed = len([r for r in self.results.values() if r.status in ["FAIL", "ERROR", "TIMEOUT"]]) def generate_detailed_regression_report_content(self): """Generate detailed regression report content as string""" output = [] # Validate statuses before generating report self.validate_all_test_statuses() # Print header timestamp = datetime.now().strftime('%m-%d %H:%M:%S') output.append(f"INFO: {timestamp}: {'+' * 15} REPORT {'+' * 15}") # Print table header output.append(f"INFO: {timestamp}: | status | test_name | seed | jobid | cpu_time | max_mem | procs |") # Process each test result for result in self.results.values(): # Get test info test_name = result.name seed = getattr(result, 'seed', 'unknown') job_id = getattr(result, 'job_id', 'unknown') # Get CPU time and memory info from actual data cpu_time, max_mem, procs = self.extract_job_statistics(result) # Handle PENDING status specially for file output if result.status == "PENDING": cpu_time = "-1|unknown" # Format status with proper colors (remove color codes for file output) status = result.status if status == "PASS": status = "PASS" elif status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: status = "FAIL" elif status == "RUNNING": status = "RUNNING" elif status == "PENDING": status = "PENDING" else: status = status # Print test result line output.append(f"INFO: {timestamp}: | {status} | {test_name} | {seed} | {job_id} | {cpu_time} | {max_mem} | {procs} |") output.append(f"INFO: {timestamp}: {'+' * 15} END REPORT {'+' * 15}") output.append(f"Total unique tests reported: {len(self.results)}") return "\n".join(output) def generate_error_summary_report_content(self): """Generate error summary report content as string""" output = [] # Collect all error information from log content before UVM Report catcher Summary and direct patterns error_info = {} failed_tests = [] for result in self.results.values(): if result.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]: failed_tests.append(result) # Analyze log file for errors in content before UVM Report catcher Summary self.analyze_log_for_errors(result, error_info) if not failed_tests: output.append("No UVM_ERROR or UVM_FATAL found in logs") else: if not error_info: output.append(f"Found {len(failed_tests)} failed tests but no UVM_ERROR or UVM_FATAL found in logs") for result in failed_tests: output.append(f" {result.name}: {result.status} - {result.error_msg}") else: output.append(f"Found {len(failed_tests)} failed tests with error details:") # Group errors by type error_count = 1 for error_type, error_details in error_info.items(): output.append(f"({error_count}) ERR ID:{error_details['id']}:") output.append(f"MSG: \"{error_details['message']}\"") # Print error count if available (from log content before UVM Report catcher Summary) if 'count' in error_details: output.append(f"Count: {error_details['count']} (from log content before UVM Report catcher Summary)") # Print associated test paths for test_path in error_details['tests']: output.append(f"{test_path}") error_count += 1 return "\n".join(output) def generate_regression_summary_info_content(self): """Generate regression summary info content as string""" output = [] # Validate statuses before generating report self.validate_all_test_statuses() # Count results total_tests = len(self.results) passed_tests = len([r for r in self.results.values() if r.status == "PASS"]) rerun_passed_tests = len([r for r in self.results.values() if r.status == "RERUN PASS"]) failed_tests = len([r for r in self.results.values() if r.status in ["FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"]]) pending_tests = len([r for r in self.results.values() if r.status == "PENDING"]) running_tests = len([r for r in self.results.values() if r.status == "RUNNING"]) # Print summary output.append(f"Running: {running_tests}") output.append(f"Pending: {pending_tests}") output.append(f"Passed: {passed_tests}") output.append(f"Rerun Passed: {rerun_passed_tests}") output.append(f"Failed: {failed_tests}") output.append(f"Total tests: {total_tests}") # Print detailed status output.append(f"Total Tests: {total_tests}") output.append(f"Passed: {passed_tests}") output.append(f"Rerun Passed: {rerun_passed_tests}") output.append(f"Failed: {failed_tests}") output.append(f"Pending: {pending_tests}") output.append(f"Running: {running_tests}") # Calculate pass rate if total_tests > 0: pass_rate = ((passed_tests + rerun_passed_tests) / total_tests) * 100 output.append(f"Pass Rate: {pass_rate:.1f}%") else: output.append("Pass Rate: 0.0%") # Determine overall status if pending_tests > 0 or running_tests > 0: output.append("Overall Status: INCOMPLETE") elif failed_tests == 0: output.append("Overall Status: PASS") else: output.append("Overall Status: FAIL") return "\n".join(output) def generate_test_status_and_log_paths_content(self): """Generate test status and log paths content for zregress_report.log""" output = [] # Validate statuses before generating report self.validate_all_test_statuses() # Get ALL test results (not just final statuses) all_results = list(self.results.values()) # Sort results by status priority: RUNNING, PENDING, FAIL/RERUN FAIL/ERROR/TIMEOUT, PASS, RERUN PASS def sort_key(result): status_priority = { "RUNNING": 0, "PENDING": 1, "FAIL": 2, "RERUN FAIL": 2, "ERROR": 2, "TIMEOUT": 2, "PASS": 3, "RERUN PASS": 3 } return status_priority.get(result.status, 4) all_results.sort(key=sort_key) # Print all test results with their log paths for result in all_results: log_path = self.get_test_log_path(result) error_info = "" if hasattr(result, 'error_detected') and result.error_detected: error_info = " (running but had error)" if log_path: output.append(f"[{result.status}]{error_info} {log_path}") else: # If no log file found, still show the test but indicate no log output.append(f"[{result.status}]{error_info} {result.name}: No log file found") return "\n".join(output) def _update_job_status_counts(self, status_changes: Dict[int, str]): """Update running_jobs and pending_jobs counts based on current LSF status""" # Reset counts new_running_count = 0 new_pending_count = 0 # Count jobs by status for job_id, status in status_changes.items(): if status == "RUN": new_running_count += 1 elif status == "PEND": new_pending_count += 1 # CRITICAL FIX: Also count retry jobs if they exist if hasattr(self, 'retry_results'): for retry_job_id, retry_result in self.retry_results.items(): if int(retry_job_id) not in status_changes: # Check status of retry job retry_status = self.check_lsf_job_status(int(retry_job_id)) if retry_status == "RUN": new_running_count += 1 elif retry_status == "PEND": new_pending_count += 1 # Update counts and TestResult objects for job_id, status in status_changes.items(): test_info = self.get_test_info_by_job_id(job_id) if test_info: test_name = test_info['name'] seed = test_info['seed'] # Find the TestResult object found_result = None for result_key, result_obj in self.results.items(): if result_obj.name == test_name and getattr(result_obj, 'seed', '') == seed: found_result = result_obj break if found_result: if status == "RUN" and found_result.status == "PENDING": # Job just started running found_result.status = "RUNNING" found_result.start() # Set start time self._reset_log_read_position(test_name, seed) timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] {test_name} seed={seed} RUNNING") elif status == "PEND" and found_result.status == "RUNNING": # Job went back to pending (resource preemption, etc.) found_result.status = "PENDING" timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} [jobid {job_id}] {test_name} seed={seed} PENDING") # Update global counts old_running = self.running_jobs old_pending = self.pending_jobs self.running_jobs = new_running_count self.pending_jobs = new_pending_count # Log count changes if significant if old_running != new_running_count or old_pending != new_pending_count: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"INFO: {timestamp} Status Count Update: RUNNING {old_running} -> {new_running_count}, PENDING {old_pending} -> {new_pending_count}") def _clean_submitted_jobs(self): """Clean duplicate job IDs from submitted_jobs list""" if len(self.submitted_jobs) != len(set(self.submitted_jobs)): original_count = len(self.submitted_jobs) self.submitted_jobs = list(dict.fromkeys(self.submitted_jobs)) # Remove duplicates while preserving order cleaned_count = len(self.submitted_jobs) if original_count != cleaned_count: print(f"Cleaned duplicate job IDs: {original_count} -> {cleaned_count}") print(f"Removed {original_count - cleaned_count} duplicate job IDs") def _clean_duplicate_test_results(self): """Clean up duplicate TestResult objects based on name:config:seed combination""" print(f"Cleaning up duplicate test results...") original_count = len(self.results) # Create a mapping to track unique tests unique_tests = {} duplicates_to_remove = [] for result_key, result_obj in self.results.items(): # Create a unique identifier for each test test_identifier = f"{result_obj.name}:{result_obj.config}:{getattr(result_obj, 'seed', 'unknown')}" if test_identifier not in unique_tests: # First occurrence of this test unique_tests[test_identifier] = result_key else: # Duplicate found - keep the one with more complete information existing_key = unique_tests[test_identifier] existing_obj = self.results[existing_key] # Determine which one to keep (prefer PASS over FAIL/ERROR/TIMEOUT, then prefer final status over PENDING/RUNNING) if result_obj.status == "PASS" and existing_obj.status != "PASS": # Always keep PASS over any other status duplicates_to_remove.append(existing_key) unique_tests[test_identifier] = result_key elif existing_obj.status == "PASS" and result_obj.status != "PASS": # Keep existing PASS, remove new non-PASS duplicates_to_remove.append(result_key) elif result_obj.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] and existing_obj.status in ["PENDING", "RUNNING"]: # Keep the new one with final status, remove the old one with transient status duplicates_to_remove.append(existing_key) unique_tests[test_identifier] = result_key elif existing_obj.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] and result_obj.status in ["PENDING", "RUNNING"]: # Keep the existing one with final status, remove the new one with transient status duplicates_to_remove.append(result_key) else: # Both have same status level, keep the one with more info if hasattr(result_obj, 'job_id') and result_obj.job_id and not (hasattr(existing_obj, 'job_id') and existing_obj.job_id): # New one has job_id, existing one doesn't duplicates_to_remove.append(existing_key) unique_tests[test_identifier] = result_key else: # Keep existing one duplicates_to_remove.append(result_key) # Remove duplicates for key in duplicates_to_remove: if key in self.results: del self.results[key] cleaned_count = len(self.results)
def monitor_running_tests_for_errors(self): """Monitor running tests for errors in their log files""" current_time = time.time() if current_time - self.last_error_monitor_time < self.error_monitor_interval: return self.last_error_monitor_time = current_time timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"\n{Colors.YELLOW}=== 30-Minute Error Monitoring Check ({timestamp}) ==={Colors.END}") # Get all running tests and filter out PEND jobs running_tests = [] for result in self.results.values(): if result.status == "RUNNING": # Check if job is actually running (not pending) job_id = getattr(result, 'job_id', None) if job_id: try: lsf_status = self.check_lsf_job_status(int(job_id)) if lsf_status == "RUN": running_tests.append(result) # Skip PEND jobs - they're waiting for resources, not actually running except Exception: # If we can't check LSF status, include it in monitoring running_tests.append(result) else: # No job_id, include in monitoring running_tests.append(result) if not running_tests: print(f" No actually running tests to monitor") # Check if all tests are completed and stop monitoring if so all_completed = all(result.status in ["PASS", "RERUN PASS", "FAIL", "RERUN FAIL", "ERROR", "TIMEOUT"] for result in self.results.values()) if all_completed and (self.running_jobs == 0 and self.pending_jobs == 0): print(f" {Colors.GREEN}All tests completed, stopping error monitoring{Colors.END}") self._stop_status_thread = True return print(f" Scanning {len(running_tests)} running tests for first-time errors...") errors_found_count = 0 for result in running_tests: if self._check_test_log_for_errors(result): errors_found_count += 1 if errors_found_count == 0: print(f" {Colors.GREEN}✓ No new errors detected in running tests{Colors.END}") else: print(f" {Colors.RED}⚠ Found {errors_found_count} new error(s) in running tests{Colors.END}") def _check_test_log_for_errors(self, result): """Check a specific test's log file for errors""" test_name = result.name seed = getattr(result, 'seed', 'unknown') # Get the log file path log_file_path = self.get_test_log_path(result) if not log_file_path or not os.path.exists(log_file_path): return False # Get or initialize the last read position for this log file log_key = f"{test_name}_{seed}" last_position = self.log_read_positions.get(log_key, 0) # Check if we've already reported errors for this test if hasattr(result, 'error_reported') and result.error_reported: return False # CRITICAL FIX: Only check for timeout if job is actually RUNNING # PEND jobs should not be subject to timeout detection as they're waiting for resources job_id = getattr(result, 'job_id', None) if job_id: try: lsf_status = self.check_lsf_job_status(int(job_id)) if lsf_status == "PEND": # Job is pending - check for PEND timeout if configured if self.pend_timeout_seconds is not None: now_ts = time.time() last_update_ts = self.log_last_update_times.get(log_key, now_ts) if now_ts - last_update_ts >= self.pend_timeout_seconds: mins = int(self.pend_timeout_seconds // 60) result.finish("TIMEOUT", f"Job pending for {mins} minutes (resource timeout)") print(f"\n{Colors.YELLOW}⏱{Colors.END} {test_name} seed={seed} TIMEOUT - pending for {mins} minutes") self.log_last_update_times[log_key] = now_ts return False # No PEND timeout configured or not yet reached - don't check for hang timeout return False except Exception: # If we can't check LSF status, assume it's running and proceed with timeout check pass try: with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as f: # Seek to the last read position f.seek(last_position) # Read new content new_content = f.read() current_position = f.tell()
# Update last update time if file advanced, else check for hang now_ts = time.time() last_update_ts = self.log_last_update_times.get(log_key, now_ts) if current_position > last_position: # File advanced; update last update time self.log_last_update_times[log_key] = now_ts else: # No progress; if configured threshold without new lines, mark TIMEOUT (hung) # Only apply timeout to RUNNING jobs, not PEND jobs if now_ts - last_update_ts >= self.hang_timeout_seconds: mins = int(self.hang_timeout_seconds // 60) result.finish("TIMEOUT", f"No new log lines for {mins} minutes (assumed hang)") print(f"\n{Colors.YELLOW}⏱{Colors.END} {test_name} seed={seed} TIMEOUT - no log updates for {mins} minutes") # Reset tracking to avoid repeated triggers self.log_last_update_times[log_key] = now_ts # CRITICAL FIX: Trigger retry for TIMEOUT cases if getattr(self.args, 'retry', 0) > 0 and hasattr(result, 'job_id') and result.job_id: print(f"{Colors.CYAN}🚀 Triggering retry for TIMEOUT case {test_name} seed={seed}{Colors.END}") self._resubmit_from_stored_opcode(result.job_id) return False # Update the last read position self.log_read_positions[log_key] = current_position # Only scan content BEFORE 'UVM Report catcher Summary' to avoid false positives summary_idx = new_content.find('UVM Report catcher Summary') scan_text = new_content[:summary_idx] if summary_idx != -1 else new_content
# Check for error keywords in the scan_text first_error_found = None for keyword in self.error_keywords: if keyword in scan_text: # Find the first occurrence of this error keyword lines = scan_text.split('\n') for i, line in enumerate(lines): if keyword in line: # Skip report-summary style lines that are not real errors if 'UVM_' in keyword or 'UVM_' in line: if 'Number of' in line and 'reports' in line: continue # Get some context around the error start_line = max(0, i - 2) end_line = min(len(lines), i + 3) context = '\n'.join(lines[start_line:end_line]) first_error_found = { 'keyword': keyword, 'line': line.strip(), 'context': context } break if first_error_found: break if first_error_found: timestamp = datetime.now().strftime('%m-%d %H:%M:%S') print(f"\n{Colors.RED}🚨 FIRST ERROR DETECTED 🚨{Colors.END}") print(f"{Colors.RED}[{timestamp}] Test: {test_name} seed={seed} - running but had error{Colors.END}") print(f"{Colors.RED}Log File: {log_file_path}{Colors.END}") print(f"{Colors.RED}Error Type: {first_error_found['keyword']}{Colors.END}") print(f"{Colors.RED}Error Line: {first_error_found['line']}{Colors.END}") print(f"{Colors.RED}Error Context:{Colors.END}") for context_line in first_error_found['context'].split('\n'): print(f"{Colors.RED} {context_line}{Colors.END}") print(f"{Colors.RED}{'='*80}{Colors.END}") # Mark that we've reported the first error for this test result.error_reported = True result.error_detected = True result.first_error_details = first_error_found return True except Exception as e: print(f" Warning: Could not read log file {log_file_path}: {e}") return False def _reset_log_read_position(self, test_name, seed): """Reset the log read position for a test (when it starts running)""" log_key = f"{test_name}_{seed}" self.log_read_positions[log_key] = 0 def _cleanup_log_read_positions(self): """Clean up log read positions for completed tests""" completed_tests = [result for result in self.results.values() if result.status in ["PASS", "RERUN PASS", "FAIL", "ERROR", "TIMEOUT"]] for result in completed_tests: test_name = result.name seed = getattr(result, 'seed', 'unknown') log_key = f"{test_name}_{seed}" if log_key in self.log_read_positions: del self.log_read_positions[log_key] if hasattr(self, 'log_last_update_times') and log_key in self.log_last_update_times: del self.log_last_update_times[log_key] def save_error_monitor_state(self): """Save error monitoring state to file""" try: state_file = self.report_dir / "error_monitor_state.json" state_data = { 'log_read_positions': self.log_read_positions, 'last_error_monitor_time': self.last_error_monitor_time, 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } with open(state_file, 'w') as f: json.dump(state_data, f, indent=2) except Exception as e: print(f"Warning: Could not save error monitor state: {e}") def load_error_monitor_state(self): """Load error monitoring state from file""" try: state_file = self.report_dir / "error_monitor_state.json" if state_file.exists(): with open(state_file, 'r') as f: state_data = json.load(f) self.log_read_positions = state_data.get('log_read_positions', {}) self.last_error_monitor_time = state_data.get('last_error_monitor_time', time.time()) print(f"Loaded error monitor state from {state_file}") except Exception as e: print(f"Warning: Could not load error monitor state: {e}")
def parse_it_regress_alias(alias_file: str) -> List[Tuple[str, str]]: """Parse it_regress.alias file.
Expected format (example): sim_1d: cmd: ./hregress.py -g full_1die --auto_restart ...
Returns: List of (sim_dir_name, cmd_string) """ items: List[Tuple[str, str]] = [] current_sim: Optional[str] = None
with open(alias_file, "r", encoding="utf-8", errors="ignore") as f: for raw in f: line = raw.strip() if not line or line.startswith("#"): continue
# Section header: sim_xxx: if line.endswith(":") and not line.lower().startswith("cmd:"): current_sim = line[:-1].strip() continue
# Command line: cmd: ... if line.lower().startswith("cmd:"): if not current_sim: raise ValueError(f"Found 'cmd:' before any sim section in {alias_file}: {raw!r}") cmd = line.split(":", 1)[1].strip() if not cmd: raise ValueError(f"Empty cmd for section {current_sim} in {alias_file}") items.append((current_sim, cmd)) continue
return items
def _maybe_prefix_python(cmd_tokens: List[str], sim_dir: str) -> List[str]: """Best-effort: if entry is a .py under sim_dir but not executable, run via current python.""" if not cmd_tokens: return cmd_tokens first = cmd_tokens[0] if first.endswith(".py"): p = Path(sim_dir) / first try: if p.exists() and not os.access(str(p), os.X_OK): return [sys.executable, first] + cmd_tokens[1:] except Exception: pass return cmd_tokens
def submit_all_regress_from_alias(args) -> int: """When args.all_regress_en == 1: read alias file and submit regressions in each sim dir.""" alias_file = getattr(args, "alias_file", "it_regress.alias") sim_root = getattr(args, "all_regress_sim_root", ".") template_sim_dir = getattr(args, "template_sim_dir", "sim")
alias_path = os.path.abspath(alias_file) if not os.path.isabs(alias_file) else alias_file
# Base directory resolution: # - If user provides --all-regress-sim-root and it exists, use it. # - Otherwise, use alias file's directory (most common in flow: cd sim_xxx and run). # - Fallback to current working directory. sim_root_abs = os.path.abspath(sim_root) if sim_root else "" if sim_root_abs and os.path.isdir(sim_root_abs): base_dir = sim_root_abs else: base_dir = os.path.abspath(os.path.dirname(alias_path)) if alias_path else os.getcwd()
if not os.path.exists(alias_path): print(f"{Colors.RED}Error: alias file not found: {alias_path}{Colors.END}") return 1
try: items = parse_it_regress_alias(alias_path) except Exception as e: print(f"{Colors.RED}Error: failed to parse alias file {alias_path}: {e}{Colors.END}") return 1
if not items: print(f"{Colors.YELLOW}Warning: no cmd entries found in {alias_path}{Colors.END}") return 0
print(f"{Colors.BOLD}=== ALL REGRESS MODE (from alias) ==={Colors.END}") print(f"Alias file: {alias_path}") print(f"Base dir: {base_dir}") if sim_root_abs and os.path.isdir(sim_root_abs): print(f"Sim root: {sim_root_abs} (enabled)") else: print(f"Sim root: {sim_root_abs or '(not set)'} (ignored; using base dir)") template_abs = template_sim_dir if os.path.isabs(template_sim_dir) else os.path.join(base_dir, template_sim_dir) print(f"Template: {template_abs}") print(f"Found {len(items)} cmd entries")
failed: List[Tuple[str, int]] = [] jobs: List[Dict[str, str]] = [] for sim_name, cmd_str in items: sim_dir = sim_name if os.path.isabs(sim_name) else os.path.join(base_dir, sim_name) if not os.path.isdir(sim_dir): # Auto-create sim_xxx from template 'sim' directory (cp -rf sim sim_xxx) template_path = template_sim_dir if os.path.isabs(template_sim_dir) else os.path.join(base_dir, template_sim_dir) if not os.path.isdir(template_path): print(f"{Colors.RED}Error: sim dir not found: {sim_dir}{Colors.END}") print(f"{Colors.RED}Error: template sim dir not found: {template_path}{Colors.END}") failed.append((sim_name, 127)) continue ts = datetime.now().strftime("%m-%d %H:%M:%S") print(f"{Colors.YELLOW}INFO: {ts} [{sim_name}] sim dir missing, creating via: cp -rf {template_path} {sim_dir}{Colors.END}") try: r_cp = subprocess.run(["cp", "-rf", template_path, sim_dir], cwd=base_dir) if r_cp.returncode != 0 or not os.path.isdir(sim_dir): print(f"{Colors.RED}INFO: {ts} [{sim_name}] create FAIL rc={r_cp.returncode}{Colors.END}") failed.append((sim_name, int(r_cp.returncode) if r_cp.returncode is not None else 1)) continue print(f"{Colors.GREEN}INFO: {ts} [{sim_name}] created OK{Colors.END}") except Exception as e: print(f"{Colors.RED}INFO: {ts} [{sim_name}] create FAIL: {e}{Colors.END}") failed.append((sim_name, 1)) continue
try: tokens = shlex.split(cmd_str) except Exception as e: print(f"{Colors.RED}Error: shlex split failed for {sim_name} cmd={cmd_str!r}: {e}{Colors.END}") failed.append((sim_name, 2)) continue
tokens = _maybe_prefix_python(tokens, sim_dir) jobs.append({"name": sim_name, "sim_dir": sim_dir, "cmd_str": cmd_str, "tokens": tokens})
if not jobs and failed: # 所有条目都在准备阶段失败了 print(f"\n{Colors.RED}=== ALL REGRESS MODE SUMMARY: FAIL (no job started) ==={Colors.END}") for name, rc in failed: print(f" - {name}: rc={rc}") return 1
# 并行启动所有 sim_xxx 的 hregress 命令 procs: List[Tuple[Dict[str, str], subprocess.Popen]] = [] for job in jobs: ts = datetime.now().strftime("%m-%d %H:%M:%S") print(f"INFO: {ts} [{job['name']}] START cwd={job['sim_dir']} cmd: {job['cmd_str']}") try: p = subprocess.Popen( job["tokens"], cwd=job["sim_dir"], ) procs.append((job, p)) except FileNotFoundError as e: print(f"{Colors.RED}INFO: {ts} [{job['name']}] start FAIL: {e}{Colors.END}") failed.append((job["name"], 127)) except Exception as e: print(f"{Colors.RED}INFO: {ts} [{job['name']}] start FAIL: {e}{Colors.END}") failed.append((job["name"], 1))
# 等待所有并行回归结束,统计返回码 for job, p in procs: rc = p.wait() ts = datetime.now().strftime("%m-%d %H:%M:%S") if rc == 0: print(f"{Colors.GREEN}INFO: {ts} [{job['name']}] FINISH OK{Colors.END}") else: print(f"{Colors.RED}INFO: {ts} [{job['name']}] FINISH FAIL rc={rc}{Colors.END}") failed.append((job["name"], int(rc)))
if failed: print(f"\n{Colors.RED}=== ALL REGRESS MODE SUMMARY: FAIL ==={Colors.END}") for name, rc in failed: print(f" - {name}: rc={rc}") return 1
print(f"\n{Colors.GREEN}=== ALL REGRESS MODE SUMMARY: OK ==={Colors.END}") return 0
def parse_arguments(): """Parse the operation""" parser = argparse.ArgumentParser(description="regress") # Required arguments (conditionally required based on other parameters) parser.add_argument("-g", "--groups", nargs="+", required=False, help="group tag (required unless using -lst/--list)") # Optional arguments parser.add_argument("-d", "--dienum", type=int, nargs="?", default=2, help="die num :1 to 4") parser.add_argument("-v", "--rtl_ver", nargs="?", default="STUB NOC_TOP", help="rtl vision") # parser.add_argument("-v", "--rtl_ver", nargs="?", default="FULL", # help="rtl vision") parser.add_argument("-m", "--mode", nargs="?", default="", help="mode") parser.add_argument("-def", "--define", type=str, nargs="?", help="rtl define marco") parser.add_argument("-q", "--queue", nargs="?", default="pron_normal", help="queue") parser.add_argument("--timestamp", nargs="*", help="add timestamp or not, use True or False") parser.add_argument("--bypass", nargs="*", default="1", help="bypass the pre_full_run: 0=compile, 1=skip compile if files exist") parser.add_argument("--wait-timeout", type=int, nargs="?", default=100, help="waiting timeout (h)") parser.add_argument('--max_concurrent', type=int, default=50, help='max concurrent job count') # Legacy arguments for backward compatibility parser.add_argument('--legacy-mode', choices=['compile_regression'], default='compile_regression', help='Legacy run mode: compile_regression(compile then run regression)') parser.add_argument('--timeout', type=int, default=60, help='Single test timeout (minutes) (default: 60)') parser.add_argument('--output-dir', default='.', help='Output directory for compile and regression (default: ./output)') parser.add_argument('--dir', default='.', help='Simulation output directory path (default: ./output)') parser.add_argument('--p2-mode', default='normal', help='P2 mode for compilation (default: normal)') parser.add_argument('--seed', default='random', help='Random seed (default: random)') parser.add_argument('--wave', action='store_true', help='Enable FSDB wave format (default: no wave)') parser.add_argument('--wave-on-fail', action='store_true', help='Generate wave file only when test fails') parser.add_argument('--coverage', action='store_true', help='Enable coverage collection') parser.add_argument('--cov', type=str, default=None, choices=['all', 'tgl', 'line', 'cond', 'fsm', 'branch', 'assert'], help='Coverage type: all, tgl, line, cond, fsm, branch, assert (default: None)') parser.add_argument('--vcs-optimize', action='store_true', default=False, help='Enable VCS optimization (parallel compilation and simulation) (default: disabled)') parser.add_argument('--vcs-cores', type=int, default=1, help='Number of cores for VCS parallel compilation/simulation (default: 1, single-threaded)') parser.add_argument('--vcs-xa', action='store_true', default=False, help='Enable VCS-XA acceleration (default: disabled)') parser.add_argument('--verbosity', default='UVM_MEDIUM', choices=['UVM_NONE', 'UVM_LOW', 'UVM_MEDIUM', 'UVM_HIGH', 'UVM_FULL'], help='UVM verbosity level (default: UVM_MEDIUM)') parser.add_argument('--plusargs', default='', help='Additional plusargs parameters') parser.add_argument('--retry', type=int, default=1, help='Failed test retry count (default: 1)') parser.add_argument('--debug', action='store_true', help='Enable debug mode') parser.add_argument('--keep-logs', action='store_true', help='Keep all log files') parser.add_argument('--status-interval', type=int, default=5, help='Status print interval (minutes) (default: 5)') parser.add_argument('--error-monitor-interval', type=int, default=30, help='Error monitoring interval (minutes) (default: 30)') parser.add_argument('--hang-timeout-minutes', type=int, default=30, help='Timeout for no new log lines (minutes) (default: 30)') parser.add_argument('--pend-timeout-minutes', type=int, default=None, help='Timeout for PEND jobs (minutes). If not set, PEND jobs will wait indefinitely for resources (default: None)') parser.add_argument('--memory', type=int, default=None, help='Memory reservation in GB for LSF jobs (default: not specified, use LSF default)') parser.add_argument('--cpu-cores', type=int, default=1, help='CPU cores to request for LSF jobs (default: 1)') parser.add_argument('--failed-regression', type=str, default=None, help='Path to failed regression JSON file to re-run failed tests only') parser.add_argument('-lst', '--list', type=str, default=None, help='Path to JSON regression list file to run all test cases in the list') parser.add_argument('--auto-restart', action='store_true', default=False, help='Automatically restart regression after completion (default: False)') parser.add_argument('--restart-interval-hours', type=float, default=None, help='Auto-restart interval in hours (e.g., 12.0 for 12 hours). If set, regression will restart after this interval (default: None)') parser.add_argument('--max-restarts', type=int, default=None, help='Maximum number of auto-restarts (default: None, unlimited)')
# Multi-topology one-click mode parser.add_argument('--all-regress-en', type=int, default=0, help='Enable all-regress mode: when set to 1, read alias file and submit regress in each sim_xxx dir (default: 0)') parser.add_argument('--alias-file', type=str, default='it_regress.alias', help='Alias file path (it_regress.alias format) used when --all-regress-en=1') parser.add_argument('--all-regress-sim-root', type=str, default='.', help='Sim root dir that contains sim_xxx subdirs, used when --all-regress-en=1') parser.add_argument('--template-sim-dir', type=str, default='sim', help="Template sim directory name/path under sim-root. If sim_xxx doesn't exist, create it by running: cp -rf <template> <sim_xxx>") return parser.parse_args()
def main(): args = parse_arguments()
# Short-circuit: all-regress mode (submit commands from alias file in each sim_xxx dir) try: if int(getattr(args, 'all_regress_en', 0)) == 1: rc = submit_all_regress_from_alias(args) sys.exit(int(rc)) except Exception as e: print(f"{Colors.RED}Error in all-regress mode: {e}{Colors.END}") sys.exit(1) # Validate arguments based on mode if hasattr(args, 'failed_regression') and args.failed_regression: # Validate failed regression file if not os.path.exists(args.failed_regression): print(f"{Colors.RED}Error: Failed regression file not found {args.failed_regression}{Colors.END}") sys.exit(1) print(f"Running failed tests from: {args.failed_regression}") elif hasattr(args, 'list') and args.list: # Validate regression list file - check in ../def/case_def/ directory regression_list_path = os.path.join(os.getcwd(), "..", "def", "case_def", args.list) if not os.path.exists(regression_list_path): print(f"{Colors.RED}Error: Regression list file not found {regression_list_path}{Colors.END}") sys.exit(1) print(f"Running tests from regression list: {regression_list_path}") # For regression list mode, groups are not required else: # For normal regression mode, groups are required if not args.groups: print(f"{Colors.RED}Error: -g/--groups is required when not using -lst/--list or --failed-regression{Colors.END}") sys.exit(1) # Check if json_list file exists (only for normal regression) json_list_path = os.path.join(os.getcwd(), "../def/json_list") if not os.path.exists(json_list_path): print(f"{Colors.RED}Error: Test list file not found {json_list_path}{Colors.END}") sys.exit(1) # Run regression test runner = RegressionRunner(args) try: runner.run() except KeyboardInterrupt: print(f"\n{Colors.YELLOW}User interrupted, cleaning up...{Colors.END}") runner.cleanup() sys.exit(1) except Exception as e: print(f"{Colors.RED}Regression test exception: {e}{Colors.END}") runner.cleanup() sys.exit(1)
if __name__ == "__main__": main()
浙公网安备 33010602011771号