FRCMFD DATA ANALYSIS — Complete Data Extraction
import os
for f in os.listdir("/content"):
print(f)
#!/usr/bin/env python3
"""
FRCMFD DATA ANALYSIS — Complete Data Extraction & Summary
============================================================
This script:
1. Scans /content for all directories and files
2. Extracts all TAR archives recursively
3. Parses all JSON data files
4. Analyzes all NPY spectra files
5. Generates a comprehensive summary with tables and plots
Run this to see exactly what data you have collected.
============================================================
"""
import os
import sys
import json
import tarfile
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from collections import defaultdict
# ==============================================================================
# SCAN & EXTRACT FUNCTIONS
# ==============================================================================
def scan_directories(base_path="/content"):
"""Scan all directories and files in base_path."""
base = Path(base_path)
structure = {}
for item in base.iterdir():
if item.is_dir():
# Count files in directory
files = list(item.rglob('*'))
file_count = sum(1 for f in files if f.is_file())
structure[item.name] = {
'type': 'directory',
'file_count': file_count,
'path': str(item)
}
else:
structure[item.name] = {
'type': 'file',
'size_kb': item.stat().st_size / 1024,
'path': str(item)
}
return structure
def extract_tar_files(base_path="/content"):
"""Extract all TAR files recursively."""
base = Path(base_path)
extracted = []
for tar_file in base.rglob('*.tar'):
try:
print(f" Extracting: {tar_file.name}")
extract_dir = tar_file.parent / tar_file.stem
extract_dir.mkdir(exist_ok=True)
with tarfile.open(tar_file, 'r') as tar:
tar.extractall(path=extract_dir)
extracted.append({
'file': tar_file.name,
'extracted_to': str(extract_dir),
'files': len(tar.getnames())
})
except Exception as e:
print(f" ⚠️ Could not extract {tar_file.name}: {e}")
return extracted
def parse_json_files(base_path="/content"):
"""Parse all JSON files and extract key metrics."""
base = Path(base_path)
data = []
for json_file in base.rglob('*.json'):
try:
with open(json_file, 'r') as f:
content = json.load(f)
# Extract key metrics based on structure
metrics = {
'file': json_file.name,
'path': str(json_file),
'type': 'json'
}
# If it's a run file
if 'step' in content or isinstance(content, list):
if isinstance(content, list) and len(content) > 0:
# Extract from first and last entries
first = content[0]
last = content[-1]
metrics['steps'] = last.get('step', 0)
metrics['H0'] = first.get('H_total', None)
metrics['Hf'] = last.get('H_total', None)
metrics['dH'] = last.get('H_total', 0) - first.get('H_total', 0) if first.get('H_total') and last.get('H_total') else None
if metrics['H0'] and metrics['dH'] is not None:
metrics['drift_pct'] = (metrics['dH'] / max(abs(metrics['H0']), 1e-30)) * 100
else:
metrics['drift_pct'] = None
# Extract kappa if present
if 'kappa' in first:
metrics['kappa'] = first['kappa']
elif 'κ' in first:
metrics['kappa'] = first['κ']
# Extract energy components
metrics['H_kinetic'] = last.get('H_kinetic', None)
metrics['H_gradient'] = last.get('H_gradient', None)
metrics['H_potential'] = last.get('H_potential', None)
# If it's a summary file
elif 'runs' in content:
metrics['type'] = 'summary'
metrics['runs'] = len(content.get('runs', []))
if 'kappa_star' in content:
metrics['kappa_star'] = content['kappa_star']
if 'verdict' in content:
metrics['verdict'] = content['verdict']
data.append(metrics)
except Exception as e:
print(f" ⚠️ Could not parse {json_file.name}: {e}")
return data
def analyze_npy_files(base_path="/content"):
"""Analyze NPY files (spectra)."""
base = Path(base_path)
npy_data = []
for npy_file in base.rglob('*.npy'):
try:
data = np.load(npy_file)
npy_data.append({
'file': npy_file.name,
'path': str(npy_file),
'shape': data.shape,
'dtype': str(data.dtype),
'size_kb': npy_file.stat().st_size / 1024,
'min': float(np.min(data)),
'max': float(np.max(data)),
'mean': float(np.mean(data)),
'std': float(np.std(data))
})
except Exception as e:
print(f" ⚠️ Could not analyze {npy_file.name}: {e}")
return npy_data
# ==============================================================================
# ANALYSIS & SUMMARY FUNCTIONS
# ==============================================================================
def summarize_runs(json_data):
"""Summarize run data from JSON files."""
runs = []
for item in json_data:
if item.get('type') != 'json':
continue
if 'kappa' in item and 'drift_pct' in item and item['drift_pct'] is not None:
runs.append({
'file': item['file'],
'kappa': item['kappa'],
'steps': item.get('steps', 0),
'drift_pct': item['drift_pct'],
'dH': item.get('dH', 0),
'H0': item.get('H0', 0),
'Hf': item.get('Hf', 0),
'H_kinetic': item.get('H_kinetic', None),
'H_gradient': item.get('H_gradient', None),
'H_potential': item.get('H_potential', None)
})
return runs
def print_summary(structure, extracted, json_data, npy_data, runs):
"""Print comprehensive summary."""
print("\n" + "="*80)
print("📊 FRCMFD DATA ANALYSIS — COMPLETE SUMMARY")
print("="*80)
print(f"Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)
# 1. Directory Structure
print("\n📁 DIRECTORY STRUCTURE")
print("-"*60)
for name, info in sorted(structure.items()):
if info['type'] == 'directory':
print(f" 📁 {name}/ ({info['file_count']} files)")
else:
print(f" 📄 {name} ({info['size_kb']:.1f} KB)")
# 2. Extracted TARs
print("\n📦 EXTRACTED TAR ARCHIVES")
print("-"*60)
if extracted:
for tar in extracted:
print(f" {tar['file']} → {tar['extracted_to']} ({tar['files']} files)")
else:
print(" No TAR files extracted")
# 3. JSON Data Summary
print("\n📄 JSON DATA FILES")
print("-"*60)
print(f" Total JSON files: {len(json_data)}")
# 4. Run Data Summary
print("\n🏃 RUN DATA SUMMARY")
print("-"*60)
if runs:
# Sort by kappa
runs_sorted = sorted(runs, key=lambda x: x['kappa'] if x['kappa'] is not None else 0)
print(f"{'File':<35} | {'κ':>8} | {'Steps':>8} | {'Drift(%)':>12} | {'ΔH':>12}")
print("-"*85)
for r in runs_sorted[:20]: # Show first 20
file_short = r['file'][:35]
kappa_str = f"{r['kappa']:.4f}" if r['kappa'] is not None else "N/A"
print(f"{file_short:<35} | {kappa_str:>8} | {r['steps']:>8} | {r['drift_pct']:12.5f} | {r['dH']:12.6e}")
if len(runs_sorted) > 20:
print(f" ... and {len(runs_sorted)-20} more runs")
# Statistics
drift_values = [r['drift_pct'] for r in runs if r['drift_pct'] is not None]
if drift_values:
print(f"\n Drift Statistics:")
print(f" Min: {min(drift_values):.5f}%")
print(f" Max: {max(drift_values):.5f}%")
print(f" Mean: {np.mean(drift_values):.5f}%")
print(f" Std: {np.std(drift_values):.5f}%")
else:
print(" No run data found in JSON files")
# 5. NPY Spectra Summary
print("\n📊 NPY SPECTRA FILES")
print("-"*60)
if npy_data:
print(f" Total NPY files: {len(npy_data)}")
print(f"{'File':<35} | {'Shape':<15} | {'Size(KB)':>10} | {'Min':>10} | {'Max':>10}")
print("-"*85)
for n in npy_data[:10]:
file_short = n['file'][:35]
shape_str = str(n['shape'])[:15]
print(f"{file_short:<35} | {shape_str:<15} | {n['size_kb']:10.1f} | {n['min']:10.3e} | {n['max']:10.3e}")
if len(npy_data) > 10:
print(f" ... and {len(npy_data)-10} more NPY files")
else:
print(" No NPY files found")
# 6. Key Findings
print("\n🔑 KEY FINDINGS")
print("-"*60)
# Find κ* from bracketing summary
kappa_star = None
for item in json_data:
if item.get('type') == 'summary' and 'kappa_star' in item:
kappa_star = item['kappa_star']
break
if kappa_star is not None:
print(f" ✅ κ* (zero-drift coupling) ≈ {kappa_star:.6f}")
# Find best drift
if runs:
best_run = min(runs, key=lambda x: abs(x['drift_pct']) if x['drift_pct'] is not None else float('inf'))
if best_run['drift_pct'] is not None:
print(f" ✅ Best drift: {best_run['drift_pct']:.5f}% at κ={best_run['kappa']:.4f}")
# Find worst drift
if runs:
worst_run = max(runs, key=lambda x: x['drift_pct'] if x['drift_pct'] is not None else 0)
if worst_run['drift_pct'] is not None:
print(f" ⚠️ Worst drift: {worst_run['drift_pct']:.5f}% at κ={worst_run['kappa']:.4f}")
# 7. Summary Statistics
print("\n📊 SUMMARY STATISTICS")
print("-"*60)
print(f" Directories: {len([s for s in structure.values() if s['type'] == 'directory'])}")
print(f" Files: {len([s for s in structure.values() if s['type'] == 'file'])}")
print(f" JSON files: {len(json_data)}")
print(f" NPY files: {len(npy_data)}")
print(f" Run data points: {len(runs)}")
print(f" TAR archives extracted: {len(extracted)}")
print("\n" + "="*80)
print("✅ ANALYSIS COMPLETE")
print("="*80)
# ==============================================================================
# PLOTTING FUNCTIONS
# ==============================================================================
def plot_drift_vs_kappa(runs, save_path=None):
"""Plot drift vs kappa."""
if not runs:
return
kappas = [r['kappa'] for r in runs if r['kappa'] is not None and r['drift_pct'] is not None]
drifts = [r['drift_pct'] for r in runs if r['kappa'] is not None and r['drift_pct'] is not None]
if len(kappas) < 2:
return
plt.figure(figsize=(10, 6))
plt.scatter(kappas, drifts, alpha=0.7, s=50)
plt.xlabel('κ')
plt.ylabel('Drift (%)')
plt.title('Hamiltonian Drift vs Coupling Parameter κ')
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='r', linestyle='--', alpha=0.5)
if save_path:
plt.savefig(save_path, dpi=150, bbox_inches='tight')
print(f" Plot saved to: {save_path}")
plt.show()
def plot_energy_components(runs, save_path=None):
"""Plot energy components from runs."""
# Look for runs with energy component data
valid_runs = []
for r in runs:
if all(x is not None for x in [r.get('H_kinetic'), r.get('H_gradient'), r.get('H_potential')]):
valid_runs.append(r)
if len(valid_runs) < 2:
return
# Group by kappa and average
kappa_groups = defaultdict(lambda: {'K': [], 'G': [], 'V': []})
for r in valid_runs:
k = r['kappa']
if k is not None:
kappa_groups[k]['K'].append(r['H_kinetic'])
kappa_groups[k]['G'].append(r['H_gradient'])
kappa_groups[k]['V'].append(r['H_potential'])
kappas = sorted(kappa_groups.keys())
K_avg = [np.mean(kappa_groups[k]['K']) for k in kappas]
G_avg = [np.mean(kappa_groups[k]['G']) for k in kappas]
V_avg = [np.mean(kappa_groups[k]['V']) for k in kappas]
if len(kappas) < 2:
return
plt.figure(figsize=(12, 6))
plt.plot(kappas, K_avg, 'o-', label='Kinetic', linewidth=2)
plt.plot(kappas, G_avg, 's-', label='Gradient', linewidth=2)
plt.plot(kappas, V_avg, '^-', label='Potential', linewidth=2)
plt.xlabel('κ')
plt.ylabel('Energy Component')
plt.title('Energy Components vs Coupling Parameter κ')
plt.legend()
plt.grid(True, alpha=0.3)
if save_path:
plt.savefig(save_path, dpi=150, bbox_inches='tight')
print(f" Plot saved to: {save_path}")
plt.show()
# ==============================================================================
# MAIN
# ==============================================================================
def main():
print("="*80)
print("🚀 FRCMFD DATA ANALYSIS — Complete Extraction & Summary")
print("="*80)
print(f"Working directory: /content")
print("="*80)
# 1. Scan directories
print("\n📁 Scanning directories...")
structure = scan_directories("/content")
# 2. Extract TAR files
print("\n📦 Extracting TAR archives...")
extracted = extract_tar_files("/content")
# 3. Parse JSON files
print("\n📄 Parsing JSON data...")
json_data = parse_json_files("/content")
# 4. Analyze NPY files
print("\n📊 Analyzing NPY spectra...")
npy_data = analyze_npy_files("/content")
# 5. Summarize runs
runs = summarize_runs(json_data)
# 6. Print summary
print_summary(structure, extracted, json_data, npy_data, runs)
# 7. Generate plots
print("\n📈 Generating plots...")
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
plot_drift_vs_kappa(runs, save_path=f"/content/drift_vs_kappa_{timestamp}.png")
plot_energy_components(runs, save_path=f"/content/energy_vs_kappa_{timestamp}.png")
# 8. Save summary to JSON
summary_data = {
'timestamp': timestamp,
'directories': {k: v for k, v in structure.items()},
'extracted_tars': extracted,
'json_files': json_data,
'npy_files': npy_data,
'runs': runs
}
summary_file = f"/content/data_summary_{timestamp}.json"
with open(summary_file, 'w') as f:
json.dump(summary_data, f, indent=2, default=float)
print(f"\n📁 Summary saved to: {summary_file}")
print("\n" + "="*80)
print("✅ ANALYSIS COMPLETE!")
print("="*80)
if __name__ == "__main__":
main()