diff --git a/schnellerereETL.py b/schnellerereETL.py index b5d88c7..e527142 100644 --- a/schnellerereETL.py +++ b/schnellerereETL.py @@ -3,9 +3,9 @@ import pandas as pd from pathlib import Path from collections import defaultdict -# ============================== + # KONFIGURATION -# ============================== + INPUT_JSON = "hetionet-v1.0.json" OUTPUT_DIR = Path("neo4j_csv") OUTPUT_DIR.mkdir(exist_ok=True) @@ -14,9 +14,8 @@ print("="*60) print("HETIONET ETL PIPELINE") print("="*60) -# ============================== # EXTRACT -# ============================== + print("\nPHASE 1: EXTRACTION") print("-"*60) print("Loading JSON data...") @@ -30,9 +29,8 @@ edges_raw = data["edges"] print(f"Nodes loaded: {len(nodes_raw):,}") print(f"Edges loaded: {len(edges_raw):,}") -# ============================== # TRANSFORM – NODES -# ============================== + print("\n PHASE 2: TRANSFORM NODES") print("-"*60) @@ -91,9 +89,8 @@ for kind in nodes_df["kind"].unique(): df_kind.to_csv(filename, index=False) print(f" {filename.name} ({len(df_kind):,} rows)") -# ============================== # TRANSFORM – EDGES -# ============================== + print("\nPHASE 3: TRANSFORM EDGES") print("-"*60) @@ -127,9 +124,9 @@ for edge_type in ['associates', 'treats', 'presents', 'causes', 'regulates', 'up if len(edges_by_type[edge_type]) > 0: print(f" - {edge_type}: {len(edges_by_type[edge_type]):,}") -# ============================== + # ANALYSES -# ============================== + print("\n" + "="*60) print("PHASE 4: ANALYSES") print("="*60) @@ -438,9 +435,7 @@ network_edges_df.to_csv(OUTPUT_DIR / "network_edges.csv", index=False) print(f"Network: {len(network_nodes_df)} nodes, {len(network_edges_df)} edges") -# ============================== -# SUMMARY -# ============================== + print("\n" + "="*60) print("ETL PIPELINE COMPLETED SUCCESSFULLY") print("="*60)