adjust comments
This commit is contained in:
@@ -3,9 +3,9 @@ import pandas as pd
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
# ==============================
|
|
||||||
# KONFIGURATION
|
# KONFIGURATION
|
||||||
# ==============================
|
|
||||||
INPUT_JSON = "hetionet-v1.0.json"
|
INPUT_JSON = "hetionet-v1.0.json"
|
||||||
OUTPUT_DIR = Path("neo4j_csv")
|
OUTPUT_DIR = Path("neo4j_csv")
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
@@ -14,9 +14,8 @@ print("="*60)
|
|||||||
print("HETIONET ETL PIPELINE")
|
print("HETIONET ETL PIPELINE")
|
||||||
print("="*60)
|
print("="*60)
|
||||||
|
|
||||||
# ==============================
|
|
||||||
# EXTRACT
|
# EXTRACT
|
||||||
# ==============================
|
|
||||||
print("\nPHASE 1: EXTRACTION")
|
print("\nPHASE 1: EXTRACTION")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
print("Loading JSON data...")
|
print("Loading JSON data...")
|
||||||
@@ -30,9 +29,8 @@ edges_raw = data["edges"]
|
|||||||
print(f"Nodes loaded: {len(nodes_raw):,}")
|
print(f"Nodes loaded: {len(nodes_raw):,}")
|
||||||
print(f"Edges loaded: {len(edges_raw):,}")
|
print(f"Edges loaded: {len(edges_raw):,}")
|
||||||
|
|
||||||
# ==============================
|
|
||||||
# TRANSFORM – NODES
|
# TRANSFORM – NODES
|
||||||
# ==============================
|
|
||||||
print("\n PHASE 2: TRANSFORM NODES")
|
print("\n PHASE 2: TRANSFORM NODES")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -91,9 +89,8 @@ for kind in nodes_df["kind"].unique():
|
|||||||
df_kind.to_csv(filename, index=False)
|
df_kind.to_csv(filename, index=False)
|
||||||
print(f" {filename.name} ({len(df_kind):,} rows)")
|
print(f" {filename.name} ({len(df_kind):,} rows)")
|
||||||
|
|
||||||
# ==============================
|
|
||||||
# TRANSFORM – EDGES
|
# TRANSFORM – EDGES
|
||||||
# ==============================
|
|
||||||
print("\nPHASE 3: TRANSFORM EDGES")
|
print("\nPHASE 3: TRANSFORM EDGES")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -127,9 +124,9 @@ for edge_type in ['associates', 'treats', 'presents', 'causes', 'regulates', 'up
|
|||||||
if len(edges_by_type[edge_type]) > 0:
|
if len(edges_by_type[edge_type]) > 0:
|
||||||
print(f" - {edge_type}: {len(edges_by_type[edge_type]):,}")
|
print(f" - {edge_type}: {len(edges_by_type[edge_type]):,}")
|
||||||
|
|
||||||
# ==============================
|
|
||||||
# ANALYSES
|
# ANALYSES
|
||||||
# ==============================
|
|
||||||
print("\n" + "="*60)
|
print("\n" + "="*60)
|
||||||
print("PHASE 4: ANALYSES")
|
print("PHASE 4: ANALYSES")
|
||||||
print("="*60)
|
print("="*60)
|
||||||
@@ -438,9 +435,7 @@ network_edges_df.to_csv(OUTPUT_DIR / "network_edges.csv", index=False)
|
|||||||
|
|
||||||
print(f"Network: {len(network_nodes_df)} nodes, {len(network_edges_df)} edges")
|
print(f"Network: {len(network_nodes_df)} nodes, {len(network_edges_df)} edges")
|
||||||
|
|
||||||
# ==============================
|
|
||||||
# SUMMARY
|
|
||||||
# ==============================
|
|
||||||
print("\n" + "="*60)
|
print("\n" + "="*60)
|
||||||
print("ETL PIPELINE COMPLETED SUCCESSFULLY")
|
print("ETL PIPELINE COMPLETED SUCCESSFULLY")
|
||||||
print("="*60)
|
print("="*60)
|
||||||
|
|||||||
Reference in New Issue
Block a user