fix comments etc.
This commit is contained in:
50
etl.py
50
etl.py
@@ -4,7 +4,7 @@ from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# KONFIGURATION
|
||||
# config
|
||||
|
||||
INPUT_JSON = "hetionet-v1.0.json"
|
||||
OUTPUT_DIR = Path("neo4j_csv")
|
||||
@@ -14,7 +14,7 @@ print("="*60)
|
||||
print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)")
|
||||
print("="*60)
|
||||
|
||||
# EXTRACT
|
||||
# extract
|
||||
|
||||
print("\nPHASE 1: EXTRACTION")
|
||||
print("-"*60)
|
||||
@@ -29,7 +29,7 @@ edges_raw = data["edges"]
|
||||
print(f"Nodes loaded: {len(nodes_raw):,}")
|
||||
print(f"Edges loaded: {len(edges_raw):,}")
|
||||
|
||||
# TRANSFORM – NODES
|
||||
# transfomr – nodes
|
||||
|
||||
print("\nPHASE 2: TRANSFORM NODES")
|
||||
print("-"*60)
|
||||
@@ -48,7 +48,7 @@ for node in nodes_raw:
|
||||
|
||||
nodes_df = pd.DataFrame(nodes_flat)
|
||||
|
||||
# Spaltennamen Neo4j-sicher machen
|
||||
# make column names neo4j safe
|
||||
nodes_df.columns = (
|
||||
nodes_df.columns
|
||||
.str.replace(" ", "_")
|
||||
@@ -59,11 +59,11 @@ nodes_df.columns = (
|
||||
print(f"Processed {len(nodes_df):,} nodes")
|
||||
print(f" Columns: {', '.join(nodes_df.columns[:5])}...")
|
||||
|
||||
# Create lookup dictionaries
|
||||
# create lookup dictionaries
|
||||
node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind']))
|
||||
node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name']))
|
||||
|
||||
# Create sets for fast membership testing
|
||||
# create sets for fast membership testing
|
||||
gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id'])
|
||||
disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id'])
|
||||
symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id'])
|
||||
@@ -77,7 +77,7 @@ print(f" - Symptoms: {len(symptom_ids):,}")
|
||||
print(f" - Compounds: {len(compound_ids):,}")
|
||||
print(f" - Side Effects: {len(sideeffect_ids):,}")
|
||||
|
||||
# Export nodes by type
|
||||
# export nodes by type
|
||||
print("\nExporting node files...")
|
||||
for kind in nodes_df["kind"].unique():
|
||||
df_kind = (
|
||||
@@ -89,7 +89,7 @@ for kind in nodes_df["kind"].unique():
|
||||
df_kind.to_csv(filename, index=False)
|
||||
print(f" {filename.name} ({len(df_kind):,} rows)")
|
||||
|
||||
# TRANSFORM – EDGES
|
||||
# transform edges
|
||||
|
||||
print("\nPHASE 3: TRANSFORM EDGES")
|
||||
print("-"*60)
|
||||
@@ -107,7 +107,7 @@ for i, edge in enumerate(edges_raw):
|
||||
|
||||
edges_df = pd.DataFrame(edges)
|
||||
|
||||
# Relationship-Typen Neo4j-sicher machen
|
||||
# make relationship types neo4j safe
|
||||
edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_")
|
||||
|
||||
# split edges into seperate files
|
||||
@@ -120,13 +120,13 @@ for edge_type in sorted(edge_types):
|
||||
edges_subset = edges_df[edges_df['type'] == edge_type]
|
||||
filename = OUTPUT_DIR / f"edges_{edge_type}.csv"
|
||||
|
||||
# Only export source and target (type is in filename)
|
||||
# only export source and target (type is in filename)
|
||||
edges_subset[['source', 'target']].to_csv(filename, index=False)
|
||||
|
||||
size_mb = filename.stat().st_size / (1024*1024)
|
||||
print(f" ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)")
|
||||
|
||||
# Also keep the combined file for backward compatibility
|
||||
# also keep the combined file for backward compatibility
|
||||
edges_file = OUTPUT_DIR / "edges_all.csv"
|
||||
edges_df.to_csv(edges_file, index=False)
|
||||
print(f"\n ✓ edges_all.csv (combined) ({len(edges_df):,} rows)")
|
||||
@@ -136,7 +136,7 @@ print(f" Total edges: {len(edges_df):,}")
|
||||
print(f" Split into {len(edge_types)} separate CSV files")
|
||||
print(f" Each file can be loaded independently!")
|
||||
|
||||
# Pre-filter edges by type for analysis
|
||||
# pre-filter edges by type for analysis
|
||||
print("\nEdge type distribution:")
|
||||
edges_by_type = {}
|
||||
for edge_type in sorted(edge_types):
|
||||
@@ -145,14 +145,12 @@ for edge_type in sorted(edge_types):
|
||||
pct = 100 * count / len(edges_df)
|
||||
print(f" - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)")
|
||||
|
||||
# [ANALYSES - keeping all the existing analysis code...]
|
||||
# (Keeping the same analysis code as before)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 4: ANALYSES")
|
||||
print("="*60)
|
||||
|
||||
# ANALYSIS 1: HOTSPOT GENES
|
||||
# analysis 1: hotspot genes
|
||||
print("\nAnalysis 1: Hotspot Genes")
|
||||
print("-"*60)
|
||||
|
||||
@@ -183,7 +181,7 @@ genes_df_sorted.to_csv(OUTPUT_DIR / "nodes_Gene.csv", index=False)
|
||||
|
||||
print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)")
|
||||
|
||||
# ANALYSIS 2: DISEASE SYMPTOM DIVERSITY
|
||||
# analysis 2: disease symptom diversity
|
||||
print("\nAnalysis 2: Disease Symptom Diversity")
|
||||
print("-"*60)
|
||||
|
||||
@@ -208,7 +206,7 @@ disease_df_sorted.to_csv(OUTPUT_DIR / "nodes_Disease.csv", index=False)
|
||||
|
||||
print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)")
|
||||
|
||||
# Build indices for drug analyses
|
||||
# build indices for drug analyses
|
||||
print("\nBuilding indices for drug analyses...")
|
||||
disease_to_genes = defaultdict(set)
|
||||
gene_to_diseases = defaultdict(set)
|
||||
@@ -238,7 +236,7 @@ print(f"\n💡 For faster Neo4j loading, use the split edge files:")
|
||||
print(f" edges_associates.csv, edges_treats.csv, etc.")
|
||||
print(f" Instead of the combined edges_all.csv")
|
||||
|
||||
# ANALYSIS 3: DRUG REPURPOSING
|
||||
# analysis 3: drug repurposing
|
||||
print("\nAnalysis 3: Drug Repurposing Opportunities")
|
||||
print("-"*60)
|
||||
|
||||
@@ -273,7 +271,7 @@ if len(repurposing_df) > 0:
|
||||
repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False)
|
||||
print(f"Found {len(repurposing_df):,} repurposing opportunities")
|
||||
|
||||
# ANALYSIS 4: POLYPHARMACY RISK
|
||||
# analysis 4: polypharmacy risk
|
||||
print("\nAnalysis 4: Polypharmacy Risk")
|
||||
print("-"*60)
|
||||
|
||||
@@ -294,7 +292,7 @@ if len(drug_sideeffects) > 0:
|
||||
drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False)
|
||||
print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects")
|
||||
|
||||
# ANALYSIS 5: SYMPTOM TRIANGLE
|
||||
# analysis 5: symptom triangle
|
||||
print("\nAnalysis 5: Symptom-Disease-Drug Triangle")
|
||||
print("-"*60)
|
||||
|
||||
@@ -326,7 +324,7 @@ if len(symptom_triangle_df) > 0:
|
||||
symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False)
|
||||
print(f"Analyzed {len(symptom_triangle_df):,} symptoms")
|
||||
|
||||
# ANALYSIS 6: SUPER DRUGS
|
||||
# analysis 6: super drugs
|
||||
print("\nAnalysis 6: Super-Drug Score")
|
||||
print("-"*60)
|
||||
|
||||
@@ -352,7 +350,7 @@ if len(super_drugs_df) > 0:
|
||||
super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False)
|
||||
print(f"Analyzed {len(super_drugs_df):,} drugs")
|
||||
|
||||
# ANALYSIS 7: DRUG CONFLICTS
|
||||
# analysis 7: drug conflicts
|
||||
print("\nAnalysis 7: Drug Conflicts")
|
||||
print("-"*60)
|
||||
|
||||
@@ -390,7 +388,7 @@ if len(drug_conflicts_df) > 0:
|
||||
drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False)
|
||||
print(f"Found {len(drug_conflicts_df):,} drug conflict pairs")
|
||||
|
||||
# ANALYSIS 8: NETWORK DATA
|
||||
# analysis 8: network data
|
||||
print("\nAnalysis 8: Network Visualization Data")
|
||||
print("-"*60)
|
||||
|
||||
@@ -401,7 +399,7 @@ network_edges = []
|
||||
node_id_counter = 0
|
||||
id_mapping = {}
|
||||
|
||||
# Add disease nodes
|
||||
# add disease nodes
|
||||
for disease_id in top_diseases:
|
||||
node_id = f"d_{node_id_counter}"
|
||||
id_mapping[disease_id] = node_id
|
||||
@@ -413,7 +411,7 @@ for disease_id in top_diseases:
|
||||
})
|
||||
node_id_counter += 1
|
||||
|
||||
# Add genes
|
||||
# add genes
|
||||
disease_genes = gene_disease_edges[
|
||||
gene_disease_edges['source'].isin(top_diseases)
|
||||
].head(150)
|
||||
@@ -437,7 +435,7 @@ for _, row in disease_genes.iterrows():
|
||||
'type': 'associates'
|
||||
})
|
||||
|
||||
# Add drugs
|
||||
# add drugs
|
||||
drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50)
|
||||
|
||||
for _, row in drug_treatments.iterrows():
|
||||
|
||||
Reference in New Issue
Block a user