fix comments etc.

This commit is contained in:
Philipp Jacoby
2026-02-10 17:57:43 +01:00
parent 3003310be0
commit 8965b04a61
5 changed files with 59 additions and 60 deletions

50
etl.py
View File

@@ -4,7 +4,7 @@ from pathlib import Path
from collections import defaultdict
# KONFIGURATION
# config
INPUT_JSON = "hetionet-v1.0.json"
OUTPUT_DIR = Path("neo4j_csv")
@@ -14,7 +14,7 @@ print("="*60)
print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)")
print("="*60)
# EXTRACT
# extract
print("\nPHASE 1: EXTRACTION")
print("-"*60)
@@ -29,7 +29,7 @@ edges_raw = data["edges"]
print(f"Nodes loaded: {len(nodes_raw):,}")
print(f"Edges loaded: {len(edges_raw):,}")
# TRANSFORM NODES
# transfomr nodes
print("\nPHASE 2: TRANSFORM NODES")
print("-"*60)
@@ -48,7 +48,7 @@ for node in nodes_raw:
nodes_df = pd.DataFrame(nodes_flat)
# Spaltennamen Neo4j-sicher machen
# make column names neo4j safe
nodes_df.columns = (
nodes_df.columns
.str.replace(" ", "_")
@@ -59,11 +59,11 @@ nodes_df.columns = (
print(f"Processed {len(nodes_df):,} nodes")
print(f" Columns: {', '.join(nodes_df.columns[:5])}...")
# Create lookup dictionaries
# create lookup dictionaries
node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind']))
node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name']))
# Create sets for fast membership testing
# create sets for fast membership testing
gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id'])
disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id'])
symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id'])
@@ -77,7 +77,7 @@ print(f" - Symptoms: {len(symptom_ids):,}")
print(f" - Compounds: {len(compound_ids):,}")
print(f" - Side Effects: {len(sideeffect_ids):,}")
# Export nodes by type
# export nodes by type
print("\nExporting node files...")
for kind in nodes_df["kind"].unique():
df_kind = (
@@ -89,7 +89,7 @@ for kind in nodes_df["kind"].unique():
df_kind.to_csv(filename, index=False)
print(f" {filename.name} ({len(df_kind):,} rows)")
# TRANSFORM EDGES
# transform edges
print("\nPHASE 3: TRANSFORM EDGES")
print("-"*60)
@@ -107,7 +107,7 @@ for i, edge in enumerate(edges_raw):
edges_df = pd.DataFrame(edges)
# Relationship-Typen Neo4j-sicher machen
# make relationship types neo4j safe
edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_")
# split edges into seperate files
@@ -120,13 +120,13 @@ for edge_type in sorted(edge_types):
edges_subset = edges_df[edges_df['type'] == edge_type]
filename = OUTPUT_DIR / f"edges_{edge_type}.csv"
# Only export source and target (type is in filename)
# only export source and target (type is in filename)
edges_subset[['source', 'target']].to_csv(filename, index=False)
size_mb = filename.stat().st_size / (1024*1024)
print(f" ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)")
# Also keep the combined file for backward compatibility
# also keep the combined file for backward compatibility
edges_file = OUTPUT_DIR / "edges_all.csv"
edges_df.to_csv(edges_file, index=False)
print(f"\n ✓ edges_all.csv (combined) ({len(edges_df):,} rows)")
@@ -136,7 +136,7 @@ print(f" Total edges: {len(edges_df):,}")
print(f" Split into {len(edge_types)} separate CSV files")
print(f" Each file can be loaded independently!")
# Pre-filter edges by type for analysis
# pre-filter edges by type for analysis
print("\nEdge type distribution:")
edges_by_type = {}
for edge_type in sorted(edge_types):
@@ -145,14 +145,12 @@ for edge_type in sorted(edge_types):
pct = 100 * count / len(edges_df)
print(f" - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)")
# [ANALYSES - keeping all the existing analysis code...]
# (Keeping the same analysis code as before)
print("\n" + "="*60)
print("PHASE 4: ANALYSES")
print("="*60)
# ANALYSIS 1: HOTSPOT GENES
# analysis 1: hotspot genes
print("\nAnalysis 1: Hotspot Genes")
print("-"*60)
@@ -183,7 +181,7 @@ genes_df_sorted.to_csv(OUTPUT_DIR / "nodes_Gene.csv", index=False)
print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)")
# ANALYSIS 2: DISEASE SYMPTOM DIVERSITY
# analysis 2: disease symptom diversity
print("\nAnalysis 2: Disease Symptom Diversity")
print("-"*60)
@@ -208,7 +206,7 @@ disease_df_sorted.to_csv(OUTPUT_DIR / "nodes_Disease.csv", index=False)
print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)")
# Build indices for drug analyses
# build indices for drug analyses
print("\nBuilding indices for drug analyses...")
disease_to_genes = defaultdict(set)
gene_to_diseases = defaultdict(set)
@@ -238,7 +236,7 @@ print(f"\n💡 For faster Neo4j loading, use the split edge files:")
print(f" edges_associates.csv, edges_treats.csv, etc.")
print(f" Instead of the combined edges_all.csv")
# ANALYSIS 3: DRUG REPURPOSING
# analysis 3: drug repurposing
print("\nAnalysis 3: Drug Repurposing Opportunities")
print("-"*60)
@@ -273,7 +271,7 @@ if len(repurposing_df) > 0:
repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False)
print(f"Found {len(repurposing_df):,} repurposing opportunities")
# ANALYSIS 4: POLYPHARMACY RISK
# analysis 4: polypharmacy risk
print("\nAnalysis 4: Polypharmacy Risk")
print("-"*60)
@@ -294,7 +292,7 @@ if len(drug_sideeffects) > 0:
drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False)
print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects")
# ANALYSIS 5: SYMPTOM TRIANGLE
# analysis 5: symptom triangle
print("\nAnalysis 5: Symptom-Disease-Drug Triangle")
print("-"*60)
@@ -326,7 +324,7 @@ if len(symptom_triangle_df) > 0:
symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False)
print(f"Analyzed {len(symptom_triangle_df):,} symptoms")
# ANALYSIS 6: SUPER DRUGS
# analysis 6: super drugs
print("\nAnalysis 6: Super-Drug Score")
print("-"*60)
@@ -352,7 +350,7 @@ if len(super_drugs_df) > 0:
super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False)
print(f"Analyzed {len(super_drugs_df):,} drugs")
# ANALYSIS 7: DRUG CONFLICTS
# analysis 7: drug conflicts
print("\nAnalysis 7: Drug Conflicts")
print("-"*60)
@@ -390,7 +388,7 @@ if len(drug_conflicts_df) > 0:
drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False)
print(f"Found {len(drug_conflicts_df):,} drug conflict pairs")
# ANALYSIS 8: NETWORK DATA
# analysis 8: network data
print("\nAnalysis 8: Network Visualization Data")
print("-"*60)
@@ -401,7 +399,7 @@ network_edges = []
node_id_counter = 0
id_mapping = {}
# Add disease nodes
# add disease nodes
for disease_id in top_diseases:
node_id = f"d_{node_id_counter}"
id_mapping[disease_id] = node_id
@@ -413,7 +411,7 @@ for disease_id in top_diseases:
})
node_id_counter += 1
# Add genes
# add genes
disease_genes = gene_disease_edges[
gene_disease_edges['source'].isin(top_diseases)
].head(150)
@@ -437,7 +435,7 @@ for _, row in disease_genes.iterrows():
'type': 'associates'
})
# Add drugs
# add drugs
drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50)
for _, row in drug_treatments.iterrows():