fix comments etc.

This commit is contained in:
Philipp Jacoby
2026-02-10 17:57:43 +01:00
parent 3003310be0
commit 8965b04a61
5 changed files with 59 additions and 60 deletions

View File

@@ -473,19 +473,20 @@ Follow these steps exactly in the order provided:
python neo4j_etl.py
```
**Eingabe:** Das Skript wird Sie nacheinander nach Ihrem **Datenbank-Usernamen** (Standard: `neo4j`) und Ihrem **Passwort** fragen.
**Verarbeitung:** Das Skript liest automatisch alle Abfragen aus dem Verzeichnis `neo4jqueries/analysis_queries` aus.
**Ausgabe:** Die Ergebnisse der Analyse-Queries werden direkt in der Konsole ausgegeben.
**Input:** The script will ask you for your **database-username** (default: `neo4j`) and your **password**.
**Processing:** The script automatically reads and executes cypher queries in the following directory `neo4jqueries/analysis_queries`.
**Output:** Results of the analysis will be displayed on the terminal.
---
### Projektstruktur
### Structure
| Verzeichnis / Datei | Funktion |
| :---------------------------------- | :--------------------------------------------------------- |
| `neo4j_etl.py` | Das Python-Skript zur Ausführung der Analyse-Queries. |
| `neo4jqueries/loadingQueriesNeo4j/` | Enthält alle Cypher-Dateien für den initialen Datenimport. |
| `neo4jqueries/analysis_queries/` | Enthält Cypher-Dateien für die statistische Auswertung. |
| Directory / file | Functionality |
| :---------------------------------- | :----------------------------------------------------- |
| `neo4j_etl.py` | Python-Script for execting analysis queries. |
| `neo4jqueries/loadingQueriesNeo4j/` | Contains all Cypher files for the initial data import. |
| `neo4jqueries/analysis_queries/` | Includes Cypher files for analysis. |
| | |
---

View File

@@ -104,7 +104,7 @@ try:
else:
st.sidebar.warning("No drugs found")
# OVERVIEW PAGE
# overview page
if page == "Overview":
st.header("Dataset Overview")
@@ -114,7 +114,7 @@ try:
col3.metric("Repurposing Opportunities", f"{len(repurposing):,}")
col4.metric("Analyzed Drugs", f"{len(super_drugs):,}")
# STATISTICS BOXES
# statistics boxes
st.markdown("---")
st.subheader("Key Statistics")
@@ -163,7 +163,7 @@ try:
xaxis_title="Gene",
yaxis_title="Number of Diseases"
)
# Enable chart export
# enable chart export
config = {'displayModeBar': True, 'displaylogo': False}
st.plotly_chart(fig, use_container_width=True, config=config)
@@ -185,7 +185,7 @@ try:
csv = top_diseases.to_csv(index=False).encode('utf-8')
st.download_button("Download Data", csv, "top_diseases.csv", "text/csv")
# HOTSPOT GENES PAGE
# hotspot gene page
elif page == "Hotspot Genes":
st.header("Hotspot Genes - Most Disease Associations")
@@ -215,7 +215,7 @@ try:
csv = filtered_genes.to_csv(index=False).encode('utf-8')
st.download_button("Download Filtered Data", csv, "hotspot_genes.csv", "text/csv")
# DRUG REPURPOSING PAGE
# drug repurposing page
elif page == "Drug Repurposing":
st.header("Drug Repurposing Opportunities")
@@ -250,7 +250,7 @@ try:
csv = filtered.to_csv(index=False).encode('utf-8')
st.download_button("Download", csv, f"repurposing_{selected}.csv", "text/csv")
# POLYPHARMACY RISK PAGE
# polypharmacy risk page
elif page == "Polypharmacy Risk":
st.header("Polypharmacy Risk Analysis")
@@ -285,7 +285,7 @@ try:
csv = filtered_risk.to_csv(index=False).encode('utf-8')
st.download_button("Download Risk Data", csv, "polypharmacy_risk.csv", "text/csv")
# SYMPTOM TRIANGLE PAGE
# symptop triangle page
elif page == "Symptom Triangle":
st.header("Symptom-Disease-Drug Connections")
@@ -314,7 +314,7 @@ try:
csv = top_symptoms.to_csv(index=False).encode('utf-8')
st.download_button("Download Symptom Data", csv, "symptom_triangle.csv", "text/csv")
# SUPER DRUGS PAGE
# super drugs page
elif page == "Super Drugs":
st.header("Super-Drug Score (Best Benefit/Risk Ratio)")
@@ -352,7 +352,7 @@ try:
csv = filtered_super.to_csv(index=False).encode('utf-8')
st.download_button("Download Super Drugs", csv, "super_drugs.csv", "text/csv")
# DRUG CONFLICTS PAGE
# drug conflicts page
elif page == "Drug Conflicts":
st.header("Drug Conflicts - Overlapping Side Effects")
@@ -394,28 +394,28 @@ try:
else:
st.warning("Drug conflicts data not available. Run the ETL script to generate this analysis.")
# NETWORK GRAPH PAGE
# network graph page
elif page == "Network Graph":
st.header("Disease-Gene-Drug Network")
if network_nodes is not None and network_edges is not None:
st.info("Interactive network visualization showing connections between diseases, genes, and drugs")
# Create networkx graph
# create networkx graph
G = nx.Graph()
# Add nodes
# add nodes
for _, row in network_nodes.iterrows():
G.add_node(row['id'], label=row['label'], type=row['type'])
# Add edges
# add edges
for _, row in network_edges.iterrows():
G.add_edge(row['source'], row['target'])
# Create layout
# create layout
pos = nx.spring_layout(G, k=0.5, iterations=50)
# Create edge trace
# create edge trace
edge_x = []
edge_y = []
for edge in G.edges():
@@ -431,7 +431,7 @@ try:
mode='lines'
)
# Create node traces (separate by type for legend)
# create node traces (separate by type for legend)
node_traces = []
color_map = {
'Disease': '#ff4444',
@@ -466,7 +466,7 @@ try:
)
node_traces.append(node_trace)
# Create figure
# create figure
fig = go.Figure(data=[edge_trace] + node_traces,
layout=go.Layout(
title='Disease-Gene-Drug Network',
@@ -488,7 +488,7 @@ try:
else:
st.warning("Network data not available. Run the ETL script to generate this visualization.")
# COMPARE DRUGS PAGE
# compare drugs page
elif page == "Compare Drugs":
st.header("⚖️ Compare Drugs Side-by-Side")
@@ -522,7 +522,7 @@ try:
st.metric("Side Effects", int(drug2_data['num_side_effects']))
st.metric("Super Score", f"{drug2_data['super_score']:.2f}")
# Comparison chart
# comparison chart
comparison_df = pd.DataFrame({
'Metric': ['Diseases Treated', 'Side Effects', 'Super Score'],
drug1: [drug1_data['num_diseases_treated'], drug1_data['num_side_effects'], drug1_data['super_score']],
@@ -539,7 +539,7 @@ try:
config = {'displayModeBar': True, 'displaylogo': False}
st.plotly_chart(fig, use_container_width=True, config=config)
# Winner determination
# winner determination
st.markdown("---")
st.subheader("Recommendation")

50
etl.py
View File

@@ -4,7 +4,7 @@ from pathlib import Path
from collections import defaultdict
# KONFIGURATION
# config
INPUT_JSON = "hetionet-v1.0.json"
OUTPUT_DIR = Path("neo4j_csv")
@@ -14,7 +14,7 @@ print("="*60)
print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)")
print("="*60)
# EXTRACT
# extract
print("\nPHASE 1: EXTRACTION")
print("-"*60)
@@ -29,7 +29,7 @@ edges_raw = data["edges"]
print(f"Nodes loaded: {len(nodes_raw):,}")
print(f"Edges loaded: {len(edges_raw):,}")
# TRANSFORM NODES
# transfomr nodes
print("\nPHASE 2: TRANSFORM NODES")
print("-"*60)
@@ -48,7 +48,7 @@ for node in nodes_raw:
nodes_df = pd.DataFrame(nodes_flat)
# Spaltennamen Neo4j-sicher machen
# make column names neo4j safe
nodes_df.columns = (
nodes_df.columns
.str.replace(" ", "_")
@@ -59,11 +59,11 @@ nodes_df.columns = (
print(f"Processed {len(nodes_df):,} nodes")
print(f" Columns: {', '.join(nodes_df.columns[:5])}...")
# Create lookup dictionaries
# create lookup dictionaries
node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind']))
node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name']))
# Create sets for fast membership testing
# create sets for fast membership testing
gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id'])
disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id'])
symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id'])
@@ -77,7 +77,7 @@ print(f" - Symptoms: {len(symptom_ids):,}")
print(f" - Compounds: {len(compound_ids):,}")
print(f" - Side Effects: {len(sideeffect_ids):,}")
# Export nodes by type
# export nodes by type
print("\nExporting node files...")
for kind in nodes_df["kind"].unique():
df_kind = (
@@ -89,7 +89,7 @@ for kind in nodes_df["kind"].unique():
df_kind.to_csv(filename, index=False)
print(f" {filename.name} ({len(df_kind):,} rows)")
# TRANSFORM EDGES
# transform edges
print("\nPHASE 3: TRANSFORM EDGES")
print("-"*60)
@@ -107,7 +107,7 @@ for i, edge in enumerate(edges_raw):
edges_df = pd.DataFrame(edges)
# Relationship-Typen Neo4j-sicher machen
# make relationship types neo4j safe
edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_")
# split edges into seperate files
@@ -120,13 +120,13 @@ for edge_type in sorted(edge_types):
edges_subset = edges_df[edges_df['type'] == edge_type]
filename = OUTPUT_DIR / f"edges_{edge_type}.csv"
# Only export source and target (type is in filename)
# only export source and target (type is in filename)
edges_subset[['source', 'target']].to_csv(filename, index=False)
size_mb = filename.stat().st_size / (1024*1024)
print(f" ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)")
# Also keep the combined file for backward compatibility
# also keep the combined file for backward compatibility
edges_file = OUTPUT_DIR / "edges_all.csv"
edges_df.to_csv(edges_file, index=False)
print(f"\n ✓ edges_all.csv (combined) ({len(edges_df):,} rows)")
@@ -136,7 +136,7 @@ print(f" Total edges: {len(edges_df):,}")
print(f" Split into {len(edge_types)} separate CSV files")
print(f" Each file can be loaded independently!")
# Pre-filter edges by type for analysis
# pre-filter edges by type for analysis
print("\nEdge type distribution:")
edges_by_type = {}
for edge_type in sorted(edge_types):
@@ -145,14 +145,12 @@ for edge_type in sorted(edge_types):
pct = 100 * count / len(edges_df)
print(f" - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)")
# [ANALYSES - keeping all the existing analysis code...]
# (Keeping the same analysis code as before)
print("\n" + "="*60)
print("PHASE 4: ANALYSES")
print("="*60)
# ANALYSIS 1: HOTSPOT GENES
# analysis 1: hotspot genes
print("\nAnalysis 1: Hotspot Genes")
print("-"*60)
@@ -183,7 +181,7 @@ genes_df_sorted.to_csv(OUTPUT_DIR / "nodes_Gene.csv", index=False)
print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)")
# ANALYSIS 2: DISEASE SYMPTOM DIVERSITY
# analysis 2: disease symptom diversity
print("\nAnalysis 2: Disease Symptom Diversity")
print("-"*60)
@@ -208,7 +206,7 @@ disease_df_sorted.to_csv(OUTPUT_DIR / "nodes_Disease.csv", index=False)
print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)")
# Build indices for drug analyses
# build indices for drug analyses
print("\nBuilding indices for drug analyses...")
disease_to_genes = defaultdict(set)
gene_to_diseases = defaultdict(set)
@@ -238,7 +236,7 @@ print(f"\n💡 For faster Neo4j loading, use the split edge files:")
print(f" edges_associates.csv, edges_treats.csv, etc.")
print(f" Instead of the combined edges_all.csv")
# ANALYSIS 3: DRUG REPURPOSING
# analysis 3: drug repurposing
print("\nAnalysis 3: Drug Repurposing Opportunities")
print("-"*60)
@@ -273,7 +271,7 @@ if len(repurposing_df) > 0:
repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False)
print(f"Found {len(repurposing_df):,} repurposing opportunities")
# ANALYSIS 4: POLYPHARMACY RISK
# analysis 4: polypharmacy risk
print("\nAnalysis 4: Polypharmacy Risk")
print("-"*60)
@@ -294,7 +292,7 @@ if len(drug_sideeffects) > 0:
drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False)
print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects")
# ANALYSIS 5: SYMPTOM TRIANGLE
# analysis 5: symptom triangle
print("\nAnalysis 5: Symptom-Disease-Drug Triangle")
print("-"*60)
@@ -326,7 +324,7 @@ if len(symptom_triangle_df) > 0:
symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False)
print(f"Analyzed {len(symptom_triangle_df):,} symptoms")
# ANALYSIS 6: SUPER DRUGS
# analysis 6: super drugs
print("\nAnalysis 6: Super-Drug Score")
print("-"*60)
@@ -352,7 +350,7 @@ if len(super_drugs_df) > 0:
super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False)
print(f"Analyzed {len(super_drugs_df):,} drugs")
# ANALYSIS 7: DRUG CONFLICTS
# analysis 7: drug conflicts
print("\nAnalysis 7: Drug Conflicts")
print("-"*60)
@@ -390,7 +388,7 @@ if len(drug_conflicts_df) > 0:
drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False)
print(f"Found {len(drug_conflicts_df):,} drug conflict pairs")
# ANALYSIS 8: NETWORK DATA
# analysis 8: network data
print("\nAnalysis 8: Network Visualization Data")
print("-"*60)
@@ -401,7 +399,7 @@ network_edges = []
node_id_counter = 0
id_mapping = {}
# Add disease nodes
# add disease nodes
for disease_id in top_diseases:
node_id = f"d_{node_id_counter}"
id_mapping[disease_id] = node_id
@@ -413,7 +411,7 @@ for disease_id in top_diseases:
})
node_id_counter += 1
# Add genes
# add genes
disease_genes = gene_disease_edges[
gene_disease_edges['source'].isin(top_diseases)
].head(150)
@@ -437,7 +435,7 @@ for _, row in disease_genes.iterrows():
'type': 'associates'
})
# Add drugs
# add drugs
drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50)
for _, row in drug_treatments.iterrows():

View File

@@ -1,4 +1,4 @@
LOAD CSV WITH HEADERS FROM 'file:///edges_treats.csv' AS row
LOAD CSV WITH HEADERS FROM 'file:///edges_associates.csv' AS row
MATCH (source {id: row.source})
MATCH (target {id: row.target})
CREATE (source)-[:TREATS]->(target);
CREATE (source)-[:ASSOCIATES]->(target);

View File

@@ -1,4 +1,4 @@
LOAD CSV WITH HEADERS FROM 'file:///edges_causes.csv' AS row
LOAD CSV WITH HEADERS FROM 'file:///edges_upregulates.csv' AS row
MATCH (source {id: row.source})
MATCH (target {id: row.target})
CREATE (source)-[:CAUSES]->(target);
CREATE (source)-[:UPREGULATES]->(target);