diff --git a/README.md b/README.md index 56f3968..48d55f0 100644 --- a/README.md +++ b/README.md @@ -473,19 +473,20 @@ Follow these steps exactly in the order provided: python neo4j_etl.py ``` - **Eingabe:** Das Skript wird Sie nacheinander nach Ihrem **Datenbank-Usernamen** (Standard: `neo4j`) und Ihrem **Passwort** fragen. - **Verarbeitung:** Das Skript liest automatisch alle Abfragen aus dem Verzeichnis `neo4jqueries/analysis_queries` aus. - **Ausgabe:** Die Ergebnisse der Analyse-Queries werden direkt in der Konsole ausgegeben. + **Input:** The script will ask you for your **database-username** (default: `neo4j`) and your **password**. + **Processing:** The script automatically reads and executes cypher queries in the following directory `neo4jqueries/analysis_queries`. + **Output:** Results of the analysis will be displayed on the terminal. --- -### Projektstruktur +### Structure -| Verzeichnis / Datei | Funktion | -| :---------------------------------- | :--------------------------------------------------------- | -| `neo4j_etl.py` | Das Python-Skript zur Ausführung der Analyse-Queries. | -| `neo4jqueries/loadingQueriesNeo4j/` | Enthält alle Cypher-Dateien für den initialen Datenimport. | -| `neo4jqueries/analysis_queries/` | Enthält Cypher-Dateien für die statistische Auswertung. | +| Directory / file | Functionality | +| :---------------------------------- | :----------------------------------------------------- | +| `neo4j_etl.py` | Python-Script for execting analysis queries. | +| `neo4jqueries/loadingQueriesNeo4j/` | Contains all Cypher files for the initial data import. | +| `neo4jqueries/analysis_queries/` | Includes Cypher files for analysis. | +| | | --- diff --git a/dashboard.py b/dashboard.py index f2b7ce6..c4578c6 100644 --- a/dashboard.py +++ b/dashboard.py @@ -104,7 +104,7 @@ try: else: st.sidebar.warning("No drugs found") - # OVERVIEW PAGE + # overview page if page == "Overview": st.header("Dataset Overview") @@ -114,7 +114,7 @@ try: col3.metric("Repurposing Opportunities", f"{len(repurposing):,}") col4.metric("Analyzed Drugs", f"{len(super_drugs):,}") - # STATISTICS BOXES + # statistics boxes st.markdown("---") st.subheader("Key Statistics") @@ -163,7 +163,7 @@ try: xaxis_title="Gene", yaxis_title="Number of Diseases" ) - # Enable chart export + # enable chart export config = {'displayModeBar': True, 'displaylogo': False} st.plotly_chart(fig, use_container_width=True, config=config) @@ -185,7 +185,7 @@ try: csv = top_diseases.to_csv(index=False).encode('utf-8') st.download_button("Download Data", csv, "top_diseases.csv", "text/csv") - # HOTSPOT GENES PAGE + # hotspot gene page elif page == "Hotspot Genes": st.header("Hotspot Genes - Most Disease Associations") @@ -215,7 +215,7 @@ try: csv = filtered_genes.to_csv(index=False).encode('utf-8') st.download_button("Download Filtered Data", csv, "hotspot_genes.csv", "text/csv") - # DRUG REPURPOSING PAGE + # drug repurposing page elif page == "Drug Repurposing": st.header("Drug Repurposing Opportunities") @@ -250,7 +250,7 @@ try: csv = filtered.to_csv(index=False).encode('utf-8') st.download_button("Download", csv, f"repurposing_{selected}.csv", "text/csv") - # POLYPHARMACY RISK PAGE + # polypharmacy risk page elif page == "Polypharmacy Risk": st.header("Polypharmacy Risk Analysis") @@ -285,7 +285,7 @@ try: csv = filtered_risk.to_csv(index=False).encode('utf-8') st.download_button("Download Risk Data", csv, "polypharmacy_risk.csv", "text/csv") - # SYMPTOM TRIANGLE PAGE + # symptop triangle page elif page == "Symptom Triangle": st.header("Symptom-Disease-Drug Connections") @@ -314,7 +314,7 @@ try: csv = top_symptoms.to_csv(index=False).encode('utf-8') st.download_button("Download Symptom Data", csv, "symptom_triangle.csv", "text/csv") - # SUPER DRUGS PAGE + # super drugs page elif page == "Super Drugs": st.header("Super-Drug Score (Best Benefit/Risk Ratio)") @@ -352,7 +352,7 @@ try: csv = filtered_super.to_csv(index=False).encode('utf-8') st.download_button("Download Super Drugs", csv, "super_drugs.csv", "text/csv") - # DRUG CONFLICTS PAGE + # drug conflicts page elif page == "Drug Conflicts": st.header("Drug Conflicts - Overlapping Side Effects") @@ -394,28 +394,28 @@ try: else: st.warning("Drug conflicts data not available. Run the ETL script to generate this analysis.") - # NETWORK GRAPH PAGE + # network graph page elif page == "Network Graph": st.header("Disease-Gene-Drug Network") if network_nodes is not None and network_edges is not None: st.info("Interactive network visualization showing connections between diseases, genes, and drugs") - # Create networkx graph + # create networkx graph G = nx.Graph() - # Add nodes + # add nodes for _, row in network_nodes.iterrows(): G.add_node(row['id'], label=row['label'], type=row['type']) - # Add edges + # add edges for _, row in network_edges.iterrows(): G.add_edge(row['source'], row['target']) - # Create layout + # create layout pos = nx.spring_layout(G, k=0.5, iterations=50) - - # Create edge trace + + # create edge trace edge_x = [] edge_y = [] for edge in G.edges(): @@ -431,7 +431,7 @@ try: mode='lines' ) - # Create node traces (separate by type for legend) + # create node traces (separate by type for legend) node_traces = [] color_map = { 'Disease': '#ff4444', @@ -466,7 +466,7 @@ try: ) node_traces.append(node_trace) - # Create figure + # create figure fig = go.Figure(data=[edge_trace] + node_traces, layout=go.Layout( title='Disease-Gene-Drug Network', @@ -488,7 +488,7 @@ try: else: st.warning("Network data not available. Run the ETL script to generate this visualization.") - # COMPARE DRUGS PAGE + # compare drugs page elif page == "Compare Drugs": st.header("⚖️ Compare Drugs Side-by-Side") @@ -522,7 +522,7 @@ try: st.metric("Side Effects", int(drug2_data['num_side_effects'])) st.metric("Super Score", f"{drug2_data['super_score']:.2f}") - # Comparison chart + # comparison chart comparison_df = pd.DataFrame({ 'Metric': ['Diseases Treated', 'Side Effects', 'Super Score'], drug1: [drug1_data['num_diseases_treated'], drug1_data['num_side_effects'], drug1_data['super_score']], @@ -539,7 +539,7 @@ try: config = {'displayModeBar': True, 'displaylogo': False} st.plotly_chart(fig, use_container_width=True, config=config) - # Winner determination + # winner determination st.markdown("---") st.subheader("Recommendation") diff --git a/etl.py b/etl.py index b2acb4a..2345502 100644 --- a/etl.py +++ b/etl.py @@ -4,7 +4,7 @@ from pathlib import Path from collections import defaultdict -# KONFIGURATION +# config INPUT_JSON = "hetionet-v1.0.json" OUTPUT_DIR = Path("neo4j_csv") @@ -14,7 +14,7 @@ print("="*60) print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)") print("="*60) -# EXTRACT +# extract print("\nPHASE 1: EXTRACTION") print("-"*60) @@ -29,7 +29,7 @@ edges_raw = data["edges"] print(f"Nodes loaded: {len(nodes_raw):,}") print(f"Edges loaded: {len(edges_raw):,}") -# TRANSFORM – NODES +# transfomr – nodes print("\nPHASE 2: TRANSFORM NODES") print("-"*60) @@ -48,7 +48,7 @@ for node in nodes_raw: nodes_df = pd.DataFrame(nodes_flat) -# Spaltennamen Neo4j-sicher machen +# make column names neo4j safe nodes_df.columns = ( nodes_df.columns .str.replace(" ", "_") @@ -59,11 +59,11 @@ nodes_df.columns = ( print(f"Processed {len(nodes_df):,} nodes") print(f" Columns: {', '.join(nodes_df.columns[:5])}...") -# Create lookup dictionaries +# create lookup dictionaries node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind'])) node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name'])) -# Create sets for fast membership testing +# create sets for fast membership testing gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id']) disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id']) symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id']) @@ -77,7 +77,7 @@ print(f" - Symptoms: {len(symptom_ids):,}") print(f" - Compounds: {len(compound_ids):,}") print(f" - Side Effects: {len(sideeffect_ids):,}") -# Export nodes by type +# export nodes by type print("\nExporting node files...") for kind in nodes_df["kind"].unique(): df_kind = ( @@ -89,7 +89,7 @@ for kind in nodes_df["kind"].unique(): df_kind.to_csv(filename, index=False) print(f" {filename.name} ({len(df_kind):,} rows)") -# TRANSFORM – EDGES +# transform edges print("\nPHASE 3: TRANSFORM EDGES") print("-"*60) @@ -107,7 +107,7 @@ for i, edge in enumerate(edges_raw): edges_df = pd.DataFrame(edges) -# Relationship-Typen Neo4j-sicher machen +# make relationship types neo4j safe edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_") # split edges into seperate files @@ -120,13 +120,13 @@ for edge_type in sorted(edge_types): edges_subset = edges_df[edges_df['type'] == edge_type] filename = OUTPUT_DIR / f"edges_{edge_type}.csv" - # Only export source and target (type is in filename) + # only export source and target (type is in filename) edges_subset[['source', 'target']].to_csv(filename, index=False) size_mb = filename.stat().st_size / (1024*1024) print(f" ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)") -# Also keep the combined file for backward compatibility +# also keep the combined file for backward compatibility edges_file = OUTPUT_DIR / "edges_all.csv" edges_df.to_csv(edges_file, index=False) print(f"\n ✓ edges_all.csv (combined) ({len(edges_df):,} rows)") @@ -136,7 +136,7 @@ print(f" Total edges: {len(edges_df):,}") print(f" Split into {len(edge_types)} separate CSV files") print(f" Each file can be loaded independently!") -# Pre-filter edges by type for analysis +# pre-filter edges by type for analysis print("\nEdge type distribution:") edges_by_type = {} for edge_type in sorted(edge_types): @@ -145,14 +145,12 @@ for edge_type in sorted(edge_types): pct = 100 * count / len(edges_df) print(f" - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)") -# [ANALYSES - keeping all the existing analysis code...] -# (Keeping the same analysis code as before) print("\n" + "="*60) print("PHASE 4: ANALYSES") print("="*60) -# ANALYSIS 1: HOTSPOT GENES +# analysis 1: hotspot genes print("\nAnalysis 1: Hotspot Genes") print("-"*60) @@ -183,7 +181,7 @@ genes_df_sorted.to_csv(OUTPUT_DIR / "nodes_Gene.csv", index=False) print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)") -# ANALYSIS 2: DISEASE SYMPTOM DIVERSITY +# analysis 2: disease symptom diversity print("\nAnalysis 2: Disease Symptom Diversity") print("-"*60) @@ -208,7 +206,7 @@ disease_df_sorted.to_csv(OUTPUT_DIR / "nodes_Disease.csv", index=False) print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)") -# Build indices for drug analyses +# build indices for drug analyses print("\nBuilding indices for drug analyses...") disease_to_genes = defaultdict(set) gene_to_diseases = defaultdict(set) @@ -238,7 +236,7 @@ print(f"\n💡 For faster Neo4j loading, use the split edge files:") print(f" edges_associates.csv, edges_treats.csv, etc.") print(f" Instead of the combined edges_all.csv") -# ANALYSIS 3: DRUG REPURPOSING +# analysis 3: drug repurposing print("\nAnalysis 3: Drug Repurposing Opportunities") print("-"*60) @@ -273,7 +271,7 @@ if len(repurposing_df) > 0: repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False) print(f"Found {len(repurposing_df):,} repurposing opportunities") -# ANALYSIS 4: POLYPHARMACY RISK +# analysis 4: polypharmacy risk print("\nAnalysis 4: Polypharmacy Risk") print("-"*60) @@ -294,7 +292,7 @@ if len(drug_sideeffects) > 0: drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False) print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects") -# ANALYSIS 5: SYMPTOM TRIANGLE +# analysis 5: symptom triangle print("\nAnalysis 5: Symptom-Disease-Drug Triangle") print("-"*60) @@ -326,7 +324,7 @@ if len(symptom_triangle_df) > 0: symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False) print(f"Analyzed {len(symptom_triangle_df):,} symptoms") -# ANALYSIS 6: SUPER DRUGS +# analysis 6: super drugs print("\nAnalysis 6: Super-Drug Score") print("-"*60) @@ -352,7 +350,7 @@ if len(super_drugs_df) > 0: super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False) print(f"Analyzed {len(super_drugs_df):,} drugs") -# ANALYSIS 7: DRUG CONFLICTS +# analysis 7: drug conflicts print("\nAnalysis 7: Drug Conflicts") print("-"*60) @@ -390,7 +388,7 @@ if len(drug_conflicts_df) > 0: drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False) print(f"Found {len(drug_conflicts_df):,} drug conflict pairs") -# ANALYSIS 8: NETWORK DATA +# analysis 8: network data print("\nAnalysis 8: Network Visualization Data") print("-"*60) @@ -401,7 +399,7 @@ network_edges = [] node_id_counter = 0 id_mapping = {} -# Add disease nodes +# add disease nodes for disease_id in top_diseases: node_id = f"d_{node_id_counter}" id_mapping[disease_id] = node_id @@ -413,7 +411,7 @@ for disease_id in top_diseases: }) node_id_counter += 1 -# Add genes +# add genes disease_genes = gene_disease_edges[ gene_disease_edges['source'].isin(top_diseases) ].head(150) @@ -437,7 +435,7 @@ for _, row in disease_genes.iterrows(): 'type': 'associates' }) -# Add drugs +# add drugs drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50) for _, row in drug_treatments.iterrows(): diff --git a/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Associates copy.cypher b/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Associates copy.cypher index 431f213..5a9dec8 100644 --- a/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Associates copy.cypher +++ b/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Associates copy.cypher @@ -1,4 +1,4 @@ -LOAD CSV WITH HEADERS FROM 'file:///edges_treats.csv' AS row +LOAD CSV WITH HEADERS FROM 'file:///edges_associates.csv' AS row MATCH (source {id: row.source}) MATCH (target {id: row.target}) -CREATE (source)-[:TREATS]->(target); \ No newline at end of file +CREATE (source)-[:ASSOCIATES]->(target); \ No newline at end of file diff --git a/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Upregulates.cypher b/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Upregulates.cypher index b4258d7..5b7f380 100644 --- a/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Upregulates.cypher +++ b/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Upregulates.cypher @@ -1,4 +1,4 @@ -LOAD CSV WITH HEADERS FROM 'file:///edges_causes.csv' AS row +LOAD CSV WITH HEADERS FROM 'file:///edges_upregulates.csv' AS row MATCH (source {id: row.source}) MATCH (target {id: row.target}) -CREATE (source)-[:CAUSES]->(target); \ No newline at end of file +CREATE (source)-[:UPREGULATES]->(target); \ No newline at end of file