fix comments etc.

2026-02-10 17:57:43 +01:00
parent 3003310be0
commit 8965b04a61
5 changed files with 59 additions and 60 deletions
--- a/README.md
+++ b/README.md
@@ -473,19 +473,20 @@ Follow these steps exactly in the order provided:
 python neo4j_etl.py
 ```

-  **Eingabe:** Das Skript wird Sie nacheinander nach Ihrem **Datenbank-Usernamen** (Standard: `neo4j`) und Ihrem **Passwort** fragen.
-  **Verarbeitung:** Das Skript liest automatisch alle Abfragen aus dem Verzeichnis `neo4jqueries/analysis_queries` aus.
-  **Ausgabe:** Die Ergebnisse der Analyse-Queries werden direkt in der Konsole ausgegeben.
+  **Input:** The script will ask you for your **database-username** (default: `neo4j`) and your **password**.
+  **Processing:** The script automatically reads and executes cypher queries in the following directory `neo4jqueries/analysis_queries`.
+  **Output:** Results of the analysis will be displayed on the terminal.

 ---

-### Projektstruktur
+### Structure

-| Verzeichnis / Datei                 | Funktion                                                   |
-| :---------------------------------- | :--------------------------------------------------------- |
-| `neo4j_etl.py`                      | Das Python-Skript zur Ausführung der Analyse-Queries.      |
-| `neo4jqueries/loadingQueriesNeo4j/` | Enthält alle Cypher-Dateien für den initialen Datenimport. |
-| `neo4jqueries/analysis_queries/`    | Enthält Cypher-Dateien für die statistische Auswertung.    |
+| Directory / file                    | Functionality                                          |
+| :---------------------------------- | :----------------------------------------------------- |
+| `neo4j_etl.py`                      | Python-Script for execting analysis queries.           |
+| `neo4jqueries/loadingQueriesNeo4j/` | Contains all Cypher files for the initial data import. |
+| `neo4jqueries/analysis_queries/`    | Includes Cypher files for  analysis.                   |
+|                                     |                                                        |

 ---

--- a/dashboard.py
+++ b/dashboard.py
@@ -104,7 +104,7 @@ try:
            else:
                st.sidebar.warning("No drugs found")
    
-    # OVERVIEW PAGE
+    # overview page
    if page == "Overview":
        st.header("Dataset Overview")
        
@@ -114,7 +114,7 @@ try:
        col3.metric("Repurposing Opportunities", f"{len(repurposing):,}")
        col4.metric("Analyzed Drugs", f"{len(super_drugs):,}")
        
-        # STATISTICS BOXES
+        # statistics boxes
        st.markdown("---")
        st.subheader("Key Statistics")
        
@@ -163,7 +163,7 @@ try:
                xaxis_title="Gene",
                yaxis_title="Number of Diseases"
            )
-            # Enable chart export
+            # enable chart export
            config = {'displayModeBar': True, 'displaylogo': False}
            st.plotly_chart(fig, use_container_width=True, config=config)
            
@@ -185,7 +185,7 @@ try:
            csv = top_diseases.to_csv(index=False).encode('utf-8')
            st.download_button("Download Data", csv, "top_diseases.csv", "text/csv")
    
-    # HOTSPOT GENES PAGE
+    # hotspot gene page
    elif page == "Hotspot Genes":
        st.header("Hotspot Genes - Most Disease Associations")
        
@@ -215,7 +215,7 @@ try:
        csv = filtered_genes.to_csv(index=False).encode('utf-8')
        st.download_button("Download Filtered Data", csv, "hotspot_genes.csv", "text/csv")
    
-    # DRUG REPURPOSING PAGE
+    # drug repurposing page
    elif page == "Drug Repurposing":
        st.header("Drug Repurposing Opportunities")
        
@@ -250,7 +250,7 @@ try:
            csv = filtered.to_csv(index=False).encode('utf-8')
            st.download_button("Download", csv, f"repurposing_{selected}.csv", "text/csv")
    
-    # POLYPHARMACY RISK PAGE
+    # polypharmacy risk page
    elif page == "Polypharmacy Risk":
        st.header("Polypharmacy Risk Analysis")
        
@@ -285,7 +285,7 @@ try:
        csv = filtered_risk.to_csv(index=False).encode('utf-8')
        st.download_button("Download Risk Data", csv, "polypharmacy_risk.csv", "text/csv")
    
-    # SYMPTOM TRIANGLE PAGE
+    # symptop triangle page
    elif page == "Symptom Triangle":
        st.header("Symptom-Disease-Drug Connections")
        
@@ -314,7 +314,7 @@ try:
        csv = top_symptoms.to_csv(index=False).encode('utf-8')
        st.download_button("Download Symptom Data", csv, "symptom_triangle.csv", "text/csv")
    
-    # SUPER DRUGS PAGE
+    # super drugs page
    elif page == "Super Drugs":
        st.header("Super-Drug Score (Best Benefit/Risk Ratio)")
        
@@ -352,7 +352,7 @@ try:
        csv = filtered_super.to_csv(index=False).encode('utf-8')
        st.download_button("Download Super Drugs", csv, "super_drugs.csv", "text/csv")
    
-    # DRUG CONFLICTS PAGE
+    # drug conflicts page
    elif page == "Drug Conflicts":
        st.header("Drug Conflicts - Overlapping Side Effects")
        
@@ -394,28 +394,28 @@ try:
        else:
            st.warning("Drug conflicts data not available. Run the ETL script to generate this analysis.")
    
-    # NETWORK GRAPH PAGE
+    # network graph page
    elif page == "Network Graph":
        st.header("Disease-Gene-Drug Network")
        
        if network_nodes is not None and network_edges is not None:
            st.info("Interactive network visualization showing connections between diseases, genes, and drugs")
            
-            # Create networkx graph
+            # create networkx graph
            G = nx.Graph()
            
-            # Add nodes
+            # add nodes
            for _, row in network_nodes.iterrows():
                G.add_node(row['id'], label=row['label'], type=row['type'])
            
-            # Add edges
+            # add edges
            for _, row in network_edges.iterrows():
                G.add_edge(row['source'], row['target'])
            
-            # Create layout
+            # create layout
            pos = nx.spring_layout(G, k=0.5, iterations=50)
-            
-            # Create edge trace
+        
+            # create edge trace
            edge_x = []
            edge_y = []
            for edge in G.edges():
@@ -431,7 +431,7 @@ try:
                mode='lines'
            )
            
-            # Create node traces (separate by type for legend)
+            # create node traces (separate by type for legend)
            node_traces = []
            color_map = {
                'Disease': '#ff4444',
@@ -466,7 +466,7 @@ try:
                    )
                    node_traces.append(node_trace)
            
-            # Create figure
+            # create figure
            fig = go.Figure(data=[edge_trace] + node_traces,
                          layout=go.Layout(
                              title='Disease-Gene-Drug Network',
@@ -488,7 +488,7 @@ try:
        else:
            st.warning("Network data not available. Run the ETL script to generate this visualization.")
    
-    # COMPARE DRUGS PAGE
+    # compare drugs page
    elif page == "Compare Drugs":
        st.header("⚖️ Compare Drugs Side-by-Side")
        
@@ -522,7 +522,7 @@ try:
                st.metric("Side Effects", int(drug2_data['num_side_effects']))
                st.metric("Super Score", f"{drug2_data['super_score']:.2f}")
            
-            # Comparison chart
+            # comparison chart
            comparison_df = pd.DataFrame({
                'Metric': ['Diseases Treated', 'Side Effects', 'Super Score'],
                drug1: [drug1_data['num_diseases_treated'], drug1_data['num_side_effects'], drug1_data['super_score']],
@@ -539,7 +539,7 @@ try:
            config = {'displayModeBar': True, 'displaylogo': False}
            st.plotly_chart(fig, use_container_width=True, config=config)
            
-            # Winner determination
+            # winner determination
            st.markdown("---")
            st.subheader("Recommendation")
            
--- a/etl.py
+++ b/etl.py
@@ -4,7 +4,7 @@ from pathlib import Path
 from collections import defaultdict


-# KONFIGURATION
+# config

 INPUT_JSON = "hetionet-v1.0.json"
 OUTPUT_DIR = Path("neo4j_csv")
@@ -14,7 +14,7 @@ print("="*60)
 print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)")
 print("="*60)

-# EXTRACT
+# extract

 print("\nPHASE 1: EXTRACTION")
 print("-"*60)
@@ -29,7 +29,7 @@ edges_raw = data["edges"]
 print(f"Nodes loaded: {len(nodes_raw):,}")
 print(f"Edges loaded: {len(edges_raw):,}")

-# TRANSFORM – NODES
+# transfomr – nodes

 print("\nPHASE 2: TRANSFORM NODES")
 print("-"*60)
@@ -48,7 +48,7 @@ for node in nodes_raw:

 nodes_df = pd.DataFrame(nodes_flat)

-# Spaltennamen Neo4j-sicher machen
+# make column names neo4j safe
 nodes_df.columns = (
    nodes_df.columns
    .str.replace(" ", "_")
@@ -59,11 +59,11 @@ nodes_df.columns = (
 print(f"Processed {len(nodes_df):,} nodes")
 print(f"   Columns: {', '.join(nodes_df.columns[:5])}...")

-# Create lookup dictionaries
+# create lookup dictionaries
 node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind']))
 node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name']))

-# Create sets for fast membership testing
+# create sets for fast membership testing
 gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id'])
 disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id'])
 symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id'])
@@ -77,7 +77,7 @@ print(f"   - Symptoms: {len(symptom_ids):,}")
 print(f"   - Compounds: {len(compound_ids):,}")
 print(f"   - Side Effects: {len(sideeffect_ids):,}")

-# Export nodes by type
+# export nodes by type
 print("\nExporting node files...")
 for kind in nodes_df["kind"].unique():
    df_kind = (
@@ -89,7 +89,7 @@ for kind in nodes_df["kind"].unique():
    df_kind.to_csv(filename, index=False)
    print(f"   {filename.name} ({len(df_kind):,} rows)")

-# TRANSFORM – EDGES
+# transform edges

 print("\nPHASE 3: TRANSFORM EDGES")
 print("-"*60)
@@ -107,7 +107,7 @@ for i, edge in enumerate(edges_raw):

 edges_df = pd.DataFrame(edges)

-# Relationship-Typen Neo4j-sicher machen
+# make relationship types neo4j safe
 edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_")

 # split edges into seperate files
@@ -120,13 +120,13 @@ for edge_type in sorted(edge_types):
    edges_subset = edges_df[edges_df['type'] == edge_type]
    filename = OUTPUT_DIR / f"edges_{edge_type}.csv"
    
-    # Only export source and target (type is in filename)
+    # only export source and target (type is in filename)
    edges_subset[['source', 'target']].to_csv(filename, index=False)
    
    size_mb = filename.stat().st_size / (1024*1024)
    print(f"   ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)")

-# Also keep the combined file for backward compatibility
+# also keep the combined file for backward compatibility
 edges_file = OUTPUT_DIR / "edges_all.csv"
 edges_df.to_csv(edges_file, index=False)
 print(f"\n   ✓ edges_all.csv (combined) ({len(edges_df):,} rows)")
@@ -136,7 +136,7 @@ print(f"   Total edges: {len(edges_df):,}")
 print(f"   Split into {len(edge_types)} separate CSV files")
 print(f"   Each file can be loaded independently!")

-# Pre-filter edges by type for analysis
+# pre-filter edges by type for analysis
 print("\nEdge type distribution:")
 edges_by_type = {}
 for edge_type in sorted(edge_types):
@@ -145,14 +145,12 @@ for edge_type in sorted(edge_types):
    pct = 100 * count / len(edges_df)
    print(f"   - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)")

-# [ANALYSES - keeping all the existing analysis code...]
-# (Keeping the same analysis code as before)

 print("\n" + "="*60)
 print("PHASE 4: ANALYSES")
 print("="*60)

-# ANALYSIS 1: HOTSPOT GENES
+# analysis 1: hotspot genes
 print("\nAnalysis 1: Hotspot Genes")
 print("-"*60)

@@ -183,7 +181,7 @@ genes_df_sorted.to_csv(OUTPUT_DIR / "nodes_Gene.csv", index=False)

 print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)")

-# ANALYSIS 2: DISEASE SYMPTOM DIVERSITY
+# analysis 2: disease symptom diversity
 print("\nAnalysis 2: Disease Symptom Diversity")
 print("-"*60)

@@ -208,7 +206,7 @@ disease_df_sorted.to_csv(OUTPUT_DIR / "nodes_Disease.csv", index=False)

 print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)")

-# Build indices for drug analyses
+# build indices for drug analyses
 print("\nBuilding indices for drug analyses...")
 disease_to_genes = defaultdict(set)
 gene_to_diseases = defaultdict(set)
@@ -238,7 +236,7 @@ print(f"\n💡 For faster Neo4j loading, use the split edge files:")
 print(f"   edges_associates.csv, edges_treats.csv, etc.")
 print(f"   Instead of the combined edges_all.csv")

-# ANALYSIS 3: DRUG REPURPOSING
+# analysis 3: drug repurposing
 print("\nAnalysis 3: Drug Repurposing Opportunities")
 print("-"*60)

@@ -273,7 +271,7 @@ if len(repurposing_df) > 0:
    repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False)
    print(f"Found {len(repurposing_df):,} repurposing opportunities")

-# ANALYSIS 4: POLYPHARMACY RISK
+# analysis 4: polypharmacy risk
 print("\nAnalysis 4: Polypharmacy Risk")
 print("-"*60)

@@ -294,7 +292,7 @@ if len(drug_sideeffects) > 0:
    drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False)
    print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects")

-# ANALYSIS 5: SYMPTOM TRIANGLE
+# analysis 5: symptom triangle
 print("\nAnalysis 5: Symptom-Disease-Drug Triangle")
 print("-"*60)

@@ -326,7 +324,7 @@ if len(symptom_triangle_df) > 0:
    symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False)
    print(f"Analyzed {len(symptom_triangle_df):,} symptoms")

-# ANALYSIS 6: SUPER DRUGS
+# analysis 6: super drugs
 print("\nAnalysis 6: Super-Drug Score")
 print("-"*60)

@@ -352,7 +350,7 @@ if len(super_drugs_df) > 0:
    super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False)
    print(f"Analyzed {len(super_drugs_df):,} drugs")

-# ANALYSIS 7: DRUG CONFLICTS
+# analysis 7: drug conflicts
 print("\nAnalysis 7: Drug Conflicts")
 print("-"*60)

@@ -390,7 +388,7 @@ if len(drug_conflicts_df) > 0:
    drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False)
    print(f"Found {len(drug_conflicts_df):,} drug conflict pairs")

-# ANALYSIS 8: NETWORK DATA
+# analysis 8: network data
 print("\nAnalysis 8: Network Visualization Data")
 print("-"*60)

@@ -401,7 +399,7 @@ network_edges = []
 node_id_counter = 0
 id_mapping = {}

-# Add disease nodes
+# add disease nodes
 for disease_id in top_diseases:
    node_id = f"d_{node_id_counter}"
    id_mapping[disease_id] = node_id
@@ -413,7 +411,7 @@ for disease_id in top_diseases:
    })
    node_id_counter += 1

-# Add genes
+# add genes
 disease_genes = gene_disease_edges[
    gene_disease_edges['source'].isin(top_diseases)
 ].head(150)
@@ -437,7 +435,7 @@ for _, row in disease_genes.iterrows():
        'type': 'associates'
    })

-# Add drugs
+# add drugs
 drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50)

 for _, row in drug_treatments.iterrows():
--- a/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Associates
+++ b/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Associates
@@ -1,4 +1,4 @@
-LOAD CSV WITH HEADERS FROM 'file:///edges_treats.csv' AS row
+LOAD CSV WITH HEADERS FROM 'file:///edges_associates.csv' AS row
 MATCH (source {id: row.source})            
 MATCH (target {id: row.target})
-CREATE (source)-[:TREATS]->(target);
+CREATE (source)-[:ASSOCIATES]->(target);
--- a/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Upregulates.cypher
+++ b/neo4jqueries/loadingQueriesNeo4j/LoadRelationshipsEdges_Upregulates.cypher
@@ -1,4 +1,4 @@
-LOAD CSV WITH HEADERS FROM 'file:///edges_causes.csv' AS row
+LOAD CSV WITH HEADERS FROM 'file:///edges_upregulates.csv' AS row
 MATCH (source {id: row.source})            
 MATCH (target {id: row.target})
-CREATE (source)-[:CAUSES]->(target);
+CREATE (source)-[:UPREGULATES]->(target);