fix comments etc.
This commit is contained in:
19
README.md
19
README.md
@@ -473,19 +473,20 @@ Follow these steps exactly in the order provided:
|
|||||||
python neo4j_etl.py
|
python neo4j_etl.py
|
||||||
```
|
```
|
||||||
|
|
||||||
**Eingabe:** Das Skript wird Sie nacheinander nach Ihrem **Datenbank-Usernamen** (Standard: `neo4j`) und Ihrem **Passwort** fragen.
|
**Input:** The script will ask you for your **database-username** (default: `neo4j`) and your **password**.
|
||||||
**Verarbeitung:** Das Skript liest automatisch alle Abfragen aus dem Verzeichnis `neo4jqueries/analysis_queries` aus.
|
**Processing:** The script automatically reads and executes cypher queries in the following directory `neo4jqueries/analysis_queries`.
|
||||||
**Ausgabe:** Die Ergebnisse der Analyse-Queries werden direkt in der Konsole ausgegeben.
|
**Output:** Results of the analysis will be displayed on the terminal.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### Projektstruktur
|
### Structure
|
||||||
|
|
||||||
| Verzeichnis / Datei | Funktion |
|
| Directory / file | Functionality |
|
||||||
| :---------------------------------- | :--------------------------------------------------------- |
|
| :---------------------------------- | :----------------------------------------------------- |
|
||||||
| `neo4j_etl.py` | Das Python-Skript zur Ausführung der Analyse-Queries. |
|
| `neo4j_etl.py` | Python-Script for execting analysis queries. |
|
||||||
| `neo4jqueries/loadingQueriesNeo4j/` | Enthält alle Cypher-Dateien für den initialen Datenimport. |
|
| `neo4jqueries/loadingQueriesNeo4j/` | Contains all Cypher files for the initial data import. |
|
||||||
| `neo4jqueries/analysis_queries/` | Enthält Cypher-Dateien für die statistische Auswertung. |
|
| `neo4jqueries/analysis_queries/` | Includes Cypher files for analysis. |
|
||||||
|
| | |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
42
dashboard.py
42
dashboard.py
@@ -104,7 +104,7 @@ try:
|
|||||||
else:
|
else:
|
||||||
st.sidebar.warning("No drugs found")
|
st.sidebar.warning("No drugs found")
|
||||||
|
|
||||||
# OVERVIEW PAGE
|
# overview page
|
||||||
if page == "Overview":
|
if page == "Overview":
|
||||||
st.header("Dataset Overview")
|
st.header("Dataset Overview")
|
||||||
|
|
||||||
@@ -114,7 +114,7 @@ try:
|
|||||||
col3.metric("Repurposing Opportunities", f"{len(repurposing):,}")
|
col3.metric("Repurposing Opportunities", f"{len(repurposing):,}")
|
||||||
col4.metric("Analyzed Drugs", f"{len(super_drugs):,}")
|
col4.metric("Analyzed Drugs", f"{len(super_drugs):,}")
|
||||||
|
|
||||||
# STATISTICS BOXES
|
# statistics boxes
|
||||||
st.markdown("---")
|
st.markdown("---")
|
||||||
st.subheader("Key Statistics")
|
st.subheader("Key Statistics")
|
||||||
|
|
||||||
@@ -163,7 +163,7 @@ try:
|
|||||||
xaxis_title="Gene",
|
xaxis_title="Gene",
|
||||||
yaxis_title="Number of Diseases"
|
yaxis_title="Number of Diseases"
|
||||||
)
|
)
|
||||||
# Enable chart export
|
# enable chart export
|
||||||
config = {'displayModeBar': True, 'displaylogo': False}
|
config = {'displayModeBar': True, 'displaylogo': False}
|
||||||
st.plotly_chart(fig, use_container_width=True, config=config)
|
st.plotly_chart(fig, use_container_width=True, config=config)
|
||||||
|
|
||||||
@@ -185,7 +185,7 @@ try:
|
|||||||
csv = top_diseases.to_csv(index=False).encode('utf-8')
|
csv = top_diseases.to_csv(index=False).encode('utf-8')
|
||||||
st.download_button("Download Data", csv, "top_diseases.csv", "text/csv")
|
st.download_button("Download Data", csv, "top_diseases.csv", "text/csv")
|
||||||
|
|
||||||
# HOTSPOT GENES PAGE
|
# hotspot gene page
|
||||||
elif page == "Hotspot Genes":
|
elif page == "Hotspot Genes":
|
||||||
st.header("Hotspot Genes - Most Disease Associations")
|
st.header("Hotspot Genes - Most Disease Associations")
|
||||||
|
|
||||||
@@ -215,7 +215,7 @@ try:
|
|||||||
csv = filtered_genes.to_csv(index=False).encode('utf-8')
|
csv = filtered_genes.to_csv(index=False).encode('utf-8')
|
||||||
st.download_button("Download Filtered Data", csv, "hotspot_genes.csv", "text/csv")
|
st.download_button("Download Filtered Data", csv, "hotspot_genes.csv", "text/csv")
|
||||||
|
|
||||||
# DRUG REPURPOSING PAGE
|
# drug repurposing page
|
||||||
elif page == "Drug Repurposing":
|
elif page == "Drug Repurposing":
|
||||||
st.header("Drug Repurposing Opportunities")
|
st.header("Drug Repurposing Opportunities")
|
||||||
|
|
||||||
@@ -250,7 +250,7 @@ try:
|
|||||||
csv = filtered.to_csv(index=False).encode('utf-8')
|
csv = filtered.to_csv(index=False).encode('utf-8')
|
||||||
st.download_button("Download", csv, f"repurposing_{selected}.csv", "text/csv")
|
st.download_button("Download", csv, f"repurposing_{selected}.csv", "text/csv")
|
||||||
|
|
||||||
# POLYPHARMACY RISK PAGE
|
# polypharmacy risk page
|
||||||
elif page == "Polypharmacy Risk":
|
elif page == "Polypharmacy Risk":
|
||||||
st.header("Polypharmacy Risk Analysis")
|
st.header("Polypharmacy Risk Analysis")
|
||||||
|
|
||||||
@@ -285,7 +285,7 @@ try:
|
|||||||
csv = filtered_risk.to_csv(index=False).encode('utf-8')
|
csv = filtered_risk.to_csv(index=False).encode('utf-8')
|
||||||
st.download_button("Download Risk Data", csv, "polypharmacy_risk.csv", "text/csv")
|
st.download_button("Download Risk Data", csv, "polypharmacy_risk.csv", "text/csv")
|
||||||
|
|
||||||
# SYMPTOM TRIANGLE PAGE
|
# symptop triangle page
|
||||||
elif page == "Symptom Triangle":
|
elif page == "Symptom Triangle":
|
||||||
st.header("Symptom-Disease-Drug Connections")
|
st.header("Symptom-Disease-Drug Connections")
|
||||||
|
|
||||||
@@ -314,7 +314,7 @@ try:
|
|||||||
csv = top_symptoms.to_csv(index=False).encode('utf-8')
|
csv = top_symptoms.to_csv(index=False).encode('utf-8')
|
||||||
st.download_button("Download Symptom Data", csv, "symptom_triangle.csv", "text/csv")
|
st.download_button("Download Symptom Data", csv, "symptom_triangle.csv", "text/csv")
|
||||||
|
|
||||||
# SUPER DRUGS PAGE
|
# super drugs page
|
||||||
elif page == "Super Drugs":
|
elif page == "Super Drugs":
|
||||||
st.header("Super-Drug Score (Best Benefit/Risk Ratio)")
|
st.header("Super-Drug Score (Best Benefit/Risk Ratio)")
|
||||||
|
|
||||||
@@ -352,7 +352,7 @@ try:
|
|||||||
csv = filtered_super.to_csv(index=False).encode('utf-8')
|
csv = filtered_super.to_csv(index=False).encode('utf-8')
|
||||||
st.download_button("Download Super Drugs", csv, "super_drugs.csv", "text/csv")
|
st.download_button("Download Super Drugs", csv, "super_drugs.csv", "text/csv")
|
||||||
|
|
||||||
# DRUG CONFLICTS PAGE
|
# drug conflicts page
|
||||||
elif page == "Drug Conflicts":
|
elif page == "Drug Conflicts":
|
||||||
st.header("Drug Conflicts - Overlapping Side Effects")
|
st.header("Drug Conflicts - Overlapping Side Effects")
|
||||||
|
|
||||||
@@ -394,28 +394,28 @@ try:
|
|||||||
else:
|
else:
|
||||||
st.warning("Drug conflicts data not available. Run the ETL script to generate this analysis.")
|
st.warning("Drug conflicts data not available. Run the ETL script to generate this analysis.")
|
||||||
|
|
||||||
# NETWORK GRAPH PAGE
|
# network graph page
|
||||||
elif page == "Network Graph":
|
elif page == "Network Graph":
|
||||||
st.header("Disease-Gene-Drug Network")
|
st.header("Disease-Gene-Drug Network")
|
||||||
|
|
||||||
if network_nodes is not None and network_edges is not None:
|
if network_nodes is not None and network_edges is not None:
|
||||||
st.info("Interactive network visualization showing connections between diseases, genes, and drugs")
|
st.info("Interactive network visualization showing connections between diseases, genes, and drugs")
|
||||||
|
|
||||||
# Create networkx graph
|
# create networkx graph
|
||||||
G = nx.Graph()
|
G = nx.Graph()
|
||||||
|
|
||||||
# Add nodes
|
# add nodes
|
||||||
for _, row in network_nodes.iterrows():
|
for _, row in network_nodes.iterrows():
|
||||||
G.add_node(row['id'], label=row['label'], type=row['type'])
|
G.add_node(row['id'], label=row['label'], type=row['type'])
|
||||||
|
|
||||||
# Add edges
|
# add edges
|
||||||
for _, row in network_edges.iterrows():
|
for _, row in network_edges.iterrows():
|
||||||
G.add_edge(row['source'], row['target'])
|
G.add_edge(row['source'], row['target'])
|
||||||
|
|
||||||
# Create layout
|
# create layout
|
||||||
pos = nx.spring_layout(G, k=0.5, iterations=50)
|
pos = nx.spring_layout(G, k=0.5, iterations=50)
|
||||||
|
|
||||||
# Create edge trace
|
# create edge trace
|
||||||
edge_x = []
|
edge_x = []
|
||||||
edge_y = []
|
edge_y = []
|
||||||
for edge in G.edges():
|
for edge in G.edges():
|
||||||
@@ -431,7 +431,7 @@ try:
|
|||||||
mode='lines'
|
mode='lines'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create node traces (separate by type for legend)
|
# create node traces (separate by type for legend)
|
||||||
node_traces = []
|
node_traces = []
|
||||||
color_map = {
|
color_map = {
|
||||||
'Disease': '#ff4444',
|
'Disease': '#ff4444',
|
||||||
@@ -466,7 +466,7 @@ try:
|
|||||||
)
|
)
|
||||||
node_traces.append(node_trace)
|
node_traces.append(node_trace)
|
||||||
|
|
||||||
# Create figure
|
# create figure
|
||||||
fig = go.Figure(data=[edge_trace] + node_traces,
|
fig = go.Figure(data=[edge_trace] + node_traces,
|
||||||
layout=go.Layout(
|
layout=go.Layout(
|
||||||
title='Disease-Gene-Drug Network',
|
title='Disease-Gene-Drug Network',
|
||||||
@@ -488,7 +488,7 @@ try:
|
|||||||
else:
|
else:
|
||||||
st.warning("Network data not available. Run the ETL script to generate this visualization.")
|
st.warning("Network data not available. Run the ETL script to generate this visualization.")
|
||||||
|
|
||||||
# COMPARE DRUGS PAGE
|
# compare drugs page
|
||||||
elif page == "Compare Drugs":
|
elif page == "Compare Drugs":
|
||||||
st.header("⚖️ Compare Drugs Side-by-Side")
|
st.header("⚖️ Compare Drugs Side-by-Side")
|
||||||
|
|
||||||
@@ -522,7 +522,7 @@ try:
|
|||||||
st.metric("Side Effects", int(drug2_data['num_side_effects']))
|
st.metric("Side Effects", int(drug2_data['num_side_effects']))
|
||||||
st.metric("Super Score", f"{drug2_data['super_score']:.2f}")
|
st.metric("Super Score", f"{drug2_data['super_score']:.2f}")
|
||||||
|
|
||||||
# Comparison chart
|
# comparison chart
|
||||||
comparison_df = pd.DataFrame({
|
comparison_df = pd.DataFrame({
|
||||||
'Metric': ['Diseases Treated', 'Side Effects', 'Super Score'],
|
'Metric': ['Diseases Treated', 'Side Effects', 'Super Score'],
|
||||||
drug1: [drug1_data['num_diseases_treated'], drug1_data['num_side_effects'], drug1_data['super_score']],
|
drug1: [drug1_data['num_diseases_treated'], drug1_data['num_side_effects'], drug1_data['super_score']],
|
||||||
@@ -539,7 +539,7 @@ try:
|
|||||||
config = {'displayModeBar': True, 'displaylogo': False}
|
config = {'displayModeBar': True, 'displaylogo': False}
|
||||||
st.plotly_chart(fig, use_container_width=True, config=config)
|
st.plotly_chart(fig, use_container_width=True, config=config)
|
||||||
|
|
||||||
# Winner determination
|
# winner determination
|
||||||
st.markdown("---")
|
st.markdown("---")
|
||||||
st.subheader("Recommendation")
|
st.subheader("Recommendation")
|
||||||
|
|
||||||
|
|||||||
50
etl.py
50
etl.py
@@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
# KONFIGURATION
|
# config
|
||||||
|
|
||||||
INPUT_JSON = "hetionet-v1.0.json"
|
INPUT_JSON = "hetionet-v1.0.json"
|
||||||
OUTPUT_DIR = Path("neo4j_csv")
|
OUTPUT_DIR = Path("neo4j_csv")
|
||||||
@@ -14,7 +14,7 @@ print("="*60)
|
|||||||
print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)")
|
print("HETIONET ETL PIPELINE (OPTIMIZED + SPLIT EDGES)")
|
||||||
print("="*60)
|
print("="*60)
|
||||||
|
|
||||||
# EXTRACT
|
# extract
|
||||||
|
|
||||||
print("\nPHASE 1: EXTRACTION")
|
print("\nPHASE 1: EXTRACTION")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
@@ -29,7 +29,7 @@ edges_raw = data["edges"]
|
|||||||
print(f"Nodes loaded: {len(nodes_raw):,}")
|
print(f"Nodes loaded: {len(nodes_raw):,}")
|
||||||
print(f"Edges loaded: {len(edges_raw):,}")
|
print(f"Edges loaded: {len(edges_raw):,}")
|
||||||
|
|
||||||
# TRANSFORM – NODES
|
# transfomr – nodes
|
||||||
|
|
||||||
print("\nPHASE 2: TRANSFORM NODES")
|
print("\nPHASE 2: TRANSFORM NODES")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
@@ -48,7 +48,7 @@ for node in nodes_raw:
|
|||||||
|
|
||||||
nodes_df = pd.DataFrame(nodes_flat)
|
nodes_df = pd.DataFrame(nodes_flat)
|
||||||
|
|
||||||
# Spaltennamen Neo4j-sicher machen
|
# make column names neo4j safe
|
||||||
nodes_df.columns = (
|
nodes_df.columns = (
|
||||||
nodes_df.columns
|
nodes_df.columns
|
||||||
.str.replace(" ", "_")
|
.str.replace(" ", "_")
|
||||||
@@ -59,11 +59,11 @@ nodes_df.columns = (
|
|||||||
print(f"Processed {len(nodes_df):,} nodes")
|
print(f"Processed {len(nodes_df):,} nodes")
|
||||||
print(f" Columns: {', '.join(nodes_df.columns[:5])}...")
|
print(f" Columns: {', '.join(nodes_df.columns[:5])}...")
|
||||||
|
|
||||||
# Create lookup dictionaries
|
# create lookup dictionaries
|
||||||
node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind']))
|
node_id_to_kind = dict(zip(nodes_df['id'], nodes_df['kind']))
|
||||||
node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name']))
|
node_id_to_name = dict(zip(nodes_df['id'], nodes_df['name']))
|
||||||
|
|
||||||
# Create sets for fast membership testing
|
# create sets for fast membership testing
|
||||||
gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id'])
|
gene_ids = set(nodes_df[nodes_df['kind'] == 'Gene']['id'])
|
||||||
disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id'])
|
disease_ids = set(nodes_df[nodes_df['kind'] == 'Disease']['id'])
|
||||||
symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id'])
|
symptom_ids = set(nodes_df[nodes_df['kind'] == 'Symptom']['id'])
|
||||||
@@ -77,7 +77,7 @@ print(f" - Symptoms: {len(symptom_ids):,}")
|
|||||||
print(f" - Compounds: {len(compound_ids):,}")
|
print(f" - Compounds: {len(compound_ids):,}")
|
||||||
print(f" - Side Effects: {len(sideeffect_ids):,}")
|
print(f" - Side Effects: {len(sideeffect_ids):,}")
|
||||||
|
|
||||||
# Export nodes by type
|
# export nodes by type
|
||||||
print("\nExporting node files...")
|
print("\nExporting node files...")
|
||||||
for kind in nodes_df["kind"].unique():
|
for kind in nodes_df["kind"].unique():
|
||||||
df_kind = (
|
df_kind = (
|
||||||
@@ -89,7 +89,7 @@ for kind in nodes_df["kind"].unique():
|
|||||||
df_kind.to_csv(filename, index=False)
|
df_kind.to_csv(filename, index=False)
|
||||||
print(f" {filename.name} ({len(df_kind):,} rows)")
|
print(f" {filename.name} ({len(df_kind):,} rows)")
|
||||||
|
|
||||||
# TRANSFORM – EDGES
|
# transform edges
|
||||||
|
|
||||||
print("\nPHASE 3: TRANSFORM EDGES")
|
print("\nPHASE 3: TRANSFORM EDGES")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
@@ -107,7 +107,7 @@ for i, edge in enumerate(edges_raw):
|
|||||||
|
|
||||||
edges_df = pd.DataFrame(edges)
|
edges_df = pd.DataFrame(edges)
|
||||||
|
|
||||||
# Relationship-Typen Neo4j-sicher machen
|
# make relationship types neo4j safe
|
||||||
edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_")
|
edges_df["type"] = edges_df["type"].str.replace(" ", "_").str.replace("-", "_")
|
||||||
|
|
||||||
# split edges into seperate files
|
# split edges into seperate files
|
||||||
@@ -120,13 +120,13 @@ for edge_type in sorted(edge_types):
|
|||||||
edges_subset = edges_df[edges_df['type'] == edge_type]
|
edges_subset = edges_df[edges_df['type'] == edge_type]
|
||||||
filename = OUTPUT_DIR / f"edges_{edge_type}.csv"
|
filename = OUTPUT_DIR / f"edges_{edge_type}.csv"
|
||||||
|
|
||||||
# Only export source and target (type is in filename)
|
# only export source and target (type is in filename)
|
||||||
edges_subset[['source', 'target']].to_csv(filename, index=False)
|
edges_subset[['source', 'target']].to_csv(filename, index=False)
|
||||||
|
|
||||||
size_mb = filename.stat().st_size / (1024*1024)
|
size_mb = filename.stat().st_size / (1024*1024)
|
||||||
print(f" ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)")
|
print(f" ✓ edges_{edge_type:20s}.csv ({len(edges_subset):>10,} rows, {size_mb:>6.2f} MB)")
|
||||||
|
|
||||||
# Also keep the combined file for backward compatibility
|
# also keep the combined file for backward compatibility
|
||||||
edges_file = OUTPUT_DIR / "edges_all.csv"
|
edges_file = OUTPUT_DIR / "edges_all.csv"
|
||||||
edges_df.to_csv(edges_file, index=False)
|
edges_df.to_csv(edges_file, index=False)
|
||||||
print(f"\n ✓ edges_all.csv (combined) ({len(edges_df):,} rows)")
|
print(f"\n ✓ edges_all.csv (combined) ({len(edges_df):,} rows)")
|
||||||
@@ -136,7 +136,7 @@ print(f" Total edges: {len(edges_df):,}")
|
|||||||
print(f" Split into {len(edge_types)} separate CSV files")
|
print(f" Split into {len(edge_types)} separate CSV files")
|
||||||
print(f" Each file can be loaded independently!")
|
print(f" Each file can be loaded independently!")
|
||||||
|
|
||||||
# Pre-filter edges by type for analysis
|
# pre-filter edges by type for analysis
|
||||||
print("\nEdge type distribution:")
|
print("\nEdge type distribution:")
|
||||||
edges_by_type = {}
|
edges_by_type = {}
|
||||||
for edge_type in sorted(edge_types):
|
for edge_type in sorted(edge_types):
|
||||||
@@ -145,14 +145,12 @@ for edge_type in sorted(edge_types):
|
|||||||
pct = 100 * count / len(edges_df)
|
pct = 100 * count / len(edges_df)
|
||||||
print(f" - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)")
|
print(f" - {edge_type:20s}: {count:>10,} ({pct:>5.1f}%)")
|
||||||
|
|
||||||
# [ANALYSES - keeping all the existing analysis code...]
|
|
||||||
# (Keeping the same analysis code as before)
|
|
||||||
|
|
||||||
print("\n" + "="*60)
|
print("\n" + "="*60)
|
||||||
print("PHASE 4: ANALYSES")
|
print("PHASE 4: ANALYSES")
|
||||||
print("="*60)
|
print("="*60)
|
||||||
|
|
||||||
# ANALYSIS 1: HOTSPOT GENES
|
# analysis 1: hotspot genes
|
||||||
print("\nAnalysis 1: Hotspot Genes")
|
print("\nAnalysis 1: Hotspot Genes")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -183,7 +181,7 @@ genes_df_sorted.to_csv(OUTPUT_DIR / "nodes_Gene.csv", index=False)
|
|||||||
|
|
||||||
print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)")
|
print(f"Top gene: {genes_df_sorted.iloc[0]['name']} ({int(genes_df_sorted.iloc[0]['num_diseases'])} diseases)")
|
||||||
|
|
||||||
# ANALYSIS 2: DISEASE SYMPTOM DIVERSITY
|
# analysis 2: disease symptom diversity
|
||||||
print("\nAnalysis 2: Disease Symptom Diversity")
|
print("\nAnalysis 2: Disease Symptom Diversity")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -208,7 +206,7 @@ disease_df_sorted.to_csv(OUTPUT_DIR / "nodes_Disease.csv", index=False)
|
|||||||
|
|
||||||
print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)")
|
print(f"Top disease: {disease_df_sorted.iloc[0]['name']} ({int(disease_df_sorted.iloc[0]['num_symptoms'])} symptoms)")
|
||||||
|
|
||||||
# Build indices for drug analyses
|
# build indices for drug analyses
|
||||||
print("\nBuilding indices for drug analyses...")
|
print("\nBuilding indices for drug analyses...")
|
||||||
disease_to_genes = defaultdict(set)
|
disease_to_genes = defaultdict(set)
|
||||||
gene_to_diseases = defaultdict(set)
|
gene_to_diseases = defaultdict(set)
|
||||||
@@ -238,7 +236,7 @@ print(f"\n💡 For faster Neo4j loading, use the split edge files:")
|
|||||||
print(f" edges_associates.csv, edges_treats.csv, etc.")
|
print(f" edges_associates.csv, edges_treats.csv, etc.")
|
||||||
print(f" Instead of the combined edges_all.csv")
|
print(f" Instead of the combined edges_all.csv")
|
||||||
|
|
||||||
# ANALYSIS 3: DRUG REPURPOSING
|
# analysis 3: drug repurposing
|
||||||
print("\nAnalysis 3: Drug Repurposing Opportunities")
|
print("\nAnalysis 3: Drug Repurposing Opportunities")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -273,7 +271,7 @@ if len(repurposing_df) > 0:
|
|||||||
repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False)
|
repurposing_df.to_csv(OUTPUT_DIR / "analysis_drug_repurposing.csv", index=False)
|
||||||
print(f"Found {len(repurposing_df):,} repurposing opportunities")
|
print(f"Found {len(repurposing_df):,} repurposing opportunities")
|
||||||
|
|
||||||
# ANALYSIS 4: POLYPHARMACY RISK
|
# analysis 4: polypharmacy risk
|
||||||
print("\nAnalysis 4: Polypharmacy Risk")
|
print("\nAnalysis 4: Polypharmacy Risk")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -294,7 +292,7 @@ if len(drug_sideeffects) > 0:
|
|||||||
drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False)
|
drug_risk_sorted.to_csv(OUTPUT_DIR / "analysis_polypharmacy_risk.csv", index=False)
|
||||||
print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects")
|
print(f"Analyzed {len(drug_risk_sorted):,} drugs for side effects")
|
||||||
|
|
||||||
# ANALYSIS 5: SYMPTOM TRIANGLE
|
# analysis 5: symptom triangle
|
||||||
print("\nAnalysis 5: Symptom-Disease-Drug Triangle")
|
print("\nAnalysis 5: Symptom-Disease-Drug Triangle")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -326,7 +324,7 @@ if len(symptom_triangle_df) > 0:
|
|||||||
symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False)
|
symptom_triangle_df.to_csv(OUTPUT_DIR / "analysis_symptom_triangle.csv", index=False)
|
||||||
print(f"Analyzed {len(symptom_triangle_df):,} symptoms")
|
print(f"Analyzed {len(symptom_triangle_df):,} symptoms")
|
||||||
|
|
||||||
# ANALYSIS 6: SUPER DRUGS
|
# analysis 6: super drugs
|
||||||
print("\nAnalysis 6: Super-Drug Score")
|
print("\nAnalysis 6: Super-Drug Score")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -352,7 +350,7 @@ if len(super_drugs_df) > 0:
|
|||||||
super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False)
|
super_drugs_df.to_csv(OUTPUT_DIR / "analysis_super_drugs.csv", index=False)
|
||||||
print(f"Analyzed {len(super_drugs_df):,} drugs")
|
print(f"Analyzed {len(super_drugs_df):,} drugs")
|
||||||
|
|
||||||
# ANALYSIS 7: DRUG CONFLICTS
|
# analysis 7: drug conflicts
|
||||||
print("\nAnalysis 7: Drug Conflicts")
|
print("\nAnalysis 7: Drug Conflicts")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -390,7 +388,7 @@ if len(drug_conflicts_df) > 0:
|
|||||||
drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False)
|
drug_conflicts_df.to_csv(OUTPUT_DIR / "analysis_drug_conflicts.csv", index=False)
|
||||||
print(f"Found {len(drug_conflicts_df):,} drug conflict pairs")
|
print(f"Found {len(drug_conflicts_df):,} drug conflict pairs")
|
||||||
|
|
||||||
# ANALYSIS 8: NETWORK DATA
|
# analysis 8: network data
|
||||||
print("\nAnalysis 8: Network Visualization Data")
|
print("\nAnalysis 8: Network Visualization Data")
|
||||||
print("-"*60)
|
print("-"*60)
|
||||||
|
|
||||||
@@ -401,7 +399,7 @@ network_edges = []
|
|||||||
node_id_counter = 0
|
node_id_counter = 0
|
||||||
id_mapping = {}
|
id_mapping = {}
|
||||||
|
|
||||||
# Add disease nodes
|
# add disease nodes
|
||||||
for disease_id in top_diseases:
|
for disease_id in top_diseases:
|
||||||
node_id = f"d_{node_id_counter}"
|
node_id = f"d_{node_id_counter}"
|
||||||
id_mapping[disease_id] = node_id
|
id_mapping[disease_id] = node_id
|
||||||
@@ -413,7 +411,7 @@ for disease_id in top_diseases:
|
|||||||
})
|
})
|
||||||
node_id_counter += 1
|
node_id_counter += 1
|
||||||
|
|
||||||
# Add genes
|
# add genes
|
||||||
disease_genes = gene_disease_edges[
|
disease_genes = gene_disease_edges[
|
||||||
gene_disease_edges['source'].isin(top_diseases)
|
gene_disease_edges['source'].isin(top_diseases)
|
||||||
].head(150)
|
].head(150)
|
||||||
@@ -437,7 +435,7 @@ for _, row in disease_genes.iterrows():
|
|||||||
'type': 'associates'
|
'type': 'associates'
|
||||||
})
|
})
|
||||||
|
|
||||||
# Add drugs
|
# add drugs
|
||||||
drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50)
|
drug_treatments = treats_edges[treats_edges['target'].isin(top_diseases)].head(50)
|
||||||
|
|
||||||
for _, row in drug_treatments.iterrows():
|
for _, row in drug_treatments.iterrows():
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
LOAD CSV WITH HEADERS FROM 'file:///edges_treats.csv' AS row
|
LOAD CSV WITH HEADERS FROM 'file:///edges_associates.csv' AS row
|
||||||
MATCH (source {id: row.source})
|
MATCH (source {id: row.source})
|
||||||
MATCH (target {id: row.target})
|
MATCH (target {id: row.target})
|
||||||
CREATE (source)-[:TREATS]->(target);
|
CREATE (source)-[:ASSOCIATES]->(target);
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
LOAD CSV WITH HEADERS FROM 'file:///edges_causes.csv' AS row
|
LOAD CSV WITH HEADERS FROM 'file:///edges_upregulates.csv' AS row
|
||||||
MATCH (source {id: row.source})
|
MATCH (source {id: row.source})
|
||||||
MATCH (target {id: row.target})
|
MATCH (target {id: row.target})
|
||||||
CREATE (source)-[:CAUSES]->(target);
|
CREATE (source)-[:UPREGULATES]->(target);
|
||||||
Reference in New Issue
Block a user