Files
ETL_Datamanagement/neo4j_etl.py
Philipp Jacoby 3003310be0 finishes setup
2026-02-10 17:43:26 +01:00

82 lines
2.0 KiB
Python

from neo4j import GraphDatabase
import pandas as pd
import os
import glob
import getpass
# Neo4j Connection
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = input("Neo4j username: ")
NEO4J_PASSWORD = getpass.getpass("Neo4j password: ")
driver = GraphDatabase.driver(
NEO4J_URI,
auth=(NEO4J_USER, NEO4J_PASSWORD)
)
# Helper Functions
def test_connection():
try:
with driver.session() as session:
result = session.run("RETURN 1")
if result.single():
print("✓ Connection successful")
return True
else:
print("✗ Error connecting")
return False
except Exception as e:
print(f"✗ Error with the connection: {e}")
return False
def run_query(query, parameters=None):
"""Run a Cypher query and return a Pandas DataFrame"""
with driver.session() as session:
result = session.run(query, parameters)
df = pd.DataFrame([record.data() for record in result])
return df
# Check Neo4j connection
if not test_connection():
print("Cannot connect to Neo4j")
exit(1)
# Folder for results
output_dir = "query_results"
os.makedirs(output_dir, exist_ok=True)
# Run all .cypher files in 'queries/' folder
cypher_files = sorted(glob.glob("analysis_queries/*.cypher"))
for file in cypher_files:
with open(file, "r", encoding="utf-8") as f:
query = f.read()
print(f"\nRunning {file}")
try:
df = run_query(query)
if df.empty:
print("⚠ No results returned")
else:
print(df.head(5)) # show top 5 rows
safe_name = os.path.splitext(os.path.basename(file))[0]
csv_path = os.path.join(output_dir, f"{safe_name}.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"✓ Saved to {csv_path}")
except Exception as e:
print(f"✗ Error running query '{file}': {e}")
driver.close()
print("\nAll queries executed.")