tp7 Neo4J

176731d6 · Helene Coullon · 900478fb · 176731d6 · 176731d6 · 176731d6
Commit 176731d6 authored 4 months ago by Helene Coullon
--- a/tp7-Neo4J/Dockerfile-jupyter
+++ b/tp7-Neo4J/Dockerfile-jupyter
+FROM quay.io/jupyter/datascience-notebook:2024-11-19
+USER root
+
+ARG openjdk_version="17"
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    "openjdk-${openjdk_version}-jre-headless" \
+    ca-certificates-java && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --upgrade pip 
+COPY requirements_docker.txt /home/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /home/requirements.txt && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+    
+USER ${NB_UID}
--- a/tp7-Neo4J/README.md
+++ b/tp7-Neo4J/README.md
+
+# Ingénierie des données graphes avec Neo4j, Spark et GraphFrames
+
+## Prérequis
+
+Pour cette séance, nous allons utiliser dans les deux types de configuration (Windows et Mac) le fichier `docker-compose-jupyter.yml` pour déployer les dépendances. Pour rappel, ce fichier déploie un serveur Jupyter aux côtés des modules au lieu d'utiliser celui du système hôte.
+
+Commencez par construire les images Docker nécessaires pour le fonctionnement :
+```bash
+docker compose -f ./docker-compose-jupyter.yml build
+```
+
+Vous pouvez ensuite déployer l'ensemble des modules (et le serveur Jupyter) avec la commande suivante : 
+```bash
+docker compose -f ./docker-compose-jupyter.yml up -d
+```
+
+Pour les étudiants sous Windows, vous pouvez indiquer à VsCode d'utiliser le serveur Jupyter pour exécuter le code, ce qui rend les choses plus pratiques (le conteneur Jupyter peut directement discuter avec les conteneurs Spark et Neo4J): 
+- dans un notebook, cliquez sur "Select Kernel" (en haut à droite).
+- cliquez sur "Existing Jupyter Server"
+- cliquez sur "Enter the URL of the running Jupyter server"
+- Entrez l'adresse du serveur : "http://localhost:8888" 
+- Gardez le nom par défaut
+- Sélectionnez le kernel Python 3
+
+Les étudiants sous Mac restent contraints d'utiliser le navigateur.
+
+## Sujet du TP 
+
+Ce sujet a pour but d'aborder les bases NoSQL orientées graphe (ici Neo4j), leurs langages de requête (ici Cypher), et leur intégration avec l'ecosystème bigdata (Spark).
+Nous utiliserons le notebook [neo4j_data_engineering.ipynb](http://localhost:8888).
--- a/tp7-Neo4J/docker-compose-jupyter.yml
+++ b/tp7-Neo4J/docker-compose-jupyter.yml
+services:
+  spark:
+    image: docker.io/bitnami/spark:3.5
+    environment:
+      - SPARK_MODE=master
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+      - SPARK_USER=spark
+    ports:
+      - '8080:8080'
+      - '7077:7077'
+      - '18888:8888'
+  spark-worker:
+    image: docker.io/bitnami/spark:3.5
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://spark:7077
+      - SPARK_WORKER_MEMORY=1G # can be changed
+      - SPARK_WORKER_CORES=1 # can be changed
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+      - SPARK_USER=spark
+    ports:
+      - '8081:8081'
+    volumes:
+      - ./graphframes_cps:/opt/bitnami/spark/work/graphframes_cps
+  minio:
+    image: minio/minio
+    container_name: minio
+    environment:
+      MINIO_ROOT_USER: root
+      MINIO_ROOT_PASSWORD: password
+    command: server /data --console-address ":9001"
+    ports:
+      - "19000:9000"
+      - "19001:9001"
+  zookeeper1:
+    image: bitnami/zookeeper:latest
+    environment:
+      ALLOW_ANONYMOUS_LOGIN: "yes"
+
+  neo4j:
+    image: neo4j:5.25.1
+    #restart: always
+    container_name: neo4j
+    volumes:
+      - neodata:/var/lib/neo4j/data
+      - neoimport:/var/lib/neo4j/import
+      - neoplugins:/var/lib/neo4j/plugins
+    ports:
+      - "0.0.0.0:7474:7474"
+      - "0.0.0.0:7687:7687"
+    environment:
+      NEO4J_dbms_security_procedures_unrestricted: "example.*, apoc.*"
+      NEO4J_dbms_security_procedures_allowlist: "example.*, apoc.*"
+      NEO4J_dbms_memory_pagecache_size: "200M"
+      NEO4J_dbms_memory_heap_initial__size: "512M"
+      NEO4J_dbms_memory_heap_max__size: "512M"
+      NEO4J_AUTH: neo4j/neo4jtp9
+      NEO4J_dbms_security_auth__minimum__password__length: 7
+      NEO4J_PLUGINS: \[\"apoc\", \"graph-data-science\"]\
+      NEO4J_apoc_export_file_enabled: true
+      NEO4J_apoc_import_file_enabled: true
+      NEO4J_apoc_import_file_use__neo4j__config: true
+
+  notebook:
+    image: jupyter-tp9
+    build:
+      context: .
+      dockerfile: Dockerfile-jupyter
+    container_name: jupyter
+    volumes:
+      - ./:/home/jovyan/work
+    ports:
+    - "8888:8888"
+    - "4040:4040"
+    environment:
+      JUPYTER_ENABLE_LAB: yes
+    command: start-notebook.py --NotebookApp.token=''
+    extra_hosts:
+        - "host.docker.internal:host-gateway"
+volumes:
+  neodata:
+  neoimport:
+  neoplugins:
+
+
+networks:
+  default:
+    name : tp9
\ No newline at end of file
--- a/tp7-Neo4J/docker-compose.yml
+++ b/tp7-Neo4J/docker-compose.yml
+services:
+  spark:
+    image: docker.io/bitnami/spark:3.5
+    environment:
+      - SPARK_MODE=master
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+      - SPARK_USER=spark
+    ports:
+      - '8080:8080'
+      - '7077:7077'
+      - '18888:8888'
+  spark-worker:
+    image: docker.io/bitnami/spark:3.5
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://spark:7077
+      - SPARK_WORKER_MEMORY=1G # can be changed
+      - SPARK_WORKER_CORES=1 # can be changed
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+      - SPARK_USER=spark
+    ports:
+      - '8081:8081'
+    volumes:
+      - ./graphframes_cps:/opt/bitnami/spark/work/graphframes_cps
+  minio:
+    image: minio/minio
+    container_name: minio
+    environment:
+      MINIO_ROOT_USER: root
+      MINIO_ROOT_PASSWORD: password
+    command: server /data --console-address ":9001"
+    ports:
+      - "19000:9000"
+      - "19001:9001"
+  zookeeper1:
+    image: bitnami/zookeeper:latest
+    environment:
+      ALLOW_ANONYMOUS_LOGIN: "yes"
+  kafka1:
+    image: bitnami/kafka:3.0.2
+    depends_on:
+      - zookeeper1
+    environment: # https://rmoff.net/2018/08/02/kafka-listeners-explained/
+      KAFKA_BROKER_ID: 1
+      KAFKA_CFG_ZOOKEEPER_CONNECT: zookeeper1:2181
+      #KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
+      KAFKA_LISTENERS: INTERNAL://:9092,PROXY://0.0.0.0:9093,OUTSIDE://0.0.0.0:9095,PROXY_PASSTHROUGH://0.0.0.0:9096
+      KAFKA_ADVERTISED_LISTENERS: INTERNAL://:9092,PROXY://envoy1:9093,OUTSIDE://localhost:9095,PROXY_PASSTHROUGH://envoy1:9096
+      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL:PLAINTEXT,PROXY:PLAINTEXT,OUTSIDE:PLAINTEXT,PROXY_PASSTHROUGH:PLAINTEXT
+      KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
+      ALLOW_PLAINTEXT_LISTENER: "yes"
+      KAFKA_NUM_PARTITIONS: ${NUM_PARTITIONS:-2}
+  kafka-ui:
+    image: provectuslabs/kafka-ui
+    container_name: kafka-ui
+    ports:
+      - "8082:8080"
+    restart: always
+    environment:
+      - KAFKA_CLUSTERS_0_NAME=kafka1
+      - KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=kafka1:9092
+      - KAFKA_CLUSTERS_0_ZOOKEEPER=zookeeper1:2181
+
+  neo4j:
+    image: neo4j:5.25.1
+    #restart: always
+    container_name: neo4j
+    volumes:
+      - neodata:/var/lib/neo4j/data
+      - neoimport:/var/lib/neo4j/import
+      - neoplugins:/var/lib/neo4j/plugins
+    ports:
+      - "0.0.0.0:7474:7474"
+      - "0.0.0.0:7687:7687"
+    environment:
+      NEO4J_dbms_security_procedures_unrestricted: "example.*, apoc.*"
+      NEO4J_dbms_security_procedures_allowlist: "example.*, apoc.*"
+      NEO4J_dbms_memory_pagecache_size: "200M"
+      NEO4J_dbms_memory_heap_initial__size: "512M"
+      NEO4J_dbms_memory_heap_max__size: "512M"
+      NEO4J_AUTH: neo4j/neo4jtp9
+      NEO4J_dbms_security_auth__minimum__password__length: 7
+      NEO4J_PLUGINS: \[\"apoc\", \"graph-data-science\"]\
+      NEO4J_apoc_export_file_enabled: true
+      NEO4J_apoc_import_file_enabled: true
+      NEO4J_apoc_import_file_use__neo4j__config: true
+
+volumes:
+  neodata:
+  neoimport:
+  neoplugins:
+
+
+networks:
+  default:
+    name : tp9
\ No newline at end of file
--- a/tp7-Neo4J/neo4j_data_engineering.ipynb
+++ b/tp7-Neo4J/neo4j_data_engineering.ipynb
--- a/tp7-Neo4J/requirements.txt
+++ b/tp7-Neo4J/requirements.txt
+httpx<0.28.0
+pandas==2.2.3
+mysqlclient==2.2.4
+jupysql==0.10.14
+seaborn==0.13.2
+pymongo==4.10.1
+pyspark==3.5.3
+hdfs==2.7.3
+minio==7.2.10
+docker==7.1.0
+kafka-python==2.0.2; python_version < '3.12'
+kafka-python @ git+https://github.com/dpkp/kafka-python.git ; python_version >= '3.12'
+findspark
+cy2py
+ipycytoscape
\ No newline at end of file
--- a/tp7-Neo4J/requirements_docker.txt
+++ b/tp7-Neo4J/requirements_docker.txt
+httpx<0.28.0
+#pandas==2.2.3
+#mysqlclient==2.2.4
+jupysql==0.10.14
+#seaborn==0.13.2
+pymongo==4.10.1
+pyspark==3.5.3
+hdfs==2.7.3
+minio==7.2.10
+docker==7.1.0
+kafka-python==2.0.2; python_version < '3.12'
+kafka-python @ git+https://github.com/dpkp/kafka-python.git ; python_version >= '3.12'
+findspark
+cy2py
+ipycytoscape
\ No newline at end of file