Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
tp-bigdata
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LOGIN-BIGDATA
tp-bigdata
Commits
80deb151
Commit
80deb151
authored
1 month ago
by
Helene Coullon
Browse files
Options
Downloads
Patches
Plain Diff
tp6 base + Naolib
parent
0c97d4a1
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
tp4-5-6/tp6/corrige_tp6.ipynb
+9
-22
9 additions, 22 deletions
tp4-5-6/tp6/corrige_tp6.ipynb
with
9 additions
and
22 deletions
tp4-5-6/tp6/corrige_tp6.ipynb
+
9
−
22
View file @
80deb151
...
...
@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
2
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -69,7 +69,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
3
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -78,7 +78,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
7
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -86,14 +86,13 @@
"file_path = \"s3a://tp6/160109-histoire.txt\"\n",
"#lines = sql_context.read.text(file_path).collect()\n",
"#print(lines)\n",
"lines = sql_context.read.text(file_path).rdd.map(lambda r: r[0]) # pourquoi le map ??? pourquoi le read et \n",
"print(lines.collect())\n",
"#pas textfile comme TP avant ?"
"lines = sql_context.read.text(file_path).rdd.map(lambda r: r[0])\n",
"print(lines.collect())"
]
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
8
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -144,7 +143,7 @@
"\n",
"# Étape 3 : Transformer les données\n",
"# Diviser les lignes en mots\n",
"words = lines.select(explode(split(col(\"value\"), \" \")).alias(\"word\"))
# pas clair pour moi
\n",
"words = lines.select(explode(split(col(\"value\"), \" \")).alias(\"word\"))\n",
"\n",
"# Compter les occurrences des mots\n",
"word_counts = words.groupBy(\"word\").count()\n",
...
...
@@ -169,7 +168,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
10
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -237,7 +236,7 @@
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
12
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -250,18 +249,6 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
...
...
%% Cell type:markdown id: tags:
# Correction TP6
%% Cell type:code id: tags:
```
python
### Configuration Mac : utilisation de notebook Jupyter
from
pyspark
import
SparkContext
,
SparkConf
conf
=
SparkConf
()
\
.
setAppName
(
'
SparkApp
'
)
\
.
setMaster
(
'
spark://spark:7077
'
)
\
.
set
(
"
spark.jars.packages
"
,
"
org.apache.hadoop:hadoop-aws:3.3.4
"
)
\
.
set
(
"
spark.sql.shuffle.partitions
"
,
"
10
"
)
sc
=
SparkContext
.
getOrCreate
(
conf
=
conf
)
from
pyspark.sql
import
SQLContext
# Créer un SQLContext pour les opérations SQL
sql_context
=
SQLContext
(
sc
)
minio_ip_address
=
"
minio
"
```
%% Cell type:code id: tags:
```
python
sc
.
_jsc
.
hadoopConfiguration
().
set
(
"
fs.s3a.endpoint
"
,
f
"
http://
{
minio_ip_address
}
:9000
"
)
sc
.
_jsc
.
hadoopConfiguration
().
set
(
"
fs.s3a.access.key
"
,
"
root
"
)
sc
.
_jsc
.
hadoopConfiguration
().
set
(
"
fs.s3a.secret.key
"
,
"
password
"
)
sc
.
_jsc
.
hadoopConfiguration
().
set
(
"
fs.s3a.path.style.access
"
,
"
true
"
)
sc
.
_jsc
.
hadoopConfiguration
().
set
(
"
fs.s3a.impl
"
,
"
org.apache.hadoop.fs.s3a.S3AFileSystem
"
)
sc
.
_jsc
.
hadoopConfiguration
().
set
(
"
fs.s3a.connection.ssl.enabled
"
,
"
false
"
)
from
minio
import
Minio
client_minio
=
Minio
(
f
"
{
minio_ip_address
}
:9000
"
,
access_key
=
"
root
"
,
secret_key
=
"
password
"
,
secure
=
False
)
# Création du bucket tp6
if
client_minio
.
bucket_exists
(
"
tp6
"
)
==
False
:
client_minio
.
make_bucket
(
"
tp6
"
)
```
%% Cell type:markdown id: tags:
## Exercice 1
%% Cell type:code id: tags:
```
python
client_minio
.
fput_object
(
"
tp6
"
,
"
160109-histoire.txt
"
,
"
allData/160109-histoire.txt
"
)
```
%% Cell type:code id: tags:
```
python
# Chargement des données
file_path
=
"
s3a://tp6/160109-histoire.txt
"
#lines = sql_context.read.text(file_path).collect()
#print(lines)
lines
=
sql_context
.
read
.
text
(
file_path
).
rdd
.
map
(
lambda
r
:
r
[
0
])
# pourquoi le map ??? pourquoi le read et
lines
=
sql_context
.
read
.
text
(
file_path
).
rdd
.
map
(
lambda
r
:
r
[
0
])
print
(
lines
.
collect
())
#pas textfile comme TP avant ?
```
%% Cell type:code id: tags:
```
python
# Transformation : Diviser les lignes en mots
words
=
lines
.
flatMap
(
lambda
line
:
line
.
split
(
"
"
))
# Agrégation : Compter les mots
word_counts
=
words
.
map
(
lambda
word
:
(
word
,
1
)).
reduceByKey
(
lambda
a
,
b
:
a
+
b
)
# Trier par fréquence décroissante
sorted_word_counts
=
word_counts
.
sortBy
(
lambda
x
:
x
[
1
],
ascending
=
False
)
# Afficher les résultats
print
(
"
Table des mots et leurs comptes :
"
)
for
word
,
count
in
sorted_word_counts
.
take
(
10
):
print
(
f
"
{
word
}
:
{
count
}
"
)
# Filtrer les mots avec une longueur >= 6
filtered_word_counts
=
sorted_word_counts
.
filter
(
lambda
x
:
len
(
x
[
0
])
>=
6
)
# Afficher les résultats filtrés
print
(
"
Mots de longueur >= 6 et leurs comptes :
"
)
for
word
,
count
in
filtered_word_counts
.
take
(
10
):
print
(
f
"
{
word
}
:
{
count
}
"
)
```
%% Cell type:markdown id: tags:
## Exercice 2 : Stream
%% Cell type:code id: tags:
```
python
from
pyspark.sql.functions
import
explode
,
split
,
col
# Étape 2 : Définir la source (le répertoire "data" contenant les fichiers texte)
input_dir
=
"
s3a://tp6/allData
"
# Répertoire de surveillance
checkpoint_dir
=
"
checkpoint
"
# Répertoire pour la reprise
lines
=
sql_context
.
readStream
\
.
format
(
"
text
"
)
\
.
load
(
input_dir
)
# Étape 3 : Transformer les données
# Diviser les lignes en mots
words
=
lines
.
select
(
explode
(
split
(
col
(
"
value
"
),
"
"
)).
alias
(
"
word
"
))
# pas clair pour moi
words
=
lines
.
select
(
explode
(
split
(
col
(
"
value
"
),
"
"
)).
alias
(
"
word
"
))
# Compter les occurrences des mots
word_counts
=
words
.
groupBy
(
"
word
"
).
count
()
# Sort word counts in descending order
sorted_word_counts
=
word_counts
.
orderBy
(
col
(
"
count
"
).
desc
())
# Étape 4 : Définir la sortie
# Sortie sur la console avec affichage de 20 lignes par micro-batch
query
=
sorted_word_counts
.
writeStream
\
.
outputMode
(
"
complete
"
)
\
.
format
(
"
console
"
)
\
.
option
(
"
numRows
"
,
20
)
\
.
option
(
"
truncate
"
,
False
)
\
.
start
()
# Étape 5 : Démarrer la requête
query
.
awaitTermination
()
# https://stackoverflow.com/questions/61463554/structured-streaming-output-is-not-showing-on-jupyter-notebook
```
%% Cell type:code id: tags:
```
python
query
.
stop
()
```
%% Cell type:code id: tags:
```
python
from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
explode
,
split
,
col
,
regexp_replace
,
length
# Définir les chemins
input_dir
=
"
data
"
# Répertoire surveillé
checkpoint_dir
=
"
checkpoint
"
# Répertoire de checkpoint
# Définir les mots à exclure
excluded_words
=
{
"
quelques
"
,
"
toujours
"
,
"
ceci
"
,
"
cela
"
,
"
mais
"
,
"
donc
"
,
"
or
"
,
"
ni
"
,
"
car
"
}
input_dir
=
"
s3a://tp6/allData
"
# Répertoire de surveillance
lines
=
sql_context
.
readStream
\
.
format
(
"
text
"
)
\
.
load
(
input_dir
)
# Transformation : nettoyer les mots et filtrer
words
=
lines
.
select
(
explode
(
split
(
regexp_replace
(
col
(
"
value
"
),
r
"
[^\w\s]
"
,
""
),
# Supprimer la ponctuation
"
"
)
).
alias
(
"
word
"
)
)
# Filtrer les mots
filtered_words
=
words
.
filter
(
(
length
(
col
(
"
word
"
))
>
7
)
&
# Garde les mots avec plus de 7 lettres
(
~
col
(
"
word
"
).
isin
(
*
excluded_words
))
&
# Exclut les mots dans excludedWords
(
col
(
"
word
"
)
!=
""
)
# Exclut les mots vides
)
# Compter les occurrences des mots filtrés
word_counts
=
filtered_words
.
groupBy
(
"
word
"
).
count
()
# Trier les mots par fréquence décroissante
sorted_word_counts
=
word_counts
.
orderBy
(
col
(
"
count
"
).
desc
())
# Écrire le résultat sur la console avec les 20 mots les plus fréquents
query
=
sorted_word_counts
.
writeStream
\
.
outputMode
(
"
complete
"
)
\
.
format
(
"
console
"
)
\
.
option
(
"
numRows
"
,
20
)
\
.
option
(
"
truncate
"
,
False
)
\
.
start
()
# Attendre la fin de l'exécution
query
.
awaitTermination
()
```
%% Cell type:code id: tags:
```
python
query
.
stop
()
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment