spark

7c86926e · Quentin ANDRE · 9eb6eb1a · 7c86926e
Commit 7c86926e authored 2 years ago by Quentin ANDRE
--- a/spark_streaming.py
+++ b/spark_streaming.py
@@ -3,7 +3,7 @@ from geopy.geocoders import Nominatim
 import sys
 from pyspark.sql import SparkSession
 from pyspark.sql.types import *
-from pyspark.sql.functions import split
+from pyspark.sql.functions import split, from_json, col


 geolocator = Nominatim(user_agent="quentin.andre@imt-atlantique.net")
@@ -17,27 +17,47 @@ def find_country(lat, long):
        return None

 spark = SparkSession.builder.appName("Spark Structured Streaming from Kafka").getOrCreate()
-planes = spark.readStream.format("kafka") \
+sdfPlanes = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "air-traffic") \
    .option("startingOffsets", "latest") \
-    .load().selectExpr("CAST(value AS JSON)")
+    .load().selectExpr("CAST(value AS STRING)")

-taxiFaresSchema = StructType([
-    StructField("on_ground", BoolType()), 
+planesSchema = StructType([
+    StructField("on_ground", BooleanType()), 
    StructField("icao24", LongType()),
    StructField("sensors", LongType()), 
-    StructField("vertical_rate", TimestampType()),
+    StructField("vertical_rate", FloatType()),
    StructField("origin_country", StringType()), 
+    StructField("squawk", LongType()), 
+    StructField("geo_altitude", FloatType()),
+    StructField("baro_altitude", FloatType()), 
+    StructField("velocity", FloatType()),
+    StructField("latitude", FloatType()), 
+    StructField("spi", BooleanType()),
+    StructField("position_source", IntegerType()), 
+    StructField("last_contact", LongType()),
+    StructField("time_position", LongType()), 
+    StructField("heading", FloatType()),
+    StructField("time", LongType()), 
+    StructField("longitude", FloatType()),
+    StructField("callsign", StringType()), 
    ])

-'''{"on_ground": false, "icao24": "407182", "sensors": null, "vertical_rate": 0, "origin_country": "United Kingdom", "squawk": "7755", "geo_altitude": 8214.36, "baro_altitude": 8229.6, "velocity": 176.26, "latitude": 54.1107, "spi": false, "position_source": 0, "last_contact": 1646922076, "time_position": 1646922076, "heading": 158.6, "time": 1646922077, "longitude": -2.8725, "callsign": "EXS1LY  "}'''
+'''{"on_ground": false, "icao24": "407182", "sensors": null, "vertical_rate": 0, "origin_country": "United Kingdom", 
+"squawk": "7755", "geo_altitude": 8214.36, "baro_altitude": 8229.6, "velocity": 176.26, "latitude": 54.1107, "spi": false, 
+"position_source": 0, "last_contact": 1646922076, "time_position": 1646922076, "heading": 158.6, "time": 1646922077, 
+"longitude": -2.8725, "callsign": "EXS1LY  "}'''

 def parse_data_from_kafka_message(sdf, schema):
    assert sdf.isStreaming == True, "DataFrame doesn't receive streaming data"
-    col = split(sdf['value'], ',') #split attributes to nested array in one Column
+
+    '''col = split(sdf['value'], ',') #split attributes to nested array in one Column
+    
    #now expand col to multiple top-level columns
    for idx, field in enumerate(schema):
-        sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType))
-    return sdf.select([field.name for field in schema])
-    sdfRides = parse_data_from_kafka_message(sdfRides, taxiRidesSchema)
\ No newline at end of file
+        sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType))'''
+    return sdf.withColumn("value", from_json("value", schema)).select([field.name for field in schema])
+    
+sdfPlanes = parse_data_from_kafka_message(sdfPlanes, planesSchema)
+sdfPlanes.show()
\ No newline at end of file