#Last login for each employee & date
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local')\
.appName('scdType2')\
.getOrCreate()
data1 = [(1, '2021-02-22 00:00:00'),
(1, '2021-02-22 09:00:00'),
(1, '2021-02-22 11:00:00'),
(2, '2021-02-23 11:00:00'),
(2, '2021-02-24 23:00:00'),
(2, '2021-02-24 23:15:00')]
columns1 = ['emp_id', 'date_time_col']
dataDF = spark.createDataFrame(data=data1, schema=columns1)
dataDF.show()
dataDF.groupBy('emp_id', to_date('date_time_col', 'yyyy-MM-dd').alias('calendar_day'))\
.agg(max('date_time_col').alias('last_login_time_by_day')).show()
Output:
No comments:
Post a Comment