#Pyspark - Find sum of login time for each employee per each day
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local')\
.appName('scdType2')\
.getOrCreate()
data1 = [(1, '2021-02-22', 5, 20),
(1, '2021-02-22', 50, 445),
(1, '2021-02-22', 500, 575),
(2, '2021-02-23', 15, 70),
(3, '2021-02-24', 45, 95),
(4, '2021-02-24', 100, 300)]
columns1 = ['emp_id', 'date', 'in', 'out']
dataDF = spark.createDataFrame(data=data1, schema=columns1)
dataDF.show()
dataDF.groupBy('emp_id', 'date')\
.agg(sum(col('out') - col('in'))).alias('diff').orderBy('emp_id', 'date').show()
Output:
No comments:
Post a Comment