from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local')\
.appName('scdType2')\
.getOrCreate()
data1 = [
(1, 100),
(2, 100),
(3, 100),
(4, 200),
(5, 200)]
columns1 = ['id', 'num']
dataDF = spark.createDataFrame(data=data1, schema=columns1)
dataDF.show()
win_s = Window.orderBy('id')
df = dataDF.withColumn('lag_val', lag('num').over(win_s) - col('num'))\
.withColumn('lead_val', lead('num').over(win_s) - col('num'))
df.filter((df.lag_val == 0) & (df.lead_val == 0)).select('num').show()
Output:
No comments:
Post a Comment