Apr 15, 2023

Pyspark Joins

data1_cols = ["id","name"]

data1 = [(1, "Sugreeva"), (2, "Ravan"), (4, "Hanuman"), (4, "Hanuman"), (2, "Ravan")]

data1_df = spark.createDataFrame(data = data1, schema = data1_cols)

data1_df.show(truncate=False)

+---+--------+
| id| name|
+---+--------+
| 1|Sugreeva|
| 2| Ravan|
| 4| Hanuman|
| 4| Hanuman|
| 2| Ravan|
+---+--------+

data2_cols = ["id","name"]

data2 = [(2, "Ravan"), (2, "Ravan"), (1, "Sugreeva"), (6, "Ram"), (7, "Sita"), (7, "Sita")]

data2_df = spark.createDataFrame(data = data2, schema = data2_cols)

data2_df.show(truncate=False)

+---+--------+
| id| name|
+---+--------+
| 2| Ravan|
| 2| Ravan|
| 1|Sugreeva|
| 6| Ram|
| 7| Sita|
| 7| Sita|
+---+--------+

#inner (M*N if any duplicates)

data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="inner").show(truncate=False)

+---+--------+---+--------+
|id |name |id |name |
+---+--------+---+--------+
|1 |Sugreeva|1 |Sugreeva|
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
+---+--------+---+--------+
#left join (M*N if any duplicates)

data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="left").show(truncate=False)

+---+--------+----+--------+
|id |name |id |name |
+---+--------+----+--------+
|1 |Sugreeva|1 |Sugreeva|
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|4 |Hanuman |null|null |
|4 |Hanuman |null|null |
+---+--------+----+--------+

#right join (M*N if any duplicates)

data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="right").show(truncate=False)

+----+--------+---+--------+
|id |name |id |name |
+----+--------+---+--------+
|null|null |7 |Sita |
|null|null |7 |Sita |
|null|null |6 |Ram |
|1 |Sugreeva|1 |Sugreeva|
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
+----+--------+---+--------+

#left_anti

data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="left_anti").show(truncate=False)

+---+-------+
|id |name |
+---+-------+
|4 |Hanuman|
|4 |Hanuman|
+---+-------+

#left_semi

data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="left_semi").show(truncate=False)

+---+--------+
|id |name |
+---+--------+
|1 |Sugreeva|
|2 |Ravan |
|2 |Ravan |
+---+--------+

#right_anti (not by default, reverse join dataframes & use left_anti)

data2_df.join(data1_df, on=[data1_df.id == data2_df.id], how="left_anti").show(truncate=False)

+---+----+
|id |name|
+---+----+
|7 |Sita|
|7 |Sita|
|6 |Ram |
+---+----+



No comments:

Post a Comment