data1_cols = ["id","name"]
data1 = [(1, "Sugreeva"), (2, "Ravan"), (4, "Hanuman"), (4, "Hanuman"), (2, "Ravan")]
data1_df = spark.createDataFrame(data = data1, schema = data1_cols)
data1_df.show(truncate=False)
+---+--------+
| id| name|
+---+--------+
| 1|Sugreeva|
| 2| Ravan|
| 4| Hanuman|
| 4| Hanuman|
| 2| Ravan|
+---+--------+
data2_cols = ["id","name"]
data2 = [(2, "Ravan"), (2, "Ravan"), (1, "Sugreeva"), (6, "Ram"), (7, "Sita"), (7, "Sita")]
data2_df = spark.createDataFrame(data = data2, schema = data2_cols)
data2_df.show(truncate=False)
+---+--------+
| id| name|
+---+--------+
| 2| Ravan|
| 2| Ravan|
| 1|Sugreeva|
| 6| Ram|
| 7| Sita|
| 7| Sita|
+---+--------+
#inner (M*N if any duplicates)
data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="inner").show(truncate=False)
+---+--------+---+--------+
|id |name |id |name |
+---+--------+---+--------+
|1 |Sugreeva|1 |Sugreeva|
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
+---+--------+---+--------+
data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="left").show(truncate=False)
+---+--------+----+--------+
|id |name |id |name |
+---+--------+----+--------+
|1 |Sugreeva|1 |Sugreeva|
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|4 |Hanuman |null|null |
|4 |Hanuman |null|null |
+---+--------+----+--------+
#right join (M*N if any duplicates)
data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="right").show(truncate=False)
+----+--------+---+--------+
|id |name |id |name |
+----+--------+---+--------+
|null|null |7 |Sita |
|null|null |7 |Sita |
|null|null |6 |Ram |
|1 |Sugreeva|1 |Sugreeva|
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
|2 |Ravan |2 |Ravan |
+----+--------+---+--------+
#left_anti
data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="left_anti").show(truncate=False)
+---+-------+
|id |name |
+---+-------+
|4 |Hanuman|
|4 |Hanuman|
+---+-------+
data1_df.join(data2_df, on=[data1_df.id == data2_df.id], how="left_semi").show(truncate=False)
+---+--------+
|id |name |
+---+--------+
|1 |Sugreeva|
|2 |Ravan |
|2 |Ravan |
+---+--------+
#right_anti (not by default, reverse join dataframes & use left_anti)
data2_df.join(data1_df, on=[data1_df.id == data2_df.id], how="left_anti").show(truncate=False)
+---+----+
|id |name|
+---+----+
|7 |Sita|
|7 |Sita|
|6 |Ram |
+---+----+
No comments:
Post a Comment