Apr 5, 2023

pyspark join condition (2 types)

 columns1 = ["emp_id","emp_name","emp_city","emp_salary"]

data1 = [
(1, "John", "Sydney", 35000.00),
(2, "Peter", "Melbourne", 45000.00),
(3, "Sam", "Sydney", 55000.00)]
emp_df = spark.createDataFrame(data = data1, schema = columns1)
emp_df.show(truncate=False)
emp_df.printSchema()

data2 = [
(2, "Peter", "Melbourne", 55000.00),
(5, "Jessie", "Brisbane", 42000.00)]
emp_delta_df = spark.createDataFrame(data = data2, schema = columns1)
emp_delta_df.show(truncate=False)
emp_delta_df.printSchema()

# emp_id from emp_delta_df not shown
print('### emp_id from emp_delta_df not shown')
emp_df.join(emp_delta_df, "emp_id", "inner").show()

# emp_id is shown from both dataframes
print('### emp_id is repeated from both dataframes')
emp_df.join(emp_delta_df, emp_df.emp_id == emp_delta_df.emp_id, "inner").show()
### emp_id from emp_delta_df not shown
+------+--------+---------+----------+--------+---------+----------+
|emp_id|emp_name| emp_city|emp_salary|emp_name| emp_city|emp_salary|
+------+--------+---------+----------+--------+---------+----------+
|     2|   Peter|Melbourne|   45000.0|   Peter|Melbourne|   55000.0|
+------+--------+---------+----------+--------+---------+----------+

### emp_id is repeated from both dataframes
+------+--------+---------+----------+------+--------+---------+----------+
|emp_id|emp_name| emp_city|emp_salary|emp_id|emp_name| emp_city|emp_salary|
+------+--------+---------+----------+------+--------+---------+----------+
|     2|   Peter|Melbourne|   45000.0|     2|   Peter|Melbourne|   55000.0|
+------+--------+---------+----------+------+--------+---------+----------+

No comments:

Post a Comment