columns1 = ["emp_id","emp_name","emp_city","emp_salary"]
data1 = [
(1, "John", "Sydney", 35000.00),
(2, "Peter", "Melbourne", 45000.00),
(3, "Sam", "Sydney", 55000.00)]
emp_df = spark.createDataFrame(data = data1, schema = columns1)
emp_df.show(truncate=False)
emp_df.printSchema()
data2 = [
(2, "Peter", "Melbourne", 55000.00),
(5, "Jessie", "Brisbane", 42000.00)]
emp_delta_df = spark.createDataFrame(data = data2, schema = columns1)
emp_delta_df.show(truncate=False)
emp_delta_df.printSchema()
# emp_id from emp_delta_df not shown
print('### emp_id from emp_delta_df not shown')
emp_df.join(emp_delta_df, "emp_id", "inner").show()
# emp_id is shown from both dataframes
print('### emp_id is repeated from both dataframes')
emp_df.join(emp_delta_df, emp_df.emp_id == emp_delta_df.emp_id, "inner").show()
### emp_id from emp_delta_df not shown
+------+--------+---------+----------+--------+---------+----------+
|emp_id|emp_name| emp_city|emp_salary|emp_name| emp_city|emp_salary|
+------+--------+---------+----------+--------+---------+----------+
| 2| Peter|Melbourne| 45000.0| Peter|Melbourne| 55000.0|
+------+--------+---------+----------+--------+---------+----------+
### emp_id is repeated from both dataframes
+------+--------+---------+----------+------+--------+---------+----------+
|emp_id|emp_name| emp_city|emp_salary|emp_id|emp_name| emp_city|emp_salary|
+------+--------+---------+----------+------+--------+---------+----------+
| 2| Peter|Melbourne| 45000.0| 2| Peter|Melbourne| 55000.0|
+------+--------+---------+----------+------+--------+---------+----------+
No comments:
Post a Comment