Here are two ways to use PySpark to get employees from the emp Table whose salary is higher than their manager’s.
Table of contents
Approach#1
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Initialize SparkSession
spark = SparkSession.builder \
.appName("EmployeeSalaryComparison") \
.getOrCreate()
# Sample data
data = [
(7369, "SMITH", "CLERK", 7902, "1981-04-02", 2150, None, 20),
(7654, "MARTIN", "SALESMAN", 7698, "1981-09-28", 3100, 1000, 30),
(7698, "BLAKE", "MANAGER", 7839, "1981-05-01", 2850, None, 30),
(7902, "CAVIN", "MANAGER", 7300, "1981-06-02", 1200, None, 20),
]
columns = ["empno", "ename", "job", "mgr", "hiredate", "sal", "comm", "deptno"]
# Create DataFrame
emp_df = spark.createDataFrame(data, columns)
df1=emp_df.alias("df1")
df2=emp_df.alias("df2")
df2=df2.select(col("empno").alias("mempno"), \
col("ename").alias("mename"), \
col("job").alias("mjob"), \
col("mgr").alias("mmgr"), \
col("hiredate").alias("mhiredate"), \
col("sal").alias("msal"), \
col("comm").alias("mcomm"), \
col("deptno").alias("mdeptno"),
)
joined_df=df1.join(df2, df1["mgr"] == df2["mempno"], "left").select("empno", "sal","msal")
result=joined_df.filter(col("sal") > col("msal"))
result.show()
Output
+-----+----+----+
|empno| sal|msal|
+-----+----+----+
| 7654|3100|2850|
| 7369|2150|1200|
+-----+----+----+
Approach#2
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Initialize SparkSession
spark = SparkSession.builder \
.appName("EmployeeSalaryComparison") \
.getOrCreate()
# Sample data
data = [
(7369, "SMITH", "CLERK", 7902, "1981-04-02", 2150, None, 20),
(7654, "MARTIN", "SALESMAN", 7698, "1981-09-28", 3100, 1000, 30),
(7698, "BLAKE", "MANAGER", 7839, "1981-05-01", 2850, None, 30),
(7902, "CAVIN", "MANAGER", 7300, "1981-06-02", 1200, None, 20),
]
columns = ["empno", "ename", "job", "mgr", "hiredate", "sal", "comm", "deptno"]
# Create DataFrame
emp_df = spark.createDataFrame(data, columns)
df2=emp_df.select( *[col(c).alias("m" +c) for c in columns])
joined_df=emp_df.join(df2, emp_df.mgr == df2.mempno, "left").select("empno", "sal", "msal")
result=joined_df.filter(col("sal") > col("msal"))
result.show()
Output
+-----+----+----+
|empno| sal|msal|
+-----+----+----+
| 7654|3100|2850|
| 7369|2150|1200|
+-----+----+----+