How to Write PySpark Code: Employee Sal> Manager

Here are two ways to use PySpark to get employees from the emp Table whose salary is higher than their manager’s.

Table of contents

  1. Approach#1
  2. Approach#2

Approach#1

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("EmployeeSalaryComparison") \
    .getOrCreate()

# Sample data
data = [
    (7369, "SMITH", "CLERK", 7902, "1981-04-02", 2150, None, 20),
    (7654, "MARTIN", "SALESMAN", 7698, "1981-09-28", 3100, 1000, 30),
    (7698, "BLAKE", "MANAGER", 7839, "1981-05-01", 2850, None, 30),
    (7902, "CAVIN", "MANAGER", 7300, "1981-06-02", 1200, None, 20),
]

columns = ["empno", "ename", "job", "mgr", "hiredate", "sal", "comm", "deptno"]

# Create DataFrame
emp_df = spark.createDataFrame(data, columns)

df1=emp_df.alias("df1")
df2=emp_df.alias("df2")

df2=df2.select(col("empno").alias("mempno"), \
               col("ename").alias("mename"), \
                   col("job").alias("mjob"), \
                       col("mgr").alias("mmgr"), \
                           col("hiredate").alias("mhiredate"), \
                               col("sal").alias("msal"), \
                                   col("comm").alias("mcomm"), \
                                       col("deptno").alias("mdeptno"), 
               
               )

joined_df=df1.join(df2, df1["mgr"] == df2["mempno"], "left").select("empno", "sal","msal")

result=joined_df.filter(col("sal") > col("msal")) 
result.show()

Output

+-----+----+----+
|empno| sal|msal|
+-----+----+----+
| 7654|3100|2850|
| 7369|2150|1200|
+-----+----+----+

Approach#2

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("EmployeeSalaryComparison") \
    .getOrCreate()

# Sample data
data = [
    (7369, "SMITH", "CLERK", 7902, "1981-04-02", 2150, None, 20),
    (7654, "MARTIN", "SALESMAN", 7698, "1981-09-28", 3100, 1000, 30),
    (7698, "BLAKE", "MANAGER", 7839, "1981-05-01", 2850, None, 30),
    (7902, "CAVIN", "MANAGER", 7300, "1981-06-02", 1200, None, 20),
]

columns = ["empno", "ename", "job", "mgr", "hiredate", "sal", "comm", "deptno"]

# Create DataFrame
emp_df = spark.createDataFrame(data, columns)

df2=emp_df.select( *[col(c).alias("m" +c) for c in columns])


joined_df=emp_df.join(df2, emp_df.mgr == df2.mempno, "left").select("empno", "sal", "msal")

result=joined_df.filter(col("sal") > col("msal"))

result.show()

Output

+-----+----+----+
|empno| sal|msal|
+-----+----+----+
| 7654|3100|2850|
| 7369|2150|1200|
+-----+----+----+

Author: Srini

Experienced Data Engineer, having skills in PySpark, Databricks, Python SQL, AWS, Linux, and Mainframe