Here is an SQL Query that extracts employees whose salary is greater than the average salary.
SQL Query in PySpark & Pandas
Table of contents
SQL Query
SELECT
employee_id,
last_name,
first_name,
salary
FROM
employee
WHERE
salary > (SELECT AVG(salary) FROM employee);
Pandas Code
import pandas as pd
# Assuming you have a DataFrame named 'employee_df' containing employee data
# Calculate the average salary
average_salary = employee_df['salary'].mean()
# Extract employees with salary greater than the average salary
high_salary_employees = employee_df[employee_df['salary'] > average_salary]
# Display the result
print(high_salary_employees)
PySpark Code
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
# Initialize SparkSession
spark = SparkSession.builder \
.appName("HighSalaryEmployees") \
.getOrCreate()
# Assuming you have loaded your data into a DataFrame named 'employee_df'
# If not, replace 'employee_df' with your actual DataFrame
# Calculate the average salary
average_salary = employee_df.select(avg("salary")).collect()[0][0]
# Extract employees with salary greater than the average salary
high_salary_employees = employee_df.filter(employee_df["salary"] > average_salary)
# Display the result
high_salary_employees.show()
# Stop SparkSession
spark.stop()
You must be logged in to post a comment.