Here are some interview questions for the Data engineer roles: Python, PySpark, and SQL. These are useful for practice.

Python Questions
Q.1) state=[{“Bob”,”87″}, {“Mike”, “35”},{“Bob”, “52”}, {“Jason”,”35″}, {“Mike”, “55”}, {“Jessica”, “99”}]
Find out from Max marks from the above data
# Given data
state = [{"Bob", "87"}, {"Mike", "35"}, {"Bob", "52"}, {"Jason", "35"}, {"Mike", "55"}, {"Jessica", "99"}]
# Convert sets to a list of tuples
state_list = []
for s in state:
for item in s:
# Identify the name and the marks by checking if it's numeric
if item.isdigit():
marks = int(item)
else:
name = item
state_list.append((name, marks))
# Find the person with the maximum marks
max_record = max(state_list, key=lambda x: x[1])
print(f"{max_record[0]} has the maximum marks: {max_record[1]}")
Q.2. Write Python code?
Question
1
2 2
3 3 3
4 4 4 4
Solution
# Number of rows
n = 4
for i in range(1, n + 1):
# Print 'i' i times in each row
print((str(i) + ' ') * i)
PySpark Questions
Q.3 Pyspark script to find an employee with the highest salary department-wise
Method-1: using groupBy()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as max_
# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeHighestSalary").getOrCreate()
# Sample data for employees: [employee_id, employee_name, department, salary]
data = [
(1, "John", "HR", 5000),
(2, "Jane", "HR", 6000),
(3, "Sam", "IT", 7000),
(4, "Mike", "IT", 9000),
(5, "Sara", "Finance", 8000),
(6, "Bob", "Finance", 8500)
]
# Create DataFrame from sample data
columns = ["employee_id", "employee_name", "department", "salary"]
df = spark.createDataFrame(data, columns)
# Find the maximum salary per department
max_salary_df = df.groupBy("department").agg(max_("salary").alias("max_salary"))
# Join the original dataframe with max_salary_df to get employee details with highest salary
result_df = df.join(max_salary_df, (df.department == max_salary_df.department) & (df.salary == max_salary_df.max_salary)) \
.select(df["employee_id"], df["employee_name"], df["department"], df["salary"])
# Show the result
result_df.show()
# Stop the Spark session
spark.stop()
Method:2 Using Windows function
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank
# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeHighestSalaryRank").getOrCreate()
# Sample data for employees: [employee_id, employee_name, department, salary]
data = [
(1, "John", "HR", 5000),
(2, "Jane", "HR", 6000),
(3, "Sam", "IT", 7000),
(4, "Mike", "IT", 9000),
(5, "Sara", "Finance", 8000),
(6, "Bob", "Finance", 8500)
]
# Create DataFrame from sample data
columns = ["employee_id", "employee_name", "department", "salary"]
df = spark.createDataFrame(data, columns)
# Define a window specification: partition by department, order by salary in descending order
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())
# Apply rank() to each employee in their department based on salary
ranked_df = df.withColumn("rank", rank().over(window_spec))
# Filter to get only the top-ranked employees (rank == 1) in each department
highest_salary_df = ranked_df.filter(col("rank") == 1).drop("rank")
# Show the result
highest_salary_df.show()
SQL Questions
Q.4
Input
Name Address Email Floor Resource
A Bangalore a@gmail.com 1 Cpu
A Bangalore A1@gmail.com 1 Cpu
A Bangalore A2@gmail.com 2 desktop
B Bangalore b@gmail.com 2 Desktop
B Bangalore B1@gmail.com 2 Desktop
B Bangalore B2@gmail.com 1 Monitor
Output
Name Total_visit Most_visited_floor Resources_used
A 3 1 Cpu, desktop
B 3 2 Desktop, Monitor
Solution
use sakila;
CREATE TABLE my_floor (
Name VARCHAR(50),
Address VARCHAR(100),
Email VARCHAR(100),
Floor INT,
Resource VARCHAR(50)
);
INSERT INTO my_floor (Name, Address, Email, Floor, Resource)
VALUES
('A', 'Bangalore', 'a@gmail.com', 1, 'Cpu'),
('A', 'Bangalore', 'A1@gmail.com', 1, 'Cpu'),
('A', 'Bangalore', 'A2@gmail.com', 2, 'desktop'),
('B', 'Bangalore', 'b@gmail.com', 2, 'Desktop'),
('B', 'Bangalore', 'B1@gmail.com', 2, 'Desktop'),
('B', 'Bangalore', 'B2@gmail.com', 1, 'Monitor');
SELECT
Name,
COUNT(*) AS Total_visit,
-- Find the most visited floor using a subquery and GROUP BY
(SELECT Floor
FROM (
SELECT Floor, COUNT(*) AS floor_visits
FROM my_floor t2
WHERE t2.Name = t1.Name
GROUP BY Floor
ORDER BY floor_visits DESC
LIMIT 1
) AS most_visited_floor_subquery
) AS Most_visited_floor,
-- Aggregate distinct resources used by each name
GROUP_CONCAT(DISTINCT Resource ORDER BY Resource SEPARATOR ', ') AS Resources_used
FROM my_floor t1
GROUP BY Name;
Output
Name Total_visit Most_visited_floor Resources_used
--- ---- ------ -----
A 3 1 Cpu, desktop
B 3 2 Desktop, Monitor
Q.5 Amounts deposits question
transaction_id ,account_holder, transaction_type, amount
1, John, Deposit, 1000.0
2, Alice, Withdrawal, 500.0
3, Bob, Deposit, 1500.0
4, John, Withdrawal, 500.0
5, Bob, Withdrawal, 1000.0
-------------------------------------
Q-1 : Calculate Total Balance for Each Account Holder ?
Q-2 : Find the Account Holder with the Highest Total Balance ?
Q-3 : Identify the Most Frequent Transaction Type ?
Solution
CREATE TABLE transactions (
transaction_id INT,
account_holder VARCHAR(50),
transaction_type VARCHAR(50),
amount DECIMAL(10, 2)
);
INSERT INTO transactions (transaction_id, account_holder, transaction_type, amount)
VALUES
(1, 'John', 'Deposit', 1000.00),
(2, 'Alice', 'Withdrawal', 500.00),
(3, 'Bob', 'Deposit', 1500.00),
(4, 'John', 'Withdrawal', 500.00),
(5, 'Bob', 'Withdrawal', 1000.00);
Total balance for each account holder
SELECT
account_holder,
SUM(CASE WHEN transaction_type = 'Deposit' THEN amount
WHEN transaction_type = 'Withdrawal' THEN -amount
ELSE 0 END) AS total_balance
FROM
transactions
GROUP BY
account_holder;
Account holder with the highest balance
SELECT
account_holder,
total_balance
FROM (
SELECT
account_holder,
SUM(CASE WHEN transaction_type = 'Deposit' THEN amount
WHEN transaction_type = 'Withdrawal' THEN -amount
ELSE 0 END) AS total_balance
FROM
transactions
GROUP BY
account_holder
) AS balances
ORDER BY
total_balance DESC
LIMIT 1;
Identify the most frequent transaction type
SELECT
transaction_type,
COUNT(*) AS frequency
FROM
transactions
GROUP BY
transaction_type
ORDER BY
frequency DESC
LIMIT 1;







You must be logged in to post a comment.