Here are some interview questions for the Data engineer roles: Python, PySpark, and SQL. These are useful for practice.

Python, PySpark and SQL interview Questions
Photo by Greta Hoffman on Pexels.com

Python Questions

Q.1) state=[{“Bob”,”87″}, {“Mike”, “35”},{“Bob”, “52”}, {“Jason”,”35″}, {“Mike”, “55”}, {“Jessica”, “99”}]
Find out from Max marks from the above data

# Given data
state = [{"Bob", "87"}, {"Mike", "35"}, {"Bob", "52"}, {"Jason", "35"}, {"Mike", "55"}, {"Jessica", "99"}]

# Convert sets to a list of tuples
state_list = []
for s in state:
    for item in s:
        # Identify the name and the marks by checking if it's numeric
        if item.isdigit():
            marks = int(item)
        else:
            name = item
    state_list.append((name, marks))

# Find the person with the maximum marks
max_record = max(state_list, key=lambda x: x[1])

print(f"{max_record[0]} has the maximum marks: {max_record[1]}")

Q.2. Write Python code?

Question

1
2 2
3 3 3
4 4 4 4

Solution

# Number of rows
n = 4

for i in range(1, n + 1):
    # Print 'i' i times in each row
    print((str(i) + ' ') * i)

PySpark Questions

Q.3 Pyspark script to find an employee with the highest salary department-wise

Method-1: using groupBy()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as max_

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeHighestSalary").getOrCreate()

# Sample data for employees: [employee_id, employee_name, department, salary]
data = [
    (1, "John", "HR", 5000),
    (2, "Jane", "HR", 6000),
    (3, "Sam", "IT", 7000),
    (4, "Mike", "IT", 9000),
    (5, "Sara", "Finance", 8000),
    (6, "Bob", "Finance", 8500)
]

# Create DataFrame from sample data
columns = ["employee_id", "employee_name", "department", "salary"]
df = spark.createDataFrame(data, columns)

# Find the maximum salary per department
max_salary_df = df.groupBy("department").agg(max_("salary").alias("max_salary"))

# Join the original dataframe with max_salary_df to get employee details with highest salary
result_df = df.join(max_salary_df, (df.department == max_salary_df.department) & (df.salary == max_salary_df.max_salary)) \
              .select(df["employee_id"], df["employee_name"], df["department"], df["salary"])

# Show the result
result_df.show()

# Stop the Spark session
spark.stop()

Method:2 Using Windows function

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeHighestSalaryRank").getOrCreate()

# Sample data for employees: [employee_id, employee_name, department, salary]
data = [
    (1, "John", "HR", 5000),
    (2, "Jane", "HR", 6000),
    (3, "Sam", "IT", 7000),
    (4, "Mike", "IT", 9000),
    (5, "Sara", "Finance", 8000),
    (6, "Bob", "Finance", 8500)
]

# Create DataFrame from sample data
columns = ["employee_id", "employee_name", "department", "salary"]
df = spark.createDataFrame(data, columns)

# Define a window specification: partition by department, order by salary in descending order
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

# Apply rank() to each employee in their department based on salary
ranked_df = df.withColumn("rank", rank().over(window_spec))

# Filter to get only the top-ranked employees (rank == 1) in each department
highest_salary_df = ranked_df.filter(col("rank") == 1).drop("rank")

# Show the result
highest_salary_df.show()

SQL Questions

Q.4

Input 
 
Name	Address	           Email	Floor	Resource
A	Bangalore	a@gmail.com      1	Cpu
A	Bangalore	A1@gmail.com      1	Cpu
A	Bangalore	A2@gmail.com      2	desktop
B	Bangalore	b@gmail.com      2	Desktop
B	Bangalore	B1@gmail.com     2	Desktop
B	Bangalore	B2@gmail.com	1	Monitor
 
Output
 
Name	Total_visit	Most_visited_floor	Resources_used
A	3	               1	       Cpu, desktop
B	3	               2	      Desktop, Monitor

Solution

use sakila;

CREATE TABLE my_floor (
    Name VARCHAR(50),
    Address VARCHAR(100),
    Email VARCHAR(100),
    Floor INT,
    Resource VARCHAR(50)
);

INSERT INTO my_floor (Name, Address, Email, Floor, Resource)
VALUES 
    ('A', 'Bangalore', 'a@gmail.com', 1, 'Cpu'),
    ('A', 'Bangalore', 'A1@gmail.com', 1, 'Cpu'),
    ('A', 'Bangalore', 'A2@gmail.com', 2, 'desktop'),
    ('B', 'Bangalore', 'b@gmail.com', 2, 'Desktop'),
    ('B', 'Bangalore', 'B1@gmail.com', 2, 'Desktop'),
    ('B', 'Bangalore', 'B2@gmail.com', 1, 'Monitor');
   
   
   SELECT 
    Name,
    COUNT(*) AS Total_visit,
    -- Find the most visited floor using a subquery and GROUP BY
    (SELECT Floor 
     FROM (
         SELECT Floor, COUNT(*) AS floor_visits
         FROM my_floor t2
         WHERE t2.Name = t1.Name
         GROUP BY Floor
         ORDER BY floor_visits DESC
         LIMIT 1
     ) AS most_visited_floor_subquery
    ) AS Most_visited_floor,
    -- Aggregate distinct resources used by each name
    GROUP_CONCAT(DISTINCT Resource ORDER BY Resource SEPARATOR ', ') AS Resources_used
FROM my_floor t1
GROUP BY Name;

Output

Name	Total_visit	Most_visited_floor	Resources_used
---     ----             ------                 -----
A	3	          1	                Cpu, desktop
B	3	          2	                Desktop, Monitor

Q.5 Amounts deposits question

transaction_id  ,account_holder,  transaction_type,  amount
1,  John, Deposit,  1000.0
2,  Alice,  Withdrawal,  500.0
3,  Bob,  Deposit,  1500.0
4,  John,  Withdrawal,  500.0
5,  Bob,  Withdrawal,  1000.0
 
-------------------------------------
 
Q-1 : Calculate Total Balance for Each Account Holder ?
Q-2 : Find the Account Holder with the Highest Total Balance ?
Q-3 : Identify the Most Frequent Transaction Type ?

Solution

CREATE TABLE transactions (
    transaction_id INT,
    account_holder VARCHAR(50),
    transaction_type VARCHAR(50),
    amount DECIMAL(10, 2)
);

INSERT INTO transactions (transaction_id, account_holder, transaction_type, amount)
VALUES
    (1, 'John', 'Deposit', 1000.00),
    (2, 'Alice', 'Withdrawal', 500.00),
    (3, 'Bob', 'Deposit', 1500.00),
    (4, 'John', 'Withdrawal', 500.00),
    (5, 'Bob', 'Withdrawal', 1000.00);

Total balance for each account holder

SELECT 
    account_holder,
    SUM(CASE WHEN transaction_type = 'Deposit' THEN amount 
             WHEN transaction_type = 'Withdrawal' THEN -amount 
             ELSE 0 END) AS total_balance
FROM 
    transactions
GROUP BY 
    account_holder;

Account holder with the highest balance

SELECT 
    account_holder, 
    total_balance
FROM (
    SELECT 
        account_holder,
        SUM(CASE WHEN transaction_type = 'Deposit' THEN amount 
                 WHEN transaction_type = 'Withdrawal' THEN -amount 
                 ELSE 0 END) AS total_balance
    FROM 
        transactions
    GROUP BY 
        account_holder
) AS balances
ORDER BY 
    total_balance DESC
LIMIT 1;

Identify the most frequent transaction type

SELECT 
    transaction_type, 
    COUNT(*) AS frequency
FROM 
    transactions
GROUP BY 
    transaction_type
ORDER BY 
    frequency DESC
LIMIT 1;