Write a PySpark query to identify the top 3 employees with the highest average performance score in each department.
Input Data:
Expected Output:
Input DataFrame Script
employee_data = [
(1001, 2022, 3, "HR"),
(1002, 2022, 5, "HR"),
(1003, 2022, 4, "HR"),
(1004, 2022, 2, "HR"),
(1005, 2022, 1, "HR"),
(2001, 2022, 4, "Engineering"),
(2002, 2022, 3, "Engineering"),
(2003, 2022, 5, "Engineering"),
(2004, 2022, 2, "Engineering"),
(2005, 2022, 1, "Engineering"),
(3001, 2022, 5, "Sales"),
(3002, 2022, 4, "Sales"),
(3003, 2022, 3, "Sales"),
(3004, 2022, 2, "Sales"),
(3005, 2022, 1, "Sales"),
(4001, 2022, 3, "Finance"),
(4002, 2022, 1, "Finance"),
(4003, 2022, 5, "Finance"),
(4004, 2022, 4, "Finance"),
(4005, 2022, 2, "Finance"),
(5001, 2022, 4, "Marketing"),
(5002, 2022, 3, "Marketing"),
(5003, 2022, 5, "Marketing"),
(5004, 2022, 1, "Marketing"),
(5005, 2022, 2, "Marketing"),
(1001, 2023, 5, "HR"),
(1002, 2023, 3, "HR"),
(1003, 2023, 1, "HR"),
(1004, 2023, 4, "HR"),
(1005, 2023, 2, "HR"),
(2001, 2023, 1, "Engineering"),
(2002, 2023, 5, "Engineering"),
(2003, 2023, 4, "Engineering"),
(2004, 2023, 3, "Engineering"),
(2005, 2023, 2, "Engineering"),
(3001, 2023, 4, "Sales"),
(3002, 2023, 2, "Sales"),
(3003, 2023, 1, "Sales"),
(3004, 2023, 5, "Sales"),
(3005, 2023, 3, "Sales"),
(4001, 2023, 5, "Finance"),
(4002, 2023, 2, "Finance"),
(4003, 2023, 3, "Finance"),
(4004, 2023, 1, "Finance"),
(4005, 2023, 4, "Finance"),
(5001, 2023, 3, "Marketing"),
(5002, 2023, 4, "Marketing"),
(5003, 2023, 1, "Marketing"),
(5004, 2023, 5, "Marketing"),
(5005, 2023, 2, "Marketing"),
(1001, 2024, 2, "HR"),
(1002, 2024, 4, "HR"),
(1003, 2024, 1, "HR"),
(1004, 2024, 5, "HR"),
(1005, 2024, 3, "HR"),
(2001, 2024, 5, "Engineering"),
(2002, 2024, 2, "Engineering"),
(2003, 2024, 3, "Engineering"),
(2004, 2024, 4, "Engineering"),
(2005, 2024, 1, "Engineering"),
(3001, 2024, 2, "Sales"),
(3002, 2024, 4, "Sales"),
(3003, 2024, 5, "Sales"),
(3004, 2024, 3, "Sales"),
(3005, 2024, 1, "Sales"),
(4001, 2024, 4, "Finance"),
(4002, 2024, 5, "Finance"),
(4003, 2024, 1, "Finance"),
(4004, 2024, 3, "Finance"),
(4005, 2024, 2, "Finance"),
(5001, 2024, 5, "Marketing"),
(5002, 2024, 2, "Marketing"),
(5003, 2024, 3, "Marketing"),
(5004, 2024, 4, "Marketing"),
(5005, 2024, 1, "Marketing")
]
employee_columns = ["EMP_id", "Assessment_year", "Rating", "Department"]
employee_df = spark.createDataFrame(employee_data, employee_columns)
# Show Employee DataFrame
print("Employee DataFrame:")
display(employee_df)
Try solving the question yourself! If you need help, click below to reveal the solution.