You are given a table of customer transactions that includes each customer's purchases over time. Your task is to identify the customers with the highest lifetime spending. Write a Pyspark code to calculate the total amount spent by each customer and rank them from highest to lowest total spend.
Input Data:
Expected Output:
Input DataFrame Script
from datetime import datetime, timedelta
import random
# Generate random date function
def generate_random_date(start_date, end_date):
delta = end_date - start_date
random_days = random.randint(0, delta.days)
return start_date + timedelta(days=random_days)
# Define start and end dates for random date generation
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 12, 31)
# Sample data for Customers table
customers_data = [
(1, "John Doe"),
(2, "Jane Smith"),
(3, "Alice Brown"),
(4, "Bob Johnson"),
(5, "Emily Davis"),
(6, "Charlie Wilson"),
(7, "Diana Miller"),
(8, "Ethan Clark"),
(9, "Fiona Taylor"),
(10, "George Martin")
]
customers_columns = ["Customer_id", "Customer_name"]
customers_df = spark.createDataFrame(customers_data, customers_columns)
# Sample data for Transactions table with random dates and values
transactions_data = [
(1, generate_random_date(start_date, end_date), "Item A", 100.50),
(2, generate_random_date(start_date, end_date), "Item B", 200.75),
(3, generate_random_date(start_date, end_date), "Item C", 50.25),
(4, generate_random_date(start_date, end_date), "Item D", 150.00),
(5, generate_random_date(start_date, end_date), "Item E", 300.00),
(6, generate_random_date(start_date, end_date), "Item F", 75.25),
(7, generate_random_date(start_date, end_date), "Item G", 125.00),
(8, generate_random_date(start_date, end_date), "Item H", 250.00),
(9, generate_random_date(start_date, end_date), "Item I", 180.00),
(10, generate_random_date(start_date, end_date), "Item J", 90.00),
(1, generate_random_date(start_date, end_date), "Item K", 75.00),
(2, generate_random_date(start_date, end_date), "Item L", 120.00),
(3, generate_random_date(start_date, end_date), "Item M", 30.00),
(4, generate_random_date(start_date, end_date), "Item N", 200.00),
(5, generate_random_date(start_date, end_date), "Item O", 50.00),
(6, generate_random_date(start_date, end_date), "Item P", 100.00),
(7, generate_random_date(start_date, end_date), "Item Q", 80.00),
(8, generate_random_date(start_date, end_date), "Item R", 300.00),
(9, generate_random_date(start_date, end_date), "Item S", 150.00),
(10, generate_random_date(start_date, end_date), "Item T", 110.00),
(1, generate_random_date(start_date, end_date), "Item U", 90.00),
(2, generate_random_date(start_date, end_date), "Item V", 75.00),
(3, generate_random_date(start_date, end_date), "Item W", 60.00),
(4, generate_random_date(start_date, end_date), "Item X", 100.00),
(5, generate_random_date(start_date, end_date), "Item Y", 200.00),
(6, generate_random_date(start_date, end_date), "Item Z", 50.00),
(7, generate_random_date(start_date, end_date), "Item AA", 150.00),
(8, generate_random_date(start_date, end_date), "Item BB", 120.00),
(9, generate_random_date(start_date, end_date), "Item CC", 95.00),
(10, generate_random_date(start_date, end_date), "Item DD", 125.00)
]
transactions_columns = ["Customer_id", "Order_date", "Item_name", "Item_value"]
transactions_df = spark.createDataFrame(transactions_data, transactions_columns)
display(customers_df)
display(transactions_df)
Try solving the question yourself! If you need help, click below to reveal the solution.