Retrieve the order details of each customer where the order amount of each order is within 50% of the highest ordered amount for that customer.
Input Data:
Expected OutPut:
Input DataFrame Script:
from pyspark.sql.functions import col,count,max,round,filter
import random
from datetime import datetime, timedelta
# Generate the sample data
customers = [f"CUST{str(i).zfill(3)}" for i in range(1, 11)]
items = ["Laptop", "Smartphone", "Tablet", "Monitor", "Keyboard", "Mouse", "Printer", "Camera", "Speaker", "Router"]
data = []
order_id_counter = 1000
start_date = datetime(2025, 1, 1)
for _ in range(50):
customer_id = random.choice(customers)
item_name = random.choice(items)
total_amount = random.randint(100, 5000) # price range from 100 to 5000
order_date = start_date + timedelta(days=random.randint(0, 365))
order_id = f"ORD{order_id_counter}"
order_id_counter += 1
data.append([customer_id, order_id, order_date.strftime('%Y-%m-%d'), item_name, total_amount])
# Define schema for the DataFrame
columns = ["Customer_ID", "Order_ID", "Order_Date", "ItemName", "Total_Amount"]
# Create PySpark DataFrame
fabricofdata_df = spark.createDataFrame(data, columns)
display(fabricofdata_df)
Please try to answer the question and submit in comments section, if you still not able to get it click below show button to see the answer.