Python PySprk Puzzle 1

SAS
0
PySpark Puzzle: Retrieve Orders Within 50% of Highest Order Amount

Retrieve the order details of each customer where the order amount of each order is within 50% of the highest ordered amount for that customer.



Input Data:


Expected OutPut:


Input DataFrame Script:
from pyspark.sql.functions import col,count,max,round,filter
import random
from datetime import datetime, timedelta

# Generate the sample data
customers = [f"CUST{str(i).zfill(3)}" for i in range(1, 11)]
items = ["Laptop", "Smartphone", "Tablet", "Monitor", "Keyboard", "Mouse", "Printer", "Camera", "Speaker", "Router"]

data = []
order_id_counter = 1000
start_date = datetime(2025, 1, 1)

for _ in range(50):
    customer_id = random.choice(customers)
    item_name = random.choice(items)
    total_amount = random.randint(100, 5000)  # price range from 100 to 5000
    order_date = start_date + timedelta(days=random.randint(0, 365))
    order_id = f"ORD{order_id_counter}"
    order_id_counter += 1
    
    data.append([customer_id, order_id, order_date.strftime('%Y-%m-%d'), item_name, total_amount])

# Define schema for the DataFrame
columns = ["Customer_ID", "Order_ID", "Order_Date", "ItemName", "Total_Amount"]

# Create PySpark DataFrame
fabricofdata_df = spark.createDataFrame(data, columns)
display(fabricofdata_df)

Please try to answer the question and submit in comments section, if you still not able to get it click below show button to see the answer.



#Solution 1 using joins and filters
fabricofdata_df2 = fabricofdata_df.groupBy(col("Customer_ID"))\
    .agg(\
            max(col("Total_Amount")).alias("Max_Amount")\
        )
fabricofdata_df3 = fabricofdata_df.join(fabricofdata_df2,on=(fabricofdata_df.Customer_ID == fabricofdata_df2.Customer_ID),how = "inner").select(fabricofdata_df["*"],round((fabricofdata_df2["Max_Amount"]*0.5),3).alias("Max10"))
display(fabricofdata_df3.sort(col("Customer_ID")))
display(fabricofdata_df3.filter(col("Total_Amount")<=col("Max10")).sort(col("Customer_ID")).drop("Max10"))


#Solution 2 Using Window Partitioning
from pyspark.sql.functions import max
from pyspark.sql.window import Window

fabricofdata_df_R = fabricofdata_df.withColumn("MaxAmount",max(col("Total_Amount")).over(Window.partitionBy(col("Customer_ID")).orderBy("Customer_ID"))*0.5   )
display(fabricofdata_df_R.filter(col("Total_Amount") <= col("MaxAmount")).drop(col("MaxAmount")))






Post a Comment

0Comments

Post a Comment (0)

#buttons=(Ok, Go it!) #days=(20)

Our website uses cookies to enhance your experience. Check Now
Ok, Go it!