Given a sales table containing transaction records for various products, write a PySpark query to find products that have exactly two transactions. For these products, return only the details of their second transaction.
Input Data:
Expected Output:
Input DataFrame Script
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import col,cast
schema = StructType([
StructField("transaction_id", IntegerType(), True),
StructField("product_id", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("quantity", IntegerType(), True),
StructField("price", IntegerType(), True),
StructField("transaction_date", StringType(), True)
])
# Create sample data
data = [
(101, 1, "Laptop", 1, 50000, "2025-05-01 10:15:00"),
(102, 1, "Laptop", 2, 50000, "2025-05-02 12:30:00"),
(103, 2, "Phone", 1, 20000, "2025-05-03 09:45:00"),
(104, 3, "Tablet", 1, 15000, "2025-05-04 11:00:00"),
(105, 2, "Phone", 1, 20000, "2025-05-05 14:20:00"),
(106, 4, "Smartwatch", 1, 8000, "2025-05-06 16:10:00"),
(107, 5, "Camera", 1, 25000, "2025-05-07 18:25:00"),
(108, 3, "Tablet", 1, 15000, "2025-05-08 20:30:00"),
(109, 6, "Monitor", 1, 50000, "2025-05-09 09:00:00"),
(110, 6, "Monitor", 2, 12000, "2025-05-10 14:45:00"),
(111, 7, "Keyboard", 1, 2000, "2025-05-11 10:20:00"),
(112, 7, "Keyboard", 1, 2000, "2025-05-12 11:35:00"),
(113, 7, "Keyboard", 1, 2000, "2025-05-13 13:50:00"),
(114, 2, "Phone", 2, 20000, "2025-05-14 15:05:00"),
(115, 8, "Headphones", 1, 5000, "2025-05-15 17:20:00")
]
# Create DataFrame
fabricofdata_DF = spark.createDataFrame(data, schema=schema)
fabricofdata_DF = fabricofdata_DF.withColumn("transaction_date", col("transaction_date").cast("timestamp"))
# Show sample data
fabricofdata_DF.show()
Try solving the question yourself! If you need help, click below to reveal the solution.