{"id": "q01", "difficulty": "easy", "question": "How many customers exist in the ERP system of record?", "gold_sql": "SELECT COUNT(*) FROM sales.customers", "expected_shape": "scalar"}
{"id": "q02", "difficulty": "easy", "question": "How many active products are in the current catalog?", "gold_sql": "SELECT COUNT(*) FROM ops.products WHERE active = TRUE", "expected_shape": "scalar"}
{"id": "q03", "difficulty": "easy", "question": "How many warehouses does Vantage Retail Group operate?", "gold_sql": "SELECT COUNT(*) FROM ops.warehouses", "expected_shape": "scalar"}
{"id": "q04", "difficulty": "easy", "question": "List all sales territories with their regions, alphabetically by territory name.", "gold_sql": "SELECT name, region FROM sales.territories ORDER BY name", "expected_shape": "table"}
{"id": "q05", "difficulty": "easy", "question": "How many orders were placed in November 2025?", "gold_sql": "SELECT COUNT(*) FROM orders.orders WHERE order_ts >= TIMESTAMP '2025-11-01 00:00:00' AND order_ts < TIMESTAMP '2025-12-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q06", "difficulty": "medium", "question": "What was total revenue in Q3 2025?", "gold_sql": "SELECT SUM(amount_usd) FROM finance.revenue_recognized WHERE posting_status = 'posted' AND recognized_date BETWEEN DATE '2025-07-01' AND DATE '2025-09-30'", "expected_shape": "scalar"}
{"id": "q07", "difficulty": "medium", "question": "Show monthly revenue for each month of 2025, in calendar order.", "gold_sql": "SELECT date_trunc('month', recognized_date) AS month, SUM(amount_usd) AS revenue_usd FROM finance.revenue_recognized WHERE posting_status = 'posted' AND recognized_date >= DATE '2025-01-01' AND recognized_date < DATE '2026-01-01' GROUP BY 1 ORDER BY 1", "expected_shape": "table"}
{"id": "q08", "difficulty": "hard", "question": "What was the total value of bookings taken in Q3 2025, converted to USD at the daily exchange rate?", "gold_sql": "SELECT SUM(b.amount * r.rate) FROM finance.bookings b JOIN finance.exchange_rates r ON r.rate_date = CAST(b.booked_at AS DATE) AND r.from_currency = b.currency_code AND r.to_currency = 'USD' WHERE b.booked_at >= TIMESTAMP '2025-07-01 00:00:00' AND b.booked_at < TIMESTAMP '2025-10-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q09", "difficulty": "medium", "question": "How much did we bill customers in October 2025, gross of tax and shipping?", "gold_sql": "SELECT SUM(amount_usd) FROM sales.rev_billed WHERE billed_date >= DATE '2025-10-01' AND billed_date < DATE '2025-11-01'", "expected_shape": "scalar"}
{"id": "q10", "difficulty": "easy", "question": "Per the marketing definition (30-day window, fraud excluded), how many active users did we have as of 2025-11-15?", "gold_sql": "SELECT active_user_count FROM marketing.active_users WHERE snapshot_date = DATE '2025-11-15'", "expected_shape": "scalar"}
{"id": "q11", "difficulty": "easy", "question": "How many distinct users were active on the website on 2025-11-15, per the daily web definition?", "gold_sql": "SELECT COUNT(*) FROM web.active_users_daily WHERE activity_date = DATE '2025-11-15'", "expected_shape": "scalar"}
{"id": "q12", "difficulty": "medium", "question": "What was the average order value in dollars for US-dollar orders placed in 2025, excluding cancelled orders?", "gold_sql": "SELECT AVG(order_total / 100.0) FROM orders.orders WHERE currency_code = 'USD' AND status <> 'cancelled' AND order_ts >= TIMESTAMP '2025-01-01 00:00:00' AND order_ts < TIMESTAMP '2026-01-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q13", "difficulty": "easy", "question": "What was the total refund amount issued in October 2025?", "gold_sql": "SELECT SUM(refund_amount) FROM returns.refunds WHERE refunded_at >= TIMESTAMP '2025-10-01 00:00:00' AND refunded_at < TIMESTAMP '2025-11-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q14", "difficulty": "hard", "question": "What percentage of October 2025 US-dollar order value (non-cancelled orders) was refunded in USD during that same month?", "gold_sql": "SELECT 100.0 * (SELECT COALESCE(SUM(refund_amount), 0) FROM returns.refunds WHERE currency_code = 'USD' AND refunded_at >= TIMESTAMP '2025-10-01 00:00:00' AND refunded_at < TIMESTAMP '2025-11-01 00:00:00') / (SELECT SUM(order_total) / 100.0 FROM orders.orders WHERE currency_code = 'USD' AND status <> 'cancelled' AND order_ts >= TIMESTAMP '2025-10-01 00:00:00' AND order_ts < TIMESTAMP '2025-11-01 00:00:00')", "expected_shape": "scalar"}
{"id": "q15", "difficulty": "medium", "question": "How many orders placed in 2025 were guest checkouts, i.e. have no linked customer account?", "gold_sql": "SELECT COUNT(*) FROM orders.orders WHERE customer_id IS NULL AND order_ts >= TIMESTAMP '2025-01-01 00:00:00' AND order_ts < TIMESTAMP '2026-01-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q16", "difficulty": "hard", "question": "Who were the top 10 customers by recognized revenue in 2025? Show customer id, full name, and revenue.", "gold_sql": "SELECT c.customer_id, c.full_name, SUM(r.amount_usd) AS revenue_usd FROM finance.revenue_recognized r JOIN orders.order_lines ol ON ol.order_line_id = r.order_line_id JOIN orders.orders o ON o.order_id = ol.order_id JOIN sales.customers c ON c.customer_id = o.customer_id WHERE r.posting_status = 'posted' AND r.recognized_date >= DATE '2025-01-01' AND r.recognized_date < DATE '2026-01-01' GROUP BY c.customer_id, c.full_name ORDER BY revenue_usd DESC, c.customer_id LIMIT 10", "expected_shape": "table"}
{"id": "q17", "difficulty": "easy", "question": "How many marketing CDP profiles are not matched to an ERP customer record?", "gold_sql": "SELECT COUNT(*) FROM marketing.customers WHERE crm_customer_id IS NULL", "expected_shape": "scalar"}
{"id": "q18", "difficulty": "medium", "question": "How many distinct ERP customers have at least one profile in the marketing CDP?", "gold_sql": "SELECT COUNT(DISTINCT crm_customer_id) FROM marketing.customers WHERE crm_customer_id IS NOT NULL", "expected_shape": "scalar"}
{"id": "q19", "difficulty": "easy", "question": "How many orders are in each order status?", "gold_sql": "SELECT status, COUNT(*) AS n_orders FROM orders.orders GROUP BY status ORDER BY status", "expected_shape": "table"}
{"id": "q20", "difficulty": "medium", "question": "What were the top 5 return reasons by number of returns requested in 2025?", "gold_sql": "SELECT rr.description, COUNT(*) AS n_returns FROM returns.returns r JOIN returns.return_reasons rr ON rr.reason_code = r.reason_code WHERE r.requested_at >= TIMESTAMP '2025-01-01 00:00:00' AND r.requested_at < TIMESTAMP '2026-01-01 00:00:00' GROUP BY rr.description ORDER BY n_returns DESC, rr.description LIMIT 5", "expected_shape": "table"}
{"id": "q21", "difficulty": "easy", "question": "How many marketing campaigns started in 2025?", "gold_sql": "SELECT COUNT(*) FROM marketing.campaigns WHERE starts_on >= DATE '2025-01-01' AND starts_on < DATE '2026-01-01'", "expected_shape": "scalar"}
{"id": "q22", "difficulty": "easy", "question": "What was total ad spend in Q4 2025 up to the snapshot date of 2025-11-30?", "gold_sql": "SELECT SUM(spend_usd) FROM marketing.ad_spend WHERE spend_date BETWEEN DATE '2025-10-01' AND DATE '2025-11-30'", "expected_shape": "scalar"}
{"id": "q23", "difficulty": "medium", "question": "Break down November 2025 ad spend by channel, highest spend first.", "gold_sql": "SELECT channel, SUM(spend_usd) AS spend_usd FROM marketing.ad_spend WHERE spend_date BETWEEN DATE '2025-11-01' AND DATE '2025-11-30' GROUP BY channel ORDER BY spend_usd DESC, channel", "expected_shape": "table"}
{"id": "q24", "difficulty": "hard", "question": "What was the click-through rate, as a percentage of sends that received at least one click, for campaign 42?", "gold_sql": "SELECT 100.0 * COUNT(DISTINCT CASE WHEN ee.event_type = 'click' THEN es.send_id END) / COUNT(DISTINCT es.send_id) AS ctr_pct FROM marketing.email_sends es LEFT JOIN marketing.email_events ee ON ee.send_id = es.send_id WHERE es.campaign_id = 42", "expected_shape": "scalar"}
{"id": "q25", "difficulty": "easy", "question": "How many employees are currently active (not terminated)?", "gold_sql": "SELECT COUNT(*) FROM hr.employees WHERE terminated_on IS NULL", "expected_shape": "scalar"}
{"id": "q26", "difficulty": "medium", "question": "Show active headcount by department, largest first.", "gold_sql": "SELECT d.name AS department, COUNT(*) AS headcount FROM hr.employees e JOIN hr.departments d ON d.department_id = e.department_id WHERE e.terminated_on IS NULL GROUP BY d.name ORDER BY headcount DESC, d.name", "expected_shape": "table"}
{"id": "q27", "difficulty": "easy", "question": "What was the average performance rating in review period 2025-H1?", "gold_sql": "SELECT AVG(rating) FROM hr.performance_reviews WHERE review_period = '2025-H1'", "expected_shape": "scalar"}
{"id": "q28", "difficulty": "medium", "question": "What were the top 10 products by units sold in 2025, excluding cancelled orders?", "gold_sql": "SELECT p.product_name, SUM(ol.qty) AS units_sold FROM orders.order_lines ol JOIN orders.orders o ON o.order_id = ol.order_id JOIN ops.products p ON p.product_id = ol.product_id WHERE o.status <> 'cancelled' AND o.order_ts >= TIMESTAMP '2025-01-01 00:00:00' AND o.order_ts < TIMESTAMP '2026-01-01 00:00:00' GROUP BY p.product_name ORDER BY units_sold DESC, p.product_name LIMIT 10", "expected_shape": "table"}
{"id": "q29", "difficulty": "hard", "question": "What is the 2025 return rate by product category, measured as units returned divided by units sold, highest first?", "gold_sql": "WITH sold AS (SELECT p.category_id, SUM(ol.qty) AS units_sold FROM orders.order_lines ol JOIN orders.orders o ON o.order_id = ol.order_id JOIN ops.products p ON p.product_id = ol.product_id WHERE o.status <> 'cancelled' AND o.order_ts >= TIMESTAMP '2025-01-01 00:00:00' AND o.order_ts < TIMESTAMP '2026-01-01 00:00:00' GROUP BY p.category_id), returned AS (SELECT p.category_id, SUM(ri.qty) AS units_returned FROM returns.return_items ri JOIN returns.returns r ON r.return_id = ri.return_id JOIN orders.order_lines ol ON ol.order_line_id = ri.order_line_id JOIN ops.products p ON p.product_id = ol.product_id WHERE r.requested_at >= TIMESTAMP '2025-01-01 00:00:00' AND r.requested_at < TIMESTAMP '2026-01-01 00:00:00' GROUP BY p.category_id) SELECT pc.name AS category, 100.0 * COALESCE(rt.units_returned, 0) / s.units_sold AS return_rate_pct FROM sold s JOIN ops.product_categories pc ON pc.category_id = s.category_id LEFT JOIN returned rt ON rt.category_id = s.category_id ORDER BY return_rate_pct DESC, pc.name", "expected_shape": "table"}
{"id": "q30", "difficulty": "easy", "question": "What was total on-hand inventory across all warehouses on 2025-11-30?", "gold_sql": "SELECT SUM(qty_on_hand) FROM ops.inventory_levels WHERE as_of_date = DATE '2025-11-30'", "expected_shape": "scalar"}
{"id": "q31", "difficulty": "medium", "question": "How many open purchase orders does each supplier have, most first?", "gold_sql": "SELECT s.name AS supplier, COUNT(*) AS open_pos FROM ops.purchase_orders po JOIN ops.suppliers s ON s.supplier_id = po.supplier_id WHERE po.status = 'open' GROUP BY s.name ORDER BY open_pos DESC, s.name", "expected_shape": "table"}
{"id": "q32", "difficulty": "easy", "question": "How many suppliers are flagged as preferred?", "gold_sql": "SELECT COUNT(*) FROM ops.suppliers WHERE preferred = TRUE", "expected_shape": "scalar"}
{"id": "q33", "difficulty": "medium", "question": "What was the average number of days from shipment to delivery for shipments delivered in November 2025?", "gold_sql": "SELECT AVG(date_diff('day', shipped_at, delivered_at)) FROM orders.shipments WHERE delivered_at >= TIMESTAMP '2025-11-01 00:00:00' AND delivered_at < TIMESTAMP '2025-12-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q34", "difficulty": "medium", "question": "How many web sessions that started in November 2025 were ruled to be bots?", "gold_sql": "SELECT COUNT(*) FROM web.sessions s JOIN web.bot_detections b ON b.session_id = s.session_id WHERE b.verdict = 'bot' AND s.started_at >= TIMESTAMP '2025-11-01 00:00:00' AND s.started_at < TIMESTAMP '2025-12-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q35", "difficulty": "medium", "question": "Show the daily web active-user count for each day of November 2025, in date order.", "gold_sql": "SELECT activity_date, COUNT(*) AS daily_active_users FROM web.active_users_daily WHERE activity_date BETWEEN DATE '2025-11-01' AND DATE '2025-11-30' GROUP BY activity_date ORDER BY activity_date", "expected_shape": "table"}
{"id": "q36", "difficulty": "medium", "question": "How many carts created in November 2025 ended up abandoned?", "gold_sql": "SELECT COUNT(*) FROM web.carts WHERE status = 'abandoned' AND created_at >= TIMESTAMP '2025-11-01 00:00:00' AND created_at < TIMESTAMP '2025-12-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q37", "difficulty": "easy", "question": "How many users are assigned to each variant of experiment 7?", "gold_sql": "SELECT variant, COUNT(*) AS n_users FROM web.ab_assignments WHERE experiment_id = 7 GROUP BY variant ORDER BY variant", "expected_shape": "table"}
{"id": "q38", "difficulty": "easy", "question": "How many gift cards still carry a positive balance?", "gold_sql": "SELECT COUNT(*) FROM orders.gift_cards WHERE balance_cents > 0", "expected_shape": "scalar"}
{"id": "q39", "difficulty": "easy", "question": "What was total gross payroll for pay periods ending in 2025?", "gold_sql": "SELECT SUM(total_gross_usd) FROM hr.payroll_runs WHERE pay_period_end >= DATE '2025-01-01' AND pay_period_end < DATE '2026-01-01'", "expected_shape": "scalar"}
{"id": "q40", "difficulty": "hard", "question": "Break Q3 2025 recognized revenue down by customer region, considering orders with a linked customer account.", "gold_sql": "SELECT c.region, SUM(r.amount_usd) AS revenue_usd FROM finance.revenue_recognized r JOIN orders.order_lines ol ON ol.order_line_id = r.order_line_id JOIN orders.orders o ON o.order_id = ol.order_id JOIN sales.customers c ON c.customer_id = o.customer_id WHERE r.posting_status = 'posted' AND r.recognized_date BETWEEN DATE '2025-07-01' AND DATE '2025-09-30' GROUP BY c.region ORDER BY revenue_usd DESC, c.region", "expected_shape": "table"}
{"id": "q41", "difficulty": "medium", "question": "How many loyalty accounts are not linked to an ERP customer?", "gold_sql": "SELECT COUNT(*) FROM marketing.loyalty_accounts WHERE customer_id IS NULL", "expected_shape": "scalar"}
{"id": "q42", "difficulty": "medium", "question": "How many loyalty points were redeemed in total during 2025?", "gold_sql": "SELECT ABS(SUM(points_delta)) FROM marketing.loyalty_transactions WHERE points_delta < 0 AND txn_ts >= TIMESTAMP '2025-01-01 00:00:00' AND txn_ts < TIMESTAMP '2026-01-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q43", "difficulty": "easy", "question": "List all carriers and their transport mode, alphabetically.", "gold_sql": "SELECT name, mode FROM orders.carriers ORDER BY name", "expected_shape": "table"}
{"id": "q44", "difficulty": "hard", "question": "What share of 2025 orders that have both a promised date and at least one shipment shipped on or before their promised date?", "gold_sql": "SELECT 100.0 * SUM(CASE WHEN f.first_ship <= pd.promised_date THEN 1 ELSE 0 END) / COUNT(*) AS on_time_pct FROM (SELECT o.order_id, CAST(MIN(s.shipped_at) AS DATE) AS first_ship FROM orders.orders o JOIN orders.shipments s ON s.order_id = o.order_id WHERE o.order_ts >= TIMESTAMP '2025-01-01 00:00:00' AND o.order_ts < TIMESTAMP '2026-01-01 00:00:00' GROUP BY o.order_id) f JOIN orders.promised_dates pd ON pd.order_id = f.order_id", "expected_shape": "scalar"}
{"id": "q45", "difficulty": "medium", "question": "How many chargebacks were opened in 2025 and what was their total amount in USD?", "gold_sql": "SELECT COUNT(*) AS n_chargebacks, SUM(amount) AS total_usd FROM returns.chargebacks WHERE currency_code = 'USD' AND opened_on >= DATE '2025-01-01' AND opened_on < DATE '2026-01-01'", "expected_shape": "table"}
{"id": "q46", "difficulty": "hard", "question": "Using the fiscal calendar, what was recognized revenue by fiscal quarter of fiscal year 2025?", "gold_sql": "SELECT fc.fiscal_quarter, SUM(r.amount_usd) AS revenue_usd FROM finance.revenue_recognized r JOIN finance.fiscal_calendar fc ON fc.date_key = r.recognized_date WHERE r.posting_status = 'posted' AND fc.fiscal_year = 2025 GROUP BY fc.fiscal_quarter ORDER BY fc.fiscal_quarter", "expected_shape": "table"}
{"id": "q47", "difficulty": "hard", "question": "For promo codes redeemed in October 2025, what USD order value is attributable to each campaign, highest first?", "gold_sql": "SELECT cm.name AS campaign, SUM(o.order_total) / 100.0 AS usd_order_value FROM marketing.campaigns cm JOIN marketing.promotions p ON p.campaign_id = cm.campaign_id JOIN marketing.promo_redemptions pr ON pr.promo_code = p.promo_code JOIN orders.orders o ON o.order_id = pr.order_id WHERE o.currency_code = 'USD' AND pr.redeemed_at >= TIMESTAMP '2025-10-01 00:00:00' AND pr.redeemed_at < TIMESTAMP '2025-11-01 00:00:00' GROUP BY cm.name ORDER BY usd_order_value DESC, cm.name", "expected_shape": "table"}
{"id": "q48", "difficulty": "medium", "question": "What was the average bot score among sessions ruled to be bots in November 2025?", "gold_sql": "SELECT AVG(bot_score) FROM web.bot_detections WHERE verdict = 'bot' AND detected_at >= TIMESTAMP '2025-11-01 00:00:00' AND detected_at < TIMESTAMP '2025-12-01 00:00:00'", "expected_shape": "scalar"}
{"id": "q49", "difficulty": "hard", "question": "Produce an accounts-receivable aging report as of 2025-11-30: total open AR invoice amount in buckets current, 1-30, 31-60, 61-90, and 90+ days past due.", "gold_sql": "SELECT CASE WHEN date_diff('day', due_date, DATE '2025-11-30') <= 0 THEN 'current' WHEN date_diff('day', due_date, DATE '2025-11-30') <= 30 THEN '1-30' WHEN date_diff('day', due_date, DATE '2025-11-30') <= 60 THEN '31-60' WHEN date_diff('day', due_date, DATE '2025-11-30') <= 90 THEN '61-90' ELSE '90+' END AS bucket, SUM(amount_usd) AS open_amount_usd FROM finance.ar_invoices WHERE status = 'open' GROUP BY 1 ORDER BY 1", "expected_shape": "table"}
{"id": "q50", "difficulty": "medium", "question": "How many distinct registered customers placed at least one order in 2025?", "gold_sql": "SELECT COUNT(DISTINCT customer_id) FROM orders.orders WHERE customer_id IS NOT NULL AND order_ts >= TIMESTAMP '2025-01-01 00:00:00' AND order_ts < TIMESTAMP '2026-01-01 00:00:00'", "expected_shape": "scalar"}
