数据分析SQL实战

# 数据分析SQL实战

# 概述

数据分析是现代企业决策的重要依据。本章节通过实际的数据分析场景,展示如何使用SQL进行数据挖掘、统计分析和业务洞察。

# 数据准备

# 示例数据表结构

# 用户行为表 (user_behavior)

CREATE TABLE user_behavior (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    user_id INT NOT NULL,
    event_type ENUM('page_view', 'click', 'purchase', 'add_to_cart', 'search') NOT NULL,
    page_url VARCHAR(500),
    product_id INT,
    session_id VARCHAR(100),
    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    device_type ENUM('desktop', 'mobile', 'tablet') NOT NULL,
    browser VARCHAR(50),
    ip_address VARCHAR(45),
    INDEX idx_user_timestamp (user_id, timestamp),
    INDEX idx_event_timestamp (event_type, timestamp),
    INDEX idx_session (session_id)
);

# 销售数据表 (sales_data)

CREATE TABLE sales_data (
    sale_id BIGINT PRIMARY KEY AUTO_INCREMENT,
    product_id INT NOT NULL,
    category_id INT NOT NULL,
    user_id INT NOT NULL,
    quantity INT NOT NULL,
    unit_price DECIMAL(10,2) NOT NULL,
    total_amount DECIMAL(12,2) NOT NULL,
    discount_amount DECIMAL(10,2) DEFAULT 0,
    sale_date DATE NOT NULL,
    sale_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    region VARCHAR(50),
    channel ENUM('online', 'offline', 'mobile_app') NOT NULL,
    INDEX idx_date_category (sale_date, category_id),
    INDEX idx_product_date (product_id, sale_date),
    INDEX idx_user_date (user_id, sale_date)
);

# 产品信息表 (product_info)

CREATE TABLE product_info (
    product_id INT PRIMARY KEY,
    product_name VARCHAR(200) NOT NULL,
    category_id INT NOT NULL,
    brand VARCHAR(100),
    cost_price DECIMAL(10,2),
    list_price DECIMAL(10,2),
    launch_date DATE,
    status ENUM('active', 'discontinued') DEFAULT 'active'
);

# 用户行为分析

# 1. 用户活跃度分析

# 日活跃用户(DAU)统计

SELECT 
    DATE(timestamp) as date,
    COUNT(DISTINCT user_id) as daily_active_users,
    COUNT(*) as total_events
FROM user_behavior
WHERE timestamp >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY DATE(timestamp)
ORDER BY date DESC;

# 用户留存率分析

-- 计算7日留存率
WITH first_visit AS (
    SELECT 
        user_id,
        MIN(DATE(timestamp)) as first_date
    FROM user_behavior
    GROUP BY user_id
),
retention_data AS (
    SELECT 
        fv.first_date,
        fv.user_id,
        CASE WHEN ub.user_id IS NOT NULL THEN 1 ELSE 0 END as returned
    FROM first_visit fv
    LEFT JOIN user_behavior ub ON fv.user_id = ub.user_id 
        AND DATE(ub.timestamp) = DATE_ADD(fv.first_date, INTERVAL 7 DAY)
    WHERE fv.first_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
)
SELECT 
    first_date,
    COUNT(*) as new_users,
    SUM(returned) as retained_users,
    ROUND(SUM(returned) * 100.0 / COUNT(*), 2) as retention_rate_7d
FROM retention_data
GROUP BY first_date
ORDER BY first_date DESC;

# 用户行为路径分析

-- 分析用户在网站上的行为路径
WITH user_sessions AS (
    SELECT 
        user_id,
        session_id,
        event_type,
        page_url,
        timestamp,
        ROW_NUMBER() OVER (PARTITION BY session_id ORDER BY timestamp) as step_number
    FROM user_behavior
    WHERE DATE(timestamp) = CURDATE()
),
path_analysis AS (
    SELECT 
        session_id,
        GROUP_CONCAT(event_type ORDER BY step_number SEPARATOR ' -> ') as user_path,
        COUNT(*) as path_length,
        MAX(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) as converted
    FROM user_sessions
    GROUP BY session_id
)
SELECT 
    user_path,
    COUNT(*) as session_count,
    SUM(converted) as conversions,
    ROUND(SUM(converted) * 100.0 / COUNT(*), 2) as conversion_rate,
    AVG(path_length) as avg_path_length
FROM path_analysis
GROUP BY user_path
HAVING COUNT(*) >= 10
ORDER BY session_count DESC
LIMIT 20;

# 2. 转化漏斗分析

-- 电商转化漏斗分析
WITH funnel_data AS (
    SELECT 
        user_id,
        session_id,
        MAX(CASE WHEN event_type = 'page_view' THEN 1 ELSE 0 END) as viewed,
        MAX(CASE WHEN event_type = 'click' THEN 1 ELSE 0 END) as clicked,
        MAX(CASE WHEN event_type = 'add_to_cart' THEN 1 ELSE 0 END) as added_to_cart,
        MAX(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) as purchased
    FROM user_behavior
    WHERE DATE(timestamp) >= DATE_SUB(CURDATE(), INTERVAL 7 DAY)
    GROUP BY user_id, session_id
)
SELECT 
    'Page View' as step,
    SUM(viewed) as users,
    100.0 as conversion_rate
FROM funnel_data
UNION ALL
SELECT 
    'Click',
    SUM(clicked),
    ROUND(SUM(clicked) * 100.0 / SUM(viewed), 2)
FROM funnel_data
UNION ALL
SELECT 
    'Add to Cart',
    SUM(added_to_cart),
    ROUND(SUM(added_to_cart) * 100.0 / SUM(viewed), 2)
FROM funnel_data
UNION ALL
SELECT 
    'Purchase',
    SUM(purchased),
    ROUND(SUM(purchased) * 100.0 / SUM(viewed), 2)
FROM funnel_data;

# 销售数据分析

# 1. 销售趋势分析

# 月度销售趋势

SELECT 
    DATE_FORMAT(sale_date, '%Y-%m') as month,
    COUNT(*) as order_count,
    SUM(total_amount) as total_revenue,
    SUM(quantity) as total_quantity,
    AVG(total_amount) as avg_order_value,
    COUNT(DISTINCT user_id) as unique_customers
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 12 MONTH)
GROUP BY DATE_FORMAT(sale_date, '%Y-%m')
ORDER BY month;

# 同比增长率分析

WITH monthly_sales AS (
    SELECT 
        YEAR(sale_date) as year,
        MONTH(sale_date) as month,
        SUM(total_amount) as monthly_revenue
    FROM sales_data
    WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 24 MONTH)
    GROUP BY YEAR(sale_date), MONTH(sale_date)
),
growth_analysis AS (
    SELECT 
        year,
        month,
        monthly_revenue,
        LAG(monthly_revenue, 12) OVER (ORDER BY year, month) as prev_year_revenue
    FROM monthly_sales
)
SELECT 
    year,
    month,
    monthly_revenue,
    prev_year_revenue,
    CASE 
        WHEN prev_year_revenue IS NOT NULL THEN
            ROUND((monthly_revenue - prev_year_revenue) * 100.0 / prev_year_revenue, 2)
        ELSE NULL
    END as yoy_growth_rate
FROM growth_analysis
WHERE prev_year_revenue IS NOT NULL
ORDER BY year, month;

# 2. 产品分析

# 产品销售排行

SELECT 
    p.product_name,
    p.brand,
    SUM(s.quantity) as total_sold,
    SUM(s.total_amount) as total_revenue,
    AVG(s.unit_price) as avg_selling_price,
    COUNT(DISTINCT s.user_id) as unique_buyers,
    ROUND(SUM(s.total_amount - s.quantity * p.cost_price), 2) as total_profit,
    ROUND((SUM(s.total_amount - s.quantity * p.cost_price) / SUM(s.total_amount)) * 100, 2) as profit_margin
FROM sales_data s
INNER JOIN product_info p ON s.product_id = p.product_id
WHERE s.sale_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY s.product_id, p.product_name, p.brand
ORDER BY total_revenue DESC
LIMIT 20;

# ABC分析(帕累托分析)

WITH product_revenue AS (
    SELECT 
        s.product_id,
        p.product_name,
        SUM(s.total_amount) as revenue
    FROM sales_data s
    INNER JOIN product_info p ON s.product_id = p.product_id
    WHERE s.sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
    GROUP BY s.product_id, p.product_name
),
revenue_with_cumulative AS (
    SELECT 
        product_id,
        product_name,
        revenue,
        SUM(revenue) OVER () as total_revenue,
        SUM(revenue) OVER (ORDER BY revenue DESC) as cumulative_revenue,
        ROW_NUMBER() OVER (ORDER BY revenue DESC) as rank_num,
        COUNT(*) OVER () as total_products
    FROM product_revenue
)
SELECT 
    product_id,
    product_name,
    revenue,
    ROUND(revenue * 100.0 / total_revenue, 2) as revenue_percentage,
    ROUND(cumulative_revenue * 100.0 / total_revenue, 2) as cumulative_percentage,
    CASE 
        WHEN cumulative_revenue * 100.0 / total_revenue <= 80 THEN 'A类产品'
        WHEN cumulative_revenue * 100.0 / total_revenue <= 95 THEN 'B类产品'
        ELSE 'C类产品'
    END as abc_category
FROM revenue_with_cumulative
ORDER BY revenue DESC;

# 3. 客户分析

# RFM客户分析

WITH customer_rfm AS (
    SELECT 
        user_id,
        DATEDIFF(CURDATE(), MAX(sale_date)) as recency,
        COUNT(*) as frequency,
        SUM(total_amount) as monetary
    FROM sales_data
    WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 365 DAY)
    GROUP BY user_id
),
rfm_scores AS (
    SELECT 
        user_id,
        recency,
        frequency,
        monetary,
        NTILE(5) OVER (ORDER BY recency DESC) as r_score,
        NTILE(5) OVER (ORDER BY frequency ASC) as f_score,
        NTILE(5) OVER (ORDER BY monetary ASC) as m_score
    FROM customer_rfm
),
customer_segments AS (
    SELECT 
        user_id,
        recency,
        frequency,
        monetary,
        r_score,
        f_score,
        m_score,
        CASE 
            WHEN r_score >= 4 AND f_score >= 4 AND m_score >= 4 THEN '冠军客户'
            WHEN r_score >= 3 AND f_score >= 3 AND m_score >= 3 THEN '忠诚客户'
            WHEN r_score >= 4 AND f_score <= 2 THEN '新客户'
            WHEN r_score <= 2 AND f_score >= 3 THEN '流失客户'
            WHEN r_score >= 3 AND f_score <= 2 AND m_score <= 2 THEN '潜在客户'
            ELSE '一般客户'
        END as customer_segment
    FROM rfm_scores
)
SELECT 
    customer_segment,
    COUNT(*) as customer_count,
    AVG(recency) as avg_recency,
    AVG(frequency) as avg_frequency,
    AVG(monetary) as avg_monetary,
    SUM(monetary) as total_value
FROM customer_segments
GROUP BY customer_segment
ORDER BY total_value DESC;

# 客户生命周期价值(CLV)分析

WITH customer_metrics AS (
    SELECT 
        user_id,
        MIN(sale_date) as first_purchase_date,
        MAX(sale_date) as last_purchase_date,
        COUNT(*) as total_orders,
        SUM(total_amount) as total_spent,
        AVG(total_amount) as avg_order_value,
        DATEDIFF(MAX(sale_date), MIN(sale_date)) + 1 as customer_lifespan_days
    FROM sales_data
    GROUP BY user_id
    HAVING COUNT(*) > 1
),
clv_calculation AS (
    SELECT 
        user_id,
        total_orders,
        total_spent,
        avg_order_value,
        customer_lifespan_days,
        CASE 
            WHEN customer_lifespan_days > 0 THEN 
                total_orders / (customer_lifespan_days / 365.0)
            ELSE 0
        END as purchase_frequency_per_year,
        CASE 
            WHEN customer_lifespan_days > 0 THEN 
                avg_order_value * (total_orders / (customer_lifespan_days / 365.0)) * 2
            ELSE avg_order_value
        END as estimated_clv
    FROM customer_metrics
)
SELECT 
    CASE 
        WHEN estimated_clv >= 10000 THEN '高价值客户'
        WHEN estimated_clv >= 5000 THEN '中价值客户'
        WHEN estimated_clv >= 1000 THEN '低价值客户'
        ELSE '潜在客户'
    END as clv_segment,
    COUNT(*) as customer_count,
    AVG(estimated_clv) as avg_clv,
    SUM(estimated_clv) as total_clv,
    AVG(avg_order_value) as avg_order_value,
    AVG(purchase_frequency_per_year) as avg_frequency
FROM clv_calculation
GROUP BY clv_segment
ORDER BY avg_clv DESC;

# 高级分析技术

# 1. 同期群分析(Cohort Analysis)

-- 按月同期群的留存分析
WITH user_cohorts AS (
    SELECT 
        user_id,
        DATE_FORMAT(MIN(sale_date), '%Y-%m') as cohort_month
    FROM sales_data
    GROUP BY user_id
),
user_activities AS (
    SELECT 
        uc.cohort_month,
        uc.user_id,
        DATE_FORMAT(s.sale_date, '%Y-%m') as activity_month,
        PERIOD_DIFF(
            CAST(DATE_FORMAT(s.sale_date, '%Y%m') AS UNSIGNED),
            CAST(REPLACE(uc.cohort_month, '-', '') AS UNSIGNED)
        ) as period_number
    FROM user_cohorts uc
    INNER JOIN sales_data s ON uc.user_id = s.user_id
),
cohort_data AS (
    SELECT 
        cohort_month,
        period_number,
        COUNT(DISTINCT user_id) as active_users
    FROM user_activities
    GROUP BY cohort_month, period_number
),
cohort_sizes AS (
    SELECT 
        cohort_month,
        COUNT(DISTINCT user_id) as cohort_size
    FROM user_cohorts
    GROUP BY cohort_month
)
SELECT 
    cd.cohort_month,
    cs.cohort_size,
    cd.period_number,
    cd.active_users,
    ROUND(cd.active_users * 100.0 / cs.cohort_size, 2) as retention_rate
FROM cohort_data cd
INNER JOIN cohort_sizes cs ON cd.cohort_month = cs.cohort_month
WHERE cd.cohort_month >= DATE_FORMAT(DATE_SUB(CURDATE(), INTERVAL 12 MONTH), '%Y-%m')
ORDER BY cd.cohort_month, cd.period_number;

# 2. 时间序列分析

-- 移动平均和趋势分析
WITH daily_sales AS (
    SELECT 
        sale_date,
        SUM(total_amount) as daily_revenue,
        COUNT(*) as daily_orders
    FROM sales_data
    WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
    GROUP BY sale_date
),
moving_averages AS (
    SELECT 
        sale_date,
        daily_revenue,
        daily_orders,
        AVG(daily_revenue) OVER (
            ORDER BY sale_date 
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        ) as ma_7_revenue,
        AVG(daily_revenue) OVER (
            ORDER BY sale_date 
            ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
        ) as ma_30_revenue,
        LAG(daily_revenue, 7) OVER (ORDER BY sale_date) as revenue_7_days_ago
    FROM daily_sales
)
SELECT 
    sale_date,
    daily_revenue,
    ma_7_revenue,
    ma_30_revenue,
    revenue_7_days_ago,
    CASE 
        WHEN revenue_7_days_ago IS NOT NULL THEN
            ROUND((daily_revenue - revenue_7_days_ago) * 100.0 / revenue_7_days_ago, 2)
        ELSE NULL
    END as wow_growth_rate,
    CASE 
        WHEN daily_revenue > ma_7_revenue * 1.2 THEN '异常高'
        WHEN daily_revenue < ma_7_revenue * 0.8 THEN '异常低'
        ELSE '正常'
    END as anomaly_flag
FROM moving_averages
ORDER BY sale_date DESC;

# 3. 关联规则分析

-- 购物篮分析 - 寻找经常一起购买的商品
WITH order_products AS (
    SELECT 
        s1.user_id,
        s1.sale_date,
        s1.product_id as product_a,
        s2.product_id as product_b
    FROM sales_data s1
    INNER JOIN sales_data s2 ON s1.user_id = s2.user_id 
        AND s1.sale_date = s2.sale_date
        AND s1.product_id < s2.product_id
    WHERE s1.sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
),
product_pairs AS (
    SELECT 
        product_a,
        product_b,
        COUNT(*) as pair_count
    FROM order_products
    GROUP BY product_a, product_b
    HAVING COUNT(*) >= 5
),
product_totals AS (
    SELECT 
        product_id,
        COUNT(DISTINCT CONCAT(user_id, '-', sale_date)) as total_orders
    FROM sales_data
    WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
    GROUP BY product_id
)
SELECT 
    pa.product_name as product_a_name,
    pb.product_name as product_b_name,
    pp.pair_count,
    pta.total_orders as product_a_orders,
    ptb.total_orders as product_b_orders,
    ROUND(pp.pair_count * 100.0 / pta.total_orders, 2) as confidence_a_to_b,
    ROUND(pp.pair_count * 100.0 / ptb.total_orders, 2) as confidence_b_to_a,
    ROUND(
        pp.pair_count * 1.0 / 
        (pta.total_orders * ptb.total_orders / 
         (SELECT COUNT(DISTINCT CONCAT(user_id, '-', sale_date)) 
          FROM sales_data 
          WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY))
        ), 2
    ) as lift
FROM product_pairs pp
INNER JOIN product_info pa ON pp.product_a = pa.product_id
INNER JOIN product_info pb ON pp.product_b = pb.product_id
INNER JOIN product_totals pta ON pp.product_a = pta.product_id
INNER JOIN product_totals ptb ON pp.product_b = ptb.product_id
ORDER BY lift DESC, pair_count DESC
LIMIT 20;

# 数据可视化查询

# 1. 热力图数据

-- 按小时和星期的销售热力图数据
SELECT 
    DAYNAME(sale_date) as day_of_week,
    HOUR(sale_time) as hour_of_day,
    COUNT(*) as order_count,
    SUM(total_amount) as total_revenue
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY DAYNAME(sale_date), HOUR(sale_time)
ORDER BY 
    FIELD(day_of_week, 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'),
    hour_of_day;

# 2. 地理分析数据

-- 按地区的销售分布
SELECT 
    region,
    COUNT(*) as order_count,
    SUM(total_amount) as total_revenue,
    AVG(total_amount) as avg_order_value,
    COUNT(DISTINCT user_id) as unique_customers
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY region
ORDER BY total_revenue DESC;

# 性能优化建议

  1. 分区表:对大数据量的时间序列数据使用分区
  2. 索引优化:为分析查询创建合适的复合索引
  3. 数据预聚合:创建汇总表减少实时计算
  4. 查询缓存:缓存常用的分析结果
  5. 并行处理:利用数据库的并行查询能力

# 最佳实践

  1. 数据质量:确保数据的准确性和完整性
  2. 业务理解:深入理解业务逻辑和指标定义
  3. 可重用性:编写可重用的分析查询模板
  4. 文档化:详细记录分析逻辑和假设
  5. 监控:定期监控数据分析的性能和准确性

# 总结

通过这些数据分析SQL实战案例,我们学习了:

  • 用户行为分析的各种方法
  • 销售数据的深度挖掘技术
  • 客户分析和细分策略
  • 高级分析技术的应用
  • 数据可视化的查询准备

这些技能可以帮助数据分析师和业务人员更好地理解数据,发现业务洞察,支持数据驱动的决策。