数据分析SQL实战
# 数据分析SQL实战
# 概述
数据分析是现代企业决策的重要依据。本章节通过实际的数据分析场景,展示如何使用SQL进行数据挖掘、统计分析和业务洞察。
# 数据准备
# 示例数据表结构
# 用户行为表 (user_behavior)
CREATE TABLE user_behavior (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
user_id INT NOT NULL,
event_type ENUM('page_view', 'click', 'purchase', 'add_to_cart', 'search') NOT NULL,
page_url VARCHAR(500),
product_id INT,
session_id VARCHAR(100),
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
device_type ENUM('desktop', 'mobile', 'tablet') NOT NULL,
browser VARCHAR(50),
ip_address VARCHAR(45),
INDEX idx_user_timestamp (user_id, timestamp),
INDEX idx_event_timestamp (event_type, timestamp),
INDEX idx_session (session_id)
);
# 销售数据表 (sales_data)
CREATE TABLE sales_data (
sale_id BIGINT PRIMARY KEY AUTO_INCREMENT,
product_id INT NOT NULL,
category_id INT NOT NULL,
user_id INT NOT NULL,
quantity INT NOT NULL,
unit_price DECIMAL(10,2) NOT NULL,
total_amount DECIMAL(12,2) NOT NULL,
discount_amount DECIMAL(10,2) DEFAULT 0,
sale_date DATE NOT NULL,
sale_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
region VARCHAR(50),
channel ENUM('online', 'offline', 'mobile_app') NOT NULL,
INDEX idx_date_category (sale_date, category_id),
INDEX idx_product_date (product_id, sale_date),
INDEX idx_user_date (user_id, sale_date)
);
# 产品信息表 (product_info)
CREATE TABLE product_info (
product_id INT PRIMARY KEY,
product_name VARCHAR(200) NOT NULL,
category_id INT NOT NULL,
brand VARCHAR(100),
cost_price DECIMAL(10,2),
list_price DECIMAL(10,2),
launch_date DATE,
status ENUM('active', 'discontinued') DEFAULT 'active'
);
# 用户行为分析
# 1. 用户活跃度分析
# 日活跃用户(DAU)统计
SELECT
DATE(timestamp) as date,
COUNT(DISTINCT user_id) as daily_active_users,
COUNT(*) as total_events
FROM user_behavior
WHERE timestamp >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY DATE(timestamp)
ORDER BY date DESC;
# 用户留存率分析
-- 计算7日留存率
WITH first_visit AS (
SELECT
user_id,
MIN(DATE(timestamp)) as first_date
FROM user_behavior
GROUP BY user_id
),
retention_data AS (
SELECT
fv.first_date,
fv.user_id,
CASE WHEN ub.user_id IS NOT NULL THEN 1 ELSE 0 END as returned
FROM first_visit fv
LEFT JOIN user_behavior ub ON fv.user_id = ub.user_id
AND DATE(ub.timestamp) = DATE_ADD(fv.first_date, INTERVAL 7 DAY)
WHERE fv.first_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
)
SELECT
first_date,
COUNT(*) as new_users,
SUM(returned) as retained_users,
ROUND(SUM(returned) * 100.0 / COUNT(*), 2) as retention_rate_7d
FROM retention_data
GROUP BY first_date
ORDER BY first_date DESC;
# 用户行为路径分析
-- 分析用户在网站上的行为路径
WITH user_sessions AS (
SELECT
user_id,
session_id,
event_type,
page_url,
timestamp,
ROW_NUMBER() OVER (PARTITION BY session_id ORDER BY timestamp) as step_number
FROM user_behavior
WHERE DATE(timestamp) = CURDATE()
),
path_analysis AS (
SELECT
session_id,
GROUP_CONCAT(event_type ORDER BY step_number SEPARATOR ' -> ') as user_path,
COUNT(*) as path_length,
MAX(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) as converted
FROM user_sessions
GROUP BY session_id
)
SELECT
user_path,
COUNT(*) as session_count,
SUM(converted) as conversions,
ROUND(SUM(converted) * 100.0 / COUNT(*), 2) as conversion_rate,
AVG(path_length) as avg_path_length
FROM path_analysis
GROUP BY user_path
HAVING COUNT(*) >= 10
ORDER BY session_count DESC
LIMIT 20;
# 2. 转化漏斗分析
-- 电商转化漏斗分析
WITH funnel_data AS (
SELECT
user_id,
session_id,
MAX(CASE WHEN event_type = 'page_view' THEN 1 ELSE 0 END) as viewed,
MAX(CASE WHEN event_type = 'click' THEN 1 ELSE 0 END) as clicked,
MAX(CASE WHEN event_type = 'add_to_cart' THEN 1 ELSE 0 END) as added_to_cart,
MAX(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) as purchased
FROM user_behavior
WHERE DATE(timestamp) >= DATE_SUB(CURDATE(), INTERVAL 7 DAY)
GROUP BY user_id, session_id
)
SELECT
'Page View' as step,
SUM(viewed) as users,
100.0 as conversion_rate
FROM funnel_data
UNION ALL
SELECT
'Click',
SUM(clicked),
ROUND(SUM(clicked) * 100.0 / SUM(viewed), 2)
FROM funnel_data
UNION ALL
SELECT
'Add to Cart',
SUM(added_to_cart),
ROUND(SUM(added_to_cart) * 100.0 / SUM(viewed), 2)
FROM funnel_data
UNION ALL
SELECT
'Purchase',
SUM(purchased),
ROUND(SUM(purchased) * 100.0 / SUM(viewed), 2)
FROM funnel_data;
# 销售数据分析
# 1. 销售趋势分析
# 月度销售趋势
SELECT
DATE_FORMAT(sale_date, '%Y-%m') as month,
COUNT(*) as order_count,
SUM(total_amount) as total_revenue,
SUM(quantity) as total_quantity,
AVG(total_amount) as avg_order_value,
COUNT(DISTINCT user_id) as unique_customers
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 12 MONTH)
GROUP BY DATE_FORMAT(sale_date, '%Y-%m')
ORDER BY month;
# 同比增长率分析
WITH monthly_sales AS (
SELECT
YEAR(sale_date) as year,
MONTH(sale_date) as month,
SUM(total_amount) as monthly_revenue
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 24 MONTH)
GROUP BY YEAR(sale_date), MONTH(sale_date)
),
growth_analysis AS (
SELECT
year,
month,
monthly_revenue,
LAG(monthly_revenue, 12) OVER (ORDER BY year, month) as prev_year_revenue
FROM monthly_sales
)
SELECT
year,
month,
monthly_revenue,
prev_year_revenue,
CASE
WHEN prev_year_revenue IS NOT NULL THEN
ROUND((monthly_revenue - prev_year_revenue) * 100.0 / prev_year_revenue, 2)
ELSE NULL
END as yoy_growth_rate
FROM growth_analysis
WHERE prev_year_revenue IS NOT NULL
ORDER BY year, month;
# 2. 产品分析
# 产品销售排行
SELECT
p.product_name,
p.brand,
SUM(s.quantity) as total_sold,
SUM(s.total_amount) as total_revenue,
AVG(s.unit_price) as avg_selling_price,
COUNT(DISTINCT s.user_id) as unique_buyers,
ROUND(SUM(s.total_amount - s.quantity * p.cost_price), 2) as total_profit,
ROUND((SUM(s.total_amount - s.quantity * p.cost_price) / SUM(s.total_amount)) * 100, 2) as profit_margin
FROM sales_data s
INNER JOIN product_info p ON s.product_id = p.product_id
WHERE s.sale_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY s.product_id, p.product_name, p.brand
ORDER BY total_revenue DESC
LIMIT 20;
# ABC分析(帕累托分析)
WITH product_revenue AS (
SELECT
s.product_id,
p.product_name,
SUM(s.total_amount) as revenue
FROM sales_data s
INNER JOIN product_info p ON s.product_id = p.product_id
WHERE s.sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
GROUP BY s.product_id, p.product_name
),
revenue_with_cumulative AS (
SELECT
product_id,
product_name,
revenue,
SUM(revenue) OVER () as total_revenue,
SUM(revenue) OVER (ORDER BY revenue DESC) as cumulative_revenue,
ROW_NUMBER() OVER (ORDER BY revenue DESC) as rank_num,
COUNT(*) OVER () as total_products
FROM product_revenue
)
SELECT
product_id,
product_name,
revenue,
ROUND(revenue * 100.0 / total_revenue, 2) as revenue_percentage,
ROUND(cumulative_revenue * 100.0 / total_revenue, 2) as cumulative_percentage,
CASE
WHEN cumulative_revenue * 100.0 / total_revenue <= 80 THEN 'A类产品'
WHEN cumulative_revenue * 100.0 / total_revenue <= 95 THEN 'B类产品'
ELSE 'C类产品'
END as abc_category
FROM revenue_with_cumulative
ORDER BY revenue DESC;
# 3. 客户分析
# RFM客户分析
WITH customer_rfm AS (
SELECT
user_id,
DATEDIFF(CURDATE(), MAX(sale_date)) as recency,
COUNT(*) as frequency,
SUM(total_amount) as monetary
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 365 DAY)
GROUP BY user_id
),
rfm_scores AS (
SELECT
user_id,
recency,
frequency,
monetary,
NTILE(5) OVER (ORDER BY recency DESC) as r_score,
NTILE(5) OVER (ORDER BY frequency ASC) as f_score,
NTILE(5) OVER (ORDER BY monetary ASC) as m_score
FROM customer_rfm
),
customer_segments AS (
SELECT
user_id,
recency,
frequency,
monetary,
r_score,
f_score,
m_score,
CASE
WHEN r_score >= 4 AND f_score >= 4 AND m_score >= 4 THEN '冠军客户'
WHEN r_score >= 3 AND f_score >= 3 AND m_score >= 3 THEN '忠诚客户'
WHEN r_score >= 4 AND f_score <= 2 THEN '新客户'
WHEN r_score <= 2 AND f_score >= 3 THEN '流失客户'
WHEN r_score >= 3 AND f_score <= 2 AND m_score <= 2 THEN '潜在客户'
ELSE '一般客户'
END as customer_segment
FROM rfm_scores
)
SELECT
customer_segment,
COUNT(*) as customer_count,
AVG(recency) as avg_recency,
AVG(frequency) as avg_frequency,
AVG(monetary) as avg_monetary,
SUM(monetary) as total_value
FROM customer_segments
GROUP BY customer_segment
ORDER BY total_value DESC;
# 客户生命周期价值(CLV)分析
WITH customer_metrics AS (
SELECT
user_id,
MIN(sale_date) as first_purchase_date,
MAX(sale_date) as last_purchase_date,
COUNT(*) as total_orders,
SUM(total_amount) as total_spent,
AVG(total_amount) as avg_order_value,
DATEDIFF(MAX(sale_date), MIN(sale_date)) + 1 as customer_lifespan_days
FROM sales_data
GROUP BY user_id
HAVING COUNT(*) > 1
),
clv_calculation AS (
SELECT
user_id,
total_orders,
total_spent,
avg_order_value,
customer_lifespan_days,
CASE
WHEN customer_lifespan_days > 0 THEN
total_orders / (customer_lifespan_days / 365.0)
ELSE 0
END as purchase_frequency_per_year,
CASE
WHEN customer_lifespan_days > 0 THEN
avg_order_value * (total_orders / (customer_lifespan_days / 365.0)) * 2
ELSE avg_order_value
END as estimated_clv
FROM customer_metrics
)
SELECT
CASE
WHEN estimated_clv >= 10000 THEN '高价值客户'
WHEN estimated_clv >= 5000 THEN '中价值客户'
WHEN estimated_clv >= 1000 THEN '低价值客户'
ELSE '潜在客户'
END as clv_segment,
COUNT(*) as customer_count,
AVG(estimated_clv) as avg_clv,
SUM(estimated_clv) as total_clv,
AVG(avg_order_value) as avg_order_value,
AVG(purchase_frequency_per_year) as avg_frequency
FROM clv_calculation
GROUP BY clv_segment
ORDER BY avg_clv DESC;
# 高级分析技术
# 1. 同期群分析(Cohort Analysis)
-- 按月同期群的留存分析
WITH user_cohorts AS (
SELECT
user_id,
DATE_FORMAT(MIN(sale_date), '%Y-%m') as cohort_month
FROM sales_data
GROUP BY user_id
),
user_activities AS (
SELECT
uc.cohort_month,
uc.user_id,
DATE_FORMAT(s.sale_date, '%Y-%m') as activity_month,
PERIOD_DIFF(
CAST(DATE_FORMAT(s.sale_date, '%Y%m') AS UNSIGNED),
CAST(REPLACE(uc.cohort_month, '-', '') AS UNSIGNED)
) as period_number
FROM user_cohorts uc
INNER JOIN sales_data s ON uc.user_id = s.user_id
),
cohort_data AS (
SELECT
cohort_month,
period_number,
COUNT(DISTINCT user_id) as active_users
FROM user_activities
GROUP BY cohort_month, period_number
),
cohort_sizes AS (
SELECT
cohort_month,
COUNT(DISTINCT user_id) as cohort_size
FROM user_cohorts
GROUP BY cohort_month
)
SELECT
cd.cohort_month,
cs.cohort_size,
cd.period_number,
cd.active_users,
ROUND(cd.active_users * 100.0 / cs.cohort_size, 2) as retention_rate
FROM cohort_data cd
INNER JOIN cohort_sizes cs ON cd.cohort_month = cs.cohort_month
WHERE cd.cohort_month >= DATE_FORMAT(DATE_SUB(CURDATE(), INTERVAL 12 MONTH), '%Y-%m')
ORDER BY cd.cohort_month, cd.period_number;
# 2. 时间序列分析
-- 移动平均和趋势分析
WITH daily_sales AS (
SELECT
sale_date,
SUM(total_amount) as daily_revenue,
COUNT(*) as daily_orders
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
GROUP BY sale_date
),
moving_averages AS (
SELECT
sale_date,
daily_revenue,
daily_orders,
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as ma_7_revenue,
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as ma_30_revenue,
LAG(daily_revenue, 7) OVER (ORDER BY sale_date) as revenue_7_days_ago
FROM daily_sales
)
SELECT
sale_date,
daily_revenue,
ma_7_revenue,
ma_30_revenue,
revenue_7_days_ago,
CASE
WHEN revenue_7_days_ago IS NOT NULL THEN
ROUND((daily_revenue - revenue_7_days_ago) * 100.0 / revenue_7_days_ago, 2)
ELSE NULL
END as wow_growth_rate,
CASE
WHEN daily_revenue > ma_7_revenue * 1.2 THEN '异常高'
WHEN daily_revenue < ma_7_revenue * 0.8 THEN '异常低'
ELSE '正常'
END as anomaly_flag
FROM moving_averages
ORDER BY sale_date DESC;
# 3. 关联规则分析
-- 购物篮分析 - 寻找经常一起购买的商品
WITH order_products AS (
SELECT
s1.user_id,
s1.sale_date,
s1.product_id as product_a,
s2.product_id as product_b
FROM sales_data s1
INNER JOIN sales_data s2 ON s1.user_id = s2.user_id
AND s1.sale_date = s2.sale_date
AND s1.product_id < s2.product_id
WHERE s1.sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
),
product_pairs AS (
SELECT
product_a,
product_b,
COUNT(*) as pair_count
FROM order_products
GROUP BY product_a, product_b
HAVING COUNT(*) >= 5
),
product_totals AS (
SELECT
product_id,
COUNT(DISTINCT CONCAT(user_id, '-', sale_date)) as total_orders
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY)
GROUP BY product_id
)
SELECT
pa.product_name as product_a_name,
pb.product_name as product_b_name,
pp.pair_count,
pta.total_orders as product_a_orders,
ptb.total_orders as product_b_orders,
ROUND(pp.pair_count * 100.0 / pta.total_orders, 2) as confidence_a_to_b,
ROUND(pp.pair_count * 100.0 / ptb.total_orders, 2) as confidence_b_to_a,
ROUND(
pp.pair_count * 1.0 /
(pta.total_orders * ptb.total_orders /
(SELECT COUNT(DISTINCT CONCAT(user_id, '-', sale_date))
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 90 DAY))
), 2
) as lift
FROM product_pairs pp
INNER JOIN product_info pa ON pp.product_a = pa.product_id
INNER JOIN product_info pb ON pp.product_b = pb.product_id
INNER JOIN product_totals pta ON pp.product_a = pta.product_id
INNER JOIN product_totals ptb ON pp.product_b = ptb.product_id
ORDER BY lift DESC, pair_count DESC
LIMIT 20;
# 数据可视化查询
# 1. 热力图数据
-- 按小时和星期的销售热力图数据
SELECT
DAYNAME(sale_date) as day_of_week,
HOUR(sale_time) as hour_of_day,
COUNT(*) as order_count,
SUM(total_amount) as total_revenue
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY DAYNAME(sale_date), HOUR(sale_time)
ORDER BY
FIELD(day_of_week, 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'),
hour_of_day;
# 2. 地理分析数据
-- 按地区的销售分布
SELECT
region,
COUNT(*) as order_count,
SUM(total_amount) as total_revenue,
AVG(total_amount) as avg_order_value,
COUNT(DISTINCT user_id) as unique_customers
FROM sales_data
WHERE sale_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
GROUP BY region
ORDER BY total_revenue DESC;
# 性能优化建议
- 分区表:对大数据量的时间序列数据使用分区
- 索引优化:为分析查询创建合适的复合索引
- 数据预聚合:创建汇总表减少实时计算
- 查询缓存:缓存常用的分析结果
- 并行处理:利用数据库的并行查询能力
# 最佳实践
- 数据质量:确保数据的准确性和完整性
- 业务理解:深入理解业务逻辑和指标定义
- 可重用性:编写可重用的分析查询模板
- 文档化:详细记录分析逻辑和假设
- 监控:定期监控数据分析的性能和准确性
# 总结
通过这些数据分析SQL实战案例,我们学习了:
- 用户行为分析的各种方法
- 销售数据的深度挖掘技术
- 客户分析和细分策略
- 高级分析技术的应用
- 数据可视化的查询准备
这些技能可以帮助数据分析师和业务人员更好地理解数据,发现业务洞察,支持数据驱动的决策。