SQL窗口函数
# SQL窗口函数
作者:哪吒
时间:2024年12月
# 概述
SQL窗口函数(Window Functions)是SQL中的高级功能,允许在不使用GROUP BY的情况下对数据集进行分组计算。窗口函数可以访问与当前行相关的一组行,并对这组行执行计算,同时保留原始行的详细信息。
# 基本语法
function_name([arguments]) OVER (
[PARTITION BY column1, column2, ...]
[ORDER BY column1 [ASC|DESC], column2 [ASC|DESC], ...]
[ROWS|RANGE BETWEEN start AND end]
)
# 语法组成部分
- function_name: 窗口函数名称
- OVER: 窗口函数的关键字
- PARTITION BY: 将结果集分组(可选)
- ORDER BY: 在窗口内排序(可选)
- ROWS/RANGE: 定义窗口框架(可选)
# 排名函数
# ROW_NUMBER() - 行号
为结果集中的每一行分配唯一的序号。
-- 基本用法
SELECT
employee_name,
salary,
ROW_NUMBER() OVER (ORDER BY salary DESC) AS row_num
FROM employees;
-- 分组排序
SELECT
department,
employee_name,
salary,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank
FROM employees;
# RANK() - 排名(有并列)
为行分配排名,相同值获得相同排名,后续排名会跳跃。
-- 示例数据
CREATE TABLE students (
id INT,
name VARCHAR(50),
class VARCHAR(10),
score INT
);
INSERT INTO students VALUES
(1, '张三', 'A班', 95),
(2, '李四', 'A班', 92),
(3, '王五', 'A班', 92),
(4, '赵六', 'A班', 88),
(5, '钱七', 'B班', 96),
(6, '孙八', 'B班', 94),
(7, '周九', 'B班', 90);
-- 全校排名
SELECT
name,
class,
score,
RANK() OVER (ORDER BY score DESC) AS school_rank,
ROW_NUMBER() OVER (ORDER BY score DESC) AS row_num
FROM students;
-- 班级内排名
SELECT
name,
class,
score,
RANK() OVER (PARTITION BY class ORDER BY score DESC) AS class_rank
FROM students;
# DENSE_RANK() - 密集排名
类似RANK(),但后续排名不跳跃。
SELECT
name,
score,
RANK() OVER (ORDER BY score DESC) AS rank_with_gaps,
DENSE_RANK() OVER (ORDER BY score DESC) AS dense_rank,
ROW_NUMBER() OVER (ORDER BY score DESC) AS row_number
FROM students
ORDER BY score DESC;
# NTILE() - 分组排名
将结果集分成N个大致相等的组。
-- 将学生分成4个等级组
SELECT
name,
score,
NTILE(4) OVER (ORDER BY score DESC) AS quartile,
CASE
WHEN NTILE(4) OVER (ORDER BY score DESC) = 1 THEN '优秀'
WHEN NTILE(4) OVER (ORDER BY score DESC) = 2 THEN '良好'
WHEN NTILE(4) OVER (ORDER BY score DESC) = 3 THEN '中等'
ELSE '需提高'
END AS performance_level
FROM students;
-- 按部门分组,每个部门内分成3个薪资等级
SELECT
department,
employee_name,
salary,
NTILE(3) OVER (PARTITION BY department ORDER BY salary DESC) AS salary_tier
FROM employees;
# 聚合窗口函数
# SUM() OVER - 累计求和
-- 累计销售额
SELECT
order_date,
daily_sales,
SUM(daily_sales) OVER (ORDER BY order_date) AS cumulative_sales,
SUM(daily_sales) OVER (ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM daily_sales
ORDER BY order_date;
-- 移动平均(最近3天)
SELECT
order_date,
daily_sales,
AVG(daily_sales) OVER (
ORDER BY order_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS moving_avg_3days
FROM daily_sales;
# COUNT() OVER - 累计计数
-- 累计订单数
SELECT
order_date,
order_id,
COUNT(*) OVER (ORDER BY order_date) AS cumulative_orders,
COUNT(*) OVER (PARTITION BY YEAR(order_date) ORDER BY order_date) AS yearly_cumulative
FROM orders;
# MAX() 和 MIN() OVER
-- 历史最高和最低价格
SELECT
trade_date,
stock_price,
MAX(stock_price) OVER (ORDER BY trade_date ROWS UNBOUNDED PRECEDING) AS historical_high,
MIN(stock_price) OVER (ORDER BY trade_date ROWS UNBOUNDED PRECEDING) AS historical_low
FROM stock_prices;
# 偏移函数
# LAG() - 获取前面的值
-- 基本语法
LAG(column, offset, default_value) OVER ([PARTITION BY ...] ORDER BY ...)
-- 计算销售额环比增长
SELECT
month,
sales_amount,
LAG(sales_amount, 1) OVER (ORDER BY month) AS prev_month_sales,
sales_amount - LAG(sales_amount, 1) OVER (ORDER BY month) AS month_over_month_change,
ROUND(
(sales_amount - LAG(sales_amount, 1) OVER (ORDER BY month)) * 100.0 /
LAG(sales_amount, 1) OVER (ORDER BY month), 2
) AS growth_rate_percent
FROM monthly_sales;
-- 股价变化分析
SELECT
trade_date,
closing_price,
LAG(closing_price, 1) OVER (ORDER BY trade_date) AS prev_day_close,
LAG(closing_price, 7) OVER (ORDER BY trade_date) AS week_ago_close,
closing_price - LAG(closing_price, 1) OVER (ORDER BY trade_date) AS daily_change
FROM stock_data;
# LEAD() - 获取后面的值
-- 预测分析
SELECT
month,
actual_sales,
LEAD(actual_sales, 1) OVER (ORDER BY month) AS next_month_actual,
LEAD(actual_sales, 3) OVER (ORDER BY month) AS quarter_ahead_actual
FROM sales_data;
-- 员工离职分析
SELECT
employee_id,
hire_date,
LEAD(hire_date, 1) OVER (PARTITION BY department ORDER BY hire_date) AS next_hire_date,
DATEDIFF(
LEAD(hire_date, 1) OVER (PARTITION BY department ORDER BY hire_date),
hire_date
) AS days_to_next_hire
FROM employees;
# FIRST_VALUE() 和 LAST_VALUE()
-- 获取窗口内的第一个和最后一个值
SELECT
employee_name,
department,
salary,
FIRST_VALUE(salary) OVER (
PARTITION BY department
ORDER BY salary DESC
ROWS UNBOUNDED PRECEDING
) AS highest_salary_in_dept,
LAST_VALUE(salary) OVER (
PARTITION BY department
ORDER BY salary DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS lowest_salary_in_dept
FROM employees;
-- 时间序列分析
SELECT
trade_date,
stock_price,
FIRST_VALUE(stock_price) OVER (
ORDER BY trade_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS week_start_price,
LAST_VALUE(stock_price) OVER (
ORDER BY trade_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS current_price
FROM stock_prices;
# 窗口框架
# ROWS vs RANGE
-- ROWS:基于行数
SELECT
order_date,
amount,
SUM(amount) OVER (
ORDER BY order_date
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) AS sum_5_rows
FROM orders;
-- RANGE:基于值范围
SELECT
order_date,
amount,
SUM(amount) OVER (
ORDER BY order_date
RANGE BETWEEN INTERVAL 7 DAY PRECEDING AND CURRENT ROW
) AS sum_last_week
FROM orders;
# 框架边界选项
-- 常用框架定义
SELECT
date_col,
value_col,
-- 从开始到当前行
SUM(value_col) OVER (
ORDER BY date_col
ROWS UNBOUNDED PRECEDING
) AS cumulative_sum,
-- 当前行到结束
SUM(value_col) OVER (
ORDER BY date_col
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
) AS remaining_sum,
-- 前后各3行
AVG(value_col) OVER (
ORDER BY date_col
ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING
) AS moving_avg_7,
-- 整个分区
SUM(value_col) OVER (
PARTITION BY category
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS total_by_category
FROM sample_data;
# 实战示例
# 示例1:销售业绩分析
-- 创建销售数据表
CREATE TABLE sales_performance (
salesperson VARCHAR(50),
region VARCHAR(30),
month DATE,
sales_amount DECIMAL(10,2)
);
INSERT INTO sales_performance VALUES
('张三', '华北', '2024-01-01', 50000),
('张三', '华北', '2024-02-01', 55000),
('张三', '华北', '2024-03-01', 48000),
('李四', '华北', '2024-01-01', 45000),
('李四', '华北', '2024-02-01', 52000),
('李四', '华北', '2024-03-01', 58000),
('王五', '华南', '2024-01-01', 60000),
('王五', '华南', '2024-02-01', 62000),
('王五', '华南', '2024-03-01', 65000);
-- 综合业绩分析
SELECT
salesperson,
region,
month,
sales_amount,
-- 个人累计销售额
SUM(sales_amount) OVER (
PARTITION BY salesperson
ORDER BY month
) AS personal_cumulative,
-- 区域内排名
RANK() OVER (
PARTITION BY region, month
ORDER BY sales_amount DESC
) AS regional_rank,
-- 全公司排名
RANK() OVER (
PARTITION BY month
ORDER BY sales_amount DESC
) AS company_rank,
-- 环比增长
LAG(sales_amount) OVER (
PARTITION BY salesperson
ORDER BY month
) AS prev_month_sales,
ROUND(
(sales_amount - LAG(sales_amount) OVER (
PARTITION BY salesperson ORDER BY month
)) * 100.0 / LAG(sales_amount) OVER (
PARTITION BY salesperson ORDER BY month
), 2
) AS growth_rate,
-- 移动平均(最近3个月)
ROUND(
AVG(sales_amount) OVER (
PARTITION BY salesperson
ORDER BY month
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
), 2
) AS moving_avg_3months,
-- 区域市场份额
ROUND(
sales_amount * 100.0 / SUM(sales_amount) OVER (
PARTITION BY region, month
), 2
) AS market_share_percent,
-- 业绩等级
NTILE(4) OVER (
PARTITION BY month
ORDER BY sales_amount DESC
) AS performance_quartile
FROM sales_performance
ORDER BY month, region, sales_amount DESC;
# 示例2:用户行为分析
-- 创建用户访问日志表
CREATE TABLE user_sessions (
user_id INT,
session_date DATE,
page_views INT,
session_duration INT, -- 分钟
purchases INT
);
INSERT INTO user_sessions VALUES
(1001, '2024-12-01', 5, 25, 0),
(1001, '2024-12-02', 8, 35, 1),
(1001, '2024-12-03', 3, 15, 0),
(1001, '2024-12-04', 12, 45, 2),
(1002, '2024-12-01', 2, 10, 0),
(1002, '2024-12-02', 6, 30, 1),
(1002, '2024-12-03', 4, 20, 0),
(1003, '2024-12-01', 15, 60, 3),
(1003, '2024-12-02', 10, 40, 1),
(1003, '2024-12-03', 8, 35, 1);
-- 用户行为分析
SELECT
user_id,
session_date,
page_views,
session_duration,
purchases,
-- 用户活跃度趋势
AVG(page_views) OVER (
PARTITION BY user_id
ORDER BY session_date
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS avg_pageviews_3days,
-- 会话时长变化
session_duration - LAG(session_duration) OVER (
PARTITION BY user_id
ORDER BY session_date
) AS duration_change,
-- 累计购买次数
SUM(purchases) OVER (
PARTITION BY user_id
ORDER BY session_date
) AS total_purchases,
-- 用户价值排名(基于总购买次数)
DENSE_RANK() OVER (
ORDER BY SUM(purchases) OVER (
PARTITION BY user_id
) DESC
) AS user_value_rank,
-- 活跃度等级
CASE
WHEN NTILE(4) OVER (
ORDER BY AVG(page_views) OVER (PARTITION BY user_id)
) = 1 THEN '高活跃'
WHEN NTILE(4) OVER (
ORDER BY AVG(page_views) OVER (PARTITION BY user_id)
) = 2 THEN '中活跃'
WHEN NTILE(4) OVER (
ORDER BY AVG(page_views) OVER (PARTITION BY user_id)
) = 3 THEN '低活跃'
ELSE '不活跃'
END AS activity_level,
-- 连续访问天数
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY session_date
) AS consecutive_days
FROM user_sessions
ORDER BY user_id, session_date;
# 示例3:库存管理分析
-- 创建库存变动表
CREATE TABLE inventory_movements (
product_id INT,
movement_date DATE,
movement_type VARCHAR(10), -- 'IN' 或 'OUT'
quantity INT,
unit_cost DECIMAL(8,2)
);
INSERT INTO inventory_movements VALUES
(1001, '2024-12-01', 'IN', 100, 10.00),
(1001, '2024-12-02', 'OUT', 30, 10.00),
(1001, '2024-12-03', 'OUT', 25, 10.00),
(1001, '2024-12-04', 'IN', 50, 10.50),
(1001, '2024-12-05', 'OUT', 40, 10.50),
(1002, '2024-12-01', 'IN', 200, 5.00),
(1002, '2024-12-02', 'OUT', 80, 5.00),
(1002, '2024-12-03', 'OUT', 60, 5.00),
(1002, '2024-12-04', 'IN', 100, 5.20);
-- 库存分析
SELECT
product_id,
movement_date,
movement_type,
quantity,
unit_cost,
-- 库存余额计算
SUM(
CASE
WHEN movement_type = 'IN' THEN quantity
ELSE -quantity
END
) OVER (
PARTITION BY product_id
ORDER BY movement_date, movement_type
) AS running_balance,
-- 累计入库量
SUM(
CASE WHEN movement_type = 'IN' THEN quantity ELSE 0 END
) OVER (
PARTITION BY product_id
ORDER BY movement_date
) AS cumulative_in,
-- 累计出库量
SUM(
CASE WHEN movement_type = 'OUT' THEN quantity ELSE 0 END
) OVER (
PARTITION BY product_id
ORDER BY movement_date
) AS cumulative_out,
-- 平均成本(移动加权平均)
SUM(
CASE WHEN movement_type = 'IN' THEN quantity * unit_cost ELSE 0 END
) OVER (
PARTITION BY product_id
ORDER BY movement_date
) / NULLIF(
SUM(
CASE WHEN movement_type = 'IN' THEN quantity ELSE 0 END
) OVER (
PARTITION BY product_id
ORDER BY movement_date
), 0
) AS weighted_avg_cost,
-- 库存周转分析
LAG(movement_date) OVER (
PARTITION BY product_id
ORDER BY movement_date
) AS prev_movement_date,
DATEDIFF(
movement_date,
LAG(movement_date) OVER (
PARTITION BY product_id
ORDER BY movement_date
)
) AS days_since_last_movement,
-- 最大最小库存
MAX(
SUM(
CASE
WHEN movement_type = 'IN' THEN quantity
ELSE -quantity
END
) OVER (
PARTITION BY product_id
ORDER BY movement_date
ROWS UNBOUNDED PRECEDING
)
) OVER (
PARTITION BY product_id
) AS max_inventory_level,
MIN(
SUM(
CASE
WHEN movement_type = 'IN' THEN quantity
ELSE -quantity
END
) OVER (
PARTITION BY product_id
ORDER BY movement_date
ROWS UNBOUNDED PRECEDING
)
) OVER (
PARTITION BY product_id
) AS min_inventory_level
FROM inventory_movements
ORDER BY product_id, movement_date, movement_type;
# 性能优化
# 1. 索引优化
-- 为PARTITION BY和ORDER BY的列创建索引
CREATE INDEX idx_sales_region_date ON sales_data(region, sale_date);
CREATE INDEX idx_employee_dept_salary ON employees(department, salary);
# 2. 避免不必要的窗口函数
-- 不推荐:重复计算相同的窗口函数
SELECT
employee_name,
salary,
RANK() OVER (ORDER BY salary DESC) AS rank1,
RANK() OVER (ORDER BY salary DESC) AS rank2 -- 重复计算
FROM employees;
-- 推荐:使用子查询或CTE
WITH ranked_employees AS (
SELECT
employee_name,
salary,
RANK() OVER (ORDER BY salary DESC) AS salary_rank
FROM employees
)
SELECT
employee_name,
salary,
salary_rank,
salary_rank AS another_rank_reference
FROM ranked_employees;
# 3. 合理使用窗口框架
-- 明确指定窗口框架以提高性能
SELECT
date_col,
value_col,
SUM(value_col) OVER (
ORDER BY date_col
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS running_sum
FROM data_table;
# 常见错误和解决方案
# 1. 忘记ORDER BY
-- 错误:LAG/LEAD没有ORDER BY
SELECT
LAG(sales) OVER (PARTITION BY region) -- 错误!
FROM sales_data;
-- 正确:必须指定ORDER BY
SELECT
LAG(sales) OVER (PARTITION BY region ORDER BY date)
FROM sales_data;
# 2. 窗口框架理解错误
-- 错误:LAST_VALUE没有正确的窗口框架
SELECT
LAST_VALUE(salary) OVER (ORDER BY salary) -- 只返回当前行
FROM employees;
-- 正确:指定完整的窗口框架
SELECT
LAST_VALUE(salary) OVER (
ORDER BY salary
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
)
FROM employees;
# 3. 性能问题
-- 避免在大数据集上使用复杂的窗口函数
-- 考虑先过滤数据再应用窗口函数
SELECT
employee_name,
salary,
RANK() OVER (ORDER BY salary DESC) AS rank
FROM employees
WHERE department = 'IT' -- 先过滤
AND hire_date >= '2020-01-01';
# 最佳实践
- 合理使用分区:根据业务逻辑选择合适的PARTITION BY字段
- 明确窗口框架:显式指定ROWS或RANGE边界
- 优化查询性能:为窗口函数相关字段创建索引
- 避免重复计算:使用CTE或子查询复用窗口函数结果
- 注意数据类型:确保窗口函数的参数类型正确
# 总结
SQL窗口函数是处理复杂分析查询的强大工具,能够:
- 在保留行详细信息的同时进行聚合计算
- 实现排名、分组和趋势分析
- 处理时间序列数据和移动计算
- 简化复杂的业务逻辑实现
掌握窗口函数的使用,能够大大提升SQL查询的分析能力和效率。