聚合函数
哪吒 2024/1/1
# 聚合函数
点击勘误issues (opens new window),哪吒感谢大家的阅读
# 聚合函数概述
聚合函数是对一组值执行计算并返回单个值的函数。它们通常与GROUP BY子句一起使用,对数据进行分组统计。聚合函数忽略NULL值(COUNT(*)除外)。
# 常用聚合函数
COUNT()
- 计数SUM()
- 求和AVG()
- 平均值MAX()
- 最大值MIN()
- 最小值GROUP_CONCAT()
- 字符串聚合(MySQL)STDDEV()
- 标准差VARIANCE()
- 方差
# 准备测试数据
-- 创建销售表
CREATE TABLE sales (
id INT PRIMARY KEY AUTO_INCREMENT,
product_name VARCHAR(100),
category VARCHAR(50),
price DECIMAL(10,2),
quantity INT,
sale_date DATE,
salesperson VARCHAR(50),
region VARCHAR(50)
);
-- 插入测试数据
INSERT INTO sales (product_name, category, price, quantity, sale_date, salesperson, region) VALUES
('iPhone 14', '电子产品', 5999.00, 2, '2023-01-15', '张三', '北京'),
('MacBook Pro', '电子产品', 12999.00, 1, '2023-01-16', '张三', '北京'),
('iPad Air', '电子产品', 4599.00, 3, '2023-01-17', '李四', '上海'),
('AirPods', '电子产品', 1299.00, 5, '2023-01-18', '王五', '广州'),
('Nike鞋', '服装', 899.00, 2, '2023-01-19', '张三', '北京'),
('Adidas鞋', '服装', 799.00, 3, '2023-01-20', '李四', '上海'),
('T恤', '服装', 199.00, 10, '2023-01-21', '王五', '广州'),
('牛仔裤', '服装', 399.00, 4, '2023-01-22', '赵六', '深圳'),
('咖啡机', '家电', 2999.00, 1, '2023-01-23', '张三', '北京'),
('微波炉', '家电', 899.00, 2, '2023-01-24', '李四', '上海');
# COUNT函数
COUNT函数用于计算行数或非NULL值的数量。
# COUNT(*):计算总行数
-- 计算总销售记录数
SELECT COUNT(*) as total_records FROM sales;
-- 结果:10
-- 计算每个类别的销售记录数
SELECT category, COUNT(*) as record_count
FROM sales
GROUP BY category;
-- 结果:
-- 电子产品, 4
-- 服装, 4
-- 家电, 2
# COUNT(列名):计算非NULL值数量
-- 计算有销售人员记录的数量
SELECT COUNT(salesperson) as salesperson_count FROM sales;
-- 计算每个地区的销售记录数
SELECT region, COUNT(product_name) as product_count
FROM sales
GROUP BY region
ORDER BY product_count DESC;
# COUNT(DISTINCT):计算唯一值数量
-- 计算不同产品的数量
SELECT COUNT(DISTINCT product_name) as unique_products FROM sales;
-- 计算每个类别中不同产品的数量
SELECT category, COUNT(DISTINCT product_name) as unique_products
FROM sales
GROUP BY category;
-- 计算不同销售人员的数量
SELECT COUNT(DISTINCT salesperson) as unique_salesperson FROM sales;
# SUM函数
SUM函数用于计算数值列的总和。
# 基本求和
-- 计算总销售额
SELECT SUM(price * quantity) as total_revenue FROM sales;
-- 计算总销售数量
SELECT SUM(quantity) as total_quantity FROM sales;
-- 按类别计算销售额
SELECT
category,
SUM(price * quantity) as category_revenue,
SUM(quantity) as category_quantity
FROM sales
GROUP BY category
ORDER BY category_revenue DESC;
# 条件求和
-- 计算电子产品的总销售额
SELECT SUM(price * quantity) as electronics_revenue
FROM sales
WHERE category = '电子产品';
-- 使用CASE进行条件求和
SELECT
SUM(CASE WHEN category = '电子产品' THEN price * quantity ELSE 0 END) as electronics_revenue,
SUM(CASE WHEN category = '服装' THEN price * quantity ELSE 0 END) as clothing_revenue,
SUM(CASE WHEN category = '家电' THEN price * quantity ELSE 0 END) as appliance_revenue
FROM sales;
# 按时间求和
-- 按月计算销售额
SELECT
DATE_FORMAT(sale_date, '%Y-%m') as sale_month,
SUM(price * quantity) as monthly_revenue
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m')
ORDER BY sale_month;
-- 按销售人员计算销售额
SELECT
salesperson,
SUM(price * quantity) as personal_revenue,
COUNT(*) as sales_count
FROM sales
GROUP BY salesperson
ORDER BY personal_revenue DESC;
# AVG函数
AVG函数用于计算数值列的平均值。
# 基本平均值
-- 计算平均销售价格
SELECT AVG(price) as avg_price FROM sales;
-- 计算平均销售数量
SELECT AVG(quantity) as avg_quantity FROM sales;
-- 按类别计算平均价格
SELECT
category,
AVG(price) as avg_price,
AVG(quantity) as avg_quantity,
AVG(price * quantity) as avg_revenue
FROM sales
GROUP BY category;
# 加权平均
-- 计算加权平均价格(按数量加权)
SELECT
category,
SUM(price * quantity) / SUM(quantity) as weighted_avg_price
FROM sales
GROUP BY category;
-- 与普通平均值对比
SELECT
category,
AVG(price) as simple_avg_price,
SUM(price * quantity) / SUM(quantity) as weighted_avg_price
FROM sales
GROUP BY category;
# 条件平均值
-- 计算高价商品的平均价格(价格>1000)
SELECT AVG(price) as high_price_avg
FROM sales
WHERE price > 1000;
-- 使用CASE计算条件平均值
SELECT
AVG(CASE WHEN price > 1000 THEN price END) as high_price_avg,
AVG(CASE WHEN price <= 1000 THEN price END) as low_price_avg
FROM sales;
# MAX和MIN函数
MAX和MIN函数分别用于查找最大值和最小值。
# 基本最值查询
-- 查找最高和最低价格
SELECT
MAX(price) as max_price,
MIN(price) as min_price,
MAX(quantity) as max_quantity,
MIN(quantity) as min_quantity
FROM sales;
-- 按类别查找最值
SELECT
category,
MAX(price) as max_price,
MIN(price) as min_price,
MAX(price * quantity) as max_revenue
FROM sales
GROUP BY category;
# 查找最值对应的记录
-- 查找最贵的商品信息
SELECT *
FROM sales
WHERE price = (SELECT MAX(price) FROM sales);
-- 查找每个类别中最贵的商品
SELECT s1.*
FROM sales s1
INNER JOIN (
SELECT category, MAX(price) as max_price
FROM sales
GROUP BY category
) s2 ON s1.category = s2.category AND s1.price = s2.max_price;
-- 使用窗口函数(MySQL 8.0+)
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY price DESC) as rn
FROM sales
) ranked
WHERE rn = 1;
# 日期最值
-- 查找最早和最晚的销售日期
SELECT
MIN(sale_date) as earliest_sale,
MAX(sale_date) as latest_sale,
DATEDIFF(MAX(sale_date), MIN(sale_date)) as date_range_days
FROM sales;
-- 按销售人员查找销售日期范围
SELECT
salesperson,
MIN(sale_date) as first_sale,
MAX(sale_date) as last_sale,
COUNT(*) as total_sales
FROM sales
GROUP BY salesperson;
# GROUP_CONCAT函数(MySQL)
GROUP_CONCAT函数用于将分组中的字符串值连接起来。
# 基本字符串聚合
-- 按类别聚合产品名称
SELECT
category,
GROUP_CONCAT(product_name) as products
FROM sales
GROUP BY category;
-- 使用分隔符
SELECT
category,
GROUP_CONCAT(product_name SEPARATOR ' | ') as products
FROM sales
GROUP BY category;
# 排序和去重
-- 按价格排序聚合
SELECT
category,
GROUP_CONCAT(product_name ORDER BY price DESC SEPARATOR ', ') as products_by_price
FROM sales
GROUP BY category;
-- 去重聚合
SELECT
region,
GROUP_CONCAT(DISTINCT category SEPARATOR ', ') as categories
FROM sales
GROUP BY region;
-- 限制长度
SELECT
category,
GROUP_CONCAT(product_name SEPARATOR ', ') as products
FROM sales
GROUP BY category;
-- 注意:可以设置 group_concat_max_len 系统变量来控制最大长度
# 复杂聚合
-- 聚合产品信息(名称和价格)
SELECT
category,
GROUP_CONCAT(
CONCAT(product_name, '(¥', price, ')')
ORDER BY price DESC
SEPARATOR ', '
) as product_details
FROM sales
GROUP BY category;
-- 按销售人员聚合销售信息
SELECT
salesperson,
GROUP_CONCAT(
CONCAT(product_name, ' x', quantity)
ORDER BY sale_date
SEPARATOR ' | '
) as sales_history
FROM sales
GROUP BY salesperson;
# 统计函数
# 标准差和方差
-- 计算价格的标准差和方差
SELECT
category,
AVG(price) as avg_price,
STDDEV(price) as price_stddev,
VARIANCE(price) as price_variance
FROM sales
GROUP BY category;
-- 计算销售额的统计信息
SELECT
COUNT(*) as count,
SUM(price * quantity) as total_revenue,
AVG(price * quantity) as avg_revenue,
STDDEV(price * quantity) as revenue_stddev,
MIN(price * quantity) as min_revenue,
MAX(price * quantity) as max_revenue
FROM sales;
# 多级聚合
# ROLLUP:分层汇总
-- 使用ROLLUP进行分层汇总
SELECT
region,
category,
COUNT(*) as sales_count,
SUM(price * quantity) as total_revenue
FROM sales
GROUP BY region, category WITH ROLLUP;
-- 使用GROUPING函数识别汇总行(MySQL 8.0+)
SELECT
CASE WHEN GROUPING(region) = 1 THEN '所有地区' ELSE region END as region,
CASE WHEN GROUPING(category) = 1 THEN '所有类别' ELSE category END as category,
COUNT(*) as sales_count,
SUM(price * quantity) as total_revenue
FROM sales
GROUP BY region, category WITH ROLLUP;
# 手动实现多级汇总
-- 使用UNION实现多级汇总
SELECT region, category, COUNT(*) as count, SUM(price * quantity) as revenue
FROM sales
GROUP BY region, category
UNION ALL
SELECT region, '小计' as category, COUNT(*) as count, SUM(price * quantity) as revenue
FROM sales
GROUP BY region
UNION ALL
SELECT '总计' as region, '总计' as category, COUNT(*) as count, SUM(price * quantity) as revenue
FROM sales
ORDER BY
CASE WHEN region = '总计' THEN 'zzz' ELSE region END,
CASE WHEN category = '总计' THEN 'zzz' WHEN category = '小计' THEN 'yyy' ELSE category END;
# 条件聚合
# 使用CASE WHEN进行条件聚合
-- 按价格区间统计
SELECT
COUNT(CASE WHEN price < 500 THEN 1 END) as low_price_count,
COUNT(CASE WHEN price BETWEEN 500 AND 2000 THEN 1 END) as mid_price_count,
COUNT(CASE WHEN price > 2000 THEN 1 END) as high_price_count,
SUM(CASE WHEN price < 500 THEN price * quantity ELSE 0 END) as low_price_revenue,
SUM(CASE WHEN price BETWEEN 500 AND 2000 THEN price * quantity ELSE 0 END) as mid_price_revenue,
SUM(CASE WHEN price > 2000 THEN price * quantity ELSE 0 END) as high_price_revenue
FROM sales;
-- 按月份统计各类别销售情况
SELECT
DATE_FORMAT(sale_date, '%Y-%m') as month,
SUM(CASE WHEN category = '电子产品' THEN price * quantity ELSE 0 END) as electronics,
SUM(CASE WHEN category = '服装' THEN price * quantity ELSE 0 END) as clothing,
SUM(CASE WHEN category = '家电' THEN price * quantity ELSE 0 END) as appliances
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m');
# 百分比计算
-- 计算各类别销售额占比
SELECT
category,
SUM(price * quantity) as category_revenue,
ROUND(
SUM(price * quantity) * 100.0 / (
SELECT SUM(price * quantity) FROM sales
), 2
) as revenue_percentage
FROM sales
GROUP BY category
ORDER BY category_revenue DESC;
-- 使用窗口函数计算占比(MySQL 8.0+)
SELECT
category,
SUM(price * quantity) as category_revenue,
ROUND(
SUM(price * quantity) * 100.0 / SUM(SUM(price * quantity)) OVER(), 2
) as revenue_percentage
FROM sales
GROUP BY category
ORDER BY category_revenue DESC;
# 聚合函数与窗口函数对比
# 传统聚合函数
-- 传统聚合:每个类别一行
SELECT
category,
COUNT(*) as count,
AVG(price) as avg_price
FROM sales
GROUP BY category;
# 窗口函数(MySQL 8.0+)
-- 窗口函数:保留所有行,添加聚合信息
SELECT
product_name,
category,
price,
COUNT(*) OVER (PARTITION BY category) as category_count,
AVG(price) OVER (PARTITION BY category) as category_avg_price,
price - AVG(price) OVER (PARTITION BY category) as price_diff_from_avg
FROM sales
ORDER BY category, price;
# 性能优化
# 索引优化
-- 为聚合查询创建合适的索引
CREATE INDEX idx_category_date ON sales(category, sale_date);
CREATE INDEX idx_salesperson_date ON sales(salesperson, sale_date);
CREATE INDEX idx_region_category ON sales(region, category);
# 查询优化技巧
-- 优化前:多次扫描表
SELECT
(SELECT COUNT(*) FROM sales WHERE category = '电子产品') as electronics_count,
(SELECT COUNT(*) FROM sales WHERE category = '服装') as clothing_count,
(SELECT COUNT(*) FROM sales WHERE category = '家电') as appliance_count;
-- 优化后:一次扫描,条件聚合
SELECT
COUNT(CASE WHEN category = '电子产品' THEN 1 END) as electronics_count,
COUNT(CASE WHEN category = '服装' THEN 1 END) as clothing_count,
COUNT(CASE WHEN category = '家电' THEN 1 END) as appliance_count
FROM sales;
# 实战案例
# 销售报表生成
-- 综合销售报表
SELECT
'总体情况' as report_type,
COUNT(*) as total_orders,
COUNT(DISTINCT product_name) as unique_products,
COUNT(DISTINCT salesperson) as active_salespeople,
SUM(quantity) as total_quantity,
SUM(price * quantity) as total_revenue,
AVG(price * quantity) as avg_order_value,
MIN(sale_date) as first_sale_date,
MAX(sale_date) as last_sale_date
FROM sales
UNION ALL
SELECT
CONCAT('类别: ', category) as report_type,
COUNT(*) as total_orders,
COUNT(DISTINCT product_name) as unique_products,
COUNT(DISTINCT salesperson) as active_salespeople,
SUM(quantity) as total_quantity,
SUM(price * quantity) as total_revenue,
AVG(price * quantity) as avg_order_value,
MIN(sale_date) as first_sale_date,
MAX(sale_date) as last_sale_date
FROM sales
GROUP BY category
ORDER BY report_type;
# 销售人员绩效分析
-- 销售人员绩效排名
SELECT
salesperson,
COUNT(*) as order_count,
SUM(quantity) as total_quantity,
SUM(price * quantity) as total_revenue,
AVG(price * quantity) as avg_order_value,
MAX(price * quantity) as max_single_sale,
COUNT(DISTINCT category) as categories_sold,
GROUP_CONCAT(DISTINCT category ORDER BY category) as category_list,
DATEDIFF(MAX(sale_date), MIN(sale_date)) + 1 as active_days
FROM sales
GROUP BY salesperson
ORDER BY total_revenue DESC;
# 常见错误
# 1. 在SELECT中混用聚合函数和非聚合列
-- 错误:没有GROUP BY的情况下混用
SELECT product_name, COUNT(*) FROM sales; -- 错误
-- 正确:使用GROUP BY
SELECT product_name, COUNT(*) FROM sales GROUP BY product_name;
-- 或者只使用聚合函数
SELECT COUNT(*) FROM sales;
# 2. 忽略NULL值的影响
-- COUNT(*)包含NULL值,COUNT(列名)不包含NULL值
SELECT
COUNT(*) as total_rows,
COUNT(price) as non_null_prices
FROM sales;
-- AVG函数忽略NULL值
SELECT
AVG(price) as avg_price, -- 忽略NULL
SUM(price) / COUNT(*) as manual_avg -- 包含NULL,结果可能不同
FROM sales;
# 3. GROUP_CONCAT长度限制
-- 检查和设置GROUP_CONCAT最大长度
SHOW VARIABLES LIKE 'group_concat_max_len';
SET SESSION group_concat_max_len = 10000;
-- 或者在查询中处理长度限制
SELECT
category,
CASE
WHEN LENGTH(GROUP_CONCAT(product_name)) > 100
THEN CONCAT(LEFT(GROUP_CONCAT(product_name), 97), '...')
ELSE GROUP_CONCAT(product_name)
END as products
FROM sales
GROUP BY category;
# 总结
聚合函数是SQL数据分析的核心工具,掌握它们的使用方法对于数据统计和报表生成至关重要:
# 核心聚合函数
- COUNT() - 计数统计
- SUM() - 数值求和
- AVG() - 平均值计算
- MAX()/MIN() - 最值查找
- GROUP_CONCAT() - 字符串聚合
# 最佳实践
- 理解NULL值对聚合函数的影响
- 合理使用GROUP BY进行分组
- 利用CASE WHEN进行条件聚合
- 注意GROUP_CONCAT的长度限制
- 为聚合查询创建合适的索引
- 使用HAVING过滤聚合结果
# 性能考虑
- 在经常用于GROUP BY的列上创建索引
- 避免在大表上进行复杂的聚合计算
- 考虑使用物化视图存储预计算结果
- 合理使用LIMIT限制结果集大小