聚合函数

2024/1/1

# 聚合函数

点击勘误issues (opens new window),哪吒感谢大家的阅读

# 聚合函数概述

聚合函数是对一组值执行计算并返回单个值的函数。它们通常与GROUP BY子句一起使用,对数据进行分组统计。聚合函数忽略NULL值(COUNT(*)除外)。

# 常用聚合函数

  • COUNT() - 计数
  • SUM() - 求和
  • AVG() - 平均值
  • MAX() - 最大值
  • MIN() - 最小值
  • GROUP_CONCAT() - 字符串聚合(MySQL)
  • STDDEV() - 标准差
  • VARIANCE() - 方差

# 准备测试数据

-- 创建销售表
CREATE TABLE sales (
    id INT PRIMARY KEY AUTO_INCREMENT,
    product_name VARCHAR(100),
    category VARCHAR(50),
    price DECIMAL(10,2),
    quantity INT,
    sale_date DATE,
    salesperson VARCHAR(50),
    region VARCHAR(50)
);

-- 插入测试数据
INSERT INTO sales (product_name, category, price, quantity, sale_date, salesperson, region) VALUES
('iPhone 14', '电子产品', 5999.00, 2, '2023-01-15', '张三', '北京'),
('MacBook Pro', '电子产品', 12999.00, 1, '2023-01-16', '张三', '北京'),
('iPad Air', '电子产品', 4599.00, 3, '2023-01-17', '李四', '上海'),
('AirPods', '电子产品', 1299.00, 5, '2023-01-18', '王五', '广州'),
('Nike鞋', '服装', 899.00, 2, '2023-01-19', '张三', '北京'),
('Adidas鞋', '服装', 799.00, 3, '2023-01-20', '李四', '上海'),
('T恤', '服装', 199.00, 10, '2023-01-21', '王五', '广州'),
('牛仔裤', '服装', 399.00, 4, '2023-01-22', '赵六', '深圳'),
('咖啡机', '家电', 2999.00, 1, '2023-01-23', '张三', '北京'),
('微波炉', '家电', 899.00, 2, '2023-01-24', '李四', '上海');

# COUNT函数

COUNT函数用于计算行数或非NULL值的数量。

# COUNT(*):计算总行数

-- 计算总销售记录数
SELECT COUNT(*) as total_records FROM sales;
-- 结果:10

-- 计算每个类别的销售记录数
SELECT category, COUNT(*) as record_count
FROM sales
GROUP BY category;
-- 结果:
-- 电子产品, 4
-- 服装, 4
-- 家电, 2

# COUNT(列名):计算非NULL值数量

-- 计算有销售人员记录的数量
SELECT COUNT(salesperson) as salesperson_count FROM sales;

-- 计算每个地区的销售记录数
SELECT region, COUNT(product_name) as product_count
FROM sales
GROUP BY region
ORDER BY product_count DESC;

# COUNT(DISTINCT):计算唯一值数量

-- 计算不同产品的数量
SELECT COUNT(DISTINCT product_name) as unique_products FROM sales;

-- 计算每个类别中不同产品的数量
SELECT category, COUNT(DISTINCT product_name) as unique_products
FROM sales
GROUP BY category;

-- 计算不同销售人员的数量
SELECT COUNT(DISTINCT salesperson) as unique_salesperson FROM sales;

# SUM函数

SUM函数用于计算数值列的总和。

# 基本求和

-- 计算总销售额
SELECT SUM(price * quantity) as total_revenue FROM sales;

-- 计算总销售数量
SELECT SUM(quantity) as total_quantity FROM sales;

-- 按类别计算销售额
SELECT 
    category,
    SUM(price * quantity) as category_revenue,
    SUM(quantity) as category_quantity
FROM sales
GROUP BY category
ORDER BY category_revenue DESC;

# 条件求和

-- 计算电子产品的总销售额
SELECT SUM(price * quantity) as electronics_revenue
FROM sales
WHERE category = '电子产品';

-- 使用CASE进行条件求和
SELECT 
    SUM(CASE WHEN category = '电子产品' THEN price * quantity ELSE 0 END) as electronics_revenue,
    SUM(CASE WHEN category = '服装' THEN price * quantity ELSE 0 END) as clothing_revenue,
    SUM(CASE WHEN category = '家电' THEN price * quantity ELSE 0 END) as appliance_revenue
FROM sales;

# 按时间求和

-- 按月计算销售额
SELECT 
    DATE_FORMAT(sale_date, '%Y-%m') as sale_month,
    SUM(price * quantity) as monthly_revenue
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m')
ORDER BY sale_month;

-- 按销售人员计算销售额
SELECT 
    salesperson,
    SUM(price * quantity) as personal_revenue,
    COUNT(*) as sales_count
FROM sales
GROUP BY salesperson
ORDER BY personal_revenue DESC;

# AVG函数

AVG函数用于计算数值列的平均值。

# 基本平均值

-- 计算平均销售价格
SELECT AVG(price) as avg_price FROM sales;

-- 计算平均销售数量
SELECT AVG(quantity) as avg_quantity FROM sales;

-- 按类别计算平均价格
SELECT 
    category,
    AVG(price) as avg_price,
    AVG(quantity) as avg_quantity,
    AVG(price * quantity) as avg_revenue
FROM sales
GROUP BY category;

# 加权平均

-- 计算加权平均价格(按数量加权)
SELECT 
    category,
    SUM(price * quantity) / SUM(quantity) as weighted_avg_price
FROM sales
GROUP BY category;

-- 与普通平均值对比
SELECT 
    category,
    AVG(price) as simple_avg_price,
    SUM(price * quantity) / SUM(quantity) as weighted_avg_price
FROM sales
GROUP BY category;

# 条件平均值

-- 计算高价商品的平均价格(价格>1000)
SELECT AVG(price) as high_price_avg
FROM sales
WHERE price > 1000;

-- 使用CASE计算条件平均值
SELECT 
    AVG(CASE WHEN price > 1000 THEN price END) as high_price_avg,
    AVG(CASE WHEN price <= 1000 THEN price END) as low_price_avg
FROM sales;

# MAX和MIN函数

MAX和MIN函数分别用于查找最大值和最小值。

# 基本最值查询

-- 查找最高和最低价格
SELECT 
    MAX(price) as max_price,
    MIN(price) as min_price,
    MAX(quantity) as max_quantity,
    MIN(quantity) as min_quantity
FROM sales;

-- 按类别查找最值
SELECT 
    category,
    MAX(price) as max_price,
    MIN(price) as min_price,
    MAX(price * quantity) as max_revenue
FROM sales
GROUP BY category;

# 查找最值对应的记录

-- 查找最贵的商品信息
SELECT *
FROM sales
WHERE price = (SELECT MAX(price) FROM sales);

-- 查找每个类别中最贵的商品
SELECT s1.*
FROM sales s1
INNER JOIN (
    SELECT category, MAX(price) as max_price
    FROM sales
    GROUP BY category
) s2 ON s1.category = s2.category AND s1.price = s2.max_price;

-- 使用窗口函数(MySQL 8.0+)
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY category ORDER BY price DESC) as rn
    FROM sales
) ranked
WHERE rn = 1;

# 日期最值

-- 查找最早和最晚的销售日期
SELECT 
    MIN(sale_date) as earliest_sale,
    MAX(sale_date) as latest_sale,
    DATEDIFF(MAX(sale_date), MIN(sale_date)) as date_range_days
FROM sales;

-- 按销售人员查找销售日期范围
SELECT 
    salesperson,
    MIN(sale_date) as first_sale,
    MAX(sale_date) as last_sale,
    COUNT(*) as total_sales
FROM sales
GROUP BY salesperson;

# GROUP_CONCAT函数(MySQL)

GROUP_CONCAT函数用于将分组中的字符串值连接起来。

# 基本字符串聚合

-- 按类别聚合产品名称
SELECT 
    category,
    GROUP_CONCAT(product_name) as products
FROM sales
GROUP BY category;

-- 使用分隔符
SELECT 
    category,
    GROUP_CONCAT(product_name SEPARATOR ' | ') as products
FROM sales
GROUP BY category;

# 排序和去重

-- 按价格排序聚合
SELECT 
    category,
    GROUP_CONCAT(product_name ORDER BY price DESC SEPARATOR ', ') as products_by_price
FROM sales
GROUP BY category;

-- 去重聚合
SELECT 
    region,
    GROUP_CONCAT(DISTINCT category SEPARATOR ', ') as categories
FROM sales
GROUP BY region;

-- 限制长度
SELECT 
    category,
    GROUP_CONCAT(product_name SEPARATOR ', ') as products
FROM sales
GROUP BY category;
-- 注意:可以设置 group_concat_max_len 系统变量来控制最大长度

# 复杂聚合

-- 聚合产品信息(名称和价格)
SELECT 
    category,
    GROUP_CONCAT(
        CONCAT(product_name, '(¥', price, ')')
        ORDER BY price DESC 
        SEPARATOR ', '
    ) as product_details
FROM sales
GROUP BY category;

-- 按销售人员聚合销售信息
SELECT 
    salesperson,
    GROUP_CONCAT(
        CONCAT(product_name, ' x', quantity)
        ORDER BY sale_date
        SEPARATOR ' | '
    ) as sales_history
FROM sales
GROUP BY salesperson;

# 统计函数

# 标准差和方差

-- 计算价格的标准差和方差
SELECT 
    category,
    AVG(price) as avg_price,
    STDDEV(price) as price_stddev,
    VARIANCE(price) as price_variance
FROM sales
GROUP BY category;

-- 计算销售额的统计信息
SELECT 
    COUNT(*) as count,
    SUM(price * quantity) as total_revenue,
    AVG(price * quantity) as avg_revenue,
    STDDEV(price * quantity) as revenue_stddev,
    MIN(price * quantity) as min_revenue,
    MAX(price * quantity) as max_revenue
FROM sales;

# 多级聚合

# ROLLUP:分层汇总

-- 使用ROLLUP进行分层汇总
SELECT 
    region,
    category,
    COUNT(*) as sales_count,
    SUM(price * quantity) as total_revenue
FROM sales
GROUP BY region, category WITH ROLLUP;

-- 使用GROUPING函数识别汇总行(MySQL 8.0+)
SELECT 
    CASE WHEN GROUPING(region) = 1 THEN '所有地区' ELSE region END as region,
    CASE WHEN GROUPING(category) = 1 THEN '所有类别' ELSE category END as category,
    COUNT(*) as sales_count,
    SUM(price * quantity) as total_revenue
FROM sales
GROUP BY region, category WITH ROLLUP;

# 手动实现多级汇总

-- 使用UNION实现多级汇总
SELECT region, category, COUNT(*) as count, SUM(price * quantity) as revenue
FROM sales
GROUP BY region, category

UNION ALL

SELECT region, '小计' as category, COUNT(*) as count, SUM(price * quantity) as revenue
FROM sales
GROUP BY region

UNION ALL

SELECT '总计' as region, '总计' as category, COUNT(*) as count, SUM(price * quantity) as revenue
FROM sales

ORDER BY 
    CASE WHEN region = '总计' THEN 'zzz' ELSE region END,
    CASE WHEN category = '总计' THEN 'zzz' WHEN category = '小计' THEN 'yyy' ELSE category END;

# 条件聚合

# 使用CASE WHEN进行条件聚合

-- 按价格区间统计
SELECT 
    COUNT(CASE WHEN price < 500 THEN 1 END) as low_price_count,
    COUNT(CASE WHEN price BETWEEN 500 AND 2000 THEN 1 END) as mid_price_count,
    COUNT(CASE WHEN price > 2000 THEN 1 END) as high_price_count,
    SUM(CASE WHEN price < 500 THEN price * quantity ELSE 0 END) as low_price_revenue,
    SUM(CASE WHEN price BETWEEN 500 AND 2000 THEN price * quantity ELSE 0 END) as mid_price_revenue,
    SUM(CASE WHEN price > 2000 THEN price * quantity ELSE 0 END) as high_price_revenue
FROM sales;

-- 按月份统计各类别销售情况
SELECT 
    DATE_FORMAT(sale_date, '%Y-%m') as month,
    SUM(CASE WHEN category = '电子产品' THEN price * quantity ELSE 0 END) as electronics,
    SUM(CASE WHEN category = '服装' THEN price * quantity ELSE 0 END) as clothing,
    SUM(CASE WHEN category = '家电' THEN price * quantity ELSE 0 END) as appliances
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m');

# 百分比计算

-- 计算各类别销售额占比
SELECT 
    category,
    SUM(price * quantity) as category_revenue,
    ROUND(
        SUM(price * quantity) * 100.0 / (
            SELECT SUM(price * quantity) FROM sales
        ), 2
    ) as revenue_percentage
FROM sales
GROUP BY category
ORDER BY category_revenue DESC;

-- 使用窗口函数计算占比(MySQL 8.0+)
SELECT 
    category,
    SUM(price * quantity) as category_revenue,
    ROUND(
        SUM(price * quantity) * 100.0 / SUM(SUM(price * quantity)) OVER(), 2
    ) as revenue_percentage
FROM sales
GROUP BY category
ORDER BY category_revenue DESC;

# 聚合函数与窗口函数对比

# 传统聚合函数

-- 传统聚合:每个类别一行
SELECT 
    category,
    COUNT(*) as count,
    AVG(price) as avg_price
FROM sales
GROUP BY category;

# 窗口函数(MySQL 8.0+)

-- 窗口函数:保留所有行,添加聚合信息
SELECT 
    product_name,
    category,
    price,
    COUNT(*) OVER (PARTITION BY category) as category_count,
    AVG(price) OVER (PARTITION BY category) as category_avg_price,
    price - AVG(price) OVER (PARTITION BY category) as price_diff_from_avg
FROM sales
ORDER BY category, price;

# 性能优化

# 索引优化

-- 为聚合查询创建合适的索引
CREATE INDEX idx_category_date ON sales(category, sale_date);
CREATE INDEX idx_salesperson_date ON sales(salesperson, sale_date);
CREATE INDEX idx_region_category ON sales(region, category);

# 查询优化技巧

-- 优化前:多次扫描表
SELECT 
    (SELECT COUNT(*) FROM sales WHERE category = '电子产品') as electronics_count,
    (SELECT COUNT(*) FROM sales WHERE category = '服装') as clothing_count,
    (SELECT COUNT(*) FROM sales WHERE category = '家电') as appliance_count;

-- 优化后:一次扫描,条件聚合
SELECT 
    COUNT(CASE WHEN category = '电子产品' THEN 1 END) as electronics_count,
    COUNT(CASE WHEN category = '服装' THEN 1 END) as clothing_count,
    COUNT(CASE WHEN category = '家电' THEN 1 END) as appliance_count
FROM sales;

# 实战案例

# 销售报表生成

-- 综合销售报表
SELECT 
    '总体情况' as report_type,
    COUNT(*) as total_orders,
    COUNT(DISTINCT product_name) as unique_products,
    COUNT(DISTINCT salesperson) as active_salespeople,
    SUM(quantity) as total_quantity,
    SUM(price * quantity) as total_revenue,
    AVG(price * quantity) as avg_order_value,
    MIN(sale_date) as first_sale_date,
    MAX(sale_date) as last_sale_date
FROM sales

UNION ALL

SELECT 
    CONCAT('类别: ', category) as report_type,
    COUNT(*) as total_orders,
    COUNT(DISTINCT product_name) as unique_products,
    COUNT(DISTINCT salesperson) as active_salespeople,
    SUM(quantity) as total_quantity,
    SUM(price * quantity) as total_revenue,
    AVG(price * quantity) as avg_order_value,
    MIN(sale_date) as first_sale_date,
    MAX(sale_date) as last_sale_date
FROM sales
GROUP BY category

ORDER BY report_type;

# 销售人员绩效分析

-- 销售人员绩效排名
SELECT 
    salesperson,
    COUNT(*) as order_count,
    SUM(quantity) as total_quantity,
    SUM(price * quantity) as total_revenue,
    AVG(price * quantity) as avg_order_value,
    MAX(price * quantity) as max_single_sale,
    COUNT(DISTINCT category) as categories_sold,
    GROUP_CONCAT(DISTINCT category ORDER BY category) as category_list,
    DATEDIFF(MAX(sale_date), MIN(sale_date)) + 1 as active_days
FROM sales
GROUP BY salesperson
ORDER BY total_revenue DESC;

# 常见错误

# 1. 在SELECT中混用聚合函数和非聚合列

-- 错误:没有GROUP BY的情况下混用
SELECT product_name, COUNT(*) FROM sales;  -- 错误

-- 正确:使用GROUP BY
SELECT product_name, COUNT(*) FROM sales GROUP BY product_name;

-- 或者只使用聚合函数
SELECT COUNT(*) FROM sales;

# 2. 忽略NULL值的影响

-- COUNT(*)包含NULL值,COUNT(列名)不包含NULL值
SELECT 
    COUNT(*) as total_rows,
    COUNT(price) as non_null_prices
FROM sales;

-- AVG函数忽略NULL值
SELECT 
    AVG(price) as avg_price,  -- 忽略NULL
    SUM(price) / COUNT(*) as manual_avg  -- 包含NULL,结果可能不同
FROM sales;

# 3. GROUP_CONCAT长度限制

-- 检查和设置GROUP_CONCAT最大长度
SHOW VARIABLES LIKE 'group_concat_max_len';
SET SESSION group_concat_max_len = 10000;

-- 或者在查询中处理长度限制
SELECT 
    category,
    CASE 
        WHEN LENGTH(GROUP_CONCAT(product_name)) > 100 
        THEN CONCAT(LEFT(GROUP_CONCAT(product_name), 97), '...')
        ELSE GROUP_CONCAT(product_name)
    END as products
FROM sales
GROUP BY category;

# 总结

聚合函数是SQL数据分析的核心工具,掌握它们的使用方法对于数据统计和报表生成至关重要:

# 核心聚合函数

  1. COUNT() - 计数统计
  2. SUM() - 数值求和
  3. AVG() - 平均值计算
  4. MAX()/MIN() - 最值查找
  5. GROUP_CONCAT() - 字符串聚合

# 最佳实践

  • 理解NULL值对聚合函数的影响
  • 合理使用GROUP BY进行分组
  • 利用CASE WHEN进行条件聚合
  • 注意GROUP_CONCAT的长度限制
  • 为聚合查询创建合适的索引
  • 使用HAVING过滤聚合结果

# 性能考虑

  • 在经常用于GROUP BY的列上创建索引
  • 避免在大表上进行复杂的聚合计算
  • 考虑使用物化视图存储预计算结果
  • 合理使用LIMIT限制结果集大小

# 下一步学习