分组查询 GROUP BY

# 分组查询 GROUP BY

# 概述

GROUP BY子句用于将查询结果按照一个或多个列进行分组,通常与聚合函数一起使用,对每个分组进行统计计算。

# 基本语法

SELECT column1, column2, aggregate_function(column3)
FROM table_name
WHERE condition
GROUP BY column1, column2
HAVING group_condition
ORDER BY column1;

# 测试数据准备

-- 创建员工表
CREATE TABLE employees (
    employee_id INT PRIMARY KEY,
    name VARCHAR(50),
    department VARCHAR(30),
    position VARCHAR(30),
    salary DECIMAL(10,2),
    hire_date DATE,
    manager_id INT
);

-- 插入测试数据
INSERT INTO employees VALUES
(1, '张三', '技术部', '高级工程师', 12000, '2020-01-15', NULL),
(2, '李四', '技术部', '工程师', 8000, '2021-03-20', 1),
(3, '王五', '技术部', '初级工程师', 6000, '2022-06-10', 1),
(4, '赵六', '销售部', '销售经理', 10000, '2019-08-05', NULL),
(5, '钱七', '销售部', '销售代表', 7000, '2021-11-12', 4),
(6, '孙八', '销售部', '销售代表', 6500, '2022-02-28', 4),
(7, '周九', '人事部', '人事经理', 9000, '2020-05-18', NULL),
(8, '吴十', '人事部', '人事专员', 5500, '2021-09-30', 7),
(9, '郑一', '财务部', '财务经理', 11000, '2019-12-01', NULL),
(10, '王二', '财务部', '会计', 6800, '2022-01-20', 9);

-- 创建订单表
CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT,
    order_date DATE,
    total_amount DECIMAL(10,2),
    status VARCHAR(20)
);

-- 插入订单数据
INSERT INTO orders VALUES
(1, 101, '2023-01-15', 1500.00, '已完成'),
(2, 102, '2023-01-16', 2300.00, '已完成'),
(3, 101, '2023-02-10', 800.00, '已完成'),
(4, 103, '2023-02-15', 1200.00, '处理中'),
(5, 102, '2023-03-05', 3500.00, '已完成'),
(6, 104, '2023-03-10', 900.00, '已取消'),
(7, 101, '2023-03-20', 2100.00, '已完成'),
(8, 105, '2023-04-01', 1800.00, '已完成');

# 单列分组

# 基本分组统计

-- 按部门统计员工数量
SELECT department, COUNT(*) as employee_count
FROM employees
GROUP BY department;

-- 按部门统计平均工资
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department;

-- 按部门统计工资总和
SELECT department, SUM(salary) as total_salary
FROM employees
GROUP BY department;

# 多种聚合函数组合

-- 按部门统计详细信息
SELECT 
    department,
    COUNT(*) as employee_count,
    AVG(salary) as avg_salary,
    MIN(salary) as min_salary,
    MAX(salary) as max_salary,
    SUM(salary) as total_salary
FROM employees
GROUP BY department;

# 多列分组

-- 按部门和职位分组
SELECT 
    department,
    position,
    COUNT(*) as count,
    AVG(salary) as avg_salary
FROM employees
GROUP BY department, position
ORDER BY department, position;

-- 按年份和月份分组统计订单
SELECT 
    YEAR(order_date) as order_year,
    MONTH(order_date) as order_month,
    COUNT(*) as order_count,
    SUM(total_amount) as total_sales
FROM orders
GROUP BY YEAR(order_date), MONTH(order_date)
ORDER BY order_year, order_month;

# HAVING子句

HAVING子句用于对分组后的结果进行过滤,类似于WHERE子句,但HAVING是对分组后的数据进行筛选。

# 基本HAVING用法

-- 查询员工数量大于1的部门
SELECT department, COUNT(*) as employee_count
FROM employees
GROUP BY department
HAVING COUNT(*) > 1;

-- 查询平均工资超过8000的部门
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 8000;

# WHERE vs HAVING

-- WHERE:分组前过滤
SELECT department, AVG(salary) as avg_salary
FROM employees
WHERE salary > 6000  -- 先过滤工资大于6000的员工
GROUP BY department;

-- HAVING:分组后过滤
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 8000;  -- 过滤平均工资大于8000的部门

-- 组合使用
SELECT department, AVG(salary) as avg_salary
FROM employees
WHERE hire_date >= '2020-01-01'  -- 分组前过滤
GROUP BY department
HAVING COUNT(*) >= 2;  -- 分组后过滤

# 复杂分组查询

# 嵌套分组

-- 查询每个部门工资最高的员工信息
SELECT e1.*
FROM employees e1
WHERE e1.salary = (
    SELECT MAX(e2.salary)
    FROM employees e2
    WHERE e2.department = e1.department
);

-- 使用窗口函数实现
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rn
    FROM employees
) t
WHERE rn = 1;

# 条件分组

-- 按工资等级分组
SELECT 
    CASE 
        WHEN salary < 6000 THEN '低薪'
        WHEN salary BETWEEN 6000 AND 9000 THEN '中薪'
        ELSE '高薪'
    END as salary_level,
    COUNT(*) as employee_count,
    AVG(salary) as avg_salary
FROM employees
GROUP BY 
    CASE 
        WHEN salary < 6000 THEN '低薪'
        WHEN salary BETWEEN 6000 AND 9000 THEN '中薪'
        ELSE '高薪'
    END;

# 时间分组

-- 按年分组统计
SELECT 
    YEAR(hire_date) as hire_year,
    COUNT(*) as hire_count
FROM employees
GROUP BY YEAR(hire_date)
ORDER BY hire_year;

-- 按季度分组统计订单
SELECT 
    YEAR(order_date) as order_year,
    QUARTER(order_date) as order_quarter,
    COUNT(*) as order_count,
    SUM(total_amount) as total_sales
FROM orders
GROUP BY YEAR(order_date), QUARTER(order_date)
ORDER BY order_year, order_quarter;

-- 按星期分组统计
SELECT 
    DAYNAME(order_date) as day_of_week,
    COUNT(*) as order_count,
    AVG(total_amount) as avg_amount
FROM orders
GROUP BY DAYOFWEEK(order_date), DAYNAME(order_date)
ORDER BY DAYOFWEEK(order_date);

# 实战案例

# 案例1:销售数据分析

-- 创建销售数据表
CREATE TABLE sales (
    sale_id INT PRIMARY KEY,
    salesperson VARCHAR(50),
    region VARCHAR(30),
    product_category VARCHAR(30),
    sale_amount DECIMAL(10,2),
    sale_date DATE
);

-- 插入测试数据
INSERT INTO sales VALUES
(1, '张三', '华北', '电子产品', 15000, '2023-01-15'),
(2, '李四', '华南', '服装', 8000, '2023-01-20'),
(3, '王五', '华东', '电子产品', 12000, '2023-02-10'),
(4, '张三', '华北', '家具', 20000, '2023-02-15'),
(5, '赵六', '华西', '电子产品', 9000, '2023-03-05');

-- 按销售员分组统计
SELECT 
    salesperson,
    COUNT(*) as sale_count,
    SUM(sale_amount) as total_sales,
    AVG(sale_amount) as avg_sale,
    MAX(sale_amount) as max_sale
FROM sales
GROUP BY salesperson
ORDER BY total_sales DESC;

-- 按地区和产品类别分组
SELECT 
    region,
    product_category,
    COUNT(*) as sale_count,
    SUM(sale_amount) as total_sales
FROM sales
GROUP BY region, product_category
ORDER BY region, total_sales DESC;

-- 按月份统计各地区销售情况
SELECT 
    DATE_FORMAT(sale_date, '%Y-%m') as sale_month,
    region,
    SUM(sale_amount) as monthly_sales
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m'), region
ORDER BY sale_month, region;

# 案例2:学生成绩分析

-- 创建成绩表
CREATE TABLE student_scores (
    student_id INT,
    student_name VARCHAR(50),
    subject VARCHAR(30),
    score INT,
    exam_date DATE
);

-- 插入测试数据
INSERT INTO student_scores VALUES
(1, '张三', '数学', 85, '2023-06-15'),
(1, '张三', '语文', 78, '2023-06-16'),
(1, '张三', '英语', 92, '2023-06-17'),
(2, '李四', '数学', 76, '2023-06-15'),
(2, '李四', '语文', 88, '2023-06-16'),
(2, '李四', '英语', 84, '2023-06-17');

-- 按学生统计总分和平均分
SELECT 
    student_id,
    student_name,
    COUNT(*) as subject_count,
    SUM(score) as total_score,
    AVG(score) as avg_score,
    MIN(score) as min_score,
    MAX(score) as max_score
FROM student_scores
GROUP BY student_id, student_name;

-- 按科目统计平均分
SELECT 
    subject,
    COUNT(*) as student_count,
    AVG(score) as avg_score,
    MIN(score) as min_score,
    MAX(score) as max_score
FROM student_scores
GROUP BY subject
ORDER BY avg_score DESC;

-- 查询平均分大于80的学生
SELECT 
    student_id,
    student_name,
    AVG(score) as avg_score
FROM student_scores
GROUP BY student_id, student_name
HAVING AVG(score) > 80;

# 高级分组技巧

# 1. ROLLUP - 分层汇总

-- MySQL中使用WITH ROLLUP
SELECT 
    department,
    position,
    COUNT(*) as employee_count,
    SUM(salary) as total_salary
FROM employees
GROUP BY department, position WITH ROLLUP;

-- 手动实现ROLLUP效果
SELECT department, position, COUNT(*), SUM(salary)
FROM employees
GROUP BY department, position
UNION ALL
SELECT department, '小计', COUNT(*), SUM(salary)
FROM employees
GROUP BY department
UNION ALL
SELECT '总计', '', COUNT(*), SUM(salary)
FROM employees;

# 2. 动态分组

-- 根据条件动态分组
SELECT 
    CASE 
        WHEN YEAR(hire_date) < 2020 THEN '老员工'
        WHEN YEAR(hire_date) = 2020 THEN '2020年入职'
        ELSE '新员工'
    END as employee_type,
    COUNT(*) as count,
    AVG(salary) as avg_salary
FROM employees
GROUP BY 
    CASE 
        WHEN YEAR(hire_date) < 2020 THEN '老员工'
        WHEN YEAR(hire_date) = 2020 THEN '2020年入职'
        ELSE '新员工'
    END;

# 3. 条件计数

-- 使用条件聚合
SELECT 
    department,
    COUNT(*) as total_count,
    SUM(CASE WHEN salary > 8000 THEN 1 ELSE 0 END) as high_salary_count,
    SUM(CASE WHEN salary <= 8000 THEN 1 ELSE 0 END) as normal_salary_count,
    AVG(CASE WHEN salary > 8000 THEN salary END) as avg_high_salary
FROM employees
GROUP BY department;

# 性能优化

# 1. 索引优化

-- 为分组列创建索引
CREATE INDEX idx_department ON employees(department);
CREATE INDEX idx_dept_position ON employees(department, position);

-- 复合索引顺序很重要
-- 如果经常按(department, position)分组,创建复合索引

# 2. 避免不必要的分组

-- 低效:不必要的分组
SELECT department, COUNT(*)
FROM employees
WHERE department = '技术部'
GROUP BY department;

-- 高效:直接计数
SELECT COUNT(*)
FROM employees
WHERE department = '技术部';

# 3. 合理使用HAVING

-- 低效:在HAVING中使用非聚合条件
SELECT department, COUNT(*)
FROM employees
GROUP BY department
HAVING department = '技术部';

-- 高效:在WHERE中过滤
SELECT department, COUNT(*)
FROM employees
WHERE department = '技术部'
GROUP BY department;

# 常见错误

# 1. SELECT列表错误

-- 错误:SELECT中包含非分组列
SELECT name, department, COUNT(*)
FROM employees
GROUP BY department;
-- 错误原因:name不在GROUP BY中,也不是聚合函数

-- 正确:只选择分组列和聚合函数
SELECT department, COUNT(*)
FROM employees
GROUP BY department;

# 2. HAVING vs WHERE混淆

-- 错误:在WHERE中使用聚合函数
SELECT department, COUNT(*)
FROM employees
WHERE COUNT(*) > 2
GROUP BY department;

-- 正确:在HAVING中使用聚合函数
SELECT department, COUNT(*)
FROM employees
GROUP BY department
HAVING COUNT(*) > 2;

# 3. NULL值处理

-- 注意NULL值在分组中的行为
SELECT manager_id, COUNT(*)
FROM employees
GROUP BY manager_id;
-- NULL值会被分为一组

-- 排除NULL值
SELECT manager_id, COUNT(*)
FROM employees
WHERE manager_id IS NOT NULL
GROUP BY manager_id;

# 最佳实践

# 1. 查询优化

  • 在WHERE子句中尽早过滤数据
  • 为分组列创建适当的索引
  • 避免在SELECT中使用非必要的列

# 2. 可读性

  • 使用有意义的别名
  • 合理排序结果
  • 复杂分组考虑拆分为多个步骤

# 3. 数据完整性

  • 注意NULL值的处理
  • 验证聚合结果的合理性
  • 考虑数据类型的精度问题

# 4. 性能监控

  • 使用EXPLAIN分析执行计划
  • 监控大数据量分组查询的性能
  • 考虑使用物化视图缓存复杂分组结果

# 总结

GROUP BY是SQL中进行数据分析和统计的重要工具。正确使用GROUP BY可以高效地进行数据汇总和分析,但需要注意语法规则、性能优化和常见陷阱。在实际应用中,GROUP BY经常与聚合函数、HAVING子句、ORDER BY等结合使用,形成强大的数据分析能力。