分组查询 GROUP BY
# 分组查询 GROUP BY
# 概述
GROUP BY子句用于将查询结果按照一个或多个列进行分组,通常与聚合函数一起使用,对每个分组进行统计计算。
# 基本语法
SELECT column1, column2, aggregate_function(column3)
FROM table_name
WHERE condition
GROUP BY column1, column2
HAVING group_condition
ORDER BY column1;
# 测试数据准备
-- 创建员工表
CREATE TABLE employees (
employee_id INT PRIMARY KEY,
name VARCHAR(50),
department VARCHAR(30),
position VARCHAR(30),
salary DECIMAL(10,2),
hire_date DATE,
manager_id INT
);
-- 插入测试数据
INSERT INTO employees VALUES
(1, '张三', '技术部', '高级工程师', 12000, '2020-01-15', NULL),
(2, '李四', '技术部', '工程师', 8000, '2021-03-20', 1),
(3, '王五', '技术部', '初级工程师', 6000, '2022-06-10', 1),
(4, '赵六', '销售部', '销售经理', 10000, '2019-08-05', NULL),
(5, '钱七', '销售部', '销售代表', 7000, '2021-11-12', 4),
(6, '孙八', '销售部', '销售代表', 6500, '2022-02-28', 4),
(7, '周九', '人事部', '人事经理', 9000, '2020-05-18', NULL),
(8, '吴十', '人事部', '人事专员', 5500, '2021-09-30', 7),
(9, '郑一', '财务部', '财务经理', 11000, '2019-12-01', NULL),
(10, '王二', '财务部', '会计', 6800, '2022-01-20', 9);
-- 创建订单表
CREATE TABLE orders (
order_id INT PRIMARY KEY,
customer_id INT,
order_date DATE,
total_amount DECIMAL(10,2),
status VARCHAR(20)
);
-- 插入订单数据
INSERT INTO orders VALUES
(1, 101, '2023-01-15', 1500.00, '已完成'),
(2, 102, '2023-01-16', 2300.00, '已完成'),
(3, 101, '2023-02-10', 800.00, '已完成'),
(4, 103, '2023-02-15', 1200.00, '处理中'),
(5, 102, '2023-03-05', 3500.00, '已完成'),
(6, 104, '2023-03-10', 900.00, '已取消'),
(7, 101, '2023-03-20', 2100.00, '已完成'),
(8, 105, '2023-04-01', 1800.00, '已完成');
# 单列分组
# 基本分组统计
-- 按部门统计员工数量
SELECT department, COUNT(*) as employee_count
FROM employees
GROUP BY department;
-- 按部门统计平均工资
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department;
-- 按部门统计工资总和
SELECT department, SUM(salary) as total_salary
FROM employees
GROUP BY department;
# 多种聚合函数组合
-- 按部门统计详细信息
SELECT
department,
COUNT(*) as employee_count,
AVG(salary) as avg_salary,
MIN(salary) as min_salary,
MAX(salary) as max_salary,
SUM(salary) as total_salary
FROM employees
GROUP BY department;
# 多列分组
-- 按部门和职位分组
SELECT
department,
position,
COUNT(*) as count,
AVG(salary) as avg_salary
FROM employees
GROUP BY department, position
ORDER BY department, position;
-- 按年份和月份分组统计订单
SELECT
YEAR(order_date) as order_year,
MONTH(order_date) as order_month,
COUNT(*) as order_count,
SUM(total_amount) as total_sales
FROM orders
GROUP BY YEAR(order_date), MONTH(order_date)
ORDER BY order_year, order_month;
# HAVING子句
HAVING子句用于对分组后的结果进行过滤,类似于WHERE子句,但HAVING是对分组后的数据进行筛选。
# 基本HAVING用法
-- 查询员工数量大于1的部门
SELECT department, COUNT(*) as employee_count
FROM employees
GROUP BY department
HAVING COUNT(*) > 1;
-- 查询平均工资超过8000的部门
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 8000;
# WHERE vs HAVING
-- WHERE:分组前过滤
SELECT department, AVG(salary) as avg_salary
FROM employees
WHERE salary > 6000 -- 先过滤工资大于6000的员工
GROUP BY department;
-- HAVING:分组后过滤
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 8000; -- 过滤平均工资大于8000的部门
-- 组合使用
SELECT department, AVG(salary) as avg_salary
FROM employees
WHERE hire_date >= '2020-01-01' -- 分组前过滤
GROUP BY department
HAVING COUNT(*) >= 2; -- 分组后过滤
# 复杂分组查询
# 嵌套分组
-- 查询每个部门工资最高的员工信息
SELECT e1.*
FROM employees e1
WHERE e1.salary = (
SELECT MAX(e2.salary)
FROM employees e2
WHERE e2.department = e1.department
);
-- 使用窗口函数实现
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rn
FROM employees
) t
WHERE rn = 1;
# 条件分组
-- 按工资等级分组
SELECT
CASE
WHEN salary < 6000 THEN '低薪'
WHEN salary BETWEEN 6000 AND 9000 THEN '中薪'
ELSE '高薪'
END as salary_level,
COUNT(*) as employee_count,
AVG(salary) as avg_salary
FROM employees
GROUP BY
CASE
WHEN salary < 6000 THEN '低薪'
WHEN salary BETWEEN 6000 AND 9000 THEN '中薪'
ELSE '高薪'
END;
# 时间分组
-- 按年分组统计
SELECT
YEAR(hire_date) as hire_year,
COUNT(*) as hire_count
FROM employees
GROUP BY YEAR(hire_date)
ORDER BY hire_year;
-- 按季度分组统计订单
SELECT
YEAR(order_date) as order_year,
QUARTER(order_date) as order_quarter,
COUNT(*) as order_count,
SUM(total_amount) as total_sales
FROM orders
GROUP BY YEAR(order_date), QUARTER(order_date)
ORDER BY order_year, order_quarter;
-- 按星期分组统计
SELECT
DAYNAME(order_date) as day_of_week,
COUNT(*) as order_count,
AVG(total_amount) as avg_amount
FROM orders
GROUP BY DAYOFWEEK(order_date), DAYNAME(order_date)
ORDER BY DAYOFWEEK(order_date);
# 实战案例
# 案例1:销售数据分析
-- 创建销售数据表
CREATE TABLE sales (
sale_id INT PRIMARY KEY,
salesperson VARCHAR(50),
region VARCHAR(30),
product_category VARCHAR(30),
sale_amount DECIMAL(10,2),
sale_date DATE
);
-- 插入测试数据
INSERT INTO sales VALUES
(1, '张三', '华北', '电子产品', 15000, '2023-01-15'),
(2, '李四', '华南', '服装', 8000, '2023-01-20'),
(3, '王五', '华东', '电子产品', 12000, '2023-02-10'),
(4, '张三', '华北', '家具', 20000, '2023-02-15'),
(5, '赵六', '华西', '电子产品', 9000, '2023-03-05');
-- 按销售员分组统计
SELECT
salesperson,
COUNT(*) as sale_count,
SUM(sale_amount) as total_sales,
AVG(sale_amount) as avg_sale,
MAX(sale_amount) as max_sale
FROM sales
GROUP BY salesperson
ORDER BY total_sales DESC;
-- 按地区和产品类别分组
SELECT
region,
product_category,
COUNT(*) as sale_count,
SUM(sale_amount) as total_sales
FROM sales
GROUP BY region, product_category
ORDER BY region, total_sales DESC;
-- 按月份统计各地区销售情况
SELECT
DATE_FORMAT(sale_date, '%Y-%m') as sale_month,
region,
SUM(sale_amount) as monthly_sales
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m'), region
ORDER BY sale_month, region;
# 案例2:学生成绩分析
-- 创建成绩表
CREATE TABLE student_scores (
student_id INT,
student_name VARCHAR(50),
subject VARCHAR(30),
score INT,
exam_date DATE
);
-- 插入测试数据
INSERT INTO student_scores VALUES
(1, '张三', '数学', 85, '2023-06-15'),
(1, '张三', '语文', 78, '2023-06-16'),
(1, '张三', '英语', 92, '2023-06-17'),
(2, '李四', '数学', 76, '2023-06-15'),
(2, '李四', '语文', 88, '2023-06-16'),
(2, '李四', '英语', 84, '2023-06-17');
-- 按学生统计总分和平均分
SELECT
student_id,
student_name,
COUNT(*) as subject_count,
SUM(score) as total_score,
AVG(score) as avg_score,
MIN(score) as min_score,
MAX(score) as max_score
FROM student_scores
GROUP BY student_id, student_name;
-- 按科目统计平均分
SELECT
subject,
COUNT(*) as student_count,
AVG(score) as avg_score,
MIN(score) as min_score,
MAX(score) as max_score
FROM student_scores
GROUP BY subject
ORDER BY avg_score DESC;
-- 查询平均分大于80的学生
SELECT
student_id,
student_name,
AVG(score) as avg_score
FROM student_scores
GROUP BY student_id, student_name
HAVING AVG(score) > 80;
# 高级分组技巧
# 1. ROLLUP - 分层汇总
-- MySQL中使用WITH ROLLUP
SELECT
department,
position,
COUNT(*) as employee_count,
SUM(salary) as total_salary
FROM employees
GROUP BY department, position WITH ROLLUP;
-- 手动实现ROLLUP效果
SELECT department, position, COUNT(*), SUM(salary)
FROM employees
GROUP BY department, position
UNION ALL
SELECT department, '小计', COUNT(*), SUM(salary)
FROM employees
GROUP BY department
UNION ALL
SELECT '总计', '', COUNT(*), SUM(salary)
FROM employees;
# 2. 动态分组
-- 根据条件动态分组
SELECT
CASE
WHEN YEAR(hire_date) < 2020 THEN '老员工'
WHEN YEAR(hire_date) = 2020 THEN '2020年入职'
ELSE '新员工'
END as employee_type,
COUNT(*) as count,
AVG(salary) as avg_salary
FROM employees
GROUP BY
CASE
WHEN YEAR(hire_date) < 2020 THEN '老员工'
WHEN YEAR(hire_date) = 2020 THEN '2020年入职'
ELSE '新员工'
END;
# 3. 条件计数
-- 使用条件聚合
SELECT
department,
COUNT(*) as total_count,
SUM(CASE WHEN salary > 8000 THEN 1 ELSE 0 END) as high_salary_count,
SUM(CASE WHEN salary <= 8000 THEN 1 ELSE 0 END) as normal_salary_count,
AVG(CASE WHEN salary > 8000 THEN salary END) as avg_high_salary
FROM employees
GROUP BY department;
# 性能优化
# 1. 索引优化
-- 为分组列创建索引
CREATE INDEX idx_department ON employees(department);
CREATE INDEX idx_dept_position ON employees(department, position);
-- 复合索引顺序很重要
-- 如果经常按(department, position)分组,创建复合索引
# 2. 避免不必要的分组
-- 低效:不必要的分组
SELECT department, COUNT(*)
FROM employees
WHERE department = '技术部'
GROUP BY department;
-- 高效:直接计数
SELECT COUNT(*)
FROM employees
WHERE department = '技术部';
# 3. 合理使用HAVING
-- 低效:在HAVING中使用非聚合条件
SELECT department, COUNT(*)
FROM employees
GROUP BY department
HAVING department = '技术部';
-- 高效:在WHERE中过滤
SELECT department, COUNT(*)
FROM employees
WHERE department = '技术部'
GROUP BY department;
# 常见错误
# 1. SELECT列表错误
-- 错误:SELECT中包含非分组列
SELECT name, department, COUNT(*)
FROM employees
GROUP BY department;
-- 错误原因:name不在GROUP BY中,也不是聚合函数
-- 正确:只选择分组列和聚合函数
SELECT department, COUNT(*)
FROM employees
GROUP BY department;
# 2. HAVING vs WHERE混淆
-- 错误:在WHERE中使用聚合函数
SELECT department, COUNT(*)
FROM employees
WHERE COUNT(*) > 2
GROUP BY department;
-- 正确:在HAVING中使用聚合函数
SELECT department, COUNT(*)
FROM employees
GROUP BY department
HAVING COUNT(*) > 2;
# 3. NULL值处理
-- 注意NULL值在分组中的行为
SELECT manager_id, COUNT(*)
FROM employees
GROUP BY manager_id;
-- NULL值会被分为一组
-- 排除NULL值
SELECT manager_id, COUNT(*)
FROM employees
WHERE manager_id IS NOT NULL
GROUP BY manager_id;
# 最佳实践
# 1. 查询优化
- 在WHERE子句中尽早过滤数据
- 为分组列创建适当的索引
- 避免在SELECT中使用非必要的列
# 2. 可读性
- 使用有意义的别名
- 合理排序结果
- 复杂分组考虑拆分为多个步骤
# 3. 数据完整性
- 注意NULL值的处理
- 验证聚合结果的合理性
- 考虑数据类型的精度问题
# 4. 性能监控
- 使用EXPLAIN分析执行计划
- 监控大数据量分组查询的性能
- 考虑使用物化视图缓存复杂分组结果
# 总结
GROUP BY是SQL中进行数据分析和统计的重要工具。正确使用GROUP BY可以高效地进行数据汇总和分析,但需要注意语法规则、性能优化和常见陷阱。在实际应用中,GROUP BY经常与聚合函数、HAVING子句、ORDER BY等结合使用,形成强大的数据分析能力。