SQL查询优化

# SQL查询优化

查询优化是提升数据库性能的关键技术,通过优化SQL语句和查询策略来提高查询效率

# 概述

查询优化是一个系统性工程,涉及SQL语句编写、索引设计、数据库配置等多个方面。本文将从实用角度介绍各种查询优化技术和最佳实践。

# 查询优化基础

# 1. 查询执行过程

1. SQL解析 → 2. 查询优化 → 3. 执行计划生成 → 4. 查询执行 → 5. 结果返回

# 2. 性能指标

  • 响应时间:查询完成所需时间
  • 吞吐量:单位时间内处理的查询数
  • 资源消耗:CPU、内存、I/O使用量
  • 并发性能:多用户同时访问的性能

# SQL语句优化

# 1. SELECT语句优化

# 避免SELECT *

-- 不推荐:查询所有列
SELECT * FROM users WHERE city = 'Beijing';

-- 推荐:只查询需要的列
SELECT id, name, email FROM users WHERE city = 'Beijing';

# 使用LIMIT限制结果集

-- 分页查询
SELECT id, name, email 
FROM users 
WHERE city = 'Beijing'
ORDER BY created_at DESC
LIMIT 20 OFFSET 0;

-- 只需要检查是否存在记录
SELECT 1 FROM orders WHERE user_id = 123 LIMIT 1;

# 合理使用DISTINCT

-- 不推荐:在大表上使用DISTINCT
SELECT DISTINCT city FROM users;

-- 推荐:使用GROUP BY替代
SELECT city FROM users GROUP BY city;

-- 或者使用EXISTS
SELECT city FROM cities c
WHERE EXISTS (SELECT 1 FROM users u WHERE u.city = c.name);

# 2. WHERE子句优化

# 索引友好的条件

-- 推荐:使用索引列作为条件
SELECT * FROM orders WHERE user_id = 123;

-- 不推荐:在索引列上使用函数
SELECT * FROM orders WHERE YEAR(order_date) = 2024;

-- 推荐:使用范围条件
SELECT * FROM orders 
WHERE order_date >= '2024-01-01' 
AND order_date < '2025-01-01';

# 条件顺序优化

-- 将选择性高的条件放在前面
SELECT * FROM users 
WHERE email = 'john@example.com'  -- 高选择性
AND city = 'Beijing'              -- 低选择性
AND status = 'active';            -- 中等选择性

# 避免隐式类型转换

-- 不推荐:字符串和数字比较
SELECT * FROM users WHERE user_id = '123';

-- 推荐:类型匹配
SELECT * FROM users WHERE user_id = 123;

-- 不推荐:日期字符串比较
SELECT * FROM orders WHERE order_date = '2024-01-01';

-- 推荐:使用日期类型
SELECT * FROM orders WHERE order_date = DATE('2024-01-01');

# 3. JOIN优化

# 选择合适的JOIN类型

-- INNER JOIN:只返回匹配的记录
SELECT u.name, o.order_date
FROM users u
INNER JOIN orders o ON u.id = o.user_id;

-- LEFT JOIN:返回左表所有记录
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name;

-- EXISTS:检查存在性(通常比IN更高效)
SELECT * FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o WHERE o.user_id = u.id
);

# JOIN条件优化

-- 确保JOIN条件使用索引
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_users_id ON users(id);

-- 多表JOIN时的顺序很重要
SELECT u.name, o.order_date, p.product_name
FROM users u
INNER JOIN orders o ON u.id = o.user_id        -- 先连接主要表
INNER JOIN order_items oi ON o.id = oi.order_id -- 再连接详情表
INNER JOIN products p ON oi.product_id = p.id;  -- 最后连接产品表

# 避免笛卡尔积

-- 错误:缺少JOIN条件导致笛卡尔积
SELECT * FROM users, orders;

-- 正确:明确JOIN条件
SELECT * FROM users u
INNER JOIN orders o ON u.id = o.user_id;

# 4. 子查询优化

# 将子查询转换为JOIN

-- 不推荐:相关子查询
SELECT * FROM users u
WHERE u.id IN (
    SELECT user_id FROM orders WHERE total_amount > 1000
);

-- 推荐:转换为JOIN
SELECT DISTINCT u.*
FROM users u
INNER JOIN orders o ON u.id = o.user_id
WHERE o.total_amount > 1000;

# 使用EXISTS替代IN

-- 不推荐:IN子查询(当子查询结果集很大时)
SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders);

-- 推荐:EXISTS子查询
SELECT * FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o WHERE o.user_id = u.id
);

# 避免在SELECT列表中使用子查询

-- 不推荐:SELECT列表中的子查询
SELECT 
    u.name,
    (SELECT COUNT(*) FROM orders WHERE user_id = u.id) as order_count
FROM users u;

-- 推荐:使用LEFT JOIN
SELECT 
    u.name,
    COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name;

# 5. GROUP BY和ORDER BY优化

# GROUP BY优化

-- 确保GROUP BY列有索引
CREATE INDEX idx_orders_user_date ON orders(user_id, order_date);

-- 避免在GROUP BY中使用表达式
-- 不推荐
SELECT YEAR(order_date), COUNT(*)
FROM orders
GROUP BY YEAR(order_date);

-- 推荐:使用计算列或预处理
ALTER TABLE orders ADD COLUMN order_year INT;
UPDATE orders SET order_year = YEAR(order_date);
CREATE INDEX idx_orders_year ON orders(order_year);

SELECT order_year, COUNT(*)
FROM orders
GROUP BY order_year;

# ORDER BY优化

-- 为ORDER BY列创建索引
CREATE INDEX idx_users_created_at ON users(created_at);

-- 利用索引的自然顺序
SELECT * FROM users ORDER BY id;  -- 主键通常已有索引

-- 复合索引的列顺序要匹配ORDER BY
CREATE INDEX idx_orders_user_date ON orders(user_id, order_date);
SELECT * FROM orders 
WHERE user_id = 123 
ORDER BY order_date;  -- 可以利用索引

# 索引优化策略

# 1. 索引设计原则

# 选择性原则

-- 计算列的选择性
SELECT 
    COUNT(DISTINCT email) / COUNT(*) as email_selectivity,
    COUNT(DISTINCT city) / COUNT(*) as city_selectivity,
    COUNT(DISTINCT gender) / COUNT(*) as gender_selectivity
FROM users;

-- 为高选择性列创建索引
CREATE INDEX idx_users_email ON users(email);  -- 选择性高
-- 不为低选择性列单独创建索引
-- CREATE INDEX idx_users_gender ON users(gender);  -- 选择性低,不推荐

# 复合索引设计

-- 最左前缀原则
CREATE INDEX idx_orders_user_date_status ON orders(user_id, order_date, status);

-- 以下查询可以使用该索引:
-- WHERE user_id = 123
-- WHERE user_id = 123 AND order_date = '2024-01-01'
-- WHERE user_id = 123 AND order_date = '2024-01-01' AND status = 'completed'

-- 以下查询无法使用该索引:
-- WHERE order_date = '2024-01-01'  -- 跳过了最左列
-- WHERE status = 'completed'       -- 跳过了最左列

# 2. 覆盖索引

-- 创建覆盖索引,包含查询所需的所有列
CREATE INDEX idx_users_city_name_email ON users(city, name, email);

-- 以下查询可以完全通过索引满足,无需回表
SELECT name, email FROM users WHERE city = 'Beijing';

# 3. 部分索引

-- PostgreSQL:为活跃用户创建部分索引
CREATE INDEX idx_active_users_email ON users(email) 
WHERE status = 'active';

-- MySQL:使用函数索引(8.0+)
CREATE INDEX idx_users_email_domain ON users((SUBSTRING_INDEX(email, '@', -1)));

# 执行计划分析

# 1. 查看执行计划

-- MySQL
EXPLAIN SELECT * FROM users WHERE city = 'Beijing';
EXPLAIN FORMAT=JSON SELECT * FROM users WHERE city = 'Beijing';

-- PostgreSQL
EXPLAIN SELECT * FROM users WHERE city = 'Beijing';
EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM users WHERE city = 'Beijing';

-- SQL Server
SET SHOWPLAN_ALL ON;
SELECT * FROM users WHERE city = 'Beijing';
SET SHOWPLAN_ALL OFF;

-- Oracle
EXPLAIN PLAN FOR SELECT * FROM users WHERE city = 'Beijing';
SELECT * FROM TABLE(DBMS_XPLAN.DISPLAY);

# 2. 执行计划关键指标

# MySQL EXPLAIN输出解读

EXPLAIN SELECT u.name, o.order_date
FROM users u
INNER JOIN orders o ON u.id = o.user_id
WHERE u.city = 'Beijing';

-- 关注以下字段:
-- type: 连接类型(const > eq_ref > ref > range > index > ALL)
-- key: 使用的索引
-- rows: 扫描的行数
-- Extra: 额外信息(Using index, Using filesort等)

# 性能警告信号

-- 全表扫描
type = 'ALL'

-- 文件排序
Extra = 'Using filesort'

-- 临时表
Extra = 'Using temporary'

-- 扫描行数过多
rows > 10000

# 查询重写技巧

# 1. UNION优化

-- 不推荐:UNION(会去重,性能较差)
SELECT name FROM users WHERE city = 'Beijing'
UNION
SELECT name FROM users WHERE city = 'Shanghai';

-- 推荐:UNION ALL(不去重,性能更好)
SELECT name FROM users WHERE city = 'Beijing'
UNION ALL
SELECT name FROM users WHERE city = 'Shanghai';

-- 更好:使用OR条件
SELECT name FROM users WHERE city IN ('Beijing', 'Shanghai');

# 2. 分页查询优化

-- 传统分页(大偏移量时性能差)
SELECT * FROM users ORDER BY id LIMIT 100000, 20;

-- 优化:使用游标分页
SELECT * FROM users WHERE id > 100000 ORDER BY id LIMIT 20;

-- 或使用延迟关联
SELECT u.* FROM users u
INNER JOIN (
    SELECT id FROM users ORDER BY id LIMIT 100000, 20
) t ON u.id = t.id;

# 3. 条件下推

-- 不推荐:在外层过滤
SELECT * FROM (
    SELECT u.*, o.order_date
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id
) t
WHERE t.city = 'Beijing';

-- 推荐:将条件下推到内层
SELECT u.*, o.order_date
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.city = 'Beijing';

# 实战案例

# 案例1:电商订单查询优化

# 原始查询(性能差)

SELECT 
    u.name,
    u.email,
    COUNT(o.id) as order_count,
    SUM(o.total_amount) as total_spent,
    (SELECT AVG(total_amount) FROM orders WHERE user_id = u.id) as avg_order
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE YEAR(u.created_at) = 2024
GROUP BY u.id, u.name, u.email
ORDER BY total_spent DESC;

# 优化后查询

-- 1. 添加索引
CREATE INDEX idx_users_created_at ON users(created_at);
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_user_amount ON orders(user_id, total_amount);

-- 2. 优化查询
SELECT 
    u.name,
    u.email,
    COUNT(o.id) as order_count,
    COALESCE(SUM(o.total_amount), 0) as total_spent,
    COALESCE(AVG(o.total_amount), 0) as avg_order
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.created_at >= '2024-01-01' 
  AND u.created_at < '2025-01-01'
GROUP BY u.id, u.name, u.email
ORDER BY total_spent DESC
LIMIT 100;

# 案例2:复杂报表查询优化

# 原始查询(多重子查询)

SELECT 
    p.name,
    p.category,
    (SELECT COUNT(*) FROM order_items oi 
     JOIN orders o ON oi.order_id = o.id 
     WHERE oi.product_id = p.id AND o.order_date >= '2024-01-01') as sales_count,
    (SELECT SUM(oi.quantity * oi.price) FROM order_items oi 
     JOIN orders o ON oi.order_id = o.id 
     WHERE oi.product_id = p.id AND o.order_date >= '2024-01-01') as revenue
FROM products p
WHERE p.status = 'active';

# 优化后查询(使用CTE和窗口函数)

WITH product_sales AS (
    SELECT 
        oi.product_id,
        COUNT(*) as sales_count,
        SUM(oi.quantity * oi.price) as revenue
    FROM order_items oi
    JOIN orders o ON oi.order_id = o.id
    WHERE o.order_date >= '2024-01-01'
    GROUP BY oi.product_id
)
SELECT 
    p.name,
    p.category,
    COALESCE(ps.sales_count, 0) as sales_count,
    COALESCE(ps.revenue, 0) as revenue
FROM products p
LEFT JOIN product_sales ps ON p.id = ps.product_id
WHERE p.status = 'active'
ORDER BY revenue DESC;

# 案例3:大数据量聚合查询优化

# 使用物化视图

-- 创建物化视图(PostgreSQL)
CREATE MATERIALIZED VIEW daily_sales_summary AS
SELECT 
    DATE(order_date) as sale_date,
    COUNT(*) as order_count,
    SUM(total_amount) as total_revenue,
    AVG(total_amount) as avg_order_value
FROM orders
GROUP BY DATE(order_date);

-- 创建索引
CREATE INDEX idx_daily_sales_date ON daily_sales_summary(sale_date);

-- 定期刷新
REFRESH MATERIALIZED VIEW daily_sales_summary;

-- 查询使用物化视图
SELECT * FROM daily_sales_summary 
WHERE sale_date BETWEEN '2024-01-01' AND '2024-12-31';

# 数据库特定优化

# MySQL优化

-- 查询缓存(MySQL 5.7及以下)
SET GLOBAL query_cache_size = 268435456;  -- 256MB
SET GLOBAL query_cache_type = ON;

-- 优化器提示(MySQL 8.0+)
SELECT /*+ USE_INDEX(users idx_users_city) */ 
       name, email 
FROM users 
WHERE city = 'Beijing';

-- 分区表
CREATE TABLE orders_partitioned (
    id INT AUTO_INCREMENT,
    order_date DATE,
    user_id INT,
    total_amount DECIMAL(10,2),
    PRIMARY KEY (id, order_date)
) PARTITION BY RANGE (YEAR(order_date)) (
    PARTITION p2022 VALUES LESS THAN (2023),
    PARTITION p2023 VALUES LESS THAN (2024),
    PARTITION p2024 VALUES LESS THAN (2025)
);

# PostgreSQL优化

-- 统计信息更新
ANALYZE users;

-- 并行查询
SET max_parallel_workers_per_gather = 4;
SELECT COUNT(*) FROM large_table;

-- 部分索引
CREATE INDEX idx_active_users ON users(email) WHERE status = 'active';

-- 表达式索引
CREATE INDEX idx_users_lower_email ON users(LOWER(email));

# SQL Server优化

-- 更新统计信息
UPDATE STATISTICS users;

-- 查询提示
SELECT name, email
FROM users WITH (INDEX(idx_users_city))
WHERE city = 'Beijing';

-- 列存储索引
CREATE COLUMNSTORE INDEX cci_orders ON orders;

# 监控和诊断

# 1. 慢查询监控

-- MySQL:启用慢查询日志
SET GLOBAL slow_query_log = 'ON';
SET GLOBAL long_query_time = 2;  -- 记录超过2秒的查询
SET GLOBAL log_queries_not_using_indexes = 'ON';

-- 查看慢查询
SELECT * FROM mysql.slow_log ORDER BY start_time DESC LIMIT 10;

# 2. 性能监控查询

-- MySQL:查看当前运行的查询
SHOW PROCESSLIST;

-- PostgreSQL:查看活跃查询
SELECT 
    pid,
    now() - pg_stat_activity.query_start AS duration,
    query
FROM pg_stat_activity
WHERE (now() - pg_stat_activity.query_start) > interval '5 minutes';

-- SQL Server:查看等待统计
SELECT 
    wait_type,
    waiting_tasks_count,
    wait_time_ms,
    max_wait_time_ms
FROM sys.dm_os_wait_stats
ORDER BY wait_time_ms DESC;

# 最佳实践总结

# 1. 查询编写原则

  • 明确需求:只查询必要的列和行
  • 合理使用索引:为WHERE、JOIN、ORDER BY条件创建索引
  • 避免复杂子查询:优先使用JOIN替代子查询
  • 注意数据类型:避免隐式类型转换
  • 使用LIMIT:限制大结果集的返回

# 2. 索引设计原则

  • 选择性优先:为高选择性列创建索引
  • 复合索引:遵循最左前缀原则
  • 覆盖索引:减少回表操作
  • 定期维护:更新统计信息,重建碎片化索引

# 3. 性能监控

  • 建立基线:记录正常情况下的性能指标
  • 持续监控:定期检查慢查询和资源使用
  • 及时优化:发现性能问题及时处理
  • 测试验证:优化后要验证效果

# 4. 开发规范

  • 代码审查:SQL语句要经过性能审查
  • 测试环境:在接近生产的环境中测试性能
  • 文档记录:记录优化过程和效果
  • 知识分享:团队内分享优化经验

通过系统性的查询优化,可以显著提升数据库性能,改善用户体验,降低系统资源消耗。优化是一个持续的过程,需要根据业务发展和数据增长不断调整策略。