JOIN表连接

2024/1/1

# JOIN表连接

点击勘误issues (opens new window),哪吒感谢大家的阅读

# JOIN概述

JOIN是SQL中用于连接两个或多个表的操作,通过指定的连接条件将不同表中的相关数据组合在一起。JOIN是关系数据库查询的核心功能之一。

# 基本语法

SELECT 列名列表
FROM1
JOIN2 ON 连接条件
WHERE 过滤条件;

# 准备测试数据

为了更好地理解JOIN操作,我们先创建几个示例表:

-- 员工表
CREATE TABLE employees (
    emp_id INT PRIMARY KEY,
    name VARCHAR(100),
    department_id INT,
    salary DECIMAL(10,2),
    hire_date DATE
);

-- 部门表
CREATE TABLE departments (
    dept_id INT PRIMARY KEY,
    dept_name VARCHAR(100),
    location VARCHAR(100)
);

-- 项目表
CREATE TABLE projects (
    project_id INT PRIMARY KEY,
    project_name VARCHAR(100),
    start_date DATE,
    end_date DATE
);

-- 员工项目关联表
CREATE TABLE employee_projects (
    emp_id INT,
    project_id INT,
    role VARCHAR(50),
    PRIMARY KEY (emp_id, project_id)
);

-- 插入测试数据
INSERT INTO departments VALUES 
(1, 'IT部', '北京'),
(2, 'HR部', '上海'),
(3, '财务部', '广州'),
(4, '市场部', '深圳');

INSERT INTO employees VALUES 
(1, '张三', 1, 8000, '2020-01-15'),
(2, '李四', 1, 9000, '2019-03-20'),
(3, '王五', 2, 7000, '2021-06-10'),
(4, '赵六', 2, 7500, '2020-11-05'),
(5, '钱七', NULL, 6000, '2022-01-20'),
(6, '孙八', 3, 8500, '2018-09-15');

INSERT INTO projects VALUES 
(1, '电商平台', '2023-01-01', '2023-12-31'),
(2, '移动应用', '2023-03-01', '2023-09-30'),
(3, '数据分析', '2023-06-01', '2024-02-29');

INSERT INTO employee_projects VALUES 
(1, 1, '项目经理'),
(2, 1, '开发工程师'),
(1, 2, '技术顾问'),
(3, 3, '数据分析师');

# INNER JOIN(内连接)

INNER JOIN返回两个表中满足连接条件的记录。

# 基本语法

SELECT 列名列表
FROM1
INNER JOIN2 ON1.列名 =2.列名;

# 实例演示

-- 查询员工及其部门信息
SELECT e.name, e.salary, d.dept_name, d.location
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id;

-- 结果:只返回有部门分配的员工
-- 张三, 8000, IT部, 北京
-- 李四, 9000, IT部, 北京
-- 王五, 7000, HR部, 上海
-- 赵六, 7500, HR部, 上海
-- 孙八, 8500, 财务部, 广州

# 多表INNER JOIN

-- 查询员工、部门和项目信息
SELECT e.name, d.dept_name, p.project_name, ep.role
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id
INNER JOIN employee_projects ep ON e.emp_id = ep.emp_id
INNER JOIN projects p ON ep.project_id = p.project_id;

# 使用WHERE进一步过滤

-- 查询IT部门参与项目的员工
SELECT e.name, d.dept_name, p.project_name, ep.role
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id
INNER JOIN employee_projects ep ON e.emp_id = ep.emp_id
INNER JOIN projects p ON ep.project_id = p.project_id
WHERE d.dept_name = 'IT部';

# LEFT JOIN(左外连接)

LEFT JOIN返回左表的所有记录,以及右表中满足连接条件的记录。如果右表中没有匹配的记录,则用NULL填充。

# 基本语法

SELECT 列名列表
FROM1
LEFT JOIN2 ON1.列名 =2.列名;

# 实例演示

-- 查询所有员工及其部门信息(包括没有部门分配的员工)
SELECT e.name, e.salary, d.dept_name, d.location
FROM employees e
LEFT JOIN departments d ON e.department_id = d.dept_id;

-- 结果:包含所有员工,没有部门的显示NULL
-- 张三, 8000, IT部, 北京
-- 李四, 9000, IT部, 北京
-- 王五, 7000, HR部, 上海
-- 赵六, 7500, HR部, 上海
-- 钱七, 6000, NULL, NULL
-- 孙八, 8500, 财务部, 广州

# 查找没有匹配的记录

-- 查询没有部门分配的员工
SELECT e.name, e.salary
FROM employees e
LEFT JOIN departments d ON e.department_id = d.dept_id
WHERE d.dept_id IS NULL;

-- 查询没有员工的部门
SELECT d.dept_name, d.location
FROM departments d
LEFT JOIN employees e ON d.dept_id = e.department_id
WHERE e.emp_id IS NULL;

# 统计查询

-- 查询每个部门的员工数量
SELECT d.dept_name, COUNT(e.emp_id) as employee_count
FROM departments d
LEFT JOIN employees e ON d.dept_id = e.department_id
GROUP BY d.dept_id, d.dept_name;

# RIGHT JOIN(右外连接)

RIGHT JOIN返回右表的所有记录,以及左表中满足连接条件的记录。

# 基本语法

SELECT 列名列表
FROM1
RIGHT JOIN2 ON1.列名 =2.列名;

# 实例演示

-- 查询所有部门及其员工信息(包括没有员工的部门)
SELECT e.name, e.salary, d.dept_name, d.location
FROM employees e
RIGHT JOIN departments d ON e.department_id = d.dept_id;

-- 等价于LEFT JOIN的写法
SELECT e.name, e.salary, d.dept_name, d.location
FROM departments d
LEFT JOIN employees e ON d.dept_id = e.department_id;

# FULL OUTER JOIN(全外连接)

FULL OUTER JOIN返回两个表中的所有记录,无论是否有匹配。MySQL不直接支持FULL OUTER JOIN,但可以通过UNION模拟。

# MySQL中的实现

-- 使用UNION模拟FULL OUTER JOIN
SELECT e.name, e.salary, d.dept_name, d.location
FROM employees e
LEFT JOIN departments d ON e.department_id = d.dept_id
UNION
SELECT e.name, e.salary, d.dept_name, d.location
FROM employees e
RIGHT JOIN departments d ON e.department_id = d.dept_id
WHERE e.emp_id IS NULL;

# 其他数据库的语法

-- PostgreSQL, SQL Server, Oracle
SELECT e.name, e.salary, d.dept_name, d.location
FROM employees e
FULL OUTER JOIN departments d ON e.department_id = d.dept_id;

# CROSS JOIN(交叉连接)

CROSS JOIN返回两个表的笛卡尔积,即第一个表的每一行与第二个表的每一行组合。

# 基本语法

SELECT 列名列表
FROM1
CROSS JOIN2;

-- 或者
SELECT 列名列表
FROM1,2;

# 实例演示

-- 生成员工和项目的所有组合
SELECT e.name, p.project_name
FROM employees e
CROSS JOIN projects p;

-- 注意:结果会有 6 * 3 = 18 行记录

# 实际应用场景

-- 生成日期序列和员工的组合(用于考勤统计)
SELECT e.name, dates.date_value
FROM employees e
CROSS JOIN (
    SELECT DATE_ADD('2023-01-01', INTERVAL n DAY) as date_value
    FROM (
        SELECT 0 as n UNION SELECT 1 UNION SELECT 2 UNION SELECT 3
        UNION SELECT 4 UNION SELECT 5 UNION SELECT 6
    ) numbers
    WHERE DATE_ADD('2023-01-01', INTERVAL n DAY) <= '2023-01-07'
) dates;

# SELF JOIN(自连接)

自连接是表与自身的连接,通常用于处理层次结构数据。

# 准备层次结构数据

-- 添加管理者字段
ALTER TABLE employees ADD COLUMN manager_id INT;

-- 更新管理关系
UPDATE employees SET manager_id = 2 WHERE emp_id = 1;
UPDATE employees SET manager_id = 1 WHERE emp_id IN (3, 4);
UPDATE employees SET manager_id = 3 WHERE emp_id = 5;

# 自连接示例

-- 查询员工及其直接上级
SELECT 
    e.name as employee_name,
    m.name as manager_name
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.emp_id;

-- 查询有下属的管理者
SELECT DISTINCT
    m.name as manager_name,
    COUNT(e.emp_id) as subordinate_count
FROM employees e
INNER JOIN employees m ON e.manager_id = m.emp_id
GROUP BY m.emp_id, m.name;

-- 查询同级同事(有相同管理者的员工)
SELECT 
    e1.name as employee1,
    e2.name as employee2,
    m.name as common_manager
FROM employees e1
INNER JOIN employees e2 ON e1.manager_id = e2.manager_id AND e1.emp_id < e2.emp_id
INNER JOIN employees m ON e1.manager_id = m.emp_id;

# 复杂JOIN查询

# 多表连接

-- 查询员工的完整信息:姓名、部门、项目、角色
SELECT 
    e.name as employee_name,
    d.dept_name,
    p.project_name,
    ep.role,
    e.salary
FROM employees e
LEFT JOIN departments d ON e.department_id = d.dept_id
LEFT JOIN employee_projects ep ON e.emp_id = ep.emp_id
LEFT JOIN projects p ON ep.project_id = p.project_id
ORDER BY e.name, p.project_name;

# 条件连接

-- 查询薪资高于部门平均薪资的员工
SELECT 
    e.name,
    e.salary,
    d.dept_name,
    dept_avg.avg_salary
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id
INNER JOIN (
    SELECT 
        department_id,
        AVG(salary) as avg_salary
    FROM employees 
    WHERE department_id IS NOT NULL
    GROUP BY department_id
) dept_avg ON e.department_id = dept_avg.department_id
WHERE e.salary > dept_avg.avg_salary;

# 使用子查询的JOIN

-- 查询参与项目最多的员工
SELECT 
    e.name,
    d.dept_name,
    project_count.count as project_count
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id
INNER JOIN (
    SELECT 
        emp_id,
        COUNT(*) as count
    FROM employee_projects
    GROUP BY emp_id
    HAVING COUNT(*) = (
        SELECT MAX(project_count)
        FROM (
            SELECT COUNT(*) as project_count
            FROM employee_projects
            GROUP BY emp_id
        ) counts
    )
) project_count ON e.emp_id = project_count.emp_id;

# JOIN性能优化

# 1. 索引优化

-- 在连接字段上创建索引
CREATE INDEX idx_emp_dept ON employees(department_id);
CREATE INDEX idx_emp_proj_emp ON employee_projects(emp_id);
CREATE INDEX idx_emp_proj_proj ON employee_projects(project_id);

-- 复合索引
CREATE INDEX idx_emp_dept_salary ON employees(department_id, salary);

# 2. 查询优化技巧

-- 优化前:大表在前
SELECT *
FROM large_table l
JOIN small_table s ON l.id = s.large_id;

-- 优化后:小表在前(某些数据库)
SELECT *
FROM small_table s
JOIN large_table l ON s.large_id = l.id;

-- 使用EXISTS替代IN子查询
-- 优化前
SELECT * FROM employees 
WHERE department_id IN (
    SELECT dept_id FROM departments WHERE location = '北京'
);

-- 优化后
SELECT * FROM employees e
WHERE EXISTS (
    SELECT 1 FROM departments d 
    WHERE d.dept_id = e.department_id AND d.location = '北京'
);

# 3. 避免笛卡尔积

-- 错误:忘记连接条件
SELECT e.name, d.dept_name
FROM employees e, departments d;  -- 产生笛卡尔积

-- 正确:添加连接条件
SELECT e.name, d.dept_name
FROM employees e, departments d
WHERE e.department_id = d.dept_id;

-- 更好:使用显式JOIN
SELECT e.name, d.dept_name
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id;

# 实战案例

# 案例1:员工薪资分析

-- 查询各部门薪资统计
SELECT 
    d.dept_name,
    COUNT(e.emp_id) as employee_count,
    AVG(e.salary) as avg_salary,
    MIN(e.salary) as min_salary,
    MAX(e.salary) as max_salary,
    SUM(e.salary) as total_salary
FROM departments d
LEFT JOIN employees e ON d.dept_id = e.department_id
GROUP BY d.dept_id, d.dept_name
ORDER BY avg_salary DESC;

# 案例2:项目参与情况分析

-- 查询项目参与情况
SELECT 
    p.project_name,
    COUNT(ep.emp_id) as participant_count,
    GROUP_CONCAT(CONCAT(e.name, '(', ep.role, ')') SEPARATOR ', ') as participants,
    DATEDIFF(p.end_date, p.start_date) as duration_days
FROM projects p
LEFT JOIN employee_projects ep ON p.project_id = ep.project_id
LEFT JOIN employees e ON ep.emp_id = e.emp_id
GROUP BY p.project_id, p.project_name, p.start_date, p.end_date
ORDER BY participant_count DESC;

# 案例3:组织架构查询

-- 查询组织架构(递归查询,MySQL 8.0+)
WITH RECURSIVE org_chart AS (
    -- 查找顶级管理者
    SELECT 
        emp_id,
        name,
        manager_id,
        0 as level,
        CAST(name AS CHAR(1000)) as path
    FROM employees 
    WHERE manager_id IS NULL
    
    UNION ALL
    
    -- 递归查找下级
    SELECT 
        e.emp_id,
        e.name,
        e.manager_id,
        oc.level + 1,
        CONCAT(oc.path, ' -> ', e.name)
    FROM employees e
    INNER JOIN org_chart oc ON e.manager_id = oc.emp_id
)
SELECT 
    CONCAT(REPEAT('  ', level), name) as hierarchy,
    level,
    path
FROM org_chart
ORDER BY path;

# 常见错误

# 1. 连接条件错误

-- 错误:连接条件不正确
SELECT e.name, d.dept_name
FROM employees e
JOIN departments d ON e.emp_id = d.dept_id;  -- 字段不匹配

-- 正确
SELECT e.name, d.dept_name
FROM employees e
JOIN departments d ON e.department_id = d.dept_id;

# 2. 忘记处理NULL值

-- 可能遗漏NULL值的情况
SELECT e.name, d.dept_name
FROM employees e
INNER JOIN departments d ON e.department_id = d.dept_id;
-- 这会遗漏department_id为NULL的员工

-- 如果需要包含所有员工
SELECT e.name, COALESCE(d.dept_name, '未分配') as dept_name
FROM employees e
LEFT JOIN departments d ON e.department_id = d.dept_id;

# 3. 重复数据问题

-- 可能产生重复数据
SELECT e.name, d.dept_name
FROM employees e
JOIN departments d ON e.department_id = d.dept_id
JOIN employee_projects ep ON e.emp_id = ep.emp_id;
-- 如果员工参与多个项目,会出现重复行

-- 解决方案:使用DISTINCT或适当的GROUP BY
SELECT DISTINCT e.name, d.dept_name
FROM employees e
JOIN departments d ON e.department_id = d.dept_id
JOIN employee_projects ep ON e.emp_id = ep.emp_id;

# 总结

JOIN是SQL中最重要的功能之一,掌握不同类型的JOIN对于编写复杂查询至关重要:

  1. INNER JOIN:返回两表匹配的记录
  2. LEFT JOIN:返回左表所有记录和右表匹配的记录
  3. RIGHT JOIN:返回右表所有记录和左表匹配的记录
  4. FULL OUTER JOIN:返回两表所有记录
  5. CROSS JOIN:返回两表的笛卡尔积
  6. SELF JOIN:表与自身连接

# 最佳实践

  • 总是使用显式JOIN语法而不是隐式连接
  • 在连接字段上创建适当的索引
  • 注意NULL值的处理
  • 避免不必要的笛卡尔积
  • 合理使用WHERE条件过滤数据
  • 考虑查询性能和执行计划

# 下一步学习