第14天-正则表达式

2023/6/15

# 第14天-正则表达式

# 学习目标

通过本章学习,你将掌握:

  • 理解正则表达式的基本概念和语法
  • 掌握Python中re模块的使用
  • 学会编写常用的正则表达式模式
  • 掌握正则表达式的匹配、搜索、替换和分割操作
  • 理解正则表达式的高级特性(分组、前瞻、后顾等)
  • 学会在实际项目中应用正则表达式
  • 掌握正则表达式的性能优化技巧

# 一、正则表达式基础

# 1.1 什么是正则表达式

import re

def regex_introduction():
    """正则表达式介绍"""
    print("=== 正则表达式介绍 ===")
    
    # 正则表达式的定义
    print("\n1. 正则表达式的定义")
    print("""
    正则表达式(Regular Expression,简称regex或regexp)是一种强大的文本处理工具,
    用于描述字符串的模式。它可以用来:
    
    • 验证输入格式(如邮箱、电话号码)
    • 搜索和提取特定内容
    • 替换文本
    • 分割字符串
    • 数据清洗和预处理
    """)
    
    # 简单示例
    print("\n2. 简单示例")
    
    # 示例1:查找数字
    text = "我有3个苹果和5个橙子"
    numbers = re.findall(r'\d+', text)
    print(f"文本: {text}")
    print(f"找到的数字: {numbers}")
    
    # 示例2:验证邮箱格式
    emails = [
        "user@example.com",
        "invalid-email",
        "test.email@domain.org",
        "@invalid.com"
    ]
    
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    
    print("\n邮箱验证:")
    for email in emails:
        is_valid = bool(re.match(email_pattern, email))
        print(f"{email}: {'有效' if is_valid else '无效'}")
    
    # 示例3:提取URL
    text = "访问我们的网站 https://www.example.com 或 http://blog.test.org"
    url_pattern = r'https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    urls = re.findall(url_pattern, text)
    print(f"\n文本: {text}")
    print(f"找到的URL: {urls}")
    
    # 示例4:替换敏感词
    text = "这个产品很垃圾,完全是骗子公司"
    sensitive_words = ["垃圾", "骗子"]
    
    for word in sensitive_words:
        text = re.sub(word, "*" * len(word), text)
    
    print(f"\n过滤后的文本: {text}")

# 运行介绍
regex_introduction()

# 1.2 基本语法和元字符

def regex_basic_syntax():
    """正则表达式基本语法"""
    print("=== 正则表达式基本语法 ===")
    
    # 1. 字面字符
    print("\n1. 字面字符")
    text = "Hello World"
    pattern = "Hello"
    match = re.search(pattern, text)
    print(f"文本: {text}")
    print(f"模式: {pattern}")
    print(f"匹配结果: {match.group() if match else '未匹配'}")
    
    # 2. 元字符
    print("\n2. 元字符")
    
    metacharacters = {
        ".": "匹配任意字符(除换行符)",
        "^": "匹配字符串开始",
        "$": "匹配字符串结束",
        "*": "匹配前面的字符0次或多次",
        "+": "匹配前面的字符1次或多次",
        "?": "匹配前面的字符0次或1次",
        "|": "或操作符",
        "[]": "字符类,匹配方括号内的任意字符",
        "()": "分组",
        "{}": "指定匹配次数",
        "\\": "转义字符"
    }
    
    for char, description in metacharacters.items():
        print(f"{char:3} - {description}")
    
    # 3. 元字符示例
    print("\n3. 元字符示例")
    
    examples = [
        ("a.c", "abc", "匹配a和c之间有任意字符"),
        ("^Hello", "Hello World", "匹配以Hello开头的字符串"),
        ("World$", "Hello World", "匹配以World结尾的字符串"),
        ("ab*c", "ac", "匹配a后面跟0个或多个b,然后是c"),
        ("ab+c", "abc", "匹配a后面跟1个或多个b,然后是c"),
        ("ab?c", "ac", "匹配a后面跟0个或1个b,然后是c"),
        ("cat|dog", "I have a cat", "匹配cat或dog"),
        ("[aeiou]", "hello", "匹配任意元音字母"),
        ("[0-9]", "abc123", "匹配任意数字"),
        ("[^0-9]", "abc123", "匹配任意非数字字符")
    ]
    
    for pattern, text, description in examples:
        match = re.search(pattern, text)
        result = match.group() if match else "未匹配"
        print(f"模式: {pattern:10} 文本: {text:15} 结果: {result:10} - {description}")
    
    # 4. 预定义字符类
    print("\n4. 预定义字符类")
    
    predefined_classes = {
        r"\d": "匹配数字 [0-9]",
        r"\D": "匹配非数字 [^0-9]",
        r"\w": "匹配单词字符 [a-zA-Z0-9_]",
        r"\W": "匹配非单词字符 [^a-zA-Z0-9_]",
        r"\s": "匹配空白字符(空格、制表符、换行符等)",
        r"\S": "匹配非空白字符",
        r"\b": "匹配单词边界",
        r"\B": "匹配非单词边界"
    }
    
    for char_class, description in predefined_classes.items():
        print(f"{char_class:3} - {description}")
    
    # 5. 预定义字符类示例
    print("\n5. 预定义字符类示例")
    
    text = "Hello123 World_456!"
    
    class_examples = [
        (r"\d+", "匹配连续数字"),
        (r"\w+", "匹配单词字符"),
        (r"\s+", "匹配空白字符"),
        (r"\b\w+\b", "匹配完整单词")
    ]
    
    for pattern, description in class_examples:
        matches = re.findall(pattern, text)
        print(f"模式: {pattern:10} 匹配: {matches} - {description}")

# 运行基本语法演示
regex_basic_syntax()

# 1.3 量词和重复

def regex_quantifiers():
    """正则表达式量词"""
    print("=== 正则表达式量词 ===")
    
    # 1. 基本量词
    print("\n1. 基本量词")
    
    quantifiers = {
        "*": "0次或多次(贪婪)",
        "+": "1次或多次(贪婪)",
        "?": "0次或1次(贪婪)",
        "{n}": "恰好n次",
        "{n,}": "至少n次",
        "{n,m}": "n到m次",
        "*?": "0次或多次(非贪婪)",
        "+?": "1次或多次(非贪婪)",
        "??": "0次或1次(非贪婪)"
    }
    
    for quantifier, description in quantifiers.items():
        print(f"{quantifier:6} - {description}")
    
    # 2. 量词示例
    print("\n2. 量词示例")
    
    text = "aaabbbcccc"
    
    quantifier_examples = [
        ("a*", "匹配0个或多个a"),
        ("a+", "匹配1个或多个a"),
        ("a?", "匹配0个或1个a"),
        ("a{3}", "匹配恰好3个a"),
        ("b{2,}", "匹配至少2个b"),
        ("c{2,4}", "匹配2到4个c")
    ]
    
    for pattern, description in quantifier_examples:
        match = re.search(pattern, text)
        result = match.group() if match else "未匹配"
        print(f"模式: {pattern:8} 结果: {result:8} - {description}")
    
    # 3. 贪婪vs非贪婪
    print("\n3. 贪婪vs非贪婪")
    
    html_text = "<div>内容1</div><div>内容2</div>"
    
    # 贪婪匹配
    greedy_pattern = r"<div>.*</div>"
    greedy_match = re.search(greedy_pattern, html_text)
    print(f"贪婪匹配: {greedy_match.group() if greedy_match else '未匹配'}")
    
    # 非贪婪匹配
    non_greedy_pattern = r"<div>.*?</div>"
    non_greedy_matches = re.findall(non_greedy_pattern, html_text)
    print(f"非贪婪匹配: {non_greedy_matches}")
    
    # 4. 实际应用示例
    print("\n4. 实际应用示例")
    
    # 提取HTML标签内容
    html = "<h1>标题</h1><p>段落内容</p><a href='#'>链接</a>"
    
    # 提取所有标签内容
    tag_content_pattern = r"<[^>]+>(.*?)</[^>]+>"
    contents = re.findall(tag_content_pattern, html)
    print(f"HTML内容: {contents}")
    
    # 提取特定标签
    h1_pattern = r"<h1>(.*?)</h1>"
    h1_content = re.search(h1_pattern, html)
    print(f"H1内容: {h1_content.group(1) if h1_content else '未找到'}")
    
    # 验证密码强度
    passwords = [
        "123456",
        "password",
        "Password123",
        "P@ssw0rd123",
        "Weak"
    ]
    
    # 密码要求:至少8位,包含大小写字母、数字和特殊字符
    strong_password_pattern = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$"
    
    print("\n密码强度验证:")
    for password in passwords:
        is_strong = bool(re.match(strong_password_pattern, password))
        print(f"{password:12} - {'强密码' if is_strong else '弱密码'}")

# 运行量词演示
regex_quantifiers()

# 二、Python re模块详解

# 2.1 re模块基本函数

import re

def re_module_basics():
    """re模块基本函数"""
    print("=== re模块基本函数 ===")
    
    text = "Python是一种编程语言,Python很强大。联系方式:email@example.com,电话:138-1234-5678"
    
    # 1. re.match() - 从字符串开头匹配
    print("\n1. re.match() - 从字符串开头匹配")
    
    match_result = re.match(r"Python", text)
    print(f"匹配结果: {match_result.group() if match_result else '未匹配'}")
    
    # 不从开头匹配的情况
    match_result2 = re.match(r"编程", text)
    print(f"匹配'编程': {match_result2.group() if match_result2 else '未匹配'}")
    
    # 2. re.search() - 搜索整个字符串
    print("\n2. re.search() - 搜索整个字符串")
    
    search_result = re.search(r"编程", text)
    print(f"搜索'编程': {search_result.group() if search_result else '未找到'}")
    
    # 获取匹配位置
    if search_result:
        print(f"匹配位置: {search_result.span()}")
        print(f"开始位置: {search_result.start()}")
        print(f"结束位置: {search_result.end()}")
    
    # 3. re.findall() - 找到所有匹配
    print("\n3. re.findall() - 找到所有匹配")
    
    # 找到所有"Python"
    python_matches = re.findall(r"Python", text)
    print(f"所有'Python': {python_matches}")
    
    # 找到所有数字
    numbers = re.findall(r"\d+", text)
    print(f"所有数字: {numbers}")
    
    # 找到邮箱
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    print(f"邮箱地址: {emails}")
    
    # 4. re.finditer() - 返回匹配对象的迭代器
    print("\n4. re.finditer() - 返回匹配对象的迭代器")
    
    for match in re.finditer(r"\d+", text):
        print(f"数字: {match.group()}, 位置: {match.span()}")
    
    # 5. re.sub() - 替换
    print("\n5. re.sub() - 替换")
    
    # 简单替换
    new_text = re.sub(r"Python", "Java", text)
    print(f"替换后: {new_text}")
    
    # 使用函数进行替换
    def upper_replace(match):
        return match.group().upper()
    
    upper_text = re.sub(r"python", upper_replace, text, flags=re.IGNORECASE)
    print(f"大写替换: {upper_text}")
    
    # 限制替换次数
    limited_replace = re.sub(r"Python", "Java", text, count=1)
    print(f"限制替换: {limited_replace}")
    
    # 6. re.split() - 分割
    print("\n6. re.split() - 分割")
    
    # 按标点符号分割
    parts = re.split(r"[,。:]", text)
    print(f"分割结果: {[part.strip() for part in parts if part.strip()]}")
    
    # 保留分隔符
    parts_with_sep = re.split(r"([,。:])", text)
    print(f"保留分隔符: {[part for part in parts_with_sep if part]}")
    
    # 7. re.compile() - 编译正则表达式
    print("\n7. re.compile() - 编译正则表达式")
    
    # 编译常用模式
    email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
    phone_pattern = re.compile(r"\d{3}-\d{4}-\d{4}")
    
    # 使用编译后的模式
    email_matches = email_pattern.findall(text)
    phone_matches = phone_pattern.findall(text)
    
    print(f"邮箱匹配: {email_matches}")
    print(f"电话匹配: {phone_matches}")
    
    # 8. 标志参数
    print("\n8. 标志参数")
    
    flags_demo_text = "Hello WORLD\nPython Programming"
    
    # re.IGNORECASE - 忽略大小写
    ignore_case = re.findall(r"hello", flags_demo_text, re.IGNORECASE)
    print(f"忽略大小写: {ignore_case}")
    
    # re.MULTILINE - 多行模式
    multiline = re.findall(r"^\w+", flags_demo_text, re.MULTILINE)
    print(f"多行模式: {multiline}")
    
    # re.DOTALL - 点号匹配换行符
    dotall = re.findall(r"Hello.*Programming", flags_demo_text, re.DOTALL)
    print(f"点号匹配换行: {dotall}")
    
    # 组合标志
    combined = re.findall(r"hello.*python", flags_demo_text, re.IGNORECASE | re.DOTALL)
    print(f"组合标志: {combined}")

# 运行re模块基础演示
re_module_basics()

# 2.2 分组和捕获

def regex_groups():
    """正则表达式分组"""
    print("=== 正则表达式分组 ===")
    
    # 1. 基本分组
    print("\n1. 基本分组")
    
    text = "姓名:张三,年龄:25,电话:138-1234-5678"
    
    # 使用分组提取信息
    pattern = r"姓名:(\w+),年龄:(\d+),电话:([\d-]+)"
    match = re.search(pattern, text)
    
    if match:
        print(f"完整匹配: {match.group(0)}")
        print(f"姓名: {match.group(1)}")
        print(f"年龄: {match.group(2)}")
        print(f"电话: {match.group(3)}")
        print(f"所有分组: {match.groups()}")
    
    # 2. 命名分组
    print("\n2. 命名分组")
    
    # 使用命名分组
    named_pattern = r"姓名:(?P<name>\w+),年龄:(?P<age>\d+),电话:(?P<phone>[\d-]+)"
    named_match = re.search(named_pattern, text)
    
    if named_match:
        print(f"姓名: {named_match.group('name')}")
        print(f"年龄: {named_match.group('age')}")
        print(f"电话: {named_match.group('phone')}")
        print(f"分组字典: {named_match.groupdict()}")
    
    # 3. 非捕获分组
    print("\n3. 非捕获分组")
    
    # 普通分组
    normal_pattern = r"(https?)://(\w+\.\w+)"
    # 非捕获分组
    non_capture_pattern = r"(?:https?)://(\w+\.\w+)"
    
    url = "https://www.example.com"
    
    normal_match = re.search(normal_pattern, url)
    non_capture_match = re.search(non_capture_pattern, url)
    
    print(f"普通分组: {normal_match.groups() if normal_match else '未匹配'}")
    print(f"非捕获分组: {non_capture_match.groups() if non_capture_match else '未匹配'}")
    
    # 4. 分组引用
    print("\n4. 分组引用")
    
    # 查找重复的单词
    text_with_duplicates = "这是是一个测试测试文本"
    duplicate_pattern = r"(\w+)\1"
    duplicates = re.findall(duplicate_pattern, text_with_duplicates)
    print(f"重复的字符: {duplicates}")
    
    # 在替换中使用分组引用
    html_text = "<b>粗体</b> <i>斜体</i>"
    # 将HTML标签转换为Markdown
    markdown_text = re.sub(r"<b>(.*?)</b>", r"**\1**", html_text)
    markdown_text = re.sub(r"<i>(.*?)</i>", r"*\1*", markdown_text)
    print(f"转换为Markdown: {markdown_text}")
    
    # 5. 条件分组
    print("\n5. 条件分组")
    
    # 匹配不同格式的日期
    dates = [
        "2023-12-25",
        "2023/12/25",
        "25-12-2023",
        "25/12/2023"
    ]
    
    # 使用选择操作符匹配多种格式
    date_pattern = r"(\d{4}[-/]\d{2}[-/]\d{2})|(\d{2}[-/]\d{2}[-/]\d{4})"
    
    for date in dates:
        match = re.search(date_pattern, date)
        if match:
            if match.group(1):
                print(f"{date} - 年-月-日格式")
            elif match.group(2):
                print(f"{date} - 日-月-年格式")
    
    # 6. 实际应用:解析日志
    print("\n6. 实际应用:解析日志")
    
    log_entries = [
        "2023-12-25 10:30:15 [INFO] 用户登录成功 - 用户ID: 12345",
        "2023-12-25 10:31:22 [ERROR] 数据库连接失败 - 错误代码: 500",
        "2023-12-25 10:32:10 [WARNING] 内存使用率过高 - 使用率: 85%"
    ]
    
    log_pattern = r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.*?) - (?P<details>.*)"
    
    for log in log_entries:
        match = re.search(log_pattern, log)
        if match:
            log_data = match.groupdict()
            print(f"时间: {log_data['timestamp']}")
            print(f"级别: {log_data['level']}")
            print(f"消息: {log_data['message']}")
            print(f"详情: {log_data['details']}")
            print("-" * 40)

# 运行分组演示
regex_groups()

# 2.3 前瞻和后顾断言

def regex_lookahead_lookbehind():
    """前瞻和后顾断言"""
    print("=== 前瞻和后顾断言 ===")
    
    # 1. 正向前瞻 (?=...)
    print("\n1. 正向前瞻 (?=...)")
    
    text = "password123 username456 email789"
    
    # 匹配后面跟着数字的单词
    positive_lookahead = re.findall(r"\w+(?=\d+)", text)
    print(f"后面跟数字的单词: {positive_lookahead}")
    
    # 2. 负向前瞻 (?!...)
    print("\n2. 负向前瞻 (?!...)")
    
    # 匹配后面不跟数字的单词
    negative_lookahead = re.findall(r"\w+(?!\d+)", text)
    print(f"后面不跟数字的单词: {negative_lookahead}")
    
    # 3. 正向后顾 (?<=...)
    print("\n3. 正向后顾 (?<=...)")
    
    # 匹配前面有字母的数字
    positive_lookbehind = re.findall(r"(?<=\w)\d+", text)
    print(f"前面有字母的数字: {positive_lookbehind}")
    
    # 4. 负向后顾 (?<!...)
    print("\n4. 负向后顾 (?<!...)")
    
    text_with_spaces = "abc 123 def456 ghi 789"
    
    # 匹配前面没有字母的数字
    negative_lookbehind = re.findall(r"(?<!\w)\d+", text_with_spaces)
    print(f"前面没有字母的数字: {negative_lookbehind}")
    
    # 5. 实际应用示例
    print("\n5. 实际应用示例")
    
    # 密码验证:至少8位,包含大小写字母和数字
    passwords = [
        "password",
        "Password",
        "Password123",
        "pass123",
        "PASSWORD123",
        "MySecurePass1"
    ]
    
    # 使用前瞻断言验证密码
    password_pattern = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$"
    
    print("密码验证(至少8位,包含大小写字母和数字):")
    for pwd in passwords:
        is_valid = bool(re.match(password_pattern, pwd))
        print(f"{pwd:15} - {'有效' if is_valid else '无效'}")
    
    # 6. 提取特定格式的数据
    print("\n6. 提取特定格式的数据")
    
    # 提取HTML标签中的属性值
    html = '<img src="image.jpg" alt="图片" width="100"> <a href="link.html">链接</a>'
    
    # 提取src属性值
    src_values = re.findall(r'(?<=src=")[^"]*(?=")', html)
    print(f"src属性值: {src_values}")
    
    # 提取href属性值
    href_values = re.findall(r'(?<=href=")[^"]*(?=")', html)
    print(f"href属性值: {href_values}")
    
    # 7. 复杂的验证示例
    print("\n7. 复杂的验证示例")
    
    # 验证中国手机号码
    phone_numbers = [
        "13812345678",
        "15987654321",
        "12345678901",
        "1381234567",
        "138123456789"
    ]
    
    # 中国手机号规则:1开头,第二位是3-9,总共11位
    china_mobile_pattern = r"^1[3-9]\d{9}$"
    
    print("中国手机号验证:")
    for phone in phone_numbers:
        is_valid = bool(re.match(china_mobile_pattern, phone))
        print(f"{phone:12} - {'有效' if is_valid else '无效'}")
    
    # 8. 提取嵌套结构
    print("\n8. 提取嵌套结构")
    
    # 提取函数调用
    code = "print('Hello') + len('World') + max(1, 2, 3)"
    
    # 匹配函数名(后面跟着括号)
    function_names = re.findall(r"\w+(?=\()", code)
    print(f"函数名: {function_names}")
    
    # 匹配括号内的内容
    parentheses_content = re.findall(r"(?<=\()[^)]*(?=\))", code)
    print(f"括号内容: {parentheses_content}")

# 运行前瞻后顾演示
regex_lookahead_lookbehind()

# 三、常用正则表达式模式

# 3.1 数据验证模式

def common_validation_patterns():
    """常用数据验证模式"""
    print("=== 常用数据验证模式 ===")
    
    # 1. 邮箱验证
    print("\n1. 邮箱验证")
    
    emails = [
        "user@example.com",
        "test.email@domain.org",
        "invalid-email",
        "user@",
        "@domain.com",
        "user.name+tag@example.co.uk"
    ]
    
    # 简单邮箱模式
    simple_email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    
    # 更严格的邮箱模式
    strict_email_pattern = r"^[a-zA-Z0-9]([a-zA-Z0-9._-]*[a-zA-Z0-9])?@[a-zA-Z0-9]([a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}$"
    
    print("邮箱验证结果:")
    for email in emails:
        simple_valid = bool(re.match(simple_email_pattern, email))
        strict_valid = bool(re.match(strict_email_pattern, email))
        print(f"{email:25} 简单: {'✓' if simple_valid else '✗'} 严格: {'✓' if strict_valid else '✗'}")
    
    # 2. 手机号验证
    print("\n2. 手机号验证")
    
    phone_numbers = [
        "13812345678",
        "138-1234-5678",
        "138 1234 5678",
        "+86 138 1234 5678",
        "12345678901",
        "1381234567"
    ]
    
    # 中国手机号模式
    china_mobile_patterns = {
        "基本格式": r"^1[3-9]\d{9}$",
        "带连字符": r"^1[3-9]\d-\d{4}-\d{4}$",
        "带空格": r"^1[3-9]\d \d{4} \d{4}$",
        "国际格式": r"^\+86 1[3-9]\d \d{4} \d{4}$"
    }
    
    print("手机号验证结果:")
    for phone in phone_numbers:
        print(f"{phone:20}", end=" ")
        for pattern_name, pattern in china_mobile_patterns.items():
            is_valid = bool(re.match(pattern, phone))
            print(f"{pattern_name}: {'✓' if is_valid else '✗'}", end="  ")
        print()
    
    # 3. 身份证号验证
    print("\n3. 身份证号验证")
    
    id_numbers = [
        "110101199001011234",
        "11010119900101123X",
        "110101199013011234",  # 无效月份
        "110101199001321234",  # 无效日期
        "12345678901234567",   # 长度不对
    ]
    
    # 身份证号模式(简化版)
    id_pattern = r"^[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dX]$"
    
    print("身份证号验证:")
    for id_num in id_numbers:
        is_valid = bool(re.match(id_pattern, id_num))
        print(f"{id_num:20} - {'有效' if is_valid else '无效'}")
    
    # 4. URL验证
    print("\n4. URL验证")
    
    urls = [
        "https://www.example.com",
        "http://example.com/path?param=value",
        "ftp://files.example.com",
        "www.example.com",
        "invalid-url",
        "https://sub.domain.example.com:8080/path"
    ]
    
    # URL模式
    url_pattern = r"^(https?|ftp)://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(:[0-9]+)?(/.*)?$"
    
    print("URL验证:")
    for url in urls:
        is_valid = bool(re.match(url_pattern, url))
        print(f"{url:40} - {'有效' if is_valid else '无效'}")
    
    # 5. IP地址验证
    print("\n5. IP地址验证")
    
    ip_addresses = [
        "192.168.1.1",
        "255.255.255.255",
        "0.0.0.0",
        "256.1.1.1",
        "192.168.1",
        "192.168.1.1.1"
    ]
    
    # IPv4地址模式
    ipv4_pattern = r"^((25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$"
    
    print("IPv4地址验证:")
    for ip in ip_addresses:
        is_valid = bool(re.match(ipv4_pattern, ip))
        print(f"{ip:15} - {'有效' if is_valid else '无效'}")
    
    # 6. 密码强度验证
    print("\n6. 密码强度验证")
    
    passwords = [
        "123456",
        "password",
        "Password123",
        "P@ssw0rd",
        "MySecureP@ss123",
        "weak"
    ]
    
    # 不同强度的密码模式
    password_patterns = {
        "弱密码": r"^.{6,}$",  # 至少6位
        "中等密码": r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$",  # 8位,包含大小写和数字
        "强密码": r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&]).{8,}$"  # 8位,包含大小写、数字和特殊字符
    }
    
    print("密码强度验证:")
    for password in passwords:
        print(f"{password:15}", end=" ")
        for strength, pattern in password_patterns.items():
            is_valid = bool(re.match(pattern, password))
            if is_valid:
                print(f"- {strength}")
                break
        else:
            print("- 太弱")

# 运行验证模式演示
common_validation_patterns()

# 3.2 文本提取模式

def text_extraction_patterns():
    """文本提取模式"""
    print("=== 文本提取模式 ===")
    
    # 1. 提取HTML标签和内容
    print("\n1. 提取HTML标签和内容")
    
    html_content = """
    <html>
        <head><title>网页标题</title></head>
        <body>
            <h1>主标题</h1>
            <p>这是一个段落。</p>
            <a href="https://example.com">链接文本</a>
            <img src="image.jpg" alt="图片描述">
        </body>
    </html>
    """
    
    # 提取所有标签
    all_tags = re.findall(r"<[^>]+>", html_content)
    print(f"所有标签: {all_tags[:5]}...")  # 只显示前5个
    
    # 提取标签内容
    tag_contents = re.findall(r"<(\w+)[^>]*>(.*?)</\1>", html_content, re.DOTALL)
    print("标签内容:")
    for tag, content in tag_contents:
        clean_content = content.strip()
        if clean_content and not clean_content.startswith('<'):
            print(f"  {tag}: {clean_content}")
    
    # 提取链接
    links = re.findall(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', html_content)
    print(f"链接: {links}")
    
    # 提取图片信息
    images = re.findall(r'<img[^>]+src="([^"]+)"[^>]*alt="([^"]+)"[^>]*>', html_content)
    print(f"图片: {images}")
    
    # 2. 提取日期和时间
    print("\n2. 提取日期和时间")
    
    text_with_dates = """
    会议安排:
    - 2023年12月25日 上午10:00 圣诞节庆祝
    - 2023-12-31 23:59 新年倒计时
    - 12/25/2023 下午2:30 PM 项目讨论
    - 25/12/2023 晚上8点 聚餐
    """
    
    # 不同格式的日期模式
    date_patterns = {
        "中文日期": r"\d{4}年\d{1,2}月\d{1,2}日",
        "ISO日期": r"\d{4}-\d{2}-\d{2}",
        "美式日期": r"\d{1,2}/\d{1,2}/\d{4}",
        "欧式日期": r"\d{1,2}/\d{1,2}/\d{4}"
    }
    
    # 时间模式
    time_patterns = {
        "24小时制": r"\d{1,2}:\d{2}",
        "12小时制": r"\d{1,2}:\d{2}\s*[AP]M",
        "中文时间": r"[上下]午\d{1,2}[::]\d{2}|晚上\d{1,2}点"
    }
    
    print("提取的日期:")
    for pattern_name, pattern in date_patterns.items():
        dates = re.findall(pattern, text_with_dates)
        if dates:
            print(f"  {pattern_name}: {dates}")
    
    print("提取的时间:")
    for pattern_name, pattern in time_patterns.items():
        times = re.findall(pattern, text_with_dates)
        if times:
            print(f"  {pattern_name}: {times}")
    
    # 3. 提取货币和数字
    print("\n3. 提取货币和数字")
    
    financial_text = """
    商品价格:
    - iPhone 15: ¥7999
    - MacBook Pro: $2,399.00
    - 咖啡: 25.50元
    - 汽车: €45,000
    - 房价: 1,200,000 RMB
    """
    
    # 货币模式
    currency_patterns = {
        "人民币符号": r"¥[\d,]+(?:\.\d{2})?",
        "美元": r"\$[\d,]+(?:\.\d{2})?",
        "欧元": r"€[\d,]+(?:\.\d{2})?",
        "人民币文字": r"[\d,]+(?:\.\d{2})?元",
        "RMB": r"[\d,]+(?:\.\d{2})?\s*RMB"
    }
    
    print("提取的货币:")
    for currency_name, pattern in currency_patterns.items():
        amounts = re.findall(pattern, financial_text)
        if amounts:
            print(f"  {currency_name}: {amounts}")
    
    # 4. 提取联系信息
    print("\n4. 提取联系信息")
    
    contact_text = """
    联系方式:
    电话:138-1234-5678, 010-12345678
    邮箱:contact@example.com, support@company.org
    QQ:123456789
    微信:wechat_id_123
    地址:北京市朝阳区某某街道123号
    """
    
    # 联系信息模式
    contact_patterns = {
        "手机号": r"1[3-9]\d-\d{4}-\d{4}",
        "固定电话": r"\d{3,4}-\d{7,8}",
        "邮箱": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
        "QQ号": r"QQ[::]\s*(\d{5,12})",
        "微信号": r"微信[::]\s*([a-zA-Z0-9_-]+)"
    }
    
    print("提取的联系信息:")
    for info_type, pattern in contact_patterns.items():
        contacts = re.findall(pattern, contact_text)
        if contacts:
            print(f"  {info_type}: {contacts}")
    
    # 5. 提取代码片段
    print("\n5. 提取代码片段")
    
    code_text = """
    Python代码示例:
    ```python
    def hello_world():
        print("Hello, World!")
    ```
    
    JavaScript代码:
    ```javascript
    function greet(name) {
        console.log(`Hello, ${name}!`);
    }
    ```
    
    内联代码:使用 `print()` 函数输出内容。
    """
    
    # 代码块模式
    code_block_pattern = r"```(\w+)\n(.*?)```"
    code_blocks = re.findall(code_block_pattern, code_text, re.DOTALL)
    
    print("代码块:")
    for language, code in code_blocks:
        print(f"  语言: {language}")
        print(f"  代码: {code.strip()[:50]}...")  # 只显示前50个字符
    
    # 内联代码模式
    inline_code_pattern = r"`([^`]+)`"
    inline_codes = re.findall(inline_code_pattern, code_text)
    print(f"内联代码: {inline_codes}")

# 运行文本提取演示
text_extraction_patterns()

# 四、正则表达式实战应用

# 4.1 日志分析器

import re
from datetime import datetime
from collections import defaultdict, Counter
from typing import Dict, List, Tuple

class LogAnalyzer:
    """日志分析器"""
    
    def __init__(self):
        # 常见日志格式的正则表达式
        self.log_patterns = {
            'apache_common': r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^"]+) HTTP/[^"]+" (?P<status>\d+) (?P<size>\d+)',
            'apache_combined': r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^"]+) HTTP/[^"]+" (?P<status>\d+) (?P<size>\d+) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"',
            'nginx': r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^"]+) HTTP/[^"]+" (?P<status>\d+) (?P<size>\d+) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"',
            'python_logging': r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (?P<logger>\w+) - (?P<level>\w+) - (?P<message>.*)',
            'syslog': r'(?P<timestamp>\w{3}\s+\d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\w+) (?P<process>\w+)\[(?P<pid>\d+)\]: (?P<message>.*)'
        }
        
        self.compiled_patterns = {name: re.compile(pattern) for name, pattern in self.log_patterns.items()}
    
    def parse_log_line(self, line: str, log_format: str = 'auto') -> Dict:
        """解析单行日志"""
        line = line.strip()
        if not line:
            return None
        
        if log_format == 'auto':
            # 自动检测日志格式
            for format_name, pattern in self.compiled_patterns.items():
                match = pattern.match(line)
                if match:
                    result = match.groupdict()
                    result['format'] = format_name
                    return result
            return {'raw': line, 'format': 'unknown'}
        else:
            pattern = self.compiled_patterns.get(log_format)
            if pattern:
                match = pattern.match(line)
                if match:
                    result = match.groupdict()
                    result['format'] = log_format
                    return result
        
        return None
    
    def analyze_logs(self, log_lines: List[str]) -> Dict:
        """分析日志"""
        stats = {
            'total_lines': 0,
            'parsed_lines': 0,
            'formats': Counter(),
            'status_codes': Counter(),
            'methods': Counter(),
            'ips': Counter(),
            'user_agents': Counter(),
            'error_logs': [],
            'top_urls': Counter(),
            'hourly_traffic': defaultdict(int)
        }
        
        for line in log_lines:
            stats['total_lines'] += 1
            parsed = self.parse_log_line(line)
            
            if parsed:
                stats['parsed_lines'] += 1
                stats['formats'][parsed.get('format', 'unknown')] += 1
                
                # 分析Web服务器日志
                if 'status' in parsed:
                    stats['status_codes'][parsed['status']] += 1
                
                if 'method' in parsed:
                    stats['methods'][parsed['method']] += 1
                
                if 'ip' in parsed:
                    stats['ips'][parsed['ip']] += 1
                
                if 'user_agent' in parsed:
                    stats['user_agents'][parsed['user_agent']] += 1
                
                if 'url' in parsed:
                    stats['top_urls'][parsed['url']] += 1
                
                # 分析时间分布
                if 'timestamp' in parsed:
                    try:
                        # 尝试解析时间戳
                        timestamp = parsed['timestamp']
                        if '/' in timestamp:  # Apache格式
                            dt = datetime.strptime(timestamp.split()[0], '%d/%b/%Y:%H:%M:%S')
                        else:  # Python logging格式
                            dt = datetime.strptime(timestamp.split(',')[0], '%Y-%m-%d %H:%M:%S')
                        
                        hour_key = dt.strftime('%Y-%m-%d %H:00')
                        stats['hourly_traffic'][hour_key] += 1
                    except:
                        pass
                
                # 收集错误日志
                if ('status' in parsed and parsed['status'].startswith(('4', '5'))) or \
                   ('level' in parsed and parsed['level'] in ['ERROR', 'CRITICAL']):
                    stats['error_logs'].append(parsed)
        
        return stats
    
    def generate_report(self, stats: Dict) -> str:
        """生成分析报告"""
        report = []
        report.append("=== 日志分析报告 ===")
        report.append(f"总行数: {stats['total_lines']}")
        report.append(f"成功解析: {stats['parsed_lines']} ({stats['parsed_lines']/stats['total_lines']*100:.1f}%)")
        
        # 日志格式分布
        report.append("\n日志格式分布:")
        for format_name, count in stats['formats'].most_common():
            report.append(f"  {format_name}: {count}")
        
        # HTTP状态码分布
        if stats['status_codes']:
            report.append("\nHTTP状态码分布:")
            for status, count in stats['status_codes'].most_common(10):
                report.append(f"  {status}: {count}")
        
        # 请求方法分布
        if stats['methods']:
            report.append("\n请求方法分布:")
            for method, count in stats['methods'].most_common():
                report.append(f"  {method}: {count}")
        
        # 访问最多的IP
        if stats['ips']:
            report.append("\n访问最多的IP (Top 10):")
            for ip, count in stats['ips'].most_common(10):
                report.append(f"  {ip}: {count}")
        
        # 访问最多的URL
        if stats['top_urls']:
            report.append("\n访问最多的URL (Top 10):")
            for url, count in stats['top_urls'].most_common(10):
                report.append(f"  {url}: {count}")
        
        # 错误日志
        if stats['error_logs']:
            report.append(f"\n错误日志 (显示前5条):")
            for i, error in enumerate(stats['error_logs'][:5]):
                report.append(f"  {i+1}. {error.get('timestamp', 'N/A')} - {error.get('status', error.get('level', 'N/A'))} - {error.get('url', error.get('message', 'N/A'))}")
        
        return "\n".join(report)