第14天-正则表达式
哪吒 2023/6/15
# 第14天-正则表达式
# 学习目标
通过本章学习,你将掌握:
- 理解正则表达式的基本概念和语法
- 掌握Python中re模块的使用
- 学会编写常用的正则表达式模式
- 掌握正则表达式的匹配、搜索、替换和分割操作
- 理解正则表达式的高级特性(分组、前瞻、后顾等)
- 学会在实际项目中应用正则表达式
- 掌握正则表达式的性能优化技巧
# 一、正则表达式基础
# 1.1 什么是正则表达式
import re
def regex_introduction():
"""正则表达式介绍"""
print("=== 正则表达式介绍 ===")
# 正则表达式的定义
print("\n1. 正则表达式的定义")
print("""
正则表达式(Regular Expression,简称regex或regexp)是一种强大的文本处理工具,
用于描述字符串的模式。它可以用来:
• 验证输入格式(如邮箱、电话号码)
• 搜索和提取特定内容
• 替换文本
• 分割字符串
• 数据清洗和预处理
""")
# 简单示例
print("\n2. 简单示例")
# 示例1:查找数字
text = "我有3个苹果和5个橙子"
numbers = re.findall(r'\d+', text)
print(f"文本: {text}")
print(f"找到的数字: {numbers}")
# 示例2:验证邮箱格式
emails = [
"user@example.com",
"invalid-email",
"test.email@domain.org",
"@invalid.com"
]
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
print("\n邮箱验证:")
for email in emails:
is_valid = bool(re.match(email_pattern, email))
print(f"{email}: {'有效' if is_valid else '无效'}")
# 示例3:提取URL
text = "访问我们的网站 https://www.example.com 或 http://blog.test.org"
url_pattern = r'https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
urls = re.findall(url_pattern, text)
print(f"\n文本: {text}")
print(f"找到的URL: {urls}")
# 示例4:替换敏感词
text = "这个产品很垃圾,完全是骗子公司"
sensitive_words = ["垃圾", "骗子"]
for word in sensitive_words:
text = re.sub(word, "*" * len(word), text)
print(f"\n过滤后的文本: {text}")
# 运行介绍
regex_introduction()
# 1.2 基本语法和元字符
def regex_basic_syntax():
"""正则表达式基本语法"""
print("=== 正则表达式基本语法 ===")
# 1. 字面字符
print("\n1. 字面字符")
text = "Hello World"
pattern = "Hello"
match = re.search(pattern, text)
print(f"文本: {text}")
print(f"模式: {pattern}")
print(f"匹配结果: {match.group() if match else '未匹配'}")
# 2. 元字符
print("\n2. 元字符")
metacharacters = {
".": "匹配任意字符(除换行符)",
"^": "匹配字符串开始",
"$": "匹配字符串结束",
"*": "匹配前面的字符0次或多次",
"+": "匹配前面的字符1次或多次",
"?": "匹配前面的字符0次或1次",
"|": "或操作符",
"[]": "字符类,匹配方括号内的任意字符",
"()": "分组",
"{}": "指定匹配次数",
"\\": "转义字符"
}
for char, description in metacharacters.items():
print(f"{char:3} - {description}")
# 3. 元字符示例
print("\n3. 元字符示例")
examples = [
("a.c", "abc", "匹配a和c之间有任意字符"),
("^Hello", "Hello World", "匹配以Hello开头的字符串"),
("World$", "Hello World", "匹配以World结尾的字符串"),
("ab*c", "ac", "匹配a后面跟0个或多个b,然后是c"),
("ab+c", "abc", "匹配a后面跟1个或多个b,然后是c"),
("ab?c", "ac", "匹配a后面跟0个或1个b,然后是c"),
("cat|dog", "I have a cat", "匹配cat或dog"),
("[aeiou]", "hello", "匹配任意元音字母"),
("[0-9]", "abc123", "匹配任意数字"),
("[^0-9]", "abc123", "匹配任意非数字字符")
]
for pattern, text, description in examples:
match = re.search(pattern, text)
result = match.group() if match else "未匹配"
print(f"模式: {pattern:10} 文本: {text:15} 结果: {result:10} - {description}")
# 4. 预定义字符类
print("\n4. 预定义字符类")
predefined_classes = {
r"\d": "匹配数字 [0-9]",
r"\D": "匹配非数字 [^0-9]",
r"\w": "匹配单词字符 [a-zA-Z0-9_]",
r"\W": "匹配非单词字符 [^a-zA-Z0-9_]",
r"\s": "匹配空白字符(空格、制表符、换行符等)",
r"\S": "匹配非空白字符",
r"\b": "匹配单词边界",
r"\B": "匹配非单词边界"
}
for char_class, description in predefined_classes.items():
print(f"{char_class:3} - {description}")
# 5. 预定义字符类示例
print("\n5. 预定义字符类示例")
text = "Hello123 World_456!"
class_examples = [
(r"\d+", "匹配连续数字"),
(r"\w+", "匹配单词字符"),
(r"\s+", "匹配空白字符"),
(r"\b\w+\b", "匹配完整单词")
]
for pattern, description in class_examples:
matches = re.findall(pattern, text)
print(f"模式: {pattern:10} 匹配: {matches} - {description}")
# 运行基本语法演示
regex_basic_syntax()
# 1.3 量词和重复
def regex_quantifiers():
"""正则表达式量词"""
print("=== 正则表达式量词 ===")
# 1. 基本量词
print("\n1. 基本量词")
quantifiers = {
"*": "0次或多次(贪婪)",
"+": "1次或多次(贪婪)",
"?": "0次或1次(贪婪)",
"{n}": "恰好n次",
"{n,}": "至少n次",
"{n,m}": "n到m次",
"*?": "0次或多次(非贪婪)",
"+?": "1次或多次(非贪婪)",
"??": "0次或1次(非贪婪)"
}
for quantifier, description in quantifiers.items():
print(f"{quantifier:6} - {description}")
# 2. 量词示例
print("\n2. 量词示例")
text = "aaabbbcccc"
quantifier_examples = [
("a*", "匹配0个或多个a"),
("a+", "匹配1个或多个a"),
("a?", "匹配0个或1个a"),
("a{3}", "匹配恰好3个a"),
("b{2,}", "匹配至少2个b"),
("c{2,4}", "匹配2到4个c")
]
for pattern, description in quantifier_examples:
match = re.search(pattern, text)
result = match.group() if match else "未匹配"
print(f"模式: {pattern:8} 结果: {result:8} - {description}")
# 3. 贪婪vs非贪婪
print("\n3. 贪婪vs非贪婪")
html_text = "<div>内容1</div><div>内容2</div>"
# 贪婪匹配
greedy_pattern = r"<div>.*</div>"
greedy_match = re.search(greedy_pattern, html_text)
print(f"贪婪匹配: {greedy_match.group() if greedy_match else '未匹配'}")
# 非贪婪匹配
non_greedy_pattern = r"<div>.*?</div>"
non_greedy_matches = re.findall(non_greedy_pattern, html_text)
print(f"非贪婪匹配: {non_greedy_matches}")
# 4. 实际应用示例
print("\n4. 实际应用示例")
# 提取HTML标签内容
html = "<h1>标题</h1><p>段落内容</p><a href='#'>链接</a>"
# 提取所有标签内容
tag_content_pattern = r"<[^>]+>(.*?)</[^>]+>"
contents = re.findall(tag_content_pattern, html)
print(f"HTML内容: {contents}")
# 提取特定标签
h1_pattern = r"<h1>(.*?)</h1>"
h1_content = re.search(h1_pattern, html)
print(f"H1内容: {h1_content.group(1) if h1_content else '未找到'}")
# 验证密码强度
passwords = [
"123456",
"password",
"Password123",
"P@ssw0rd123",
"Weak"
]
# 密码要求:至少8位,包含大小写字母、数字和特殊字符
strong_password_pattern = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$"
print("\n密码强度验证:")
for password in passwords:
is_strong = bool(re.match(strong_password_pattern, password))
print(f"{password:12} - {'强密码' if is_strong else '弱密码'}")
# 运行量词演示
regex_quantifiers()
# 二、Python re模块详解
# 2.1 re模块基本函数
import re
def re_module_basics():
"""re模块基本函数"""
print("=== re模块基本函数 ===")
text = "Python是一种编程语言,Python很强大。联系方式:email@example.com,电话:138-1234-5678"
# 1. re.match() - 从字符串开头匹配
print("\n1. re.match() - 从字符串开头匹配")
match_result = re.match(r"Python", text)
print(f"匹配结果: {match_result.group() if match_result else '未匹配'}")
# 不从开头匹配的情况
match_result2 = re.match(r"编程", text)
print(f"匹配'编程': {match_result2.group() if match_result2 else '未匹配'}")
# 2. re.search() - 搜索整个字符串
print("\n2. re.search() - 搜索整个字符串")
search_result = re.search(r"编程", text)
print(f"搜索'编程': {search_result.group() if search_result else '未找到'}")
# 获取匹配位置
if search_result:
print(f"匹配位置: {search_result.span()}")
print(f"开始位置: {search_result.start()}")
print(f"结束位置: {search_result.end()}")
# 3. re.findall() - 找到所有匹配
print("\n3. re.findall() - 找到所有匹配")
# 找到所有"Python"
python_matches = re.findall(r"Python", text)
print(f"所有'Python': {python_matches}")
# 找到所有数字
numbers = re.findall(r"\d+", text)
print(f"所有数字: {numbers}")
# 找到邮箱
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
print(f"邮箱地址: {emails}")
# 4. re.finditer() - 返回匹配对象的迭代器
print("\n4. re.finditer() - 返回匹配对象的迭代器")
for match in re.finditer(r"\d+", text):
print(f"数字: {match.group()}, 位置: {match.span()}")
# 5. re.sub() - 替换
print("\n5. re.sub() - 替换")
# 简单替换
new_text = re.sub(r"Python", "Java", text)
print(f"替换后: {new_text}")
# 使用函数进行替换
def upper_replace(match):
return match.group().upper()
upper_text = re.sub(r"python", upper_replace, text, flags=re.IGNORECASE)
print(f"大写替换: {upper_text}")
# 限制替换次数
limited_replace = re.sub(r"Python", "Java", text, count=1)
print(f"限制替换: {limited_replace}")
# 6. re.split() - 分割
print("\n6. re.split() - 分割")
# 按标点符号分割
parts = re.split(r"[,。:]", text)
print(f"分割结果: {[part.strip() for part in parts if part.strip()]}")
# 保留分隔符
parts_with_sep = re.split(r"([,。:])", text)
print(f"保留分隔符: {[part for part in parts_with_sep if part]}")
# 7. re.compile() - 编译正则表达式
print("\n7. re.compile() - 编译正则表达式")
# 编译常用模式
email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
phone_pattern = re.compile(r"\d{3}-\d{4}-\d{4}")
# 使用编译后的模式
email_matches = email_pattern.findall(text)
phone_matches = phone_pattern.findall(text)
print(f"邮箱匹配: {email_matches}")
print(f"电话匹配: {phone_matches}")
# 8. 标志参数
print("\n8. 标志参数")
flags_demo_text = "Hello WORLD\nPython Programming"
# re.IGNORECASE - 忽略大小写
ignore_case = re.findall(r"hello", flags_demo_text, re.IGNORECASE)
print(f"忽略大小写: {ignore_case}")
# re.MULTILINE - 多行模式
multiline = re.findall(r"^\w+", flags_demo_text, re.MULTILINE)
print(f"多行模式: {multiline}")
# re.DOTALL - 点号匹配换行符
dotall = re.findall(r"Hello.*Programming", flags_demo_text, re.DOTALL)
print(f"点号匹配换行: {dotall}")
# 组合标志
combined = re.findall(r"hello.*python", flags_demo_text, re.IGNORECASE | re.DOTALL)
print(f"组合标志: {combined}")
# 运行re模块基础演示
re_module_basics()
# 2.2 分组和捕获
def regex_groups():
"""正则表达式分组"""
print("=== 正则表达式分组 ===")
# 1. 基本分组
print("\n1. 基本分组")
text = "姓名:张三,年龄:25,电话:138-1234-5678"
# 使用分组提取信息
pattern = r"姓名:(\w+),年龄:(\d+),电话:([\d-]+)"
match = re.search(pattern, text)
if match:
print(f"完整匹配: {match.group(0)}")
print(f"姓名: {match.group(1)}")
print(f"年龄: {match.group(2)}")
print(f"电话: {match.group(3)}")
print(f"所有分组: {match.groups()}")
# 2. 命名分组
print("\n2. 命名分组")
# 使用命名分组
named_pattern = r"姓名:(?P<name>\w+),年龄:(?P<age>\d+),电话:(?P<phone>[\d-]+)"
named_match = re.search(named_pattern, text)
if named_match:
print(f"姓名: {named_match.group('name')}")
print(f"年龄: {named_match.group('age')}")
print(f"电话: {named_match.group('phone')}")
print(f"分组字典: {named_match.groupdict()}")
# 3. 非捕获分组
print("\n3. 非捕获分组")
# 普通分组
normal_pattern = r"(https?)://(\w+\.\w+)"
# 非捕获分组
non_capture_pattern = r"(?:https?)://(\w+\.\w+)"
url = "https://www.example.com"
normal_match = re.search(normal_pattern, url)
non_capture_match = re.search(non_capture_pattern, url)
print(f"普通分组: {normal_match.groups() if normal_match else '未匹配'}")
print(f"非捕获分组: {non_capture_match.groups() if non_capture_match else '未匹配'}")
# 4. 分组引用
print("\n4. 分组引用")
# 查找重复的单词
text_with_duplicates = "这是是一个测试测试文本"
duplicate_pattern = r"(\w+)\1"
duplicates = re.findall(duplicate_pattern, text_with_duplicates)
print(f"重复的字符: {duplicates}")
# 在替换中使用分组引用
html_text = "<b>粗体</b> <i>斜体</i>"
# 将HTML标签转换为Markdown
markdown_text = re.sub(r"<b>(.*?)</b>", r"**\1**", html_text)
markdown_text = re.sub(r"<i>(.*?)</i>", r"*\1*", markdown_text)
print(f"转换为Markdown: {markdown_text}")
# 5. 条件分组
print("\n5. 条件分组")
# 匹配不同格式的日期
dates = [
"2023-12-25",
"2023/12/25",
"25-12-2023",
"25/12/2023"
]
# 使用选择操作符匹配多种格式
date_pattern = r"(\d{4}[-/]\d{2}[-/]\d{2})|(\d{2}[-/]\d{2}[-/]\d{4})"
for date in dates:
match = re.search(date_pattern, date)
if match:
if match.group(1):
print(f"{date} - 年-月-日格式")
elif match.group(2):
print(f"{date} - 日-月-年格式")
# 6. 实际应用:解析日志
print("\n6. 实际应用:解析日志")
log_entries = [
"2023-12-25 10:30:15 [INFO] 用户登录成功 - 用户ID: 12345",
"2023-12-25 10:31:22 [ERROR] 数据库连接失败 - 错误代码: 500",
"2023-12-25 10:32:10 [WARNING] 内存使用率过高 - 使用率: 85%"
]
log_pattern = r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.*?) - (?P<details>.*)"
for log in log_entries:
match = re.search(log_pattern, log)
if match:
log_data = match.groupdict()
print(f"时间: {log_data['timestamp']}")
print(f"级别: {log_data['level']}")
print(f"消息: {log_data['message']}")
print(f"详情: {log_data['details']}")
print("-" * 40)
# 运行分组演示
regex_groups()
# 2.3 前瞻和后顾断言
def regex_lookahead_lookbehind():
"""前瞻和后顾断言"""
print("=== 前瞻和后顾断言 ===")
# 1. 正向前瞻 (?=...)
print("\n1. 正向前瞻 (?=...)")
text = "password123 username456 email789"
# 匹配后面跟着数字的单词
positive_lookahead = re.findall(r"\w+(?=\d+)", text)
print(f"后面跟数字的单词: {positive_lookahead}")
# 2. 负向前瞻 (?!...)
print("\n2. 负向前瞻 (?!...)")
# 匹配后面不跟数字的单词
negative_lookahead = re.findall(r"\w+(?!\d+)", text)
print(f"后面不跟数字的单词: {negative_lookahead}")
# 3. 正向后顾 (?<=...)
print("\n3. 正向后顾 (?<=...)")
# 匹配前面有字母的数字
positive_lookbehind = re.findall(r"(?<=\w)\d+", text)
print(f"前面有字母的数字: {positive_lookbehind}")
# 4. 负向后顾 (?<!...)
print("\n4. 负向后顾 (?<!...)")
text_with_spaces = "abc 123 def456 ghi 789"
# 匹配前面没有字母的数字
negative_lookbehind = re.findall(r"(?<!\w)\d+", text_with_spaces)
print(f"前面没有字母的数字: {negative_lookbehind}")
# 5. 实际应用示例
print("\n5. 实际应用示例")
# 密码验证:至少8位,包含大小写字母和数字
passwords = [
"password",
"Password",
"Password123",
"pass123",
"PASSWORD123",
"MySecurePass1"
]
# 使用前瞻断言验证密码
password_pattern = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$"
print("密码验证(至少8位,包含大小写字母和数字):")
for pwd in passwords:
is_valid = bool(re.match(password_pattern, pwd))
print(f"{pwd:15} - {'有效' if is_valid else '无效'}")
# 6. 提取特定格式的数据
print("\n6. 提取特定格式的数据")
# 提取HTML标签中的属性值
html = '<img src="image.jpg" alt="图片" width="100"> <a href="link.html">链接</a>'
# 提取src属性值
src_values = re.findall(r'(?<=src=")[^"]*(?=")', html)
print(f"src属性值: {src_values}")
# 提取href属性值
href_values = re.findall(r'(?<=href=")[^"]*(?=")', html)
print(f"href属性值: {href_values}")
# 7. 复杂的验证示例
print("\n7. 复杂的验证示例")
# 验证中国手机号码
phone_numbers = [
"13812345678",
"15987654321",
"12345678901",
"1381234567",
"138123456789"
]
# 中国手机号规则:1开头,第二位是3-9,总共11位
china_mobile_pattern = r"^1[3-9]\d{9}$"
print("中国手机号验证:")
for phone in phone_numbers:
is_valid = bool(re.match(china_mobile_pattern, phone))
print(f"{phone:12} - {'有效' if is_valid else '无效'}")
# 8. 提取嵌套结构
print("\n8. 提取嵌套结构")
# 提取函数调用
code = "print('Hello') + len('World') + max(1, 2, 3)"
# 匹配函数名(后面跟着括号)
function_names = re.findall(r"\w+(?=\()", code)
print(f"函数名: {function_names}")
# 匹配括号内的内容
parentheses_content = re.findall(r"(?<=\()[^)]*(?=\))", code)
print(f"括号内容: {parentheses_content}")
# 运行前瞻后顾演示
regex_lookahead_lookbehind()
# 三、常用正则表达式模式
# 3.1 数据验证模式
def common_validation_patterns():
"""常用数据验证模式"""
print("=== 常用数据验证模式 ===")
# 1. 邮箱验证
print("\n1. 邮箱验证")
emails = [
"user@example.com",
"test.email@domain.org",
"invalid-email",
"user@",
"@domain.com",
"user.name+tag@example.co.uk"
]
# 简单邮箱模式
simple_email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
# 更严格的邮箱模式
strict_email_pattern = r"^[a-zA-Z0-9]([a-zA-Z0-9._-]*[a-zA-Z0-9])?@[a-zA-Z0-9]([a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}$"
print("邮箱验证结果:")
for email in emails:
simple_valid = bool(re.match(simple_email_pattern, email))
strict_valid = bool(re.match(strict_email_pattern, email))
print(f"{email:25} 简单: {'✓' if simple_valid else '✗'} 严格: {'✓' if strict_valid else '✗'}")
# 2. 手机号验证
print("\n2. 手机号验证")
phone_numbers = [
"13812345678",
"138-1234-5678",
"138 1234 5678",
"+86 138 1234 5678",
"12345678901",
"1381234567"
]
# 中国手机号模式
china_mobile_patterns = {
"基本格式": r"^1[3-9]\d{9}$",
"带连字符": r"^1[3-9]\d-\d{4}-\d{4}$",
"带空格": r"^1[3-9]\d \d{4} \d{4}$",
"国际格式": r"^\+86 1[3-9]\d \d{4} \d{4}$"
}
print("手机号验证结果:")
for phone in phone_numbers:
print(f"{phone:20}", end=" ")
for pattern_name, pattern in china_mobile_patterns.items():
is_valid = bool(re.match(pattern, phone))
print(f"{pattern_name}: {'✓' if is_valid else '✗'}", end=" ")
print()
# 3. 身份证号验证
print("\n3. 身份证号验证")
id_numbers = [
"110101199001011234",
"11010119900101123X",
"110101199013011234", # 无效月份
"110101199001321234", # 无效日期
"12345678901234567", # 长度不对
]
# 身份证号模式(简化版)
id_pattern = r"^[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dX]$"
print("身份证号验证:")
for id_num in id_numbers:
is_valid = bool(re.match(id_pattern, id_num))
print(f"{id_num:20} - {'有效' if is_valid else '无效'}")
# 4. URL验证
print("\n4. URL验证")
urls = [
"https://www.example.com",
"http://example.com/path?param=value",
"ftp://files.example.com",
"www.example.com",
"invalid-url",
"https://sub.domain.example.com:8080/path"
]
# URL模式
url_pattern = r"^(https?|ftp)://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(:[0-9]+)?(/.*)?$"
print("URL验证:")
for url in urls:
is_valid = bool(re.match(url_pattern, url))
print(f"{url:40} - {'有效' if is_valid else '无效'}")
# 5. IP地址验证
print("\n5. IP地址验证")
ip_addresses = [
"192.168.1.1",
"255.255.255.255",
"0.0.0.0",
"256.1.1.1",
"192.168.1",
"192.168.1.1.1"
]
# IPv4地址模式
ipv4_pattern = r"^((25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$"
print("IPv4地址验证:")
for ip in ip_addresses:
is_valid = bool(re.match(ipv4_pattern, ip))
print(f"{ip:15} - {'有效' if is_valid else '无效'}")
# 6. 密码强度验证
print("\n6. 密码强度验证")
passwords = [
"123456",
"password",
"Password123",
"P@ssw0rd",
"MySecureP@ss123",
"weak"
]
# 不同强度的密码模式
password_patterns = {
"弱密码": r"^.{6,}$", # 至少6位
"中等密码": r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$", # 8位,包含大小写和数字
"强密码": r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&]).{8,}$" # 8位,包含大小写、数字和特殊字符
}
print("密码强度验证:")
for password in passwords:
print(f"{password:15}", end=" ")
for strength, pattern in password_patterns.items():
is_valid = bool(re.match(pattern, password))
if is_valid:
print(f"- {strength}")
break
else:
print("- 太弱")
# 运行验证模式演示
common_validation_patterns()
# 3.2 文本提取模式
def text_extraction_patterns():
"""文本提取模式"""
print("=== 文本提取模式 ===")
# 1. 提取HTML标签和内容
print("\n1. 提取HTML标签和内容")
html_content = """
<html>
<head><title>网页标题</title></head>
<body>
<h1>主标题</h1>
<p>这是一个段落。</p>
<a href="https://example.com">链接文本</a>
<img src="image.jpg" alt="图片描述">
</body>
</html>
"""
# 提取所有标签
all_tags = re.findall(r"<[^>]+>", html_content)
print(f"所有标签: {all_tags[:5]}...") # 只显示前5个
# 提取标签内容
tag_contents = re.findall(r"<(\w+)[^>]*>(.*?)</\1>", html_content, re.DOTALL)
print("标签内容:")
for tag, content in tag_contents:
clean_content = content.strip()
if clean_content and not clean_content.startswith('<'):
print(f" {tag}: {clean_content}")
# 提取链接
links = re.findall(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', html_content)
print(f"链接: {links}")
# 提取图片信息
images = re.findall(r'<img[^>]+src="([^"]+)"[^>]*alt="([^"]+)"[^>]*>', html_content)
print(f"图片: {images}")
# 2. 提取日期和时间
print("\n2. 提取日期和时间")
text_with_dates = """
会议安排:
- 2023年12月25日 上午10:00 圣诞节庆祝
- 2023-12-31 23:59 新年倒计时
- 12/25/2023 下午2:30 PM 项目讨论
- 25/12/2023 晚上8点 聚餐
"""
# 不同格式的日期模式
date_patterns = {
"中文日期": r"\d{4}年\d{1,2}月\d{1,2}日",
"ISO日期": r"\d{4}-\d{2}-\d{2}",
"美式日期": r"\d{1,2}/\d{1,2}/\d{4}",
"欧式日期": r"\d{1,2}/\d{1,2}/\d{4}"
}
# 时间模式
time_patterns = {
"24小时制": r"\d{1,2}:\d{2}",
"12小时制": r"\d{1,2}:\d{2}\s*[AP]M",
"中文时间": r"[上下]午\d{1,2}[::]\d{2}|晚上\d{1,2}点"
}
print("提取的日期:")
for pattern_name, pattern in date_patterns.items():
dates = re.findall(pattern, text_with_dates)
if dates:
print(f" {pattern_name}: {dates}")
print("提取的时间:")
for pattern_name, pattern in time_patterns.items():
times = re.findall(pattern, text_with_dates)
if times:
print(f" {pattern_name}: {times}")
# 3. 提取货币和数字
print("\n3. 提取货币和数字")
financial_text = """
商品价格:
- iPhone 15: ¥7999
- MacBook Pro: $2,399.00
- 咖啡: 25.50元
- 汽车: €45,000
- 房价: 1,200,000 RMB
"""
# 货币模式
currency_patterns = {
"人民币符号": r"¥[\d,]+(?:\.\d{2})?",
"美元": r"\$[\d,]+(?:\.\d{2})?",
"欧元": r"€[\d,]+(?:\.\d{2})?",
"人民币文字": r"[\d,]+(?:\.\d{2})?元",
"RMB": r"[\d,]+(?:\.\d{2})?\s*RMB"
}
print("提取的货币:")
for currency_name, pattern in currency_patterns.items():
amounts = re.findall(pattern, financial_text)
if amounts:
print(f" {currency_name}: {amounts}")
# 4. 提取联系信息
print("\n4. 提取联系信息")
contact_text = """
联系方式:
电话:138-1234-5678, 010-12345678
邮箱:contact@example.com, support@company.org
QQ:123456789
微信:wechat_id_123
地址:北京市朝阳区某某街道123号
"""
# 联系信息模式
contact_patterns = {
"手机号": r"1[3-9]\d-\d{4}-\d{4}",
"固定电话": r"\d{3,4}-\d{7,8}",
"邮箱": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
"QQ号": r"QQ[::]\s*(\d{5,12})",
"微信号": r"微信[::]\s*([a-zA-Z0-9_-]+)"
}
print("提取的联系信息:")
for info_type, pattern in contact_patterns.items():
contacts = re.findall(pattern, contact_text)
if contacts:
print(f" {info_type}: {contacts}")
# 5. 提取代码片段
print("\n5. 提取代码片段")
code_text = """
Python代码示例:
```python
def hello_world():
print("Hello, World!")
```
JavaScript代码:
```javascript
function greet(name) {
console.log(`Hello, ${name}!`);
}
```
内联代码:使用 `print()` 函数输出内容。
"""
# 代码块模式
code_block_pattern = r"```(\w+)\n(.*?)```"
code_blocks = re.findall(code_block_pattern, code_text, re.DOTALL)
print("代码块:")
for language, code in code_blocks:
print(f" 语言: {language}")
print(f" 代码: {code.strip()[:50]}...") # 只显示前50个字符
# 内联代码模式
inline_code_pattern = r"`([^`]+)`"
inline_codes = re.findall(inline_code_pattern, code_text)
print(f"内联代码: {inline_codes}")
# 运行文本提取演示
text_extraction_patterns()
# 四、正则表达式实战应用
# 4.1 日志分析器
import re
from datetime import datetime
from collections import defaultdict, Counter
from typing import Dict, List, Tuple
class LogAnalyzer:
"""日志分析器"""
def __init__(self):
# 常见日志格式的正则表达式
self.log_patterns = {
'apache_common': r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^"]+) HTTP/[^"]+" (?P<status>\d+) (?P<size>\d+)',
'apache_combined': r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^"]+) HTTP/[^"]+" (?P<status>\d+) (?P<size>\d+) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"',
'nginx': r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>\w+) (?P<url>[^"]+) HTTP/[^"]+" (?P<status>\d+) (?P<size>\d+) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"',
'python_logging': r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (?P<logger>\w+) - (?P<level>\w+) - (?P<message>.*)',
'syslog': r'(?P<timestamp>\w{3}\s+\d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\w+) (?P<process>\w+)\[(?P<pid>\d+)\]: (?P<message>.*)'
}
self.compiled_patterns = {name: re.compile(pattern) for name, pattern in self.log_patterns.items()}
def parse_log_line(self, line: str, log_format: str = 'auto') -> Dict:
"""解析单行日志"""
line = line.strip()
if not line:
return None
if log_format == 'auto':
# 自动检测日志格式
for format_name, pattern in self.compiled_patterns.items():
match = pattern.match(line)
if match:
result = match.groupdict()
result['format'] = format_name
return result
return {'raw': line, 'format': 'unknown'}
else:
pattern = self.compiled_patterns.get(log_format)
if pattern:
match = pattern.match(line)
if match:
result = match.groupdict()
result['format'] = log_format
return result
return None
def analyze_logs(self, log_lines: List[str]) -> Dict:
"""分析日志"""
stats = {
'total_lines': 0,
'parsed_lines': 0,
'formats': Counter(),
'status_codes': Counter(),
'methods': Counter(),
'ips': Counter(),
'user_agents': Counter(),
'error_logs': [],
'top_urls': Counter(),
'hourly_traffic': defaultdict(int)
}
for line in log_lines:
stats['total_lines'] += 1
parsed = self.parse_log_line(line)
if parsed:
stats['parsed_lines'] += 1
stats['formats'][parsed.get('format', 'unknown')] += 1
# 分析Web服务器日志
if 'status' in parsed:
stats['status_codes'][parsed['status']] += 1
if 'method' in parsed:
stats['methods'][parsed['method']] += 1
if 'ip' in parsed:
stats['ips'][parsed['ip']] += 1
if 'user_agent' in parsed:
stats['user_agents'][parsed['user_agent']] += 1
if 'url' in parsed:
stats['top_urls'][parsed['url']] += 1
# 分析时间分布
if 'timestamp' in parsed:
try:
# 尝试解析时间戳
timestamp = parsed['timestamp']
if '/' in timestamp: # Apache格式
dt = datetime.strptime(timestamp.split()[0], '%d/%b/%Y:%H:%M:%S')
else: # Python logging格式
dt = datetime.strptime(timestamp.split(',')[0], '%Y-%m-%d %H:%M:%S')
hour_key = dt.strftime('%Y-%m-%d %H:00')
stats['hourly_traffic'][hour_key] += 1
except:
pass
# 收集错误日志
if ('status' in parsed and parsed['status'].startswith(('4', '5'))) or \
('level' in parsed and parsed['level'] in ['ERROR', 'CRITICAL']):
stats['error_logs'].append(parsed)
return stats
def generate_report(self, stats: Dict) -> str:
"""生成分析报告"""
report = []
report.append("=== 日志分析报告 ===")
report.append(f"总行数: {stats['total_lines']}")
report.append(f"成功解析: {stats['parsed_lines']} ({stats['parsed_lines']/stats['total_lines']*100:.1f}%)")
# 日志格式分布
report.append("\n日志格式分布:")
for format_name, count in stats['formats'].most_common():
report.append(f" {format_name}: {count}")
# HTTP状态码分布
if stats['status_codes']:
report.append("\nHTTP状态码分布:")
for status, count in stats['status_codes'].most_common(10):
report.append(f" {status}: {count}")
# 请求方法分布
if stats['methods']:
report.append("\n请求方法分布:")
for method, count in stats['methods'].most_common():
report.append(f" {method}: {count}")
# 访问最多的IP
if stats['ips']:
report.append("\n访问最多的IP (Top 10):")
for ip, count in stats['ips'].most_common(10):
report.append(f" {ip}: {count}")
# 访问最多的URL
if stats['top_urls']:
report.append("\n访问最多的URL (Top 10):")
for url, count in stats['top_urls'].most_common(10):
report.append(f" {url}: {count}")
# 错误日志
if stats['error_logs']:
report.append(f"\n错误日志 (显示前5条):")
for i, error in enumerate(stats['error_logs'][:5]):
report.append(f" {i+1}. {error.get('timestamp', 'N/A')} - {error.get('status', error.get('level', 'N/A'))} - {error.get('url', error.get('message', 'N/A'))}")
return "\n".join(report)