PHP正则表达式实战指南:模式匹配与文本处理
Orion K Lv6

PHP正则表达式实战指南:模式匹配与文本处理

正则表达式是处理文本的强大工具,在数据验证、文本解析和内容提取中发挥着重要作用。作为PHP开发者,掌握正则表达式能大大提高我们处理字符串的效率。本文将分享一些实用的正则表达式技巧和最佳实践。

正则表达式基础

1. 基本语法和函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
<?php
// 正则表达式基础示例
echo "=== 正则表达式基础 ===\n";

// 基本匹配函数
function demonstrateBasicFunctions() {
$text = "Hello World! My email is john@example.com and phone is 13812345678";

// preg_match - 执行匹配,返回匹配次数
echo "=== preg_match 示例 ===\n";

// 匹配邮箱
$emailPattern = '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/';
if (preg_match($emailPattern, $text, $matches)) {
echo "找到邮箱: " . $matches[0] . "\n";
}

// 匹配手机号
$phonePattern = '/1[3-9]\d{9}/';
if (preg_match($phonePattern, $text, $matches)) {
echo "找到手机号: " . $matches[0] . "\n";
}

// preg_match_all - 执行全局匹配
echo "\n=== preg_match_all 示例 ===\n";

$multiText = "联系方式: john@example.com, jane@test.com, admin@site.org";
if (preg_match_all($emailPattern, $multiText, $matches)) {
echo "找到 " . count($matches[0]) . " 个邮箱:\n";
foreach ($matches[0] as $email) {
echo "- $email\n";
}
}

// preg_replace - 执行替换
echo "\n=== preg_replace 示例 ===\n";

$sensitiveText = "我的手机号是13812345678,邮箱是john@example.com";

// 隐藏手机号中间4位
$hiddenPhone = preg_replace('/(1[3-9]\d)(\d{4})(\d{4})/', '$1****$3', $sensitiveText);
echo "隐藏手机号: $hiddenPhone\n";

// 隐藏邮箱用户名部分
$hiddenEmail = preg_replace('/([a-zA-Z0-9._%+-]{1,3})[a-zA-Z0-9._%+-]*@/', '$1***@', $hiddenPhone);
echo "隐藏邮箱: $hiddenEmail\n";

// preg_split - 分割字符串
echo "\n=== preg_split 示例 ===\n";

$csvData = "张三,25,北京;李四,30,上海;王五,28,广州";
$records = preg_split('/[;,]/', $csvData);
echo "分割结果: " . json_encode($records, JSON_UNESCAPED_UNICODE) . "\n";

// 更复杂的分割
$logEntry = "2023-07-10 16:45:30 [INFO] User login successful - IP: 192.168.1.100";
$logParts = preg_split('/\s+[-\[\]]\s*/', $logEntry);
echo "日志分割: " . json_encode($logParts, JSON_UNESCAPED_UNICODE) . "\n";
}

demonstrateBasicFunctions();

// 正则表达式工具类
class RegexHelper {
// 常用正则模式
const PATTERNS = [
'email' => '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/',
'phone' => '/^1[3-9]\d{9}$/',
'idcard' => '/^\d{17}[\dX]$/',
'url' => '/^https?:\/\/[^\s\/$.?#].[^\s]*$/',
'ip' => '/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/',
'date' => '/^\d{4}-\d{2}-\d{2}$/',
'time' => '/^\d{2}:\d{2}:\d{2}$/',
'chinese' => '/^[\x{4e00}-\x{9fa5}]+$/u',
'password' => '/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/'
];

// 验证输入
public static function validate($input, $pattern) {
if (isset(self::PATTERNS[$pattern])) {
return preg_match(self::PATTERNS[$pattern], $input);
}

return preg_match($pattern, $input);
}

// 提取信息
public static function extract($text, $pattern, $all = false) {
if (isset(self::PATTERNS[$pattern])) {
$pattern = self::PATTERNS[$pattern];
}

if ($all) {
preg_match_all($pattern, $text, $matches);
return $matches[0] ?? [];
} else {
preg_match($pattern, $text, $matches);
return $matches[0] ?? null;
}
}

// 清理文本
public static function clean($text, $pattern, $replacement = '') {
return preg_replace($pattern, $replacement, $text);
}

// 分割文本
public static function split($text, $pattern) {
return preg_split($pattern, $text, -1, PREG_SPLIT_NO_EMPTY);
}

// 高亮匹配内容
public static function highlight($text, $pattern, $highlightTag = 'mark') {
return preg_replace($pattern, "<$highlightTag>$0</$highlightTag>", $text);
}
}

// 使用正则工具类
echo "\n=== 正则工具类示例 ===\n";

// 验证示例
$testData = [
'email' => 'test@example.com',
'phone' => '13812345678',
'url' => 'https://www.example.com',
'chinese' => '中文测试'
];

foreach ($testData as $type => $value) {
$isValid = RegexHelper::validate($value, $type);
echo "$type 验证 '$value': " . ($isValid ? '通过' : '失败') . "\n";
}

// 提取示例
$contactText = "请联系我们:电话13812345678,邮箱support@example.com,网站https://www.example.com";

echo "\n提取信息:\n";
echo "手机号: " . (RegexHelper::extract($contactText, 'phone') ?: '未找到') . "\n";
echo "邮箱: " . (RegexHelper::extract($contactText, 'email') ?: '未找到') . "\n";
echo "网址: " . (RegexHelper::extract($contactText, 'url') ?: '未找到') . "\n";

// 清理示例
$dirtyText = "这是一段包含<script>alert('xss')</script>的文本和一些数字123456";
$cleanText = RegexHelper::clean($dirtyText, '/<[^>]*>/', '');
echo "\n清理HTML标签: $cleanText\n";

$numbersOnly = RegexHelper::clean($dirtyText, '/[^\d]/', '');
echo "只保留数字: $numbersOnly\n";
?>

2. 高级模式匹配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
<?php
// 高级正则表达式模式
echo "=== 高级模式匹配 ===\n";

class AdvancedRegex {

// 解析URL组件
public static function parseUrl($url) {
$pattern = '/^(https?):\/\/([^\/\s]+)(\/[^\s\?]*)?(\?[^\s#]*)?(#[^\s]*)?$/';

if (preg_match($pattern, $url, $matches)) {
return [
'full_url' => $matches[0],
'protocol' => $matches[1],
'domain' => $matches[2],
'path' => $matches[3] ?? '/',
'query' => $matches[4] ?? '',
'fragment' => $matches[5] ?? ''
];
}

return false;
}

// 解析邮箱地址
public static function parseEmail($email) {
$pattern = '/^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$/';

if (preg_match($pattern, $email, $matches)) {
return [
'full_email' => $matches[0],
'username' => $matches[1],
'domain' => $matches[2],
'tld' => $matches[3]
];
}

return false;
}

// 提取HTML标签属性
public static function extractHtmlAttributes($html, $tag) {
$pattern = "/<$tag\s+([^>]*)>/i";
$results = [];

if (preg_match_all($pattern, $html, $matches)) {
foreach ($matches[1] as $attributeString) {
$attributes = [];

// 解析属性
$attrPattern = '/(\w+)\s*=\s*["\']([^"\']*)["\']|(\w+)\s*=\s*([^\s>]*)/';
if (preg_match_all($attrPattern, $attributeString, $attrMatches, PREG_SET_ORDER)) {
foreach ($attrMatches as $attrMatch) {
$attrName = $attrMatch[1] ?: $attrMatch[3];
$attrValue = $attrMatch[2] ?: $attrMatch[4];
$attributes[$attrName] = $attrValue;
}
}

$results[] = $attributes;
}
}

return $results;
}

// 验证密码强度
public static function validatePasswordStrength($password) {
$checks = [
'length' => strlen($password) >= 8,
'lowercase' => preg_match('/[a-z]/', $password),
'uppercase' => preg_match('/[A-Z]/', $password),
'digit' => preg_match('/\d/', $password),
'special' => preg_match('/[@$!%*?&]/', $password),
'no_common' => !preg_match('/^(password|123456|qwerty|admin)$/i', $password)
];

$score = array_sum($checks);
$strength = 'weak';

if ($score >= 5) {
$strength = 'strong';
} elseif ($score >= 3) {
$strength = 'medium';
}

return [
'score' => $score,
'strength' => $strength,
'checks' => $checks,
'suggestions' => self::getPasswordSuggestions($checks)
];
}

// 获取密码改进建议
private static function getPasswordSuggestions($checks) {
$suggestions = [];

if (!$checks['length']) {
$suggestions[] = '密码长度至少8位';
}
if (!$checks['lowercase']) {
$suggestions[] = '添加小写字母';
}
if (!$checks['uppercase']) {
$suggestions[] = '添加大写字母';
}
if (!$checks['digit']) {
$suggestions[] = '添加数字';
}
if (!$checks['special']) {
$suggestions[] = '添加特殊字符(@$!%*?&)';
}
if (!$checks['no_common']) {
$suggestions[] = '避免使用常见密码';
}

return $suggestions;
}

// 提取文本中的日期
public static function extractDates($text) {
$patterns = [
'yyyy-mm-dd' => '/\b\d{4}-\d{2}-\d{2}\b/',
'dd/mm/yyyy' => '/\b\d{2}\/\d{2}\/\d{4}\b/',
'mm-dd-yyyy' => '/\b\d{2}-\d{2}-\d{4}\b/',
'chinese' => '/\b\d{4}年\d{1,2}月\d{1,2}日\b/u'
];

$dates = [];

foreach ($patterns as $format => $pattern) {
if (preg_match_all($pattern, $text, $matches)) {
foreach ($matches[0] as $match) {
$dates[] = [
'date' => $match,
'format' => $format
];
}
}
}

return $dates;
}

// 清理和格式化电话号码
public static function formatPhoneNumber($phone, $format = 'xxx-xxxx-xxxx') {
// 清理非数字字符
$cleanPhone = preg_replace('/\D/', '', $phone);

// 验证长度
if (strlen($cleanPhone) !== 11 || !preg_match('/^1[3-9]/', $cleanPhone)) {
return false;
}

// 格式化
switch ($format) {
case 'xxx-xxxx-xxxx':
return preg_replace('/(\d{3})(\d{4})(\d{4})/', '$1-$2-$3', $cleanPhone);
case 'xxx xxxx xxxx':
return preg_replace('/(\d{3})(\d{4})(\d{4})/', '$1 $2 $3', $cleanPhone);
case '(xxx) xxxx-xxxx':
return preg_replace('/(\d{3})(\d{4})(\d{4})/', '($1) $2-$3', $cleanPhone);
default:
return $cleanPhone;
}
}
}

// 使用高级正则功能
echo "=== 高级正则功能示例 ===\n";

// URL解析
$testUrl = "https://www.example.com/path/to/page?param=value#section";
$urlParts = AdvancedRegex::parseUrl($testUrl);
if ($urlParts) {
echo "URL解析结果:\n";
foreach ($urlParts as $key => $value) {
echo "- $key: $value\n";
}
}

// 邮箱解析
echo "\n邮箱解析:\n";
$testEmail = "john.doe@example.com";
$emailParts = AdvancedRegex::parseEmail($testEmail);
if ($emailParts) {
foreach ($emailParts as $key => $value) {
echo "- $key: $value\n";
}
}

// HTML属性提取
echo "\nHTML属性提取:\n";
$html = '<img src="image.jpg" alt="测试图片" width="300" height="200">';
$attributes = AdvancedRegex::extractHtmlAttributes($html, 'img');
foreach ($attributes as $attrs) {
echo "图片属性: " . json_encode($attrs, JSON_UNESCAPED_UNICODE) . "\n";
}

// 密码强度验证
echo "\n密码强度验证:\n";
$passwords = ['123456', 'Password1', 'StrongPass123!'];
foreach ($passwords as $pwd) {
$strength = AdvancedRegex::validatePasswordStrength($pwd);
echo "密码 '$pwd': {$strength['strength']} (得分: {$strength['score']}/6)\n";
if (!empty($strength['suggestions'])) {
echo " 建议: " . implode(', ', $strength['suggestions']) . "\n";
}
}

// 日期提取
echo "\n日期提取:\n";
$dateText = "项目开始于2023-07-10,截止日期是2023年12月31日,会议安排在15/08/2023。";
$dates = AdvancedRegex::extractDates($dateText);
foreach ($dates as $date) {
echo "找到日期: {$date['date']} (格式: {$date['format']})\n";
}

// 电话号码格式化
echo "\n电话号码格式化:\n";
$phones = ['13812345678', '138-1234-5678', '(138) 1234-5678'];
foreach ($phones as $phone) {
$formatted = AdvancedRegex::formatPhoneNumber($phone, 'xxx-xxxx-xxxx');
echo "原号码: $phone -> 格式化: " . ($formatted ?: '无效号码') . "\n";
}
?>

3. 文本处理和内容提取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
<?php
// 文本处理和内容提取
echo "=== 文本处理和内容提取 ===\n";

class TextProcessor {

// 提取文章摘要
public static function extractSummary($content, $maxLength = 200) {
// 移除HTML标签
$text = strip_tags($content);

// 移除多余空白
$text = preg_replace('/\s+/', ' ', trim($text));

// 截取指定长度
if (mb_strlen($text) <= $maxLength) {
return $text;
}

// 在单词边界截取
$summary = mb_substr($text, 0, $maxLength);
$lastSpace = mb_strrpos($summary, ' ');

if ($lastSpace !== false) {
$summary = mb_substr($summary, 0, $lastSpace);
}

return $summary . '...';
}

// 提取关键词
public static function extractKeywords($text, $minLength = 3, $maxCount = 10) {
// 转换为小写并移除标点
$text = strtolower($text);
$text = preg_replace('/[^\w\s\x{4e00}-\x{9fa5}]/u', ' ', $text);

// 分割单词
$words = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);

// 过滤停用词和短词
$stopWords = ['的', '是', '在', '有', '和', '与', '或', '但', '而', '了', '着', '过', 'the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
$filteredWords = array_filter($words, function($word) use ($minLength, $stopWords) {
return mb_strlen($word) >= $minLength && !in_array($word, $stopWords);
});

// 统计词频
$wordCount = array_count_values($filteredWords);

// 按频率排序
arsort($wordCount);

// 返回前N个关键词
return array_slice(array_keys($wordCount), 0, $maxCount);
}

// 检测语言
public static function detectLanguage($text) {
$patterns = [
'chinese' => '/[\x{4e00}-\x{9fa5}]/u',
'english' => '/[a-zA-Z]/',
'japanese' => '/[\x{3040}-\x{309f}\x{30a0}-\x{30ff}]/u',
'korean' => '/[\x{ac00}-\x{d7af}]/u',
'arabic' => '/[\x{0600}-\x{06ff}]/u',
'russian' => '/[\x{0400}-\x{04ff}]/u'
];

$scores = [];

foreach ($patterns as $lang => $pattern) {
preg_match_all($pattern, $text, $matches);
$scores[$lang] = count($matches[0]);
}

// 返回得分最高的语言
arsort($scores);
$topLang = key($scores);

return [
'language' => $topLang,
'confidence' => $scores[$topLang] / mb_strlen($text),
'scores' => $scores
];
}

// 提取邮箱和电话
public static function extractContacts($text) {
$contacts = [
'emails' => [],
'phones' => [],
'urls' => []
];

// 提取邮箱
$emailPattern = '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/';
if (preg_match_all($emailPattern, $text, $matches)) {
$contacts['emails'] = array_unique($matches[0]);
}

// 提取电话
$phonePatterns = [
'/1[3-9]\d{9}/', // 中国手机号
'/\d{3}-\d{4}-\d{4}/', // 格式化手机号
'/\(\d{3}\)\s*\d{4}-\d{4}/', // 美式电话
'/\d{4}-\d{7}/' // 固定电话
];

foreach ($phonePatterns as $pattern) {
if (preg_match_all($pattern, $text, $matches)) {
$contacts['phones'] = array_merge($contacts['phones'], $matches[0]);
}
}
$contacts['phones'] = array_unique($contacts['phones']);

// 提取URL
$urlPattern = '/https?:\/\/[^\s\/$.?#].[^\s]*/';
if (preg_match_all($urlPattern, $text, $matches)) {
$contacts['urls'] = array_unique($matches[0]);
}

return $contacts;
}

// 格式化文本
public static function formatText($text, $options = []) {
$defaults = [
'remove_extra_spaces' => true,
'fix_punctuation' => true,
'capitalize_sentences' => true,
'remove_empty_lines' => true
];

$options = array_merge($defaults, $options);

// 移除多余空格
if ($options['remove_extra_spaces']) {
$text = preg_replace('/\s+/', ' ', $text);
}

// 修复标点符号
if ($options['fix_punctuation']) {
$text = preg_replace('/\s+([,.!?;:])/', '$1', $text);
$text = preg_replace('/([,.!?;:])\s*/', '$1 ', $text);
}

// 句首大写
if ($options['capitalize_sentences']) {
$text = preg_replace_callback('/([.!?]\s+)([a-z])/', function($matches) {
return $matches[1] . strtoupper($matches[2]);
}, $text);

// 首字母大写
$text = ucfirst(trim($text));
}

// 移除空行
if ($options['remove_empty_lines']) {
$text = preg_replace('/\n\s*\n/', "\n", $text);
}

return trim($text);
}

// 生成文本统计
public static function getTextStats($text) {
$stats = [
'characters' => mb_strlen($text),
'characters_no_spaces' => mb_strlen(preg_replace('/\s/', '', $text)),
'words' => str_word_count(strip_tags($text)),
'sentences' => preg_match_all('/[.!?]+/', $text),
'paragraphs' => preg_match_all('/\n\s*\n/', $text) + 1,
'lines' => substr_count($text, "\n") + 1
];

// 计算阅读时间(假设每分钟200字)
$stats['reading_time'] = ceil($stats['words'] / 200);

// 计算平均句长
$stats['avg_sentence_length'] = $stats['sentences'] > 0 ?
round($stats['words'] / $stats['sentences'], 1) : 0;

return $stats;
}
}

// 使用文本处理器
echo "=== 文本处理器示例 ===\n";

$sampleText = "这是一篇关于PHP正则表达式的文章。正则表达式是处理文本的强大工具,在数据验证、文本解析和内容提取中发挥着重要作用。联系我们:邮箱support@example.com,电话13812345678,网站https://www.example.com。";

// 提取摘要
$summary = TextProcessor::extractSummary($sampleText, 50);
echo "文章摘要: $summary\n";

// 提取关键词
$keywords = TextProcessor::extractKeywords($sampleText);
echo "关键词: " . implode(', ', $keywords) . "\n";

// 语言检测
$langInfo = TextProcessor::detectLanguage($sampleText);
echo "检测语言: {$langInfo['language']} (置信度: " . round($langInfo['confidence'], 2) . ")\n";

// 提取联系信息
$contacts = TextProcessor::extractContacts($sampleText);
echo "联系信息:\n";
echo "- 邮箱: " . implode(', ', $contacts['emails']) . "\n";
echo "- 电话: " . implode(', ', $contacts['phones']) . "\n";
echo "- 网址: " . implode(', ', $contacts['urls']) . "\n";

// 文本统计
$stats = TextProcessor::getTextStats($sampleText);
echo "\n文本统计:\n";
foreach ($stats as $key => $value) {
$label = [
'characters' => '字符数',
'characters_no_spaces' => '字符数(不含空格)',
'words' => '单词数',
'sentences' => '句子数',
'paragraphs' => '段落数',
'lines' => '行数',
'reading_time' => '阅读时间(分钟)',
'avg_sentence_length' => '平均句长'
][$key] ?? $key;

echo "- $label: $value\n";
}

// 格式化文本
$messyText = "这是一段 格式混乱的文本 。有多余的空格 ,标点符号也不规范,还有 。";
$formattedText = TextProcessor::formatText($messyText);
echo "\n格式化前: $messyText\n";
echo "格式化后: $formattedText\n";
?>

实际应用场景

1. 数据验证和清理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
<?php
// 数据验证和清理应用
echo "=== 数据验证和清理应用 ===\n";

class DataValidator {

// 验证用户注册数据
public static function validateRegistration($data) {
$errors = [];

// 验证用户名
if (empty($data['username'])) {
$errors['username'] = '用户名不能为空';
} elseif (!preg_match('/^[a-zA-Z0-9_]{3,20}$/', $data['username'])) {
$errors['username'] = '用户名只能包含字母、数字和下划线,长度3-20位';
}

// 验证邮箱
if (empty($data['email'])) {
$errors['email'] = '邮箱不能为空';
} elseif (!preg_match('/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/', $data['email'])) {
$errors['email'] = '邮箱格式不正确';
}

// 验证密码
if (empty($data['password'])) {
$errors['password'] = '密码不能为空';
} elseif (!preg_match('/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/', $data['password'])) {
$errors['password'] = '密码必须包含大小写字母、数字和特殊字符,至少8位';
}

// 验证手机号
if (!empty($data['phone']) && !preg_match('/^1[3-9]\d{9}$/', $data['phone'])) {
$errors['phone'] = '手机号格式不正确';
}

// 验证身份证号
if (!empty($data['idcard']) && !preg_match('/^\d{17}[\dX]$/', $data['idcard'])) {
$errors['idcard'] = '身份证号格式不正确';
}

return $errors;
}

// 清理用户输入
public static function sanitizeInput($data) {
$sanitized = [];

foreach ($data as $key => $value) {
if (is_string($value)) {
// 移除HTML标签
$value = strip_tags($value);

// 移除多余空白
$value = preg_replace('/\s+/', ' ', trim($value));

// 转义特殊字符
$value = htmlspecialchars($value, ENT_QUOTES, 'UTF-8');

$sanitized[$key] = $value;
} else {
$sanitized[$key] = $value;
}
}

return $sanitized;
}

// 验证和格式化银行卡号
public static function validateBankCard($cardNumber) {
// 移除空格和连字符
$cleanNumber = preg_replace('/[\s-]/', '', $cardNumber);

// 检查是否只包含数字
if (!preg_match('/^\d+$/', $cleanNumber)) {
return ['valid' => false, 'message' => '银行卡号只能包含数字'];
}

// 检查长度
$length = strlen($cleanNumber);
if ($length < 13 || $length > 19) {
return ['valid' => false, 'message' => '银行卡号长度不正确'];
}

// Luhn算法验证
$sum = 0;
$alternate = false;

for ($i = $length - 1; $i >= 0; $i--) {
$digit = intval($cleanNumber[$i]);

if ($alternate) {
$digit *= 2;
if ($digit > 9) {
$digit = ($digit % 10) + 1;
}
}

$sum += $digit;
$alternate = !$alternate;
}

$isValid = ($sum % 10 === 0);

if ($isValid) {
// 格式化显示
$formatted = preg_replace('/(\d{4})/', '$1 ', $cleanNumber);
$formatted = trim($formatted);

return [
'valid' => true,
'original' => $cardNumber,
'clean' => $cleanNumber,
'formatted' => $formatted,
'type' => self::getBankCardType($cleanNumber)
];
} else {
return ['valid' => false, 'message' => '银行卡号校验失败'];
}
}

// 识别银行卡类型
private static function getBankCardType($cardNumber) {
$patterns = [
'visa' => '/^4\d{12}(\d{3})?$/',
'mastercard' => '/^5[1-5]\d{14}$/',
'amex' => '/^3[47]\d{13}$/',
'discover' => '/^6011\d{12}$/',
'unionpay' => '/^62\d{14,17}$/'
];

foreach ($patterns as $type => $pattern) {
if (preg_match($pattern, $cardNumber)) {
return $type;
}
}

return 'unknown';
}
}

// 使用数据验证器
echo "=== 数据验证器示例 ===\n";

// 测试注册数据验证
$registrationData = [
'username' => 'john_doe',
'email' => 'john@example.com',
'password' => 'StrongPass123!',
'phone' => '13812345678',
'idcard' => '12345678901234567X'
];

$errors = DataValidator::validateRegistration($registrationData);

if (empty($errors)) {
echo "注册数据验证通过\n";

// 清理数据
$cleanData = DataValidator::sanitizeInput($registrationData);
echo "清理后的数据: " . json_encode($cleanData, JSON_UNESCAPED_UNICODE) . "\n";
} else {
echo "注册数据验证失败:\n";
foreach ($errors as $field => $error) {
echo "- $field: $error\n";
}
}

// 测试银行卡验证
echo "\n银行卡验证:\n";
$testCards = [
'4111 1111 1111 1111', // Visa测试卡号
'5555555555554444', // MasterCard测试卡号
'1234567890123456' // 无效卡号
];

foreach ($testCards as $card) {
$result = DataValidator::validateBankCard($card);
echo "卡号 $card: ";

if ($result['valid']) {
echo "有效 ({$result['type']}) - 格式化: {$result['formatted']}\n";
} else {
echo "无效 - {$result['message']}\n";
}
}
?>

2. 日志分析和监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
<?php
// 日志分析和监控
echo "=== 日志分析和监控 ===\n";

class LogAnalyzer {

// 解析Apache访问日志
public static function parseAccessLog($logLine) {
// Apache Common Log Format
$pattern = '/^(\S+) \S+ \S+ \[([^\]]+)\] "([^"]*)" (\d+) (\S+)(?: "([^"]*)" "([^"]*)")?/';

if (preg_match($pattern, $logLine, $matches)) {
return [
'ip' => $matches[1],
'timestamp' => $matches[2],
'request' => $matches[3],
'status' => intval($matches[4]),
'size' => $matches[5] === '-' ? 0 : intval($matches[5]),
'referer' => $matches[6] ?? '',
'user_agent' => $matches[7] ?? ''
];
}

return false;
}

// 解析PHP错误日志
public static function parseErrorLog($logLine) {
$pattern = '/^\[([^\]]+)\] PHP (Fatal error|Warning|Notice|Parse error): (.+) in (.+) on line (\d+)/';

if (preg_match($pattern, $logLine, $matches)) {
return [
'timestamp' => $matches[1],
'level' => $matches[2],
'message' => $matches[3],
'file' => $matches[4],
'line' => intval($matches[5])
];
}

return false;
}

// 检测可疑活动
public static function detectSuspiciousActivity($logEntries) {
$suspicious = [];

foreach ($logEntries as $entry) {
$flags = [];

// 检测SQL注入尝试
if (preg_match('/union\s+select|drop\s+table|insert\s+into|delete\s+from/i', $entry['request'] ?? '')) {
$flags[] = 'SQL注入尝试';
}

// 检测XSS尝试
if (preg_match('/<script|javascript:|onload=|onerror=/i', $entry['request'] ?? '')) {
$flags[] = 'XSS攻击尝试';
}

// 检测路径遍历
if (preg_match('/\.\.\/|\.\.\\\\/', $entry['request'] ?? '')) {
$flags[] = '路径遍历尝试';
}

// 检测暴力破解
if (isset($entry['status']) && $entry['status'] === 401) {
$flags[] = '认证失败';
}

// 检测异常状态码
if (isset($entry['status']) && in_array($entry['status'], [500, 502, 503, 504])) {
$flags[] = '服务器错误';
}

if (!empty($flags)) {
$entry['flags'] = $flags;
$suspicious[] = $entry;
}
}

return $suspicious;
}

// 统计访问情况
public static function analyzeTraffic($logEntries) {
$stats = [
'total_requests' => count($logEntries),
'unique_ips' => [],
'status_codes' => [],
'top_pages' => [],
'user_agents' => [],
'hourly_traffic' => []
];

foreach ($logEntries as $entry) {
// 统计IP
if (isset($entry['ip'])) {
$stats['unique_ips'][$entry['ip']] = ($stats['unique_ips'][$entry['ip']] ?? 0) + 1;
}

// 统计状态码
if (isset($entry['status'])) {
$stats['status_codes'][$entry['status']] = ($stats['status_codes'][$entry['status']] ?? 0) + 1;
}

// 统计页面访问
if (isset($entry['request'])) {
preg_match('/^\w+ ([^\s\?]+)/', $entry['request'], $matches);
$page = $matches[1] ?? 'unknown';
$stats['top_pages'][$page] = ($stats['top_pages'][$page] ?? 0) + 1;
}

// 统计User Agent
if (isset($entry['user_agent'])) {
$stats['user_agents'][$entry['user_agent']] = ($stats['user_agents'][$entry['user_agent']] ?? 0) + 1;
}

// 统计小时流量
if (isset($entry['timestamp'])) {
$hour = date('H', strtotime($entry['timestamp']));
$stats['hourly_traffic'][$hour] = ($stats['hourly_traffic'][$hour] ?? 0) + 1;
}
}

// 排序统计结果
arsort($stats['unique_ips']);
arsort($stats['status_codes']);
arsort($stats['top_pages']);
arsort($stats['user_agents']);
ksort($stats['hourly_traffic']);

// 只保留前10项
$stats['unique_ips'] = array_slice($stats['unique_ips'], 0, 10, true);
$stats['top_pages'] = array_slice($stats['top_pages'], 0, 10, true);
$stats['user_agents'] = array_slice($stats['user_agents'], 0, 5, true);

return $stats;
}

// 生成报告
public static function generateReport($logEntries) {
$traffic = self::analyzeTraffic($logEntries);
$suspicious = self::detectSuspiciousActivity($logEntries);

$report = [
'summary' => [
'total_requests' => $traffic['total_requests'],
'unique_visitors' => count($traffic['unique_ips']),
'suspicious_activities' => count($suspicious),
'error_rate' => isset($traffic['status_codes'][500]) ?
round(($traffic['status_codes'][500] / $traffic['total_requests']) * 100, 2) : 0
],
'traffic_analysis' => $traffic,
'security_alerts' => $suspicious
];

return $report;
}
}

// 使用日志分析器
echo "=== 日志分析器示例 ===\n";

// 模拟日志数据
$sampleLogs = [
'192.168.1.100 - - [10/Jul/2023:16:45:30 +0800] "GET /index.php HTTP/1.1" 200 1234 "https://www.google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"',
'192.168.1.101 - - [10/Jul/2023:16:46:15 +0800] "POST /login.php HTTP/1.1" 401 567 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"',
'192.168.1.102 - - [10/Jul/2023:16:47:22 +0800] "GET /admin.php?id=1 UNION SELECT * FROM users HTTP/1.1" 403 0 "-" "sqlmap/1.0"',
'192.168.1.100 - - [10/Jul/2023:16:48:10 +0800] "GET /contact.php HTTP/1.1" 200 2345 "https://www.example.com" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)"'
];

$parsedLogs = [];
foreach ($sampleLogs as $log) {
$parsed = LogAnalyzer::parseAccessLog($log);
if ($parsed) {
$parsedLogs[] = $parsed;
}
}

echo "解析了 " . count($parsedLogs) . " 条日志记录\n";

// 生成分析报告
$report = LogAnalyzer::generateReport($parsedLogs);

echo "\n=== 流量分析报告 ===\n";
echo "总请求数: {$report['summary']['total_requests']}\n";
echo "独立访客: {$report['summary']['unique_visitors']}\n";
echo "可疑活动: {$report['summary']['suspicious_activities']}\n";
echo "错误率: {$report['summary']['error_rate']}%\n";

echo "\n热门页面:\n";
foreach ($report['traffic_analysis']['top_pages'] as $page => $count) {
echo "- $page: $count 次访问\n";
}

echo "\n状态码分布:\n";
foreach ($report['traffic_analysis']['status_codes'] as $code => $count) {
echo "- $code: $count 次\n";
}

if (!empty($report['security_alerts'])) {
echo "\n=== 安全警报 ===\n";
foreach ($report['security_alerts'] as $alert) {
echo "IP: {$alert['ip']}, 时间: {$alert['timestamp']}\n";
echo "请求: {$alert['request']}\n";
echo "威胁类型: " . implode(', ', $alert['flags']) . "\n\n";
}
}
?>

性能优化和最佳实践

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
<?php
// 正则表达式性能优化
echo "=== 性能优化和最佳实践 ===\n";

class RegexOptimizer {

// 编译和缓存正则表达式
private static $compiledPatterns = [];

public static function match($pattern, $subject, &$matches = null) {
// 缓存编译后的模式
if (!isset(self::$compiledPatterns[$pattern])) {
self::$compiledPatterns[$pattern] = $pattern;
}

return preg_match(self::$compiledPatterns[$pattern], $subject, $matches);
}

// 性能测试
public static function benchmarkPatterns($patterns, $testString, $iterations = 1000) {
$results = [];

foreach ($patterns as $name => $pattern) {
$startTime = microtime(true);

for ($i = 0; $i < $iterations; $i++) {
preg_match($pattern, $testString);
}

$endTime = microtime(true);
$results[$name] = [
'pattern' => $pattern,
'time' => round(($endTime - $startTime) * 1000, 4),
'avg_time' => round((($endTime - $startTime) * 1000) / $iterations, 6)
];
}

return $results;
}

// 优化建议
public static function analyzePattern($pattern) {
$suggestions = [];

// 检查是否使用了锚点
if (!preg_match('/^[\^]|[\$]$/', $pattern)) {
$suggestions[] = '考虑使用锚点(^$)来提高匹配效率';
}

// 检查是否有不必要的捕获组
if (preg_match('/\([^?]/', $pattern)) {
$suggestions[] = '考虑使用非捕获组(?:...)来提高性能';
}

// 检查是否有过度的量词
if (preg_match('/\.\*\.\*|\.\+\.\+/', $pattern)) {
$suggestions[] = '避免连续使用贪婪量词,可能导致回溯问题';
}

// 检查字符类优化
if (preg_match('/\[a-zA-Z\]/', $pattern)) {
$suggestions[] = '可以使用[[:alpha:]]替代[a-zA-Z]';
}

if (preg_match('/\[0-9\]/', $pattern)) {
$suggestions[] = '可以使用\\d替代[0-9]';
}

return [
'pattern' => $pattern,
'suggestions' => $suggestions,
'complexity' => self::calculateComplexity($pattern)
];
}

// 计算模式复杂度
private static function calculateComplexity($pattern) {
$complexity = 0;

// 量词增加复杂度
$complexity += preg_match_all('/[*+?{]/', $pattern);

// 字符类增加复杂度
$complexity += preg_match_all('/\[.*?\]/', $pattern);

// 分组增加复杂度
$complexity += preg_match_all('/\(.*?\)/', $pattern);

// 反向引用增加复杂度
$complexity += preg_match_all('/\\\\[1-9]/', $pattern) * 2;

if ($complexity <= 5) {
return 'low';
} elseif ($complexity <= 15) {
return 'medium';
} else {
return 'high';
}
}

// 常用模式库
const COMMON_PATTERNS = [
'email_simple' => '/^[^@]+@[^@]+\.[^@]+$/',
'email_strict' => '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/',
'phone_loose' => '/\d{11}/',
'phone_strict' => '/^1[3-9]\d{9}$/',
'url_simple' => '/https?:\/\/\S+/',
'url_strict' => '/^https?:\/\/[^\s\/$.?#].[^\s]*$/'
];

// 选择最优模式
public static function selectOptimalPattern($type, $strictness = 'medium') {
$patterns = [
'email' => [
'loose' => self::COMMON_PATTERNS['email_simple'],
'medium' => self::COMMON_PATTERNS['email_strict'],
'strict' => '/^(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$/'
],
'phone' => [
'loose' => self::COMMON_PATTERNS['phone_loose'],
'medium' => self::COMMON_PATTERNS['phone_strict'],
'strict' => '/^(?:\+86)?1[3-9]\d{9}$/'
],
'url' => [
'loose' => self::COMMON_PATTERNS['url_simple'],
'medium' => self::COMMON_PATTERNS['url_strict'],
'strict' => '/^https?:\/\/(?:[-\w.])+(?:\:[0-9]+)?(?:\/(?:[\w\/_.])*(?:\?(?:[\w&=%.])*)?(?:\#(?:[\w.])*)?)?$/'
]
];

return $patterns[$type][$strictness] ?? null;
}
}

// 使用性能优化器
echo "=== 性能优化器示例 ===\n";

// 性能测试
$testString = "联系邮箱:test@example.com,电话:13812345678";

$emailPatterns = [
'simple' => '/\S+@\S+/',
'medium' => '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/',
'complex' => '/^(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$/'
];

$benchmarkResults = RegexOptimizer::benchmarkPatterns($emailPatterns, $testString, 100);

echo "邮箱模式性能测试结果 (100次迭代):\n";
foreach ($benchmarkResults as $name => $result) {
echo "- $name: {$result['time']}ms 总时间, {$result['avg_time']}ms 平均时间\n";
}

// 模式分析
echo "\n=== 模式分析 ===\n";
foreach ($emailPatterns as $name => $pattern) {
$analysis = RegexOptimizer::analyzePattern($pattern);
echo "\n$name 模式分析:\n";
echo "复杂度: {$analysis['complexity']}\n";
if (!empty($analysis['suggestions'])) {
echo "优化建议:\n";
foreach ($analysis['suggestions'] as $suggestion) {
echo "- $suggestion\n";
}
} else {
echo "模式已优化\n";
}
}

// 选择最优模式
echo "\n=== 最优模式选择 ===\n";
$optimalEmail = RegexOptimizer::selectOptimalPattern('email', 'medium');
echo "推荐的邮箱验证模式: $optimalEmail\n";

$optimalPhone = RegexOptimizer::selectOptimalPattern('phone', 'strict');
echo "推荐的手机号验证模式: $optimalPhone\n";
?>

总结

通过本文的学习,我们全面掌握了PHP正则表达式的使用技巧:

关键要点

  1. 基础语法: 掌握了preg_match、preg_replace、preg_split等核心函数
  2. 高级模式: 学会了复杂的模式匹配和数据提取技巧
  3. 实际应用: 了解了数据验证、日志分析、文本处理等实用场景
  4. 性能优化: 掌握了正则表达式的性能优化方法

最佳实践

  • 使用锚点提高匹配效率
  • 优先使用非捕获组减少内存消耗
  • 避免过度使用贪婪量词
  • 缓存编译后的正则表达式
  • 根据需求选择合适的严格程度

安全建议

  • 验证用户输入防止注入攻击
  • 使用白名单而不是黑名单
  • 对正则表达式进行性能测试
  • 避免复杂的回溯模式
  • 定期更新验证规则

掌握这些正则表达式技巧,将大大提高你处理文本数据的能力和效率。记住,正则表达式是一把双刃剑,合理使用能事半功倍,过度使用可能影响性能。

本站由 提供部署服务