PHP正则表达式实战指南：模式匹配与文本处理

正则表达式是处理文本的强大工具，在数据验证、文本解析和内容提取中发挥着重要作用。作为PHP开发者，掌握正则表达式能大大提高我们处理字符串的效率。本文将分享一些实用的正则表达式技巧和最佳实践。

正则表达式基础

1. 基本语法和函数

<?php
// 正则表达式基础示例
echo "=== 正则表达式基础 ===\n";

// 基本匹配函数
function demonstrateBasicFunctions() {
    $text = "Hello World! My email is john@example.com and phone is 13812345678";
    
    // preg_match - 执行匹配，返回匹配次数
    echo "=== preg_match 示例 ===\n";
    
    // 匹配邮箱
    $emailPattern = '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/';
    if (preg_match($emailPattern, $text, $matches)) {
        echo "找到邮箱: " . $matches[0] . "\n";
    }
    
    // 匹配手机号
    $phonePattern = '/1[3-9]\d{9}/';
    if (preg_match($phonePattern, $text, $matches)) {
        echo "找到手机号: " . $matches[0] . "\n";
    }
    
    // preg_match_all - 执行全局匹配
    echo "\n=== preg_match_all 示例 ===\n";
    
    $multiText = "联系方式: john@example.com, jane@test.com, admin@site.org";
    if (preg_match_all($emailPattern, $multiText, $matches)) {
        echo "找到 " . count($matches[0]) . " 个邮箱:\n";
        foreach ($matches[0] as $email) {
            echo "- $email\n";
        }
    }
    
    // preg_replace - 执行替换
    echo "\n=== preg_replace 示例 ===\n";
    
    $sensitiveText = "我的手机号是13812345678，邮箱是john@example.com";
    
    // 隐藏手机号中间4位
    $hiddenPhone = preg_replace('/(1[3-9]\d)(\d{4})(\d{4})/', '$1****$3', $sensitiveText);
    echo "隐藏手机号: $hiddenPhone\n";
    
    // 隐藏邮箱用户名部分
    $hiddenEmail = preg_replace('/([a-zA-Z0-9._%+-]{1,3})[a-zA-Z0-9._%+-]*@/', '$1***@', $hiddenPhone);
    echo "隐藏邮箱: $hiddenEmail\n";
    
    // preg_split - 分割字符串
    echo "\n=== preg_split 示例 ===\n";
    
    $csvData = "张三,25,北京;李四,30,上海;王五,28,广州";
    $records = preg_split('/[;,]/', $csvData);
    echo "分割结果: " . json_encode($records, JSON_UNESCAPED_UNICODE) . "\n";
    
    // 更复杂的分割
    $logEntry = "2023-07-10 16:45:30 [INFO] User login successful - IP: 192.168.1.100";
    $logParts = preg_split('/\s+[-\[\]]\s*/', $logEntry);
    echo "日志分割: " . json_encode($logParts, JSON_UNESCAPED_UNICODE) . "\n";
}

demonstrateBasicFunctions();

// 正则表达式工具类
class RegexHelper {
    // 常用正则模式
    const PATTERNS = [
        'email' => '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/',
        'phone' => '/^1[3-9]\d{9}$/',
        'idcard' => '/^\d{17}[\dX]$/',
        'url' => '/^https?:\/\/[^\s\/$.?#].[^\s]*$/',
        'ip' => '/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/',
        'date' => '/^\d{4}-\d{2}-\d{2}$/',
        'time' => '/^\d{2}:\d{2}:\d{2}$/',
        'chinese' => '/^[\x{4e00}-\x{9fa5}]+$/u',
        'password' => '/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/'
    ];
    
    // 验证输入
    public static function validate($input, $pattern) {
        if (isset(self::PATTERNS[$pattern])) {
            return preg_match(self::PATTERNS[$pattern], $input);
        }
        
        return preg_match($pattern, $input);
    }
    
    // 提取信息
    public static function extract($text, $pattern, $all = false) {
        if (isset(self::PATTERNS[$pattern])) {
            $pattern = self::PATTERNS[$pattern];
        }
        
        if ($all) {
            preg_match_all($pattern, $text, $matches);
            return $matches[0] ?? [];
        } else {
            preg_match($pattern, $text, $matches);
            return $matches[0] ?? null;
        }
    }
    
    // 清理文本
    public static function clean($text, $pattern, $replacement = '') {
        return preg_replace($pattern, $replacement, $text);
    }
    
    // 分割文本
    public static function split($text, $pattern) {
        return preg_split($pattern, $text, -1, PREG_SPLIT_NO_EMPTY);
    }
    
    // 高亮匹配内容
    public static function highlight($text, $pattern, $highlightTag = 'mark') {
        return preg_replace($pattern, "<$highlightTag>$0</$highlightTag>", $text);
    }
}

// 使用正则工具类
echo "\n=== 正则工具类示例 ===\n";

// 验证示例
$testData = [
    'email' => 'test@example.com',
    'phone' => '13812345678',
    'url' => 'https://www.example.com',
    'chinese' => '中文测试'
];

foreach ($testData as $type => $value) {
    $isValid = RegexHelper::validate($value, $type);
    echo "$type 验证 '$value': " . ($isValid ? '通过' : '失败') . "\n";
}

// 提取示例
$contactText = "请联系我们：电话13812345678，邮箱support@example.com，网站https://www.example.com";

echo "\n提取信息:\n";
echo "手机号: " . (RegexHelper::extract($contactText, 'phone') ?: '未找到') . "\n";
echo "邮箱: " . (RegexHelper::extract($contactText, 'email') ?: '未找到') . "\n";
echo "网址: " . (RegexHelper::extract($contactText, 'url') ?: '未找到') . "\n";

// 清理示例
$dirtyText = "这是一段包含<script>alert('xss')</script>的文本和一些数字123456";
$cleanText = RegexHelper::clean($dirtyText, '/<[^>]*>/', '');
echo "\n清理HTML标签: $cleanText\n";

$numbersOnly = RegexHelper::clean($dirtyText, '/[^\d]/', '');
echo "只保留数字: $numbersOnly\n";
?>

2. 高级模式匹配

<?php
// 高级正则表达式模式
echo "=== 高级模式匹配 ===\n";

class AdvancedRegex {
    
    // 解析URL组件
    public static function parseUrl($url) {
        $pattern = '/^(https?):\/\/([^\/\s]+)(\/[^\s\?]*)?(\?[^\s#]*)?(#[^\s]*)?$/';
        
        if (preg_match($pattern, $url, $matches)) {
            return [
                'full_url' => $matches[0],
                'protocol' => $matches[1],
                'domain' => $matches[2],
                'path' => $matches[3] ?? '/',
                'query' => $matches[4] ?? '',
                'fragment' => $matches[5] ?? ''
            ];
        }
        
        return false;
    }
    
    // 解析邮箱地址
    public static function parseEmail($email) {
        $pattern = '/^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$/';
        
        if (preg_match($pattern, $email, $matches)) {
            return [
                'full_email' => $matches[0],
                'username' => $matches[1],
                'domain' => $matches[2],
                'tld' => $matches[3]
            ];
        }
        
        return false;
    }
    
    // 提取HTML标签属性
    public static function extractHtmlAttributes($html, $tag) {
        $pattern = "/<$tag\s+([^>]*)>/i";
        $results = [];
        
        if (preg_match_all($pattern, $html, $matches)) {
            foreach ($matches[1] as $attributeString) {
                $attributes = [];
                
                // 解析属性
                $attrPattern = '/(\w+)\s*=\s*["\']([^"\']*)["\']|(\w+)\s*=\s*([^\s>]*)/';
                if (preg_match_all($attrPattern, $attributeString, $attrMatches, PREG_SET_ORDER)) {
                    foreach ($attrMatches as $attrMatch) {
                        $attrName = $attrMatch[1] ?: $attrMatch[3];
                        $attrValue = $attrMatch[2] ?: $attrMatch[4];
                        $attributes[$attrName] = $attrValue;
                    }
                }
                
                $results[] = $attributes;
            }
        }
        
        return $results;
    }
    
    // 验证密码强度
    public static function validatePasswordStrength($password) {
        $checks = [
            'length' => strlen($password) >= 8,
            'lowercase' => preg_match('/[a-z]/', $password),
            'uppercase' => preg_match('/[A-Z]/', $password),
            'digit' => preg_match('/\d/', $password),
            'special' => preg_match('/[@$!%*?&]/', $password),
            'no_common' => !preg_match('/^(password|123456|qwerty|admin)$/i', $password)
        ];
        
        $score = array_sum($checks);
        $strength = 'weak';
        
        if ($score >= 5) {
            $strength = 'strong';
        } elseif ($score >= 3) {
            $strength = 'medium';
        }
        
        return [
            'score' => $score,
            'strength' => $strength,
            'checks' => $checks,
            'suggestions' => self::getPasswordSuggestions($checks)
        ];
    }
    
    // 获取密码改进建议
    private static function getPasswordSuggestions($checks) {
        $suggestions = [];
        
        if (!$checks['length']) {
            $suggestions[] = '密码长度至少8位';
        }
        if (!$checks['lowercase']) {
            $suggestions[] = '添加小写字母';
        }
        if (!$checks['uppercase']) {
            $suggestions[] = '添加大写字母';
        }
        if (!$checks['digit']) {
            $suggestions[] = '添加数字';
        }
        if (!$checks['special']) {
            $suggestions[] = '添加特殊字符(@$!%*?&)';
        }
        if (!$checks['no_common']) {
            $suggestions[] = '避免使用常见密码';
        }
        
        return $suggestions;
    }
    
    // 提取文本中的日期
    public static function extractDates($text) {
        $patterns = [
            'yyyy-mm-dd' => '/\b\d{4}-\d{2}-\d{2}\b/',
            'dd/mm/yyyy' => '/\b\d{2}\/\d{2}\/\d{4}\b/',
            'mm-dd-yyyy' => '/\b\d{2}-\d{2}-\d{4}\b/',
            'chinese' => '/\b\d{4}年\d{1,2}月\d{1,2}日\b/u'
        ];
        
        $dates = [];
        
        foreach ($patterns as $format => $pattern) {
            if (preg_match_all($pattern, $text, $matches)) {
                foreach ($matches[0] as $match) {
                    $dates[] = [
                        'date' => $match,
                        'format' => $format
                    ];
                }
            }
        }
        
        return $dates;
    }
    
    // 清理和格式化电话号码
    public static function formatPhoneNumber($phone, $format = 'xxx-xxxx-xxxx') {
        // 清理非数字字符
        $cleanPhone = preg_replace('/\D/', '', $phone);
        
        // 验证长度
        if (strlen($cleanPhone) !== 11 || !preg_match('/^1[3-9]/', $cleanPhone)) {
            return false;
        }
        
        // 格式化
        switch ($format) {
            case 'xxx-xxxx-xxxx':
                return preg_replace('/(\d{3})(\d{4})(\d{4})/', '$1-$2-$3', $cleanPhone);
            case 'xxx xxxx xxxx':
                return preg_replace('/(\d{3})(\d{4})(\d{4})/', '$1 $2 $3', $cleanPhone);
            case '(xxx) xxxx-xxxx':
                return preg_replace('/(\d{3})(\d{4})(\d{4})/', '($1) $2-$3', $cleanPhone);
            default:
                return $cleanPhone;
        }
    }
}

// 使用高级正则功能
echo "=== 高级正则功能示例 ===\n";

// URL解析
$testUrl = "https://www.example.com/path/to/page?param=value#section";
$urlParts = AdvancedRegex::parseUrl($testUrl);
if ($urlParts) {
    echo "URL解析结果:\n";
    foreach ($urlParts as $key => $value) {
        echo "- $key: $value\n";
    }
}

// 邮箱解析
echo "\n邮箱解析:\n";
$testEmail = "john.doe@example.com";
$emailParts = AdvancedRegex::parseEmail($testEmail);
if ($emailParts) {
    foreach ($emailParts as $key => $value) {
        echo "- $key: $value\n";
    }
}

// HTML属性提取
echo "\nHTML属性提取:\n";
$html = '<img   src="image.jpg"  alt="测试图片" width="300" height="200">';
$attributes = AdvancedRegex::extractHtmlAttributes($html, 'img');
foreach ($attributes as $attrs) {
    echo "图片属性: " . json_encode($attrs, JSON_UNESCAPED_UNICODE) . "\n";
}

// 密码强度验证
echo "\n密码强度验证:\n";
$passwords = ['123456', 'Password1', 'StrongPass123!'];
foreach ($passwords as $pwd) {
    $strength = AdvancedRegex::validatePasswordStrength($pwd);
    echo "密码 '$pwd': {$strength['strength']} (得分: {$strength['score']}/6)\n";
    if (!empty($strength['suggestions'])) {
        echo "  建议: " . implode(', ', $strength['suggestions']) . "\n";
    }
}

// 日期提取
echo "\n日期提取:\n";
$dateText = "项目开始于2023-07-10，截止日期是2023年12月31日，会议安排在15/08/2023。";
$dates = AdvancedRegex::extractDates($dateText);
foreach ($dates as $date) {
    echo "找到日期: {$date['date']} (格式: {$date['format']})\n";
}

// 电话号码格式化
echo "\n电话号码格式化:\n";
$phones = ['13812345678', '138-1234-5678', '(138) 1234-5678'];
foreach ($phones as $phone) {
    $formatted = AdvancedRegex::formatPhoneNumber($phone, 'xxx-xxxx-xxxx');
    echo "原号码: $phone -> 格式化: " . ($formatted ?: '无效号码') . "\n";
}
?>

3. 文本处理和内容提取

<?php
// 文本处理和内容提取
echo "=== 文本处理和内容提取 ===\n";

class TextProcessor {
    
    // 提取文章摘要
    public static function extractSummary($content, $maxLength = 200) {
        // 移除HTML标签
        $text = strip_tags($content);
        
        // 移除多余空白
        $text = preg_replace('/\s+/', ' ', trim($text));
        
        // 截取指定长度
        if (mb_strlen($text) <= $maxLength) {
            return $text;
        }
        
        // 在单词边界截取
        $summary = mb_substr($text, 0, $maxLength);
        $lastSpace = mb_strrpos($summary, ' ');
        
        if ($lastSpace !== false) {
            $summary = mb_substr($summary, 0, $lastSpace);
        }
        
        return $summary . '...';
    }
    
    // 提取关键词
    public static function extractKeywords($text, $minLength = 3, $maxCount = 10) {
        // 转换为小写并移除标点
        $text = strtolower($text);
        $text = preg_replace('/[^\w\s\x{4e00}-\x{9fa5}]/u', ' ', $text);
        
        // 分割单词
        $words = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
        
        // 过滤停用词和短词
        $stopWords = ['的', '是', '在', '有', '和', '与', '或', '但', '而', '了', '着', '过', 'the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
        $filteredWords = array_filter($words, function($word) use ($minLength, $stopWords) {
            return mb_strlen($word) >= $minLength && !in_array($word, $stopWords);
        });
        
        // 统计词频
        $wordCount = array_count_values($filteredWords);
        
        // 按频率排序
        arsort($wordCount);
        
        // 返回前N个关键词
        return array_slice(array_keys($wordCount), 0, $maxCount);
    }
    
    // 检测语言
    public static function detectLanguage($text) {
        $patterns = [
            'chinese' => '/[\x{4e00}-\x{9fa5}]/u',
            'english' => '/[a-zA-Z]/',
            'japanese' => '/[\x{3040}-\x{309f}\x{30a0}-\x{30ff}]/u',
            'korean' => '/[\x{ac00}-\x{d7af}]/u',
            'arabic' => '/[\x{0600}-\x{06ff}]/u',
            'russian' => '/[\x{0400}-\x{04ff}]/u'
        ];
        
        $scores = [];
        
        foreach ($patterns as $lang => $pattern) {
            preg_match_all($pattern, $text, $matches);
            $scores[$lang] = count($matches[0]);
        }
        
        // 返回得分最高的语言
        arsort($scores);
        $topLang = key($scores);
        
        return [
            'language' => $topLang,
            'confidence' => $scores[$topLang] / mb_strlen($text),
            'scores' => $scores
        ];
    }
    
    // 提取邮箱和电话
    public static function extractContacts($text) {
        $contacts = [
            'emails' => [],
            'phones' => [],
            'urls' => []
        ];
        
        // 提取邮箱
        $emailPattern = '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/';
        if (preg_match_all($emailPattern, $text, $matches)) {
            $contacts['emails'] = array_unique($matches[0]);
        }
        
        // 提取电话
        $phonePatterns = [
            '/1[3-9]\d{9}/',  // 中国手机号
            '/\d{3}-\d{4}-\d{4}/',  // 格式化手机号
            '/\(\d{3}\)\s*\d{4}-\d{4}/',  // 美式电话
            '/\d{4}-\d{7}/'  // 固定电话
        ];
        
        foreach ($phonePatterns as $pattern) {
            if (preg_match_all($pattern, $text, $matches)) {
                $contacts['phones'] = array_merge($contacts['phones'], $matches[0]);
            }
        }
        $contacts['phones'] = array_unique($contacts['phones']);
        
        // 提取URL
        $urlPattern = '/https?:\/\/[^\s\/$.?#].[^\s]*/';
        if (preg_match_all($urlPattern, $text, $matches)) {
            $contacts['urls'] = array_unique($matches[0]);
        }
        
        return $contacts;
    }
    
    // 格式化文本
    public static function formatText($text, $options = []) {
        $defaults = [
            'remove_extra_spaces' => true,
            'fix_punctuation' => true,
            'capitalize_sentences' => true,
            'remove_empty_lines' => true
        ];
        
        $options = array_merge($defaults, $options);
        
        // 移除多余空格
        if ($options['remove_extra_spaces']) {
            $text = preg_replace('/\s+/', ' ', $text);
        }
        
        // 修复标点符号
        if ($options['fix_punctuation']) {
            $text = preg_replace('/\s+([,.!?;:])/', '$1', $text);
            $text = preg_replace('/([,.!?;:])\s*/', '$1 ', $text);
        }
        
        // 句首大写
        if ($options['capitalize_sentences']) {
            $text = preg_replace_callback('/([.!?]\s+)([a-z])/', function($matches) {
                return $matches[1] . strtoupper($matches[2]);
            }, $text);
            
            // 首字母大写
            $text = ucfirst(trim($text));
        }
        
        // 移除空行
        if ($options['remove_empty_lines']) {
            $text = preg_replace('/\n\s*\n/', "\n", $text);
        }
        
        return trim($text);
    }
    
    // 生成文本统计
    public static function getTextStats($text) {
        $stats = [
            'characters' => mb_strlen($text),
            'characters_no_spaces' => mb_strlen(preg_replace('/\s/', '', $text)),
            'words' => str_word_count(strip_tags($text)),
            'sentences' => preg_match_all('/[.!?]+/', $text),
            'paragraphs' => preg_match_all('/\n\s*\n/', $text) + 1,
            'lines' => substr_count($text, "\n") + 1
        ];
        
        // 计算阅读时间（假设每分钟200字）
        $stats['reading_time'] = ceil($stats['words'] / 200);
        
        // 计算平均句长
        $stats['avg_sentence_length'] = $stats['sentences'] > 0 ? 
            round($stats['words'] / $stats['sentences'], 1) : 0;
        
        return $stats;
    }
}

// 使用文本处理器
echo "=== 文本处理器示例 ===\n";

$sampleText = "这是一篇关于PHP正则表达式的文章。正则表达式是处理文本的强大工具，在数据验证、文本解析和内容提取中发挥着重要作用。联系我们：邮箱support@example.com，电话13812345678，网站https://www.example.com。";

// 提取摘要
$summary = TextProcessor::extractSummary($sampleText, 50);
echo "文章摘要: $summary\n";

// 提取关键词
$keywords = TextProcessor::extractKeywords($sampleText);
echo "关键词: " . implode(', ', $keywords) . "\n";

// 语言检测
$langInfo = TextProcessor::detectLanguage($sampleText);
echo "检测语言: {$langInfo['language']} (置信度: " . round($langInfo['confidence'], 2) . ")\n";

// 提取联系信息
$contacts = TextProcessor::extractContacts($sampleText);
echo "联系信息:\n";
echo "- 邮箱: " . implode(', ', $contacts['emails']) . "\n";
echo "- 电话: " . implode(', ', $contacts['phones']) . "\n";
echo "- 网址: " . implode(', ', $contacts['urls']) . "\n";

// 文本统计
$stats = TextProcessor::getTextStats($sampleText);
echo "\n文本统计:\n";
foreach ($stats as $key => $value) {
    $label = [
        'characters' => '字符数',
        'characters_no_spaces' => '字符数(不含空格)',
        'words' => '单词数',
        'sentences' => '句子数',
        'paragraphs' => '段落数',
        'lines' => '行数',
        'reading_time' => '阅读时间(分钟)',
        'avg_sentence_length' => '平均句长'
    ][$key] ?? $key;
    
    echo "- $label: $value\n";
}

// 格式化文本
$messyText = "这是一段   格式混乱的文本  。有多余的空格   ，标点符号也不规范,还有   。";
$formattedText = TextProcessor::formatText($messyText);
echo "\n格式化前: $messyText\n";
echo "格式化后: $formattedText\n";
?>

实际应用场景

1. 数据验证和清理

<?php
// 数据验证和清理应用
echo "=== 数据验证和清理应用 ===\n";

class DataValidator {
    
    // 验证用户注册数据
    public static function validateRegistration($data) {
        $errors = [];
        
        // 验证用户名
        if (empty($data['username'])) {
            $errors['username'] = '用户名不能为空';
        } elseif (!preg_match('/^[a-zA-Z0-9_]{3,20}$/', $data['username'])) {
            $errors['username'] = '用户名只能包含字母、数字和下划线，长度3-20位';
        }
        
        // 验证邮箱
        if (empty($data['email'])) {
            $errors['email'] = '邮箱不能为空';
        } elseif (!preg_match('/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/', $data['email'])) {
            $errors['email'] = '邮箱格式不正确';
        }
        
        // 验证密码
        if (empty($data['password'])) {
            $errors['password'] = '密码不能为空';
        } elseif (!preg_match('/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/', $data['password'])) {
            $errors['password'] = '密码必须包含大小写字母、数字和特殊字符，至少8位';
        }
        
        // 验证手机号
        if (!empty($data['phone']) && !preg_match('/^1[3-9]\d{9}$/', $data['phone'])) {
            $errors['phone'] = '手机号格式不正确';
        }
        
        // 验证身份证号
        if (!empty($data['idcard']) && !preg_match('/^\d{17}[\dX]$/', $data['idcard'])) {
            $errors['idcard'] = '身份证号格式不正确';
        }
        
        return $errors;
    }
    
    // 清理用户输入
    public static function sanitizeInput($data) {
        $sanitized = [];
        
        foreach ($data as $key => $value) {
            if (is_string($value)) {
                // 移除HTML标签
                $value = strip_tags($value);
                
                // 移除多余空白
                $value = preg_replace('/\s+/', ' ', trim($value));
                
                // 转义特殊字符
                $value = htmlspecialchars($value, ENT_QUOTES, 'UTF-8');
                
                $sanitized[$key] = $value;
            } else {
                $sanitized[$key] = $value;
            }
        }
        
        return $sanitized;
    }
    
    // 验证和格式化银行卡号
    public static function validateBankCard($cardNumber) {
        // 移除空格和连字符
        $cleanNumber = preg_replace('/[\s-]/', '', $cardNumber);
        
        // 检查是否只包含数字
        if (!preg_match('/^\d+$/', $cleanNumber)) {
            return ['valid' => false, 'message' => '银行卡号只能包含数字'];
        }
        
        // 检查长度
        $length = strlen($cleanNumber);
        if ($length < 13 || $length > 19) {
            return ['valid' => false, 'message' => '银行卡号长度不正确'];
        }
        
        // Luhn算法验证
        $sum = 0;
        $alternate = false;
        
        for ($i = $length - 1; $i >= 0; $i--) {
            $digit = intval($cleanNumber[$i]);
            
            if ($alternate) {
                $digit *= 2;
                if ($digit > 9) {
                    $digit = ($digit % 10) + 1;
                }
            }
            
            $sum += $digit;
            $alternate = !$alternate;
        }
        
        $isValid = ($sum % 10 === 0);
        
        if ($isValid) {
            // 格式化显示
            $formatted = preg_replace('/(\d{4})/', '$1 ', $cleanNumber);
            $formatted = trim($formatted);
            
            return [
                'valid' => true,
                'original' => $cardNumber,
                'clean' => $cleanNumber,
                'formatted' => $formatted,
                'type' => self::getBankCardType($cleanNumber)
            ];
        } else {
            return ['valid' => false, 'message' => '银行卡号校验失败'];
        }
    }
    
    // 识别银行卡类型
    private static function getBankCardType($cardNumber) {
        $patterns = [
            'visa' => '/^4\d{12}(\d{3})?$/',
            'mastercard' => '/^5[1-5]\d{14}$/',
            'amex' => '/^3[47]\d{13}$/',
            'discover' => '/^6011\d{12}$/',
            'unionpay' => '/^62\d{14,17}$/'
        ];
        
        foreach ($patterns as $type => $pattern) {
            if (preg_match($pattern, $cardNumber)) {
                return $type;
            }
        }
        
        return 'unknown';
    }
}

// 使用数据验证器
echo "=== 数据验证器示例 ===\n";

// 测试注册数据验证
$registrationData = [
    'username' => 'john_doe',
    'email' => 'john@example.com',
    'password' => 'StrongPass123!',
    'phone' => '13812345678',
    'idcard' => '12345678901234567X'
];

$errors = DataValidator::validateRegistration($registrationData);

if (empty($errors)) {
    echo "注册数据验证通过\n";
    
    // 清理数据
    $cleanData = DataValidator::sanitizeInput($registrationData);
    echo "清理后的数据: " . json_encode($cleanData, JSON_UNESCAPED_UNICODE) . "\n";
} else {
    echo "注册数据验证失败:\n";
    foreach ($errors as $field => $error) {
        echo "- $field: $error\n";
    }
}

// 测试银行卡验证
echo "\n银行卡验证:\n";
$testCards = [
    '4111 1111 1111 1111',  // Visa测试卡号
    '5555555555554444',     // MasterCard测试卡号
    '1234567890123456'      // 无效卡号
];

foreach ($testCards as $card) {
    $result = DataValidator::validateBankCard($card);
    echo "卡号 $card: ";
    
    if ($result['valid']) {
        echo "有效 ({$result['type']}) - 格式化: {$result['formatted']}\n";
    } else {
        echo "无效 - {$result['message']}\n";
    }
}
?>

2. 日志分析和监控

<?php
// 日志分析和监控
echo "=== 日志分析和监控 ===\n";

class LogAnalyzer {
    
    // 解析Apache访问日志
    public static function parseAccessLog($logLine) {
        // Apache Common Log Format
        $pattern = '/^(\S+) \S+ \S+ \[([^\]]+)\] "([^"]*)" (\d+) (\S+)(?: "([^"]*)" "([^"]*)")?/';
        
        if (preg_match($pattern, $logLine, $matches)) {
            return [
                'ip' => $matches[1],
                'timestamp' => $matches[2],
                'request' => $matches[3],
                'status' => intval($matches[4]),
                'size' => $matches[5] === '-' ? 0 : intval($matches[5]),
                'referer' => $matches[6] ?? '',
                'user_agent' => $matches[7] ?? ''
            ];
        }
        
        return false;
    }
    
    // 解析PHP错误日志
    public static function parseErrorLog($logLine) {
        $pattern = '/^\[([^\]]+)\] PHP (Fatal error|Warning|Notice|Parse error): (.+) in (.+) on line (\d+)/';
        
        if (preg_match($pattern, $logLine, $matches)) {
            return [
                'timestamp' => $matches[1],
                'level' => $matches[2],
                'message' => $matches[3],
                'file' => $matches[4],
                'line' => intval($matches[5])
            ];
        }
        
        return false;
    }
    
    // 检测可疑活动
    public static function detectSuspiciousActivity($logEntries) {
        $suspicious = [];
        
        foreach ($logEntries as $entry) {
            $flags = [];
            
            // 检测SQL注入尝试
            if (preg_match('/union\s+select|drop\s+table|insert\s+into|delete\s+from/i', $entry['request'] ?? '')) {
                $flags[] = 'SQL注入尝试';
            }
            
            // 检测XSS尝试
            if (preg_match('/<script|javascript:|onload=|onerror=/i', $entry['request'] ?? '')) {
                $flags[] = 'XSS攻击尝试';
            }
            
            // 检测路径遍历
            if (preg_match('/\.\.\/|\.\.\\\\/', $entry['request'] ?? '')) {
                $flags[] = '路径遍历尝试';
            }
            
            // 检测暴力破解
            if (isset($entry['status']) && $entry['status'] === 401) {
                $flags[] = '认证失败';
            }
            
            // 检测异常状态码
            if (isset($entry['status']) && in_array($entry['status'], [500, 502, 503, 504])) {
                $flags[] = '服务器错误';
            }
            
            if (!empty($flags)) {
                $entry['flags'] = $flags;
                $suspicious[] = $entry;
            }
        }
        
        return $suspicious;
    }
    
    // 统计访问情况
    public static function analyzeTraffic($logEntries) {
        $stats = [
            'total_requests' => count($logEntries),
            'unique_ips' => [],
            'status_codes' => [],
            'top_pages' => [],
            'user_agents' => [],
            'hourly_traffic' => []
        ];
        
        foreach ($logEntries as $entry) {
            // 统计IP
            if (isset($entry['ip'])) {
                $stats['unique_ips'][$entry['ip']] = ($stats['unique_ips'][$entry['ip']] ?? 0) + 1;
            }
            
            // 统计状态码
            if (isset($entry['status'])) {
                $stats['status_codes'][$entry['status']] = ($stats['status_codes'][$entry['status']] ?? 0) + 1;
            }
            
            // 统计页面访问
            if (isset($entry['request'])) {
                preg_match('/^\w+ ([^\s\?]+)/', $entry['request'], $matches);
                $page = $matches[1] ?? 'unknown';
                $stats['top_pages'][$page] = ($stats['top_pages'][$page] ?? 0) + 1;
            }
            
            // 统计User Agent
            if (isset($entry['user_agent'])) {
                $stats['user_agents'][$entry['user_agent']] = ($stats['user_agents'][$entry['user_agent']] ?? 0) + 1;
            }
            
            // 统计小时流量
            if (isset($entry['timestamp'])) {
                $hour = date('H', strtotime($entry['timestamp']));
                $stats['hourly_traffic'][$hour] = ($stats['hourly_traffic'][$hour] ?? 0) + 1;
            }
        }
        
        // 排序统计结果
        arsort($stats['unique_ips']);
        arsort($stats['status_codes']);
        arsort($stats['top_pages']);
        arsort($stats['user_agents']);
        ksort($stats['hourly_traffic']);
        
        // 只保留前10项
        $stats['unique_ips'] = array_slice($stats['unique_ips'], 0, 10, true);
        $stats['top_pages'] = array_slice($stats['top_pages'], 0, 10, true);
        $stats['user_agents'] = array_slice($stats['user_agents'], 0, 5, true);
        
        return $stats;
    }
    
    // 生成报告
    public static function generateReport($logEntries) {
        $traffic = self::analyzeTraffic($logEntries);
        $suspicious = self::detectSuspiciousActivity($logEntries);
        
        $report = [
            'summary' => [
                'total_requests' => $traffic['total_requests'],
                'unique_visitors' => count($traffic['unique_ips']),
                'suspicious_activities' => count($suspicious),
                'error_rate' => isset($traffic['status_codes'][500]) ? 
                    round(($traffic['status_codes'][500] / $traffic['total_requests']) * 100, 2) : 0
            ],
            'traffic_analysis' => $traffic,
            'security_alerts' => $suspicious
        ];
        
        return $report;
    }
}

// 使用日志分析器
echo "=== 日志分析器示例 ===\n";

// 模拟日志数据
$sampleLogs = [
    '192.168.1.100 - - [10/Jul/2023:16:45:30 +0800] "GET /index.php HTTP/1.1" 200 1234 "https://www.google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"',
    '192.168.1.101 - - [10/Jul/2023:16:46:15 +0800] "POST /login.php HTTP/1.1" 401 567 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"',
    '192.168.1.102 - - [10/Jul/2023:16:47:22 +0800] "GET /admin.php?id=1 UNION SELECT * FROM users HTTP/1.1" 403 0 "-" "sqlmap/1.0"',
    '192.168.1.100 - - [10/Jul/2023:16:48:10 +0800] "GET /contact.php HTTP/1.1" 200 2345 "https://www.example.com" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)"'
];

$parsedLogs = [];
foreach ($sampleLogs as $log) {
    $parsed = LogAnalyzer::parseAccessLog($log);
    if ($parsed) {
        $parsedLogs[] = $parsed;
    }
}

echo "解析了 " . count($parsedLogs) . " 条日志记录\n";

// 生成分析报告
$report = LogAnalyzer::generateReport($parsedLogs);

echo "\n=== 流量分析报告 ===\n";
echo "总请求数: {$report['summary']['total_requests']}\n";
echo "独立访客: {$report['summary']['unique_visitors']}\n";
echo "可疑活动: {$report['summary']['suspicious_activities']}\n";
echo "错误率: {$report['summary']['error_rate']}%\n";

echo "\n热门页面:\n";
foreach ($report['traffic_analysis']['top_pages'] as $page => $count) {
    echo "- $page: $count 次访问\n";
}

echo "\n状态码分布:\n";
foreach ($report['traffic_analysis']['status_codes'] as $code => $count) {
    echo "- $code: $count 次\n";
}

if (!empty($report['security_alerts'])) {
    echo "\n=== 安全警报 ===\n";
    foreach ($report['security_alerts'] as $alert) {
        echo "IP: {$alert['ip']}, 时间: {$alert['timestamp']}\n";
        echo "请求: {$alert['request']}\n";
        echo "威胁类型: " . implode(', ', $alert['flags']) . "\n\n";
    }
}
?>

性能优化和最佳实践

<?php
// 正则表达式性能优化
echo "=== 性能优化和最佳实践 ===\n";

class RegexOptimizer {
    
    // 编译和缓存正则表达式
    private static $compiledPatterns = [];
    
    public static function match($pattern, $subject, &$matches = null) {
        // 缓存编译后的模式
        if (!isset(self::$compiledPatterns[$pattern])) {
            self::$compiledPatterns[$pattern] = $pattern;
        }
        
        return preg_match(self::$compiledPatterns[$pattern], $subject, $matches);
    }
    
    // 性能测试
    public static function benchmarkPatterns($patterns, $testString, $iterations = 1000) {
        $results = [];
        
        foreach ($patterns as $name => $pattern) {
            $startTime = microtime(true);
            
            for ($i = 0; $i < $iterations; $i++) {
                preg_match($pattern, $testString);
            }
            
            $endTime = microtime(true);
            $results[$name] = [
                'pattern' => $pattern,
                'time' => round(($endTime - $startTime) * 1000, 4),
                'avg_time' => round((($endTime - $startTime) * 1000) / $iterations, 6)
            ];
        }
        
        return $results;
    }
    
    // 优化建议
    public static function analyzePattern($pattern) {
        $suggestions = [];
        
        // 检查是否使用了锚点
        if (!preg_match('/^[\^]|[\$]$/', $pattern)) {
            $suggestions[] = '考虑使用锚点(^$)来提高匹配效率';
        }
        
        // 检查是否有不必要的捕获组
        if (preg_match('/\([^?]/', $pattern)) {
            $suggestions[] = '考虑使用非捕获组(?:...)来提高性能';
        }
        
        // 检查是否有过度的量词
        if (preg_match('/\.\*\.\*|\.\+\.\+/', $pattern)) {
            $suggestions[] = '避免连续使用贪婪量词，可能导致回溯问题';
        }
        
        // 检查字符类优化
        if (preg_match('/\[a-zA-Z\]/', $pattern)) {
            $suggestions[] = '可以使用[[:alpha:]]替代[a-zA-Z]';
        }
        
        if (preg_match('/\[0-9\]/', $pattern)) {
            $suggestions[] = '可以使用\\d替代[0-9]';
        }
        
        return [
            'pattern' => $pattern,
            'suggestions' => $suggestions,
            'complexity' => self::calculateComplexity($pattern)
        ];
    }
    
    // 计算模式复杂度
    private static function calculateComplexity($pattern) {
        $complexity = 0;
        
        // 量词增加复杂度
        $complexity += preg_match_all('/[*+?{]/', $pattern);
        
        // 字符类增加复杂度
        $complexity += preg_match_all('/\[.*?\]/', $pattern);
        
        // 分组增加复杂度
        $complexity += preg_match_all('/\(.*?\)/', $pattern);
        
        // 反向引用增加复杂度
        $complexity += preg_match_all('/\\\\[1-9]/', $pattern) * 2;
        
        if ($complexity <= 5) {
            return 'low';
        } elseif ($complexity <= 15) {
            return 'medium';
        } else {
            return 'high';
        }
    }
    
    // 常用模式库
    const COMMON_PATTERNS = [
        'email_simple' => '/^[^@]+@[^@]+\.[^@]+$/',
        'email_strict' => '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/',
        'phone_loose' => '/\d{11}/',
        'phone_strict' => '/^1[3-9]\d{9}$/',
        'url_simple' => '/https?:\/\/\S+/',
        'url_strict' => '/^https?:\/\/[^\s\/$.?#].[^\s]*$/'
    ];
    
    // 选择最优模式
    public static function selectOptimalPattern($type, $strictness = 'medium') {
        $patterns = [
            'email' => [
                'loose' => self::COMMON_PATTERNS['email_simple'],
                'medium' => self::COMMON_PATTERNS['email_strict'],
                'strict' => '/^(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$/'
            ],
            'phone' => [
                'loose' => self::COMMON_PATTERNS['phone_loose'],
                'medium' => self::COMMON_PATTERNS['phone_strict'],
                'strict' => '/^(?:\+86)?1[3-9]\d{9}$/'
            ],
            'url' => [
                'loose' => self::COMMON_PATTERNS['url_simple'],
                'medium' => self::COMMON_PATTERNS['url_strict'],
                'strict' => '/^https?:\/\/(?:[-\w.])+(?:\:[0-9]+)?(?:\/(?:[\w\/_.])*(?:\?(?:[\w&=%.])*)?(?:\#(?:[\w.])*)?)?$/'
            ]
        ];
        
        return $patterns[$type][$strictness] ?? null;
    }
}

// 使用性能优化器
echo "=== 性能优化器示例 ===\n";

// 性能测试
$testString = "联系邮箱：test@example.com，电话：13812345678";

$emailPatterns = [
    'simple' => '/\S+@\S+/',
    'medium' => '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/',
    'complex' => '/^(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$/'
];

$benchmarkResults = RegexOptimizer::benchmarkPatterns($emailPatterns, $testString, 100);

echo "邮箱模式性能测试结果 (100次迭代):\n";
foreach ($benchmarkResults as $name => $result) {
    echo "- $name: {$result['time']}ms 总时间, {$result['avg_time']}ms 平均时间\n";
}

// 模式分析
echo "\n=== 模式分析 ===\n";
foreach ($emailPatterns as $name => $pattern) {
    $analysis = RegexOptimizer::analyzePattern($pattern);
    echo "\n$name 模式分析:\n";
    echo "复杂度: {$analysis['complexity']}\n";
    if (!empty($analysis['suggestions'])) {
        echo "优化建议:\n";
        foreach ($analysis['suggestions'] as $suggestion) {
            echo "- $suggestion\n";
        }
    } else {
        echo "模式已优化\n";
    }
}

// 选择最优模式
echo "\n=== 最优模式选择 ===\n";
$optimalEmail = RegexOptimizer::selectOptimalPattern('email', 'medium');
echo "推荐的邮箱验证模式: $optimalEmail\n";

$optimalPhone = RegexOptimizer::selectOptimalPattern('phone', 'strict');
echo "推荐的手机号验证模式: $optimalPhone\n";
?>

总结

通过本文的学习，我们全面掌握了PHP正则表达式的使用技巧：

关键要点

基础语法: 掌握了preg_match、preg_replace、preg_split等核心函数
高级模式: 学会了复杂的模式匹配和数据提取技巧
实际应用: 了解了数据验证、日志分析、文本处理等实用场景
性能优化: 掌握了正则表达式的性能优化方法

最佳实践

使用锚点提高匹配效率
优先使用非捕获组减少内存消耗
避免过度使用贪婪量词
缓存编译后的正则表达式
根据需求选择合适的严格程度

安全建议

验证用户输入防止注入攻击
使用白名单而不是黑名单
对正则表达式进行性能测试
避免复杂的回溯模式
定期更新验证规则

掌握这些正则表达式技巧，将大大提高你处理文本数据的能力和效率。记住，正则表达式是一把双刃剑，合理使用能事半功倍，过度使用可能影响性能。