我一直在尝试使用一系列正则表达式和PHP函数preg_replace在PHP中编写。
我的主要目的是整理内容,例如确保句子的开头是大写字母;逗号后面有空格;等
我想要实现整理的一些例子:
// Remove any spaces around slashes
$content_replacements_from[] = "/\s*\/\s*/";
$content_replacements_to[] = "/";
// Remove any new lines or tabs
$content_replacements_from[] = "/[\r\n\t]/";
$content_replacements_to[] = " ";
// Remove any extra spaces
$content_replacements_from[] = "/\s{2,}/";
$content_replacements_to[] = " ";
// Tidy up joined full stops
$content_replacements_from[] = "/([a-zA-Z]{1})\s*[\.]{1}\s*([^(jpeg|jpg|png|pdf|gif|doc|xls|docx|xlsx|ppt|pptx|html|php|htm)]{1})/";
$content_replacements_to[] = "$1. $2";
// Tidy up joined commas
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\,]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1, $2";
// Tidy up joined exclamation marks
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\!]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1! $2";
// Tidy up joined question marks
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\?]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1? $2";
// Tidy up joined semi colons
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\;]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1; $2";
// Tidy up joined colons
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\:]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1: $2";
// Tidy up fluid ounces
$content_replacements_from[] = "/[Ff]{1}[Ll]{1}.?\s?[Oo]{1}[Zz]{1}/";
$content_replacements_to[] = "fl oz";
// Tidy up rpm
$content_replacements_from[] = "/[Rr]{1}[Pp]{1}[Mm]{1}/";
$content_replacements_to[] = "rpm";
// Tidy up UK
$content_replacements_from[] = "/[Uu]{1}[Kk]{1}/";
$content_replacements_to[] = "UK";
// Tidy up Maxi-sense
$content_replacements_from[] = "/[Mm]{1}axi[\s\-]?[Ss]{1}ense/";
$content_replacements_to[] = "maxi-sense";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Mm]{1}axi[\s\-]?[Ss]{1}ense/";
$content_replacements_to[] = ". Maxi-sense";
$content_replacements_from[] = "/^[Mm]{1}axi[\s\-]?[Ss]{1}ense/";
$content_replacements_to[] = "Maxi-sense";
// Tidy up Side-by-side
$content_replacements_from[] = "/[Ss]{1}ide[\s\-]?[Bb]{1}y[\s\-]?[Ss]{1}ide/";
$content_replacements_to[] = "side-by-side";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ss]{1}ide[\s\-]?[Bb]{1}y[\s\-]?[Ss]{1}ide/";
$content_replacements_to[] = ". Side-by-side";
$content_replacements_from[] = "/^[Ss]{1}ide[\s\-]?[Bb]{1}y[\s\-]?[Ss]{1}ide/";
$content_replacements_to[] = "Side-by-side";
// Tidy up extra large
$content_replacements_from[] = "/[Xx]{1}[Ll]{l}/";
$content_replacements_to[] = "extra large";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Xx]{1}[Ll]{l}/";
$content_replacements_to[] = "Extra large";
$content_replacements_from[] = "/^[Xx]{1}[Ll]{l}/";
$content_replacements_to[] = "Extra large";
// Tidy up D-radius
$content_replacements_from[] = "/[Dd]{1}[\s\-]?[Rr]{1}adius/";
$content_replacements_to[] = "D-radius";
// Tidy up A-rate
$content_replacements_from[] = "/[Aa]{1}[\s\-]?[Rr]{1}ate/";
$content_replacements_to[] = "A-rate";
// Tidy up In-column
$content_replacements_from[] = "/[Ii]{1}n[\s\-]?[Cc]{1}olum[n]?/";
$content_replacements_to[] = "in-column";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ii]{1}n[\s\-]?[Cc]{1}olum[n]?/";
$content_replacements_to[] = "In-column";
$content_replacements_from[] = "/^[Ii]{1}n[\s\-]?[Cc]{1}olum[n]?/";
$content_replacements_to[] = "In-column";
// Tidy up kW
$content_replacements_from[] = "/[Kk]{1}[Ww]{1}/";
$content_replacements_to[] = "kW";
// Tidy up Built-in
$content_replacements_from[] = "/[Bb]{1}uilt[\s\-]?[Ii]{1}n/";
$content_replacements_to[] = "built-in";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Bb]{1}uilt[\s\-]?[Ii]{1}n/";
$content_replacements_to[] = "Built-in";
$content_replacements_from[] = "/^[Bb]{1}uilt[\s\-]?[Ii]{1}n/";
$content_replacements_to[] = "Built-in";
// Tidy up Built-under
$content_replacements_from[] = "/[Bb]{1}uilt[\s\-]?[Uu]{1}nder/";
$content_replacements_to[] = "built-under";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Bb]{1}uilt[\s\-]?[Uu]{1}nder/";
$content_replacements_to[] = "Built-under";
$content_replacements_from[] = "/^[Bb]{1}uilt[\s\-]?[Uu]{1}nder/";
$content_replacements_to[] = "Built-under";
// Tidy up Under-counter
$content_replacements_from[] = "/[Uu]{1}nder[\s\-]?[Cc]{1}ounter/";
$content_replacements_to[] = "under-counter";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Uu]{1}nder[\s\-]?[Cc]{1}ounter/";
$content_replacements_to[] = "Under-counter";
$content_replacements_from[] = "/^[Uu]{1}nder[\s\-]?[Cc]{1}ounter/";
$content_replacements_to[] = "Under-counter";
// Tidy up Under-cabinet
$content_replacements_from[] = "/[Uu]{1}nder[\s\-]?[Cc]{1}abinet/";
$content_replacements_to[] = "under-cabinet";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Uu]{1}nder[\s\-]?[Cc]{1}abinet/";
$content_replacements_to[] = "Under-cabinet";
$content_replacements_from[] = "/^[Uu]{1}nder[\s\-]?[Cc]{1}abinet/";
$content_replacements_to[] = "Under-cabinet";
// Tidy up integrated
$content_replacements_from[] = "/([a-zA-Z0-9]{1})[\s]{1}[\-]{1}[Ii]{1}ntegrated/";
$content_replacements_to[] = "$1-integrated";
// Tidy up Semi-integrated
$content_replacements_from[] = "/[Ss]{1}emi[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "semi-integrated";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ss]{1}emi[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Semi-integrated";
$content_replacements_from[] = "/^[Ss]{1}emi[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Semi-integrated";
// Tidy up Fully-integrated
$content_replacements_from[] = "/[Ff]{1}ully[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "fully-integrated";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ff]{1}ully[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Fully-integrated";
$content_replacements_from[] = "/^[Ff]{1}ully[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Fully-integrated";
// Tidy up Semi-automatic
$content_replacements_from[] = "/[Ss]{1}emi[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "semi-automatic";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ss]{1}emi[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Semi-automatic";
$content_replacements_from[] = "/^[Ss]{1}emi[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Semi-automatic";
// Tidy up Fully-automatic
$content_replacements_from[] = "/[Ff]{1}ully[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "fully-automatic";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ff]{1}ully[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Fully-automatic";
$content_replacements_from[] = "/^[Ff]{1}ully[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Fully-automatic";
// Tidy up Pull-out
$content_replacements_from[] = "/[Pp]{1}ull[\s\-]?[Oo]{1}ut/";
$content_replacements_to[] = "pull-out";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Pp]{1}ull[\s\-]?[Oo]{1}ut/";
$content_replacements_to[] = "Pull-out";
$content_replacements_from[] = "/^[Pp]{1}ull[\s\-]?[Oo]{1}ut/";
$content_replacements_to[] = "Pull-out";
// Tidy up including
$content_replacements_from[] = "/\s[Ii]{1}nc[l]?[\.]?\s/";
$content_replacements_to[] = " including ";
// Tidy up use
$content_replacements_from[] = "/\s[Uu]{1}se\s/";
$content_replacements_to[] = " use ";
// Tidy up ?-piece
$content_replacements_from[] = "/([2345TtYy]{1})[\s\-]?[Pp]{1}iece/";
$content_replacements_to[] = "$1-piece";
// Tidy up ?-spout
$content_replacements_from[] = "/([Cc]{1})[\s\-]?[Ss]{1}pout/";
$content_replacements_to[] = "$1-spout";
// Tidy up ?-end
$content_replacements_from[] = "/([Cc]{1})[\s\-]?[Ee]{1}nd/";
$content_replacements_to[] = "$1-end";
// Tidy up Brushed Steel
$content_replacements_from[] = "/[Bb]{1}[\-\/]{1}[Ss]{1}teel/";
$content_replacements_to[] = "brushed steel";
// Tidy up Stainless Steel
$content_replacements_from[] = "/[Ss]{1}[\-\/]{1}[Ss]{1}teel/";
$content_replacements_to[] = "stainless steel";
// Tidy up Silk Steel
$content_replacements_from[] = "/[Ss]{1}ilk[\s]?[Ss]{1}teel/";
$content_replacements_to[] = "silk steel";
// Remove trade marks
$content_replacements_from[] = "/™/";
$content_replacements_to[] = "";
// Replace long dashes
$content_replacements_from[] = "/–/";
$content_replacements_to[] = "-";
// Replace single quotes
$content_replacements_from[] = "/’/";
$content_replacements_to[] = "'";
$content_replacements_from[] = "/`/";
$content_replacements_to[] = "'";
// Tidy up m
$content_replacements_from[] = "/[\s]?[Mm]{1}etre/";
$content_replacements_to[] = "m";
// Tidy up m3
$content_replacements_from[] = "/([0-9]{1})[\s]?[Mm]{1}3/";
$content_replacements_to[] = "$1m³";
$content_replacements_from[] = "/\³\;/";
$content_replacements_to[] = html_entity_decode("³");
// Tidy up to in between numbers
$content_replacements_from[] = "/([0-9]{1})[\s]?to[\s]?([0-9]{1})/";
$content_replacements_to[] = "$1 - $2";
// Tidy up per hour
$content_replacements_from[] = "/\s[Aa]{1}nd\s[Hh]{1}[Rr]?$/";
$content_replacements_to[] = "ph";
// Tidy up l
$content_replacements_from[] = "/[\s]?[Ll]{1}itre/";
$content_replacements_to[] = "l";
// Tidy up -in
$content_replacements_from[] = "/\-[Ii]{1}n/";
$content_replacements_to[] = "-in";
// Tidy up plus
$content_replacements_from[] = "/\s[Pp]{1}lus\s/";
$content_replacements_to[] = " plus ";
// Tidy up including
$content_replacements_from[] = "/\s[Ii]{1}ncluding\s/";
$content_replacements_to[] = " including ";
// Tidy up including
$content_replacements_from[] = "/[Ii]{1}nc\s/";
$content_replacements_to[] = "Including ";
// Tidy up Push/pull
$content_replacements_from[] = "/[Pp]{1}ush\/[Pp]{1}ull/";
$content_replacements_to[] = "push/pull";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Pp]{1}ush\/[Pp]{1}ull/";
$content_replacements_to[] = "Push/pull";
$content_replacements_from[] = "/^[Pp]{1}ush\/[Pp]{1}ull/";
$content_replacements_to[] = "Push/pull";
// Tidy up +
$content_replacements_from[] = "/\s\+\s/";
$content_replacements_to[] = " and ";
// Tidy up *
$content_replacements_from[] = "/\*/";
$content_replacements_to[] = "";
// Tidy up with
$content_replacements_from[] = "/\s[Ww]{1}ith\s/";
$content_replacements_to[] = " with ";
// Tidy up without
$content_replacements_from[] = "/\s[Ww]{1}ithout\s/";
$content_replacements_to[] = " without ";
// Tidy up in
$content_replacements_from[] = "/\s[Ii]{1}n\s/";
$content_replacements_to[] = " in ";
// Tidy up of
$content_replacements_from[] = "/\s[Oo]{1}f\s/";
$content_replacements_to[] = " of ";
// Tidy up for
$content_replacements_from[] = "/\s[Ff]{1}or\s/";
$content_replacements_to[] = " for ";
// Tidy up or
$content_replacements_from[] = "/\s[Oo]{1}r\s/";
$content_replacements_to[] = " or ";
// Tidy up and
$content_replacements_from[] = "/\s[Aa]{1}nd\s/";
$content_replacements_to[] = " and ";
// Tidy up to
$content_replacements_from[] = "/\s[Tt]{1}o\s/";
$content_replacements_to[] = " to ";
// Tidy up too
$content_replacements_from[] = "/\s[Tt]{1}oo\s/";
$content_replacements_to[] = " too ";
// Tidy up &
$content_replacements_from[] = "/\s&\s/";
$content_replacements_to[] = " and ";
// Tidy up &
$content_replacements_from[] = "/\s&\s/";
$content_replacements_to[] = " and ";
// Tidy up mm
$content_replacements_from[] = "/M[Mm]{1}/";
$content_replacements_to[] = "mm";
// Tidy up ize to ise
$content_replacements_from[] = "/([a-zA-Z]{2})ize{1}/";
$content_replacements_to[] = "$1ise";
// Tidy up izer to iser
$content_replacements_from[] = "/([a-zA-Z]{2})izer{1}/";
$content_replacements_to[] = "$1iser";
// Tidy up yze to yse
$content_replacements_from[] = "/([a-zA-Z]{2})yze{1}/";
$content_replacements_to[] = "$1yse";
// Tidy up ization to isation
$content_replacements_from[] = "/([a-zA-Z]{2})ization{1}/";
$content_replacements_to[] = "$1isation";
// Tidy up times symbol
$content_replacements_from[] = "/([0-9]{1})\s*[Xx]\s*([0-9A-Za-z]{1})/";
$content_replacements_to[] = "$1 × $2";
// Tidy up times symbol
$content_replacements_from[] = "/\×\;/";
$content_replacements_to[] = html_entity_decode("×");
// Tidy up inches
$content_replacements_from[] = "/([0-9]{1})\s*[Ii]{1}nches/";
$content_replacements_to[] = "$1\"";
// Tidy up inch
$content_replacements_from[] = "/([0-9]{1})\s*[Ii]{1}nch/";
$content_replacements_to[] = "$1\"";
// Make the replacements
$content = preg_replace($content_replacements_from, $content_replacements_to, $content);
这显然是复杂而漫长的。
有没有人知道更好的方法,或者知道可以做到这一点的课程?
如果可能的话,我还想将其应用于HTML中的内容。
答案 0 :(得分:7)
正则表达式非常适合文本搜索和替换。你得到的那个人表明还有改进的余地。但我的答案不是优化那些,而是我建议开始构建你自己的StringCleaner
集合,它可以做不同的东西,但都具有相同的界面:
interface StringCleaner
{
public function clean($string);
}
接下来,对于HTML,我的想法是创建一个FilterIterator
,提供对所有文本节点的访问,因此可以更容易地使用任何标准清理器更改它们。
要一次应用多个StringCleaner
(并创建一组这些),我使用了Composite Pattern(从SplObjectStore
扩展而来)StringCleaner
就可以了自己也是。
没有班级定义的例子:
$cleanerTrim = new TrimCleaner();
$cleanerBasics = new RegexCleaner();
// Remove any spaces around slashes
$cleanerBasics->addRule('\s*\/\s*', '/');
// Remove any new lines or tabs
$cleanerBasics->addRule('[\r\n\t]', ' ');
// Tidy up joined full stops
$cleanerBasics->addRule('(\w+)\.(?!jpeg|jpg|png|pdf|gif|doc|xls|docx|xlsx|ppt|pptx|html|php|htm)(\w+)', '$1. $2');
// Remove any extra spaces
$cleanerBasics->addRule('\s{2,}', ' ');
// Remove single spaces
$cleanerBasics->addRule('^\s$', '');
$cleanerInches = new RegexCleaner();
// Tidy up inches
$cleanerInches->addRule('([0-9])\s*[Ii]nches', '$1"');
$cleaner = new CleanerComposite();
$cleaner->attach($cleanerBasics);
$cleaner->attach($cleanerInches);
$cleaner->attach($cleanerTrim);
$htmlString = <<<HTML
<html>
<head>
<title>
hello world.hello earth.
</title>
</head>
<body>
<table><tr><td>test.
</td></tr></table>
<h1>Get it 1 more time.</h1>
<p>When 12 inches were not enough; hickup.</p>
</body>
</html>
HTML;
// load HTML
$dom = new DOMDocument();
$dom->preserveWhiteSpace = FALSE;
$dom->loadHTML($htmlString);
// create XPath
$xpath = new DomXPath($dom);
$it = new DOMTextWhiteSpaceFilter($xpath->query('//text()'));
foreach($it as $node)
{
$node->data = $cleaner->clean($node->data);
}
// remove whitespace only nodes
$it = new DOMTextWhiteSpaceFilter($xpath->query('//text()'), DOMTextWhiteSpaceFilter::WHITESPACE);
foreach($it as $node)
{
$node->parentNode->removeChild($node);
}
$dom->formatOutput = true;
echo $dom->saveHTML();
如示例所示,当您将复杂性隐藏到具体的StringCleaner
对象中时,您可以开始创建更多动态规则。这可以通过添加更多StringCleaner
类型来扩展,这些类型可以使用与正则表达式不同的内容,trim
中提供了一个非常简单的TrimCleaner
示例。
但可以肯定的是,正则表达式也非常强大。正如您在RegexCleaner
中看到的那样,我已将每个正则表达式分隔符移动到类本身中,因此在定义规则时,您无需反复键入它们。这只是另一个简单的例子,当你用一个已定义的动作接口将替换封装到它自己的类中时,你可以简化一些事情。
答案 1 :(得分:2)
除了大量的正则表达式之外,还有一种更好的方法可以做到这一点,但是如果没有其他人可以提出更好的工具,那么我是如何用PHP regexp做的。
可读性和易维护性几乎总是比速度更重要。 preg_replace确实需要两个单独的字符串或数组来匹配和替换,但我们可以通过在使用时重新排列数据来处理它。所以,我建议使用以下更易读的格式:
$content_replacements = array(array('From' => "/pattern 1/", 'To' => "$1 $2"),
array('From' => "/pattern 2/", 'To' => "$1,$2."));
它有一个很大的优势,如果你忘记一个'从'或'到'你的模式和替换不会失去同步。
然后要运行所有这些,你可以使用循环:
foreach ($content_replacements as $replacement)
{
$content = preg_replace($replacement['From'], $replacement['To'], $content);
}
答案 2 :(得分:2)
我认为你想要的第一件事是找到关于正则表达式的一个很好的参考,并刷新一些事物,比如哪些字符需要在字符类的内部和外部进行转义(方括号:[])。然后通过删除{1}并在有意义的情况下使用不区分大小写的匹配('// i')来清理正则表达式。然后我将包含一些代码,您可以使用这些代码来合并一些大小写规则,我希望它可以帮助您。
// capitalize first word of each sentence
function tidy_sentences($str){
$tokens = array();
foreach(preg_split('/([.?!]\s*)/', $str, 0, PREG_SPLIT_DELIM_CAPTURE) as $token){
if(!preg_match('/([.?!]\s*)/', $token, $m)) $token[0] = strtoupper($token[0]);
$tokens[] = $token;
}
return implode($tokens);
}
// apply capitalization rules to individual words
function tidy_words($str){
$tokens = array();
foreach(preg_split('/([^\w-])/', $str, 0, PREG_SPLIT_DELIM_CAPTURE) as $token){
switch(true){
// tokens you want to uppercase go here
case preg_match('/^(uk|kw)$/i', $token, $m): $tokens[] = strtoupper($m[0]); break;
// tokens you want to lowercase go here
case preg_match('/^(rpm|fl|oz)$/i', $token, $m): $tokens[] = strtolower($m[0]); break;
// tokens you want to capitalize first letter go here
case preg_match('/^(maxi-sense|side-by-side|d-radius)$/i', $token, $m): $tokens[] = ucfirst($m[0]); break;
default: $tokens[] = $token;
}
}
return implode($tokens);
}
function tidy($str){
return tidy_sentences(tidy_words($str));
}
echo tidy('foo bar rpm maxi-sense uk Fl OZ baz! pull out');
// => Foo bar rpm Maxi-sense UK fl oz baz! Pull out
答案 3 :(得分:1)
我认为最好的方法不一定是纯粹的正则表达式。正则表达式旨在用作适合遵循非常特定模式的字符串的工具。你的用例似乎并不那么具体,所以我认为是时候探索不同的途径了。
我不确定你要清理的字符串是多么复杂,或者内容通常是什么(可能它通常少于100个单词并且属于某个特定主题,所以常见的词汇量很小)。但是,我认为更灵活(从长远来看可能更容易)解决方案将涉及一个更简单的正则表达式(或一组正则表达式)来识别字符串中的标记。一旦确定了特定的令牌,您就可以根据识别令牌采取非常具体的操作,并对字符串执行更高级的算法。
我相信hakre在他的StringCleaner类的正确路径上,想要从核心逻辑中抽象出不同类型的字符串,以帮助清理代码并使其更易于管理和灵活。
我知道这是非常通用和理论上的,但是你所谈论的文库的大小会非常快 - 因为它涉及(基本上)自然语言处理,一种人工智能。