我使用php类从文章中制作标签云,但我想删除只有3个字符或更少字符的单词,也删除数字单词。
示例标签:1111猴鹿猫猪水牛
我想要结果:猴鹿水牛
该类的PHP代码(完整代码here)
function keywords_extract($text)
{
$text = strtolower($text);
$text = strip_tags($text);
/*
* Handle common words first because they have punctuation and we need to remove them
* before removing punctuation.
*/
$commonWords = "'tis,'twas,a,able,about,across,after,ain't,all,almost,also,am,among,an,and,any,are,aren't," .
"as,at,be,because,been,but,by,can,can't,cannot,could,could've,couldn't,dear,did,didn't,do,does,doesn't," .
"don't,either,else,ever,every,for,from,get,got,had,has,hasn't,have,he,he'd,he'll,he's,her,hers,him,his," .
"how,how'd,how'll,how's,however,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,just,least,let,like," .
"likely,may,me,might,might've,mightn't,most,must,must've,mustn't,my,neither,no,nor,not,o'clock,of,off," .
"often,on,only,or,other,our,own,rather,said,say,says,shan't,she,she'd,she'll,she's,should,should've," .
"shouldn't,since,so,some,than,that,that'll,that's,the,their,them,then,there,there's,these,they,they'd," .
"they'll,they're,they've,this,tis,to,too,twas,us,wants,was,wasn't,we,we'd,we'll,we're,were,weren't,what," .
"what'd,what's,when,when,when'd,when'll,when's,where,where'd,where'll,where's,which,while,who,who'd," .
"who'll,who's,whom,why,why'd,why'll,why's,will,with,won't,would,would've,wouldn't,yet,you,you'd,you'll," .
$commonWords = strtolower($commonWords);
$commonWords = explode(",", $commonWords);
foreach($commonWords as $commonWord)
{
$text = $this->str_replace_word($commonWord, "", $text);
}
/* remove punctuation and newlines */
/*
* Changed to handle international characters
*/
if ($this->m_bUTF8)
$text = preg_replace('/[^\p{L}0-9\s]|\n|\r/u',' ',$text);
else
$text = preg_replace('/[^a-zA-Z0-9\s]|\n|\r/',' ',$text);
/* remove extra spaces created */
$text = preg_replace('/ +/',' ',$text);
$text = trim($text);
$words = explode(" ", $text);
foreach ($words as $value)
{
$temp = trim($value);
if (is_numeric($temp))
continue;
$keywords[] = trim($temp);
}
return $keywords;
}
我尝试了各种方法,例如使用if (strlen($words)<3 && is_numeric($words)==true)
,但它没有用。
请帮帮我
答案 0 :(得分:1)
您应该将&&
更改为||
:
来自:
if (strlen($words)<3 && is_numeric($words)==true)
到:
if (strlen($words)<3 || is_numeric($words)==true)
如果您要删除 3个字符或更少字符的字词,则应使用<=
代替<
:
if (strlen($words) <= 3 || is_numeric($words)==true)
答案 1 :(得分:1)
你可以用正则表达式
来做变化:
/* remove extra spaces created */
$text = preg_replace('/ +/',' ',$text);
$text = trim($text);
$words = explode(" ", $text);
为:
/* remove extra spaces created */
$words = preg_replace('/\b\w{1,3}\s|[0-9]/gi','',$text);
return $words;
并删除以下foreach部分,包括return;
以下是正则表达式的解释:
\b = Match a word boundary position (whitespace or the beginning/end of the string).
\w = Match any word character (alphanumeric & underscore).
{1,3} = Matches 1 to 3 of the preceeding token.
\s = Match any whitespace character (spaces, tabs, line breaks).
| = or.
[0-9] = Match any numeric character.
这是对这种模式的人类可理解的解释: “找到一个单词 - 从起始位置到1或3个字符长度的任何单词字符和后面的空格 - 或 - 任何数字字符 - 并用空字符串替换它。
答案 2 :(得分:1)
我会略微修改你的过程以使其运行得更快(我相信它应该)。
第1步:而不是将每个常用字替换为$text
中的空字符串(替换过程很昂贵),我会将每个常用字存储到哈希表中以供以后过滤。
$commonWords = explode(",", $commonWords);
foreach($commonWords as $commonWord)
$hashWord[$commonWord] = $commonWord;
第2步:同时过滤包含少于4位数字的常用字词,数字和字词。
$words = preg_split("/[\s\n\r]/", $text);
foreach ($words as $value)
{
// Skip it is common word
if (isset($hashWord[$value])) continue;
// Skip if it is numeric
if (is_numeric($value)) continue;
// Skip if word contains less than 4 digits
if (strlen($value) < 4) continue;
$keywords[] = preg_replace('/[^a-zA-Z0-9\s].+/', '', $value);
}
以下是此功能的完整源代码(如果您要复制和粘贴)
function keywords_extract($text) {
$text = strtolower($text);
$text = strip_tags($text);
$commonWords = "'tis,'twas,a,able,about,across,after,ain't,all,almost,also,am,among,an,and,any,are,aren't," .
"as,at,be,because,been,but,by,can,can't,cannot,could,could've,couldn't,dear,did,didn't,do,does,doesn't," .
"don't,either,else,ever,every,for,from,get,got,had,has,hasn't,have,he,he'd,he'll,he's,her,hers,him,his," .
"how,how'd,how'll,how's,however,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,just,least,let,like," .
"likely,may,me,might,might've,mightn't,most,must,must've,mustn't,my,neither,no,nor,not,o'clock,of,off," .
"often,on,only,or,other,our,own,rather,said,say,says,shan't,she,she'd,she'll,she's,should,should've," .
"shouldn't,since,so,some,than,that,that'll,that's,the,their,them,then,there,there's,these,they,they'd," .
"they'll,they're,they've,this,tis,to,too,twas,us,wants,was,wasn't,we,we'd,we'll,we're,were,weren't,what," .
"what'd,what's,when,when,when'd,when'll,when's,where,where'd,where'll,where's,which,while,who,who'd," .
"who'll,who's,whom,why,why'd,why'll,why's,will,with,won't,would,would've,wouldn't,yet,you,you'd,you'll,";
$commonWords = explode(",", $commonWords);
foreach($commonWords as $commonWord)
$hashWord[$commonWord] = $commonWord;
$words = preg_split("/[\s\n\r]/", $text);
foreach ($words as $value)
{
// Skip it is common word
if (isset($hashWord[$value])) continue;
// Skip if it is numeric
if (is_numeric($value)) continue;
// Skip if word contains less than 4 digits
if (strlen($value) < 4) continue;
$keywords[] = preg_replace('/[^a-zA-Z0-9\s].+/', '', $value);
}
return $keywords;
}
演示:ideone.com/obG6n
答案 3 :(得分:0)
If((strlen($word) <= 3) && is_numeric($words)){
//Don't add in the list
}
答案 4 :(得分:0)