在comment to an answer to this question中暗示PHP无法反转Unicode字符串。
对于Unicode,它适用于PHP 因为大多数应用都将其处理为 二进制文件。是的,PHP是8位干净的。尝试 相当于PHP中的这个:perl -Mutf8 -e'打印标量反转(“ほげほげ”)'你会得到垃圾, 不是“げほげほ”。 - jrockway
不幸的是,PHPs unicode支持atm最好是“缺乏”。这将是hopefully change drastically with PHP6。
PHP MultiByte functions确实提供了处理unicode所需的基本功能,但它不一致且缺少很多功能。其中一个是反转字符串的函数。
我当然希望在没有其他原因的情况下撤销此文本,然后找出是否可能。我创建了一个功能来完成这个巨大的复杂任务,即逆转这个Unicode文本,这样你就可以放松一点直到PHP6。
测试代码:
$enc = 'UTF-8';
$text = "ほげほげ";
$defaultEnc = mb_internal_encoding();
echo "Showing results with encoding $defaultEnc.\n\n";
$revNormal = strrev($text);
$revInt = mb_strrev($text);
$revEnc = mb_strrev($text, $enc);
echo "Original text is: $text .\n";
echo "Normal strrev output: " . $revNormal . ".\n";
echo "mb_strrev without encoding output: $revInt.\n";
echo "mb_strrev with encoding $enc output: $revEnc.\n";
if (mb_internal_encoding($enc)) {
echo "\nSetting internal encoding to $enc from $defaultEnc.\n\n";
$revNormal = strrev($text);
$revInt = mb_strrev($text);
$revEnc = mb_strrev($text, $enc);
echo "Original text is: $text .\n";
echo "Normal strrev output: " . $revNormal . ".\n";
echo "mb_strrev without encoding output: $revInt.\n";
echo "mb_strrev with encoding $enc output: $revEnc.\n";
} else {
echo "\nCould not set internal encoding to $enc!\n";
}
答案 0 :(得分:9)
这是使用正则表达式的另一种方法:
function utf8_strrev($str){
preg_match_all('/./us', $str, $ar);
return implode(array_reverse($ar[0]));
}
答案 1 :(得分:6)
这是另一种方式。这似乎无需指定输出编码(使用几个不同的mb_internal_encoding
进行测试)即可工作:
function mb_strrev($text)
{
return join('', array_reverse(
preg_split('~~u', $text, -1, PREG_SPLIT_NO_EMPTY)
));
}
答案 2 :(得分:4)
答案
function mb_strrev($text, $encoding = null)
{
$funcParams = array($text);
if ($encoding !== null)
$funcParams[] = $encoding;
$length = call_user_func_array('mb_strlen', $funcParams);
$output = '';
$funcParams = array($text, $length, 1);
if ($encoding !== null)
$funcParams[] = $encoding;
while ($funcParams[1]--) {
$output .= call_user_func_array('mb_substr', $funcParams);
}
return $output;
}
答案 3 :(得分:4)
Grapheme函数比mbstring和PCRE函数更正确地处理UTF-8字符串/ Mbstring和PCRE可能会破坏字符。您可以通过执行以下代码来查看它们之间的差异。
function str_to_array($string)
{
$length = grapheme_strlen($string);
$ret = [];
for ($i = 0; $i < $length; $i += 1) {
$ret[] = grapheme_substr($string, $i, 1);
}
return $ret;
}
function str_to_array2($string)
{
$length = mb_strlen($string, "UTF-8");
$ret = [];
for ($i = 0; $i < $length; $i += 1) {
$ret[] = mb_substr($string, $i, 1, "UTF-8");
}
return $ret;
}
function str_to_array3($string)
{
return preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY);
}
function utf8_strrev($string)
{
return implode(array_reverse(str_to_array($string)));
}
function utf8_strrev2($string)
{
return implode(array_reverse(str_to_array2($string)));
}
function utf8_strrev3($string)
{
return implode(array_reverse(str_to_array3($string)));
}
// http://www.php.net/manual/en/function.grapheme-strlen.php
$string = "a\xCC\x8A" // 'LATIN SMALL LETTER A WITH RING ABOVE' (U+00E5)
."o\xCC\x88"; // 'LATIN SMALL LETTER O WITH DIAERESIS' (U+00F6)
var_dump(array_map(function($elem) { return strtoupper(bin2hex($elem)); },
[
'should be' => "o\xCC\x88"."a\xCC\x8A",
'grapheme' => utf8_strrev($string),
'mbstring' => utf8_strrev2($string),
'pcre' => utf8_strrev3($string)
]));
结果就在这里。
array(4) {
["should be"]=>
string(12) "6FCC8861CC8A"
["grapheme"]=>
string(12) "6FCC8861CC8A"
["mbstring"]=>
string(12) "CC886FCC8A61"
["pcre"]=>
string(12) "CC886FCC8A61"
}
自PHP 5.5(intl 3.0);
以来,可以使用IntlBreakIteratorfunction utf8_strrev($str)
{
$it = IntlBreakIterator::createCodePointInstance();
$it->setText($str);
$ret = '';
$pos = 0;
$prev = 0;
foreach ($it as $pos) {
$ret = substr($str, $prev, $pos - $prev) . $ret;
$prev = $pos;
}
return $ret;
}
答案 4 :(得分:1)
另一种方法:
function mb_strrev($str, $enc = null) {
if(is_null($enc)) $enc = mb_internal_encoding();
$str = mb_convert_encoding($str, 'UTF-16BE', $enc);
return mb_convert_encoding(strrev($str), $enc, 'UTF-16LE');
}
答案 5 :(得分:0)
很容易utf8_strrev( $str )
。请参阅我在下面复制的我的图书馆的相关source代码:
function utf8_strrev( $str )
{
return implode( array_reverse( utf8_split( $str ) ) );
}
function utf8_split( $str , $split_length = 1 )
{
$str = ( string ) $str;
$ret = array( );
if( pcre_utf8_support( ) )
{
$str = utf8_clean( $str );
$ret = preg_split('/(?<!^)(?!$)/u', $str );
// \X is buggy in many recent versions of PHP
//preg_match_all( '/\X/u' , $str , $ret );
//$ret = $ret[0];
}
else
{
//Fallback
$len = strlen( $str );
for( $i = 0 ; $i < $len ; $i++ )
{
if( ( $str[$i] & "\x80" ) === "\x00" )
{
$ret[] = $str[$i];
}
else if( ( ( $str[$i] & "\xE0" ) === "\xC0" ) && ( isset( $str[$i+1] ) ) )
{
if( ( $str[$i+1] & "\xC0" ) === "\x80" )
{
$ret[] = $str[$i] . $str[$i+1];
$i++;
}
}
else if( ( ( $str[$i] & "\xF0" ) === "\xE0" ) && ( isset( $str[$i+2] ) ) )
{
if( ( ( $str[$i+1] & "\xC0" ) === "\x80" ) && ( ( $str[$i+2] & "\xC0" ) === "\x80" ) )
{
$ret[] = $str[$i] . $str[$i+1] . $str[$i+2];
$i = $i + 2;
}
}
else if( ( ( $str[$i] & "\xF8" ) === "\xF0" ) && ( isset( $str[$i+3] ) ) )
{
if( ( ( $str[$i+1] & "\xC0" ) === "\x80" ) && ( ( $str[$i+2] & "\xC0" ) === "\x80" ) && ( ( $str[$i+3] & "\xC0" ) === "\x80" ) )
{
$ret[] = $str[$i] . $str[$i+1] . $str[$i+2] . $str[$i+3];
$i = $i + 3;
}
}
}
}
if( $split_length > 1 )
{
$ret = array_chunk( $ret , $split_length );
$ret = array_map( 'implode' , $ret );
}
if( $ret[0] === '' )
{
return array( );
}
return $ret;
}
function utf8_clean( $str , $remove_bom = false )
{
$regx = '/([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF7][\x80-\xBF]{3})|./s';
$str = preg_replace( $regx , '$1' , $str );
if( $remove_bom )
{
$str = utf8_str_replace( utf8_bom( ) , '' , $str );
}
return $str;
}
function utf8_str_replace( $search , $replace , $subject , &$count = 0 )
{
return str_replace( $search , $replace , $subject , $count );
}
function utf8_bom( )
{
return "\xef\xbb\xbf";
}
function pcre_utf8_support( )
{
static $support;
if( !isset( $support ) )
{
$support = @preg_match( '//u', '' );
//Cached the response
}
return $support;
}