Question

通过可读的UTF，我的意思是任何有效的UTF-8，而不是（当然）用户必须有一个字体来读取该字符串。可读字符串的示例：

$readable_str0 = "Mary had a little lamb.";
$readable_str1 = "Příšerně žluťoučký kůň úpěl ďábelské ódy.";
$readable_str4 = "صِف خَلقَ خَودِ كَمِثلِ الشَمسِ إِذ بَزَغَت يَحظى الضَجيعُ بِها نَجلاءَ مِعطارِ";
$readable_str5 = "ཨ་ཡིག་དཀར་མཛེས་ལས་འཁྲུངས་ཤེས་བློའི་གཏེར༎"; //(Dzongkha)
$readable_str7 = "とりなくこゑす　ゆめさませ　みよあけわたる";
$readable_str8 = "TWFyeSBoYWQgYSBsaXR0bGUgbGFtYi4=";

不可读的字符串：

$not_readable_str0 = "�M,�T�HLQHT��,)�IU�I�M�";
$not_readable_str1 = "9��Příšerně žluťoučký kůň úpěl ďábelské ódy."
// this has some odd characters at the beginning so should count as unreadable
// it was result of gzdeflate of readable str 1
$not_readable_str4 = "ŹĎ5ůĹńŁV»×~1xâţöÚkkąő«¶’ŢáJ";
//some random selection from gif file

Answer 1

在某些情况下很可能会失败的一种肮脏的黑客行为：

$str2 = iconv("UTF-8", "UTF-8//IGNORE", $str);

并比较$ str和$ str2的长度。

Answer 2

mb_check_encoding正如其他用户所建议的那样，似乎就是这样。至少，PHP中最简单的方法。

我以前在C ++中实际上做过很多这样的事情！在那里，没有mb_check_encoding函数，我必须自己编写。

不要在PHP中使用此代码，只是为了好奇;）使用mb_check_encoding。

另外，这个“你所谓的二进制乱码仍然是有效的UTF-8”是另一个用户，是完全错误的。您可以高度准确地检查UTF-8。当然假设它不是一个像4个字节的小字符串，并且它有很多“非ascii”字符。 UTF-8具有特定的“难以意外的正确”模式。

此代码还会检查“非最短格式”UTF-8，这是一个安全问题。 “非最短形式”UTF-8，可能会导致一个程序意图过滤掉坏命令，实际上让它们通过，可能导致SQL注入漏洞。

不知道PHP如何处理非最短格式的UTF-8;）如果它让您担心，最好自己检查一下。

long VerifyUTF8(u8* source, u8* sourceEnd) {
    while (source < sourceEnd) {
        u8 c = *source++;
        if (c >= 0x80) {
            u8* PrevPos = source - 1;
            source = LegalUTF8_(c, source);
            if ( source > sourceEnd or !source ) {
                return sourceEnd - PrevPos;
            }
        }
    }

    return 0;
}


// returns 0 if it fails! source point to the 2nd byte of the UTF8!
u8* LegalUTF8_(u8 FirstChar, u8* source) {
    if (FirstChar < 0xC2 or FirstChar > 0xF4) {
        return 0; // dissallows ASCII! No point calling this on ASCII!
    }
    u32 ch = FirstChar;
    u32 offset;
    u8 a = *source++;
    switch (FirstChar) {    /* no fall-through in this inner switch */
        case 0xE0: if (a < 0xA0) return 0; break;
        case 0xF0: if (a < 0x90) return 0; break;
        case 0xF4: if (a > 0x8F) return 0; break;
    }

    if (ch <= 0xDF) {
        offset = 0x00003080;
        goto case2;
    } else if (ch <= 0xEF) {
        offset = 0x000E2080;
        goto case3;
    } else { // case 4
        offset = 0x03C82080;
    }

    ch <<= 6; ch += a;
    if (a < 0x80 or a > 0xBF) {
        return 0;
    }
    a = *source++;

    case3:; ch <<= 6; ch += a;
    if (a < 0x80 or a > 0xBF) {
        return 0;
    }
    a = *source++;

    case2:; ch <<= 6; ch += a;
    if (a < 0x80 or a > 0xBF) {
        return 0;
    }

    if (UniValid(ch-offset)) {
        return source;
    }
    return 0;
}


bool UniValid( u32 c ) { // negative c looks like > 2 billion, which is going to return false!
    if ( c < 0xD800 ) { // common case first
        return true;
    } else if ( c <= 0x0010FFFF and c > 0xDFFF and c != 0xFFFF and c != 0xFFFE ) {
        return true;
    }
    return false;
}

如何区分可读的UTF-8字符串和二进制乱码？

2 个答案: