假设UTF-8,将= C3 = B6转换为ö

时间:2019-01-23 18:54:37

标签: c utf-8

因此,我尝试阅读电子邮件,该电子邮件被编码为带引号的可打印内容,因此包含例如:

=C3=B6

应将其转换为

ö

因此我得到c3b6是utf-8ö的十六进制表示形式,但是我实际上不理解如何将char * str = '=C3=B6'转换为char * str 'ö'

我正在使用linux,但是会将代码移植到Windows,所以我想要一个多平台解决方案。

我该怎么做?

3 个答案:

答案 0 :(得分:1)

启动OP的方法。

解析字符串struct WriteTest { enum { opCompression, opWrite }; struct REQUEST : IO_STATUS_BLOCK { WriteTest* pTest; ULONG opcode; ULONG offset; }; LONGLONG _TotalSize, _BytesLeft; HANDLE _hFile; ULONG64 _StartTime; void* _pData; REQUEST* _pRequests; ULONG _BlockSize; ULONG _ConcurrentRequestCount; ULONG _dwThreadId; LONG _dwRefCount; WriteTest(ULONG BlockSize, ULONG ConcurrentRequestCount) { if (BlockSize & (BlockSize - 1)) { __debugbreak(); } _BlockSize = BlockSize, _ConcurrentRequestCount = ConcurrentRequestCount; _dwRefCount = 1, _hFile = 0, _pRequests = 0, _pData = 0; _dwThreadId = GetCurrentThreadId(); } ~WriteTest() { if (_pData) { VirtualFree(_pData, 0, MEM_RELEASE); } if (_pRequests) { delete [] _pRequests; } if (_hFile) { NtClose(_hFile); } PostThreadMessageW(_dwThreadId, WM_QUIT, 0, 0); } void Release() { if (!InterlockedDecrement(&_dwRefCount)) { delete this; } } void AddRef() { InterlockedIncrementNoFence(&_dwRefCount); } void StartWrite() { IO_STATUS_BLOCK iosb; FILE_VALID_DATA_LENGTH_INFORMATION fvdl; fvdl.ValidDataLength.QuadPart = _TotalSize; NTSTATUS status; if (0 > (status = NtSetInformationFile(_hFile, &iosb, &_TotalSize, sizeof(_TotalSize), FileEndOfFileInformation)) || 0 > (status = NtSetInformationFile(_hFile, &iosb, &fvdl, sizeof(fvdl), FileValidDataLengthInformation))) { DbgPrint("FileValidDataLength=%x\n", status); } ULONG offset = 0; ULONG dwNumberOfBytesTransfered = _BlockSize; _BytesLeft = _TotalSize + dwNumberOfBytesTransfered; ULONG ConcurrentRequestCount = _ConcurrentRequestCount; REQUEST* irp = _pRequests; _StartTime = GetTickCount64(); do { irp->opcode = opWrite; irp->pTest = this; irp->offset = offset; offset += dwNumberOfBytesTransfered; DoWrite(irp++); } while (--ConcurrentRequestCount); } void FillBuffer(PULONGLONG pu, LONGLONG ByteOffset) { ULONG n = _BlockSize / sizeof(ULONGLONG); do { *pu++ = ByteOffset, ByteOffset += sizeof(ULONGLONG); } while (--n); } void DoWrite(REQUEST* irp) { LONG BlockSize = _BlockSize; LONGLONG BytesLeft = InterlockedExchangeAddNoFence64(&_BytesLeft, -BlockSize) - BlockSize; if (0 < BytesLeft) { LARGE_INTEGER ByteOffset; ByteOffset.QuadPart = _TotalSize - BytesLeft; PVOID Buffer = RtlOffsetToPointer(_pData, irp->offset); FillBuffer((PULONGLONG)Buffer, ByteOffset.QuadPart); AddRef(); NTSTATUS status = NtWriteFile(_hFile, 0, 0, irp, irp, Buffer, BlockSize, &ByteOffset, 0); if (0 > status) { OnComplete(status, 0, irp); } } else if (!BytesLeft) { // write end ULONG64 time = GetTickCount64() - _StartTime; WCHAR sz[64]; StrFormatByteSizeW((_TotalSize * 1000) / time, sz, RTL_NUMBER_OF(sz)); DbgPrint("end:%S\n", sz); } } static VOID NTAPI _OnComplete( _In_ NTSTATUS status, _In_ ULONG_PTR dwNumberOfBytesTransfered, _Inout_ PVOID Ctx ) { reinterpret_cast<REQUEST*>(Ctx)->pTest->OnComplete(status, dwNumberOfBytesTransfered, reinterpret_cast<REQUEST*>(Ctx)); } VOID OnComplete(NTSTATUS status, ULONG_PTR dwNumberOfBytesTransfered, REQUEST* irp) { if (0 > status) { DbgPrint("OnComplete[%x]: %x\n", irp->opcode, status); } else switch (irp->opcode) { default: __debugbreak(); case opCompression: StartWrite(); break; case opWrite: if (dwNumberOfBytesTransfered == _BlockSize) { DoWrite(irp); } else { DbgPrint(":%I64x != %x\n", dwNumberOfBytesTransfered, _BlockSize); } } Release(); } NTSTATUS Create(POBJECT_ATTRIBUTES poa, ULONGLONG size) { if (!(_pRequests = new REQUEST[_ConcurrentRequestCount]) || !(_pData = VirtualAlloc(0, _BlockSize * _ConcurrentRequestCount, MEM_COMMIT, PAGE_READWRITE))) { return STATUS_INSUFFICIENT_RESOURCES; } ULONGLONG sws = _BlockSize - 1; LARGE_INTEGER as; _TotalSize = as.QuadPart = (size + sws) & ~sws; HANDLE hFile; IO_STATUS_BLOCK iosb; NTSTATUS status = NtCreateFile(&hFile, DELETE|FILE_GENERIC_READ|FILE_GENERIC_WRITE&~FILE_APPEND_DATA, poa, &iosb, &as, 0, 0, FILE_OVERWRITE_IF, FILE_NON_DIRECTORY_FILE|FILE_NO_INTERMEDIATE_BUFFERING, 0, 0); if (0 > status) { return status; } _hFile = hFile; if (0 > (status = RtlSetIoCompletionCallback(hFile, _OnComplete, 0))) { return status; } static USHORT cmp = COMPRESSION_FORMAT_NONE; REQUEST* irp = _pRequests; irp->pTest = this; irp->opcode = opCompression; AddRef(); status = NtFsControlFile(hFile, 0, 0, irp, irp, FSCTL_SET_COMPRESSION, &cmp, sizeof(cmp), 0, 0); if (0 > status) { OnComplete(status, 0, irp); } return status; } }; void WriteSpeed(POBJECT_ATTRIBUTES poa, ULONGLONG size, ULONG BlockSize, ULONG ConcurrentRequestCount) { BOOLEAN b; NTSTATUS status = RtlAdjustPrivilege(SE_MANAGE_VOLUME_PRIVILEGE, TRUE, FALSE, &b); if (0 <= status) { status = STATUS_INSUFFICIENT_RESOURCES; if (WriteTest * pTest = new WriteTest(BlockSize, ConcurrentRequestCount)) { status = pTest->Create(poa, size); pTest->Release(); if (0 <= status) { MessageBoxW(0, 0, L"Test...", MB_OK|MB_ICONINFORMATION); } } } } ,查找2个字节的十六进制字符。然后形成一个要打印的字符串(并希望"=C3=B6"将被解释为UTF-8)-符合printf的兼容编译器“对多字节字符没有特殊规定”。 {{3}}。

printf("%s", ...)

输出

#include "stdio.h"
int main() {
  char * str = "=C3=B6";
  printf("%s\n", str);
  printf("1 %s\n", "ö");
  printf("2 %s\n", "\xC3\xB6");
  unsigned char a[3] = { 0 };
  if (sscanf("=c3=b6", "=%hhx=%hhx", &a[0], &a[1]) == 2) {
    printf("3 %s\n", a);
  }
  return 0;
}

答案 1 :(得分:0)

这应该会让您入门。

我已经对其进行了测试,它似乎可以为您提供的输入服务。它具有一些错误检查,但没有很多。

#include <stdio.h>

// hexnib -- convert ascii hex digit to binary value
int
hexnib(int chr)
{

    chr &= 0xFF;

    do {
        if ((chr >= '0') && (chr <= '9')) {
            chr -= '0';
            break;
        }

        if ((chr >= 'A') && (chr <= 'F')) {
            chr -= 'A';
            chr += 10;
            break;
        }

        // error ...
    } while (0);

    return chr;
}

void
convert(char *utf8,const char *quo)
{
    int chr;
    int acc;

    while (1) {
        chr = *quo++;
        if (chr == 0)
            break;

        // handle ordinary char (i.e. _not_ start of =XY)
        if (chr != '=') {
            *utf8++ = chr;
            continue;
        }

        // hex value accumulator
        acc = 0;

        // get X value
        chr = *quo++;
        if (chr == 0)
            break;

        // convert to binary
        chr = hexnib(chr);
        acc <<= 8;
        acc |= chr;

        // get Y value
        chr = *quo++;
        if (chr == 0)
            break;

        // convert to binary
        chr = hexnib(chr);
        acc <<= 8;
        acc |= chr;

        // store utf sequence
        *utf8++ = acc;
    }

    // store end of string
    *utf8 = 0;
}

int
main(int argc,char **argv)
{
    char *fname;
    FILE *fi;
    char ibuf[1000];
    char obuf[1000];

    --argc;
    ++argv;

    fname = *argv;
    if (fname != NULL)
        fi = fopen(fname,"r");
    else
        fi = stdin;

    while (1) {
        char *cp = fgets(ibuf,sizeof(ibuf),fi);
        if (cp == NULL)
            break;

        convert(obuf,ibuf);

        fputs(obuf,stdout);
    }

    if (fname != NULL)
        fclose(fi);

    return 0;
}

答案 2 :(得分:0)

解码quoted-printable字符串涉及三件事:

  • 忽略换行符。这些是=,后跟换行符。

  • =后跟两个十六进制数字转换为其代码与该十六进制值匹配的字符

有三种主要的数据解码方法:

  1. 输入过滤器。而不是例如fgetc(),您可以使用一个函数来读取和解码带引号的可打印输入。

  2. 转换为新缓冲区。有关相同问题,请参见Craig Esteys answer中的convert()函数。

  3. 已转换。之所以可行,是因为每个有效的带引号的可打印编码字符串至少要与被解码的字符串一样长。


输入过滤器。为简单起见,让我们一次查看一个字符。 (请注意,许多UTF-8字符长于一个字符。)

首先,我们确实需要一个辅助函数来将十六进制数字字符转换为它们各自的算术值:

static inline int hex_digit(const int c)
{
    switch (c) {
    case '0':           return  0;
    case '1':           return  1;
    case '2':           return  2;
    case '3':           return  3;
    case '4':           return  4;
    case '5':           return  5;
    case '6':           return  6;
    case '7':           return  7;
    case '8':           return  8;
    case '9':           return  9;
    case 'A': case 'a': return 10;
    case 'B': case 'b': return 11;
    case 'C': case 'c': return 12;
    case 'D': case 'd': return 13;
    case 'E': case 'e': return 14;
    case 'F': case 'f': return 15;
    default:            return -1;
    }
}

在大多数情况下,您也可以将其写为

static inline int hex_digit(const int c)
{
    if (c >= '0' && c <= '9')
        return c - '0';
    else
    if (c >= 'A' && c <= 'F')
        return c - 'A' + 10;
    else
    if (c >= 'a' && c <= 'F')
        return c - 'a' + 10;
    else
        return -1;
}

甚至是

static signed char  hex_digit_value[UCHAR_MAX + 1];

static inline int hex_digit(const int c)
{
    return hex_digit_value[(unsigned char)c];
}

static inline void init_hex_digit_values(void)
{
    int  i;
    for (i = 0; i <= UCHAR_MAX; i++)
        hex_digit_value[i] = -1;

    hex_digit_value['0'] = 0;
    hex_digit_value['1'] = 1;
    hex_digit_value['2'] = 2;
    hex_digit_value['3'] = 3;
    hex_digit_value['4'] = 4;
    hex_digit_value['5'] = 5;
    hex_digit_value['6'] = 6;
    hex_digit_value['7'] = 7;
    hex_digit_value['8'] = 8;
    hex_digit_value['9'] = 9;
    hex_digit_value['A'] = hex_digit_value['a'] = 10;
    hex_digit_value['B'] = hex_digit_value['b'] = 11;
    hex_digit_value['C'] = hex_digit_value['c'] = 12;
    hex_digit_value['D'] = hex_digit_value['d'] = 13;
    hex_digit_value['E'] = hex_digit_value['e'] = 14;
    hex_digit_value['F'] = hex_digit_value['f'] = 15;
}

其中init_hex_digit_values()在程序开始时被调用一次。我更喜欢第一种形式,因为它最便携,但是第二种形式是您通常看到的。

使用hex_digit_value[]数组的第三种形式是过早优化的示例。在某些情况下,它可能比其他方法稍快一些(但差异绝对很小,在实践中不重要),但是如果一个方法支持广泛不同的单字节字符集(例如,EBDIC和ASCII),则可能会很有用。使用相同的代码。

首先,从包含带引号的可打印数据的流(文件或句柄)中读取已解码的字符:

int get_quoted_printable_char(FILE *from)
{
    int  c, c2, hi, lo;

    /* Paranoid check. */
    if (!from || ferror(from) || feof(from))
        return EOF;

    while (1) {

        c = fgetc(from);
        if (c != '=')
            return c;

        /* Soft newline? */
        c = fgetc(from);
        if (c == '\n')
            continue;

        /* '=' at the end of input? */
        if (c == EOF)
            return EOF;

        hi = hex_digit(c);
        if (hi < 0) {
            /* Invalid input; emit '=' instead. */
            ungetc(c, from);
            return '=';
        }

        c2 = fgetc(from);
        if (c2 == EOF) {
            /* Invalid input; emit '=' <c> instead. */
            ungetc(c, from);
            return '=';
        }

        low = hex_digit(c2);
        if (lo < 0) {
            /* Invalid input; try to emit '=' <c> <c2> instead. */
            ungetc(c2, from);
            ungetc(c, from);
            return '=';
        }

        return low + 16 * high;
    }
}

如果输入有多个连续的软换行符,则存在循环。它不应该真的发生,但是如果确实发生了,我们确实想忽略它们。

如果您想将带引号的可打印流复制到文件中,则只需要上面的内容,例如

int save(FILE *source, const char *filename)
{
    FILE  *target;
    int    c;

    if (!source || ferror(source))
        return -1;  /* Invalid source handle */

    if (!filename || !*filename)
        return -2;  /* Invalid filename */

    target = fopen(filename, "w");
    if (!target)
        return -3;  /* Cannot open filename for writing */

    while (1) {
        c = get_quoted_printable_char(source);
        if (c == EOF)
            break;

        if (fputc(c, target) == EOF)
            break;
    }

    if (!feof(source) || ferror(source)) {
        fclose(target);
        remove(filename);
        return -4; /* Error reading source. */
    }
    if (fclose(source)) {
        fclose(target);
        remove(filename);
        return -4; /* Error closing source (delayed read error). */
    }

    if (ferror(target) || fflush(target)) {
        fclose(target);
        remove(filename);
        return -5; /* Write error */
    }
    if (fclose(target)) {
        remove(filename);
        return -5; /* Error closing target; delayed write error */
    }

    /* Success. */
    return 0;
}

特别要注意避免出现读写错误。它并不是非常快,因为它依赖C库来缓冲输入,但是也不慢。实际上,它不使用任何显式缓冲区(依靠标准C库来决定如何缓冲源和正在写入的文件)这一事实使它在总体上可以接受。

转换为新缓冲区或就地转换非常相似:

size_t  decode_quoted_printable(char *dst, const char *src)
{
    const char *const origin = dst;

    /* Neither pointer may be NULL. src == dst is okay, however. */
    if (!dst || !src) {
        errno = EINVAL;
        return 0;
    }

    /* Copy loop. */
    while (*src)
        if (*src == '=') {
            if (src[1] == '\0') {
                /* '=' at the end of string. Skipped. */
                break;
            } else
            if (src[1] == '\n') {
                /* Soft newline. Skip both =\n and =\n\r newlines. */
                if (src[2] == '\r')
                    src += 3;
                else
                    src += 2;
            } else
            if (src[1] == '\r') {
                /* Soft newline. Skip both =\r and =\r\n newlines. */
                if (src[2] == '\n')
                    src += 3;
                else
                    src += 2;
            } else {
                const int  hi = hex_digit((unsigned char)(src[1]));
                const int  lo = hex_digit((unsigned char)(src[2]));
                if (hi >= 0 && lo >= 0) {
                    *(dst++) = lo + 16*hi;
                    src += 3;
                } else {
                    /* Error in input format. We are permissive,
                       and reproduce the erroneous `=XY` as-is. */
                    *(dst++) = *(src++);
                }
            }
        } else
        if (*src == '\n') {
            if (src[1] == '\r')
                src += 2;
            else
                src += 1;
            *(dst++) = '\n';
        } else
        if (*src == '\r') {
            if (src[1] == '\n')
                src += 2;
            else
                src += 1;
            *(dst++) = '\n';
        } else
           *(dst++) = *(src++);

    /* Terminate result to make it a string. */
    *dst = '\0';

    /* Just in case the source was an empty string, we clear
       errno to zero.  This also means we always set errno,
       which is a bit rare, but makes the use of this function
       easy: errno is nonzero iff there was an error. */
    errno = 0;
    return (size_t)(dst - origin);
}

请注意,由于无法修改字符串文字,因此无法执行char *data = "foo"; decode_quoted_printable(foo, foo);

但是您可以执行char data[] = "foo"; decode_quoted_printable(foo, foo);,因为它声明了一个恰好被初始化为字符串"foo"的字符数组。

请注意,上述功能还可以进行自动通用换行符转换。也就是说,它支持所有四个换行约定\r\n\n\r\r\n,并将它们全部转换为标准C \n换行符。

目标缓冲区必须至少与源缓冲区一样长,并且可以使用与源缓冲区相同的目标缓冲区,只要它是可变的(不是文字字符串,也不指向文字字符串)。 / p>

与流编码方法得到的一个解码字符的不同之处在于,后者需要将整个内容存储在内存缓冲区中。这是一个加号还是减号,取决于上下文。