解决方法1

Question

我有一个显示大小限制的UTF-8编码字符串的程序（比如MAX_LEN）每当我得到长度为＆gt;的字符串时MAX_LEN，我想知道我可以拆分它的位置，以便打印得更好。

例如：

#define MAX_LEN 30U
const char big_str[] = "This string cannot be displayed on one single line: it must be splitted"

如果没有进程，输出将如下所示：

"This string cannot be displaye" // Truncated because of size limitation
"d on one single line: it must "
"be splitted"

客户端可以为拆分选择合格的分隔符，但是现在，我默认定义了分隔符列表：

#define DEFAULT_DELIMITERS " ;:,)]" // Delimiters to track in the string

所以我正在寻找一种优雅且轻量级的方法来处理这些问题而不使用malloc ：我的API应不返回子字符串，I只想显示子字符串的位置。

我已经有了一些建议，我将在回答中提出：任何反馈（例如优点和缺点）都会受到赞赏，但最重要的是我对替代解决方案感兴趣。

Answer 1

我只想显示子字符串的位置。

所以你只需要一个函数来分析你的输入，返回找到分隔符的位置。

使用strpbrk()的可能方法，至少假定为C99：

#include <unistd.h> /* for ssize_t */
#include <string.h>

#define DELIMITERS (" ;.")

void find_delimiter_positions(
  const char * input,
  const char * delimiters,
  ssize_t * delimiter_positions)
{
  ssize_t dp_current = 0;
  const char * p = input;
  while (NULL != (p = strpbrk(p, delimiters)))
  {
    delimiter_positions[dp_current] = p - input;
    ++dp_current;
    ++p;
  }
}

int main(void)
{
  char input[] = "some randrom data; more.";
  size_t input_length = strlen(input);
  ssize_t delimiter_positions[input_length];
  for (size_t s = 0; s < input_length; ++s)
  {
    delimiter_positions[s] = -1;
  }

  find_delimiter_positions(input, DELIMITERS, delimiter_positions);

  for (size_t s = 0; -1 != delimiter_positions[s]; ++s)
  {
    /* print out positions */
  }
}

为什么C99：C99引入了V（ariable）L（ength）A（rray），这是必要的，以解决不使用动态内存分配的限制。

如果也可能不使用VLA，则需要回退定义每个字符串可能出现的分隔符的最大数量。然而，后者可能是可行的，因为给出了要解析的字符串的最大长度，这反过来意味着每个字符串可能的最大分隔符数。

对于后一种情况，来自上面例子的那些行

  char input[] = "some randrom data; more.";
  size_t input_length = strlen(input);
  ssize_t delimiter_positions[input_length];

可以替换为

  char input[MAX_INPUT_LEN] = "some randrom data; more.";
  size_t input_length = strlen(input);
  ssize_t delimiter_positions[MAX_INPUT_LEN];

Answer 2

不需要额外存储的方法是使包装函数为每个子字符串调用一个回调函数。在下面的示例中，字符串只使用普通的旧printf打印，但回调可以调用任何其他API函数。

注意事项：

有一个函数next，它应该指向下一个UTF-8字符。 UTF-8 char的编码宽度可以从它的第一个字节看出来。
空格和标点符号分隔符的处理方式略有不同：空格既不附加到行的结尾或开头。（如果文本中没有任何连续的空格，那就是。）标点符号保留在一行的末尾。

以下是一个示例实现：

#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define DELIMITERS " ;:,)]"

/*
 *      Advance to next character. This should advance the pointer to
 *      up to three chars, depending on the UTF-8 encoding. (But at the
 *      moment, it doesn't.)
 */
static const char *next(const char *p)
{
    return p + 1;
}

typedef struct {
    const char *begin;
    const char *end;
} substr_t;

/*
 *      Wraps the text and stores the found substring' ranges into
 *      the lines struct. Return the number of word-wrapped lines.
 */
int wrap(const char *text, int width, substr_t *lines, uint32_t max_num_lines)
{
    const char *begin = text;
    const char *split = NULL;
    uint32_t num_lines = 1;
    int l = 0;

    while (*text) {
        if (strchr(DELIMITERS, *text)) {
            split = text;
            if (*text != ' ') split++;
        }

        if (l++ == width) {
            if (split == NULL) split = text;
            lines[num_lines - 1].begin = begin;
            lines[num_lines - 1].end = split;
            //write(fileno(stdout), begin, split - begin);

            text = begin = split;
            while (*begin == ' ') begin++;
            split = NULL;
            l = 0;
            num_lines++;

            if (num_lines > max_num_lines) {
                //abort();
                return -1;
            }
        }

        text = next(text);
    }

    lines[num_lines - 1].begin = begin;
    lines[num_lines - 1].end = text;
    //write(fileno(stdout), begin, split - begin);

    return num_lines;
}

int main()
{
    const char *text = "I have a program that displays UTF-8 encoded strings "
        "with a size limitation (say MAX_LEN). Whenever I get a string with a "
        "length > MAX_LEN, I want to find out where I could split it so it "
        "would be printed gracefully.";

    substr_t lines[100];
    const uint32_t max_num_lines = sizeof(lines) / sizeof(lines[0]);

    const int num_lines = wrap(text, 48, lines, max_num_lines);
    if (num_lines < 0) {
        fprintf(stderr, "error: can't split into %d lines\n", max_num_lines);
        return EXIT_FAILURE;
    }

    //printf("num_lines = %d\n", num_lines);
    for (int i=0; i < num_lines; i++) {
        FILE *stream = stdout;
        const ptrdiff_t line_length = lines[i].end - lines[i].begin;
        write(fileno(stream), lines[i].begin, line_length);
        fputc('\n', stream);
    }

    return EXIT_SUCCESS;
}

附录：这是另一种在strtok模式上松散构建但不修改字符串的方法。它需要一个状态，并且必须使用要打印的字符串和最大行宽初始化该状态：

struct wrap_t {
    const char *src;
    int width;
    int length;
    const char *line;
};

int wrap(struct wrap_t *line)
{
    const char *begin = line->src;
    const char *split = NULL;
    int l = 0;

    if (begin == NULL) return -1;
    while (*begin == ' ') begin++;
    if (*begin == '\0') return -1;

    while (*line->src) {
        if (strchr(DELIMITERS, *line->src)) {
            split = line->src;
            if (*line->src != ' ') split++;
        }

        if (l++ == line->width) {
            if (split == NULL) split = line->src;

            line->line = begin;
            line->length = split - begin;
            line->src = split;

            return 0;
        }

        line->src = next(line->src);
    }

    line->line = begin;
    line->length = line->src - begin;

    return 0;
}

未显示的所有定义（DELIMITERS，next）如上所述，基本算法未发生变化。我认为这种方法对客户来说很容易使用：

int main()
{
    const char *text = "I have a program that displays UTF-8 encoded strings "
        "with a size limitation (say MAX_LEN). Whenever I get a string with a "
        "length > MAX_LEN, I want to find out where I could split it so it "
        "would be printed gracefully.";

    struct wrap_t line = {text, 60};

    while (wrap(&line) == 0) {
        printf("%.*s\n", line.length, line.line);
    }

    return 0;
}

Answer 3

解决方法1

在处理整个字符串之前将连续调用的函数：它将返回重新复制的字节数以创建子字符串：

API：

/** 
 * Return the length between the beginning of the string and the 
 * last delimiter (such that returned length <= max_length)
 */
size_t get_next_substring_length(
    const char * str,    // The string to be splitted
    const char * delim,  // String of eligible delimiters for a split
    size_t max_length);  // The maximum length of resulting substring

在客户端＆＃39;侧：

size_t shift = 0;
for(;;)
{
    // Where do we start within big_str ?
    const char * tmp = big_str + shift;

    size_t count = get_next_substring_length(tmp, DEFAULT_DELIMITERS, MAX_LEN);
    if(count)
    {
        // Allocate a sub-string and recopy "count" bytes
        // Display the sub-string
        shift += count;
    }
    else // End Of String (or error)
    {
        // Handle potential error
        // Exit the loop
    }
}

溶液2

定义一个自定义结构来存储子字符串的位置和长度：

const char * str = "This is a long test string";
struct substrings 
{
    const char * str; // Beginning of the substring
    size_t length;    // Length of the substring
} sub[] = { {&str[0], 4}, 
              {&str[5], 2}, 
              {&str[8], 1},
              {&str[10], 4},
              {&str[15], 4},
              {&str[20], 6},
              {NULL, 0} };

API：

size_t find_substrings(
    struct substrings ** substr, 
    size_t max_length, 
    const char * delimiters, 
    const char * str);

在客户端＆＃39;侧：

#define ARRAY_LENGTH 20U
struct substrings substr[ARRAY_LENGTH];

// Fill the structure
find_substrings(
    &substr, 
    ARRAY_LENGTH, 
    DEFAULT_DELIMITERS, 
    big_str);

// Browse the structure
for (struct substrings * sub = &substr[0]; substr->str; sub++) 
{
    // Display sub->length bytes of sub->str
}

有些事情困扰着我：

in Solution1 我不喜欢无限循环，它经常容易出错
在解决方案2 中我随意修复了ARRAY_LENGTH，但它应根据输入字符串长度而有所不同

如何使用特定分隔符进行自动换行，无需动态分配

3 个答案:

解决方法1

溶液2