PHP中的平衡自动换行(最小粗糙度)

时间:2012-01-30 21:42:33

标签: php algorithm word-wrap text-processing

我将在PHP中制作自动换行算法。我想在最大 m 字符的 n 行中分割小块文本(短语)( n 没有给出,所以会有尽可能多的线条)。特点是线条长度(以字符为单位)必须尽可能多地平衡线条。

输入文字示例:

How to do things

输出错误(这是正常的自动换行行为), m = 6

How to
do
things

所需输出,始终 m = 6

How 
to do 
things

是否有人就如何实施此功能提出建议或指导?基本上,我正在搜索两到三个漂亮的印刷短语(尽可能多)等长线


更新:我似乎正在搜索Minimum raggedness word wrap algorithm。但是我找不到真正的编程语言中的任何实现(任何人,然后我可以用PHP转换它)。


更新2 :我为此开始了赏金。是否有可能在任何程序语言中都不存在最小粗糙度算法的任何公共实现?我需要一些可以翻译成程序指令的方式编写的东西。我现在所能找到的只是一个(通用)方程式,但需要一个最佳的搜索程序。我将非常感谢能够只接近最佳搜索算法的实现。

6 个答案:

答案 0 :(得分:7)

我已经在Alex的相同行上实现了编码维基百科算法,但直接在PHP中实现(对我来说这是一个有趣的练习)。了解如何使用最优成本函数 f(j),即“重现”部分,并不是很容易。感谢Alex为评论很好的代码。

/** 
 * minimumRaggedness
 *
 * @param string $input paragraph. Each word separed by 1 space.
 * @param int $LineWidth the max chars per line.
 * @param string $lineBreak wrapped lines separator.
 * 
 * @return string $output the paragraph wrapped.
 */
function minimumRaggedness($input, $LineWidth, $lineBreak = "\n")
{
    $words = explode(" ", $input);
    $wsnum = count($words);
    $wslen = array_map("strlen", $words);
    $inf = 1000000; //PHP_INT_MAX;

    // keep Costs
    $C = array();

    for ($i = 0; $i < $wsnum; ++$i)
    {
        $C[] = array();
        for ($j = $i; $j < $wsnum; ++$j)
        {
            $l = 0;
            for ($k = $i; $k <= $j; ++$k)
                $l += $wslen[$k];
            $c = $LineWidth - ($j - $i) - $l;
            if ($c < 0)
                $c = $inf;
            else
                $c = $c * $c;
            $C[$i][$j] = $c;
        }
    }

    // apply recurrence
    $F = array();
    $W = array();
    for ($j = 0; $j < $wsnum; ++$j)
    {
        $F[$j] = $C[0][$j];
        $W[$j] = 0;
        if ($F[$j] == $inf)
        {
            for ($k = 0; $k < $j; ++$k)
            {
                $t = $F[$k] + $C[$k + 1][$j];
                if ($t < $F[$j])
                {
                    $F[$j] = $t;
                    $W[$j] = $k + 1;
                }
            }
        }
    }

    // rebuild wrapped paragraph
    $output = "";
    if ($F[$wsnum - 1] < $inf)
    {
        $S = array();
        $j = $wsnum - 1;
        for ( ; ; )
        {
            $S[] = $j;
            $S[] = $W[$j];
            if ($W[$j] == 0)
                break;
            $j = $W[$j] - 1;
        }

        $pS = count($S) - 1;
        do
        {
            $i = $S[$pS--];
            $j = $S[$pS--];
            for ($k = $i; $k < $j; $k++)
                $output .= $words[$k] . " ";
            $output .= $words[$k] . $lineBreak;
        }
        while ($j < $wsnum - 1);
    }
    else
        $output = $input;

    return $output;
}

&GT;

答案 1 :(得分:4)

快速而肮脏,用c ++

#include <sstream>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <memory.h>

using namespace std;

int cac[1000][1000];
string res[1000][1000];
vector<string> words;
int M;

int go(int a, int b){
    if(cac[a][b]>= 0) return cac[a][b];
    if(a == b) return 0;

    int csum = -1;
    for(int i=a; i<b; ++i){
    csum += words[i].size() + 1;
    }
    if(csum <= M || a == b-1){
    string sep = "";
        for(int i=a; i<b; ++i){
            res[a][b].append(sep);
            res[a][b].append(words[i]);
            sep = " ";
    }
    return cac[a][b] = (M-csum)*(M-csum);
    }

    int ret = 1000000000;
    int best_sp = -1;
    for(int sp=a+1; sp<b; ++sp){
    int cur = go(a, sp) + go(sp,b);
    if(cur <= ret){
        ret = cur;
        best_sp = sp;
    }
    }
    res[a][b] = res[a][best_sp] + "\n" + res[best_sp][b];
    return cac[a][b] = ret;
}


int main(int argc, char ** argv){
    memset(cac, -1, sizeof(cac));
    M = atoi(argv[1]);
    string word;
    while(cin >> word) words.push_back(word);
    go(0, words.size());
    cout << res[0][words.size()] << endl;
}

测试:

$ echo "The quick brown fox jumps over a lazy dog" |./a.out 10
The quick
brown fox
jumps over
a lazy dog

编辑:只是查看维基百科页面,了解最小的粗糙字包装。将算法更改为给定的算法(具有平方罚分)

答案 2 :(得分:2)

C版:

// This is a direct implementation of the minimum raggedness word wrapping
// algorithm from http://en.wikipedia.org/wiki/Word_wrap#Minimum_raggedness

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <limits.h>

const char* pText = "How to do things";
int LineWidth = 6;
int WordCnt;
const char** pWords;
int* pWordLengths;

int* pC;
int* pF;
int* pW;
int* pS;

int CountWords(const char* p)
{
  int cnt = 0;

  while (*p != '\0')
  {
    while (*p != '\0' && isspace(*p)) p++;

    if (*p != '\0')
    {
      cnt++;
      while (*p != '\0' && !isspace(*p)) p++;
    }
  }

  return cnt;
}

void FindWords(const char* p, int cnt, const char** pWords, int* pWordLengths)
{
  while (*p != '\0')
  {
    while (*p != '\0' && isspace(*p)) p++;

    if (*p != '\0')
    {
      *pWords++ = p;
      while (*p != '\0' && !isspace(*p)) p++;
      *pWordLengths++ = p - pWords[-1];
    }
  }
}

void PrintWord(const char* p, int l)
{
  int i;

  for (i = 0; i < l; i++)
    printf("%c", p[i]);
}

// 1st program's argument is the text
// 2nd program's argument is the line width
int main(int argc, char* argv[])
{
  int i, j;

  if (argc >= 3)
  {
    pText = argv[1];
    LineWidth = atoi(argv[2]);
  }

  WordCnt = CountWords(pText);

  pWords = malloc(WordCnt * sizeof(*pWords));
  pWordLengths = malloc(WordCnt * sizeof(*pWordLengths));

  FindWords(pText, WordCnt, pWords, pWordLengths);

  printf("Input Text: \"%s\"\n", pText);
  printf("Line Width: %d\n", LineWidth);
  printf("Words     : %d\n", WordCnt);

#if 0
  for (i = 0; i < WordCnt; i++)
  {
    printf("\"");
    PrintWord(pWords[i], pWordLengths[i]);
    printf("\"\n");
  }
#endif

  // Build c(i,j) in pC[]
  pC = malloc(WordCnt * WordCnt * sizeof(int));
  for (i = 0; i < WordCnt; i++)
  {
    for (j = 0; j < WordCnt; j++)
      if (j >= i)
      {
        int k;
        int c = LineWidth - (j - i);
        for (k = i; k <= j; k++) c -= pWordLengths[k];
        c = (c >= 0) ? c * c : INT_MAX;
        pC[j * WordCnt + i] = c;
      }
      else
        pC[j * WordCnt + i] = INT_MAX;
  }

  // Build f(j) in pF[] and store the wrap points in pW[]
  pF = malloc(WordCnt * sizeof(int));
  pW = malloc(WordCnt * sizeof(int));
  for (j = 0; j < WordCnt; j++)
  {
    pW[j] = 0;
    if ((pF[j] = pC[j * WordCnt]) == INT_MAX)
    {
      int k;
      for (k = 0; k < j; k++)
      {
        int s;
        if (pF[k] == INT_MAX || pC[j * WordCnt + k + 1] == INT_MAX)
          s = INT_MAX;
        else
          s = pF[k] + pC[j * WordCnt + k + 1];
        if (pF[j] > s)
        {
          pF[j] = s;
          pW[j] = k + 1;
        }
      }
    }
  }

  // Print the optimal solution cost
  printf("f         : %d\n", pF[WordCnt - 1]);

  // Print the optimal solution, if any
  pS = malloc(2 * WordCnt * sizeof(int));
  if (pF[WordCnt - 1] != INT_MAX)
  {
    // Work out the solution's words by back tracking the
    // wrap points from pW[] and store them on the pS[] stack
    j = WordCnt - 1;
    for (;;)
    {
      *pS++ = j;
      *pS++ = pW[j];
      if (!pW[j]) break;
      j = pW[j] - 1;
    }
    // Print the solution line by line, word by word
    // in direct order
    do
    {
      int k;
      i = *--pS;
      j = *--pS;
      for (k = i; k <= j; k++)
      {
        PrintWord(pWords[k], pWordLengths[k]);
        printf(" ");
      }
      printf("\n");
    } while (j < WordCnt - 1);
  }

  return 0;
}

输出1:

ww.exe
Input Text: "How to do things"
Line Width: 6
Words     : 4
f         : 10
How
to do
things

输出2:

ww.exe "aaa bb cc ddddd" 6
Input Text: "aaa bb cc ddddd"
Line Width: 6
Words     : 4
f         : 11
aaa
bb cc
ddddd

输出3:

ww.exe "I started a bounty for this. Is it possible that do not exist any public implementation of Minimum raggedness algorithm in any procedural language? I need something written in a way that can be translated into procedural instructions. All I can find now is just a bounch of (generic) equation that however need a optimal searhing procedure. I will be grateful also for an implementation that can only approximate that optimal searching algorithm." 60
Input Text: "I started a bounty for this. Is it possible that do not exist any public implementation of Minimum raggedness algorithm in any procedural language? I need something written in a way that can be translated into procedural instructions. All I can find now is just a bounch of (generic) equation that however need a optimal searhing procedure. I will be grateful also for an implementation that can only approximate that optimal searching algorithm."
Line Width: 60
Words     : 73
f         : 241
I started a bounty for this. Is it possible that do not
exist any public implementation of Minimum raggedness
algorithm in any procedural language? I need something
written in a way that can be translated into procedural
instructions. All I can find now is just a bounch of
(generic) equation that however need a optimal searhing
procedure. I will be grateful also for an implementation
that can only approximate that optimal searching algorithm.

答案 3 :(得分:2)

我认为最简单的方法是在极限之间进行迭代

E.g。

/** 
 * balancedWordWrap
 *
 * @param string $input
 * @param int $maxWidth the max chars per line
 */
function balancedWordWrap($input, $maxWidth = null) {
    $length = strlen($input);
    if (!$maxWidth) {
        $maxWidth = min(ceil($length / 2), 75);
    }   
    $minWidth = min(ceil($length / 2), $maxWidth / 2); 

    $permutations = array();
    $scores = array();
    $lowestScore = 999;
    $lowest = $minWidth;

    foreach(range($minWidth, $maxWidth) as $width) {
        $permutations[$width] = wordwrap($input, $width);
        $lines = explode("\n", $permutations[$width]);

        $max = 0;
        foreach($lines as $line) {
            $lineLength = strlen($line);
            if ($lineLength > $max) {
                $max = $lineLength;
            }   
        }   

        $score = 0;
        foreach($lines as $line) {
            $lineLength = strlen($line);
            $score += pow($max - $lineLength, 2); 
        }   

        $scores[$width] = $score;
        if ($score < $lowestScore) {
            $lowestScore = $score;
            $lowest = $width;
        }   
    }   

    return $permutations[$lowest];
} 

鉴于输入“如何做事”

输出

How
to do
things

鉴于输入“玛丽有一只小羊羔”

输出

Mary had a
little lamb

考虑到输入“这个超长段落用于演示fmt(1)程序如何处理更长的输入。在测试输入时,你不希望它们太短,也不太长,因为只有通过检查复杂的内容才能确定程序的质量。快速的棕色狐狸跳过懒狗。国会不制定任何法律来尊重宗教的建立,或禁止自由行使;或者剥夺言论自由,或者新闻界;或者人民和平集会的权利,以及向政府请求纠正不满的权利。“,并且最多限制为75个字符,它输出:

This extra-long paragraph was writtin to demonstrate how the `fmt(1)`
program handles longer inputs. When testing inputs, you don't want them 
be too short, nor too long, because the quality of the program can only be
determined upon inspection of complex content. The quick brown fox jumps
over the lazy dog. Congress shall make no law respecting an establishment
of religion, or prohibiting the free exercise thereof; or abridging the
freedom of speech, or of the press; or the right of the people peaceably
to assemble, and to petition the Government for a redress of grievances.

答案 4 :(得分:0)

Justin's linkKnuth's Breaking Paragraphs Into Lines是历史上最佳的答案。 (较新的系统也应用microtypography技术,如摆弄字符宽度,字距调整等,但如果你只是寻找等宽的纯文本,这些额外的方法将无济于事。)

如果您只想解决问题,自由软件基金会在许多Linux系统上提供的fmt(1)实用程序实现了Knuth算法的变体,该算法也试图避免句子末尾的换行符。我编写了您的输入和一个更大的示例,并通过fmt -w 20运行它们以强制使用20个字符的行:

$ fmt -w 20 input 
Lorem ipsum dolor
sit amet

Supercalifragilisticexpialidocious
and some other
small words

One long
extra-long-word

This extra-long
paragraph
was writtin to
demonstrate how the
`fmt(1)` program
handles longer
inputs. When
testing inputs,
you don't want them
to be too short,
nor too long,
because the quality
of the program can
only be determined
upon inspection
of complex
content. The quick
brown fox jumps
over the lazy
dog. Congress
shall make no
law respecting
an establishment
of religion, or
prohibiting the
free exercise
thereof; or
abridging the
freedom of speech,
or of the press;
or the right of the
people peaceably
to assemble,
and to petition
the Government
for a redress of
grievances.

如果允许默认的75个字符的宽度用于非平凡输入,则输出看起来要好得多:

$ fmt input 
Lorem ipsum dolor sit amet

Supercalifragilisticexpialidocious and some other small words

One long extra-long-word

This extra-long paragraph was writtin to demonstrate how the `fmt(1)`
program handles longer inputs. When testing inputs, you don't want them
to be too short, nor too long, because the quality of the program can
only be determined upon inspection of complex content. The quick brown
fox jumps over the lazy dog. Congress shall make no law respecting an
establishment of religion, or prohibiting the free exercise thereof;
or abridging the freedom of speech, or of the press; or the right of
the people peaceably to assemble, and to petition the Government for a
redress of grievances.

答案 5 :(得分:0)

这是一个bash版本:

#! /bin/sh
if ! [[ "$1" =~ ^[0-9]+$ ]] ; then
    echo "Usage: balance <width> [ <string> ]"
    echo " "
    echo "  if string is not passed as parameter it will be read from STDIN\n"
    exit 2
elif [ $# -le 1 ] ; then
    LINE=`cat`
else
    LINE="$2"
fi

LINES=`echo "$LINE" | fold -s -w $1 | wc -l`
MAX=$1
MIN=0

while [ $MAX -gt $(($MIN+1)) ]
do
    TRY=$(( $MAX + $MIN >> 1 ))
    NUM=`echo "$LINE" | fold -s -w $TRY | wc -l`
    if [ $NUM -le $LINES ] ; then
        MAX=$TRY
    else
        MIN=$TRY
    fi
done

echo "$LINE" | fold -s -w $MAX

示例:

$ balance 50 "Now is the time for all good men to come to the aid of the party."
Now is the time for all good men 
to come to the aid of the party.

需要'fold'和'wc',它们通常在安装bash时可用。