DOM标记和属性规则和过滤

时间:2014-04-22 04:14:30

标签: php html dom

我使用以下代码将doc和docx转换为剥离的html

<?php
class docxhtml
{
    /** @var string */
    private $tag;
    /** @var string */
    private $attribute;

    public $connectname;
    public $connectpass;

    public function __construct($format_res, $flname)
    {
        require_once('config.php');
        // Turn up error reporting
        error_reporting(E_ALL | E_STRICT);

        // Turn off WSDL caching
        ini_set('soap.wsdl_cache_enabled', 0);

        // Define credentials for LD
        define ('USERNAME', $this->connectname);
        define ('PASSWORD', $this->connectpass);

        // SOAP WSDL endpoint
        define ('ENDPOINT', 'https://api.livedocx.com/2.1/mailmerge.asmx?wsdl');

        // Define timezone
        date_default_timezone_set('Europe/Berlin');

        // Instantiate SOAP object and log into LiveDocx
        $this->soap = new SoapClient(ENDPOINT);

        $this->soap->LogIn(
            array('username' => USERNAME, 'password' => PASSWORD)
        );

        // Upload template
        $this->data = file_get_contents('Original/' . $format_res);

        $this->soap->SetLocalTemplate(
            array('template' => base64_encode($this->data), 'format' => 'docx')
        );

        $this->result = $this->soap->RetrieveDocument(
            array('format' => 'html')
        );

        $this->data = $this->result->RetrieveDocumentResult;

        $exceptions = array(
            'a'   => array('href'),
            'img' => array('src')
        );

        $this->stripAttributes($exceptions);

        file_put_contents('Recode/' . $flname . '.html', base64_decode($this->data));
    }

    public function stripAttributes(array $exceptions)
    {
        $dom = new DOMDocument();
        $dom->strictErrorChecking = false;
        $dom->formatOutput = true;
        $dom->loadHTML(base64_decode($this->data));

        $xpath = new DOMXPath($dom);
        if (false === ($elements = $xpath->query("//*"))) die('Xpath error!');

        /** @var $element DOMElement */
        foreach ($elements as $element) {
            for ($i = $element->attributes->length; --$i >= 0;) {
                $this->tag       = $element->nodeName;
                $this->attribute = $element->attributes->item($i)->nodeName;

                if ($this->checkAttrExceptions($exceptions)) continue;

                $element->removeAttribute($this->attribute);
            }
        }

        $this->data = base64_encode($dom->saveHTML());
    }

    public function checkAttrExceptions(array $exceptions)
    {
        foreach ($exceptions as $tag => $attributes) {
            if (empty($attributes) || !is_array($attributes)) {
                die('Attributes not set!');
            }

            foreach ($attributes as $attribute) {
                if ($tag === $this->tag && $attribute === $this->attribute) {
                    return true;
                }
            }
        }

        return false;
    }
}

现在我想要的是将结果和过滤器添加到生成的html输出中,如下所示

1)所有居中的标题现在应为Div标记,目前为P标记

Sample : http://oi58.tinypic.com/2b6v0k.jpg

2)中心标题下的作者姓名也应该在单独的Div标签中,该标签现在也在P标签中(&#34;就像上面示例图片中的单词 - Francis Bacon&#34; )

3)所有标题应该在P标记中,其属性为class ="h1 or h2 or..h",其中应使用字体大小检测h

4)如果<li>标记中只有一行,那么<li> content </li>目前应为<li><p>content </p></li>

5)假设<li>标记包含多行,则应该有p标记

6)删除所有Span代码

请向我提供有关这样做的建议和指导......

编辑:要转换的Html文件的链接http://www64.zippyshare.com/v/80261796/file.html

1 个答案:

答案 0 :(得分:1)

<强> 6。删除代码

要删除代码,您可以使用getElementsByTagName,因为您知道要删除的代码的“名称”。在操作DOM时,重要的是您理解它返回的DOMNodeList对象是动态的,并且会在您更改文档时发生更改。这就是为什么在下面的例子中,我们将某些事物(即节点列表和节点数)存储在不会改变的变量中。如果你不这样做,你最终会有奇怪的行为。 $nodeList->item(0)将始终返回未更改的下一个匹配标记,因为我们正在修改DOM。

$dom = new DOMDocument();
$dom->loadHTMLFile('english.html');

$nodeList  = $dom->getElementsByTagName('span');
$nodeCount = $nodeList->length;

for ($i = 0; $i < $nodeCount; $i++) {
    $span = $nodeList->item(0);
    $span->parentNode->replaceChild(new DOMText($span->textContent), $span);
}

echo $dom->saveHTML();

如果您的HTML输出包含具有子HTML标记的<span>元素,则上述内容可能有点过于激进。要删除所有<span>标记,但保留子元素,可以使用以下方法:

$dom = new DOMDocument();
$dom->loadHTMLFile('english.html');

$nodeList  = $dom->getElementsByTagName('span');
$nodeCount = $nodeList->length;

for ($i = 0; $i < $nodeCount; $i++) {

    $span    = $nodeList->item(0);
    $cleaned = $dom->createDocumentFragment();

    $childList  = $span->childNodes;
    $childCount = $childList->length;

    for ($j = 0; $j < $childCount; $j++) {
        $child = $childList->item(0);
        $cleaned->appendChild($child);
    }

    $span->parentNode->replaceChild($cleaned, $span);
}

echo $dom->saveHTML();