PHP从docx获取文本和图像

时间:2017-08-14 13:22:28

标签: php image parsing docx

我正在使用PHP解析docx,使用以下代码按顺序提取图像和文本 -

    $zip = zip_open($filename);
    if (!$zip || is_numeric($zip)) return false;

    while ($zip_entry = zip_read($zip)) {

        if (zip_entry_open($zip, $zip_entry) == FALSE) continue;

        $zipEntryName = zip_entry_name($zip_entry);
        /*if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zipEntryName))
        {
            echo zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
        }*/
        if (strpos($zipEntryName, 'word/media') !== false)
        {
            # Removes 'word/media' prefix
            $imageName = substr($zipEntryName, 11);

            # Prevent EMF file extensions passing, as they are used by word rather than being manually placed
            if (substr($imageName, -3) == 'emf') continue;

            # Place the image assets into an array for future reference
            $imageAssets[$imageName] = array(
                'h' => 'auto',
                'w' => 'auto',
                'title' => $imageName,
                'id' => null,
                'data' => base64_encode(zip_entry_read($zip_entry, zip_entry_filesize($zip_entry))));
        }

        if ($zipEntryName != "word/document.xml") continue;

        $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));

        zip_entry_close($zip_entry);
    }
    zip_close($zip);
    $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
    $content = str_replace('</w:r></w:p>', "\r\n", $content);
    $content = str_replace("\r\n", "\n", $content);
    $striped_content = strip_tags($content);

我将文件存储在imageAssets数组中。剥离的内容包含整个文本以及图像被转换为​​随机数。如何将此数字映射到正确的图像。

1 个答案:

答案 0 :(得分:0)

**试试这段代码**

$zip2 = new ZipArchive;
$zip2->open($filename);
$zip = zip_open($filename);
$zip2->open($filename);
$i=0;
if (!$zip || is_numeric($zip)) return false;

while ($zip_entry = zip_read($zip)) {

    if (zip_entry_open($zip, $zip_entry) == FALSE) continue;

    $zipEntryName = zip_entry_name($zip_entry);
    if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zipEntryName))
    {
      //  echo zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
        $dataIn = $zip2->statIndex($i);
        $zip_element = $zip2->statIndex($i);

        $index = $zip_element['index'];
        echo "<image src='display.php?filename=".$filename."&index=".$index."' ><br />";
    }
    $i++;

    if (strpos($zipEntryName, 'word/media') !== false)
    {
        # Removes 'word/media' prefix
     $imageName = substr($zipEntryName, 11);

        # Prevent EMF file extensions passing, as they are used by word rather than being manually placed
        if (substr($imageName, -3) == 'emf') continue;

        # Place the image assets into an array for future reference
        $imageAssets[$imageName] = array(
            'h' => 'auto',
            'w' => 'auto',
            'title' => $imageName,
            'id' => null,
            'data' => base64_encode(zip_entry_read($zip_entry, zip_entry_filesize($zip_entry))));
    }

    if ($zipEntryName != "word/document.xml") continue;

    $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));

    zip_entry_close($zip_entry);

}

zip_close($zip);
$content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
$content = str_replace('</w:r></w:p>', "\r\n", $content);
$content = str_replace("\r\n", "\n", $content);
$striped_content = strip_tags($content);

并为图片显示添加新文件(display.php)相同文件夹

<?php

    /*Tell the browser that we want to display an image*/
    header('Content-Type: image/jpeg');

    /*Create a new ZIP archive object*/
    $zip = new ZipArchive;

    /*Open the received archive file*/
    if (true === $zip->open($_GET['filename'])) {

        /*Get the content of the specified index of ZIP archive*/
        echo $zip->getFromIndex($_GET['index']);

    }

    $zip->close();
    ?>