我正在使用PHP解析docx,使用以下代码按顺序提取图像和文本 -
$zip = zip_open($filename);
if (!$zip || is_numeric($zip)) return false;
while ($zip_entry = zip_read($zip)) {
if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
$zipEntryName = zip_entry_name($zip_entry);
/*if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zipEntryName))
{
echo zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
}*/
if (strpos($zipEntryName, 'word/media') !== false)
{
# Removes 'word/media' prefix
$imageName = substr($zipEntryName, 11);
# Prevent EMF file extensions passing, as they are used by word rather than being manually placed
if (substr($imageName, -3) == 'emf') continue;
# Place the image assets into an array for future reference
$imageAssets[$imageName] = array(
'h' => 'auto',
'w' => 'auto',
'title' => $imageName,
'id' => null,
'data' => base64_encode(zip_entry_read($zip_entry, zip_entry_filesize($zip_entry))));
}
if ($zipEntryName != "word/document.xml") continue;
$content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
zip_entry_close($zip_entry);
}
zip_close($zip);
$content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
$content = str_replace('</w:r></w:p>', "\r\n", $content);
$content = str_replace("\r\n", "\n", $content);
$striped_content = strip_tags($content);
我将文件存储在imageAssets
数组中。剥离的内容包含整个文本以及图像被转换为随机数。如何将此数字映射到正确的图像。
答案 0 :(得分:0)
**试试这段代码**
$zip2 = new ZipArchive;
$zip2->open($filename);
$zip = zip_open($filename);
$zip2->open($filename);
$i=0;
if (!$zip || is_numeric($zip)) return false;
while ($zip_entry = zip_read($zip)) {
if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
$zipEntryName = zip_entry_name($zip_entry);
if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zipEntryName))
{
// echo zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
$dataIn = $zip2->statIndex($i);
$zip_element = $zip2->statIndex($i);
$index = $zip_element['index'];
echo "<image src='display.php?filename=".$filename."&index=".$index."' ><br />";
}
$i++;
if (strpos($zipEntryName, 'word/media') !== false)
{
# Removes 'word/media' prefix
$imageName = substr($zipEntryName, 11);
# Prevent EMF file extensions passing, as they are used by word rather than being manually placed
if (substr($imageName, -3) == 'emf') continue;
# Place the image assets into an array for future reference
$imageAssets[$imageName] = array(
'h' => 'auto',
'w' => 'auto',
'title' => $imageName,
'id' => null,
'data' => base64_encode(zip_entry_read($zip_entry, zip_entry_filesize($zip_entry))));
}
if ($zipEntryName != "word/document.xml") continue;
$content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
zip_entry_close($zip_entry);
}
zip_close($zip);
$content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
$content = str_replace('</w:r></w:p>', "\r\n", $content);
$content = str_replace("\r\n", "\n", $content);
$striped_content = strip_tags($content);
并为图片显示添加新文件(display.php)相同文件夹
<?php
/*Tell the browser that we want to display an image*/
header('Content-Type: image/jpeg');
/*Create a new ZIP archive object*/
$zip = new ZipArchive;
/*Open the received archive file*/
if (true === $zip->open($_GET['filename'])) {
/*Get the content of the specified index of ZIP archive*/
echo $zip->getFromIndex($_GET['index']);
}
$zip->close();
?>