我试图将DOCX转换为html,在谷歌搜索后,我可以从https://github.com/xylude/Docx-to-HTML/blob/master/docx_reader.php找到这个简单的库作为此转换器,但是在页面上无法检测footnote
并且我试图将其添加到此库中,
我如何检测样式文本或简单文本是脚注?有特殊风格吗?
<?php
class Docx_reader {
private $fileData = false;
private $errors = array();
private $styles = array();
public function __construct() {
}
private function load($file) {
if (file_exists($file)) {
$zip = new ZipArchive();
$openedZip = $zip->open($file);
if ($openedZip === true) {
//attempt to load styles:
if (($styleIndex = $zip->locateName('word/styles.xml')) !== false) {
$stylesXml = $zip->getFromIndex($styleIndex);
$xml = simplexml_load_string($stylesXml);
$namespaces = $xml->getNamespaces(true);
$children = $xml->children($namespaces['w']);
foreach ($children->style as $s) {
$attr = $s->attributes('w', true);
if (isset($attr['styleId'])) {
$tags = array();
$attrs = array();
foreach (get_object_vars($s->rPr) as $tag => $style) {
$att = $style->attributes('w', true);
switch ($tag) {
case "b":
$tags[] = 'strong';
break;
case "i":
$tags[] = 'em';
break;
case "color":
//echo (String) $att['val'];
$attrs[] = 'color:#' . $att['val'];
break;
case "sz":
$attrs[] = 'font-size:' . $att['val'] . 'px';
break;
}
}
$styles[(String)$attr['styleId']] = array('tags' => $tags, 'attrs' => $attrs);
}
}
$this->styles = $styles;
}
if (($index = $zip->locateName('word/document.xml')) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
return $data;
}
$zip->close();
}
}
} else {
$this->errors[] = 'File does not exist.';
}
}
public function setFile($path) {
$this->fileData = $this->load($path);
}
public function to_plain_text() {
if ($this->fileData) {
return strip_tags($this->fileData);
} else {
return false;
}
}
public function to_html() {
if ($this->fileData) {
$xml = simplexml_load_string($this->fileData);
$namespaces = $xml->getNamespaces(true);
$children = $xml->children($namespaces['w']);
$html = '<!doctype html><html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title></title><style>span.block { display: block; }</style></head><body>';
foreach ($children->body->p as $p) {
$style = '';
$startTags = array();
$startAttrs = array();
if($p->pPr->pStyle) {
$objectAttrs = $p->pPr->pStyle->attributes('w',true);
$objectStyle = (String) $objectAttrs['val'];
if(isset($this->styles[$objectStyle])) {
$startTags = $this->styles[$objectStyle]['tags'];
$startAttrs = $this->styles[$objectStyle]['attrs'];
}
}
if ($p->pPr->spacing) {
$att = $p->pPr->spacing->attributes('w', true);
if (isset($att['before'])) {
$style.='padding-top:' . ($att['before'] / 10) . 'px;';
}
if (isset($att['after'])) {
$style.='padding-bottom:' . ($att['after'] / 10) . 'px;';
}
}
$html.='<span class="block" style="' . $style . '">';
$li = false;
if ($p->pPr->numPr) {
$li = true;
$html.='<li>';
}
foreach ($p->r as $part) {
//echo $part->t;
$tags = $startTags;
$attrs = $startAttrs;
foreach (get_object_vars($part->pPr) as $k => $v) {
if ($k = 'numPr') {
$tags[] = 'li';
}
}
foreach (get_object_vars($part->rPr) as $tag => $style) {
//print_r($style->attributes());
$att = $style->attributes('w', true);
switch ($tag) {
case "b":
$tags[] = 'strong';
break;
case "i":
$tags[] = 'em';
break;
case "color":
//echo (String) $att['val'];
$attrs[] = 'color:#' . $att['val'];
break;
case "sz":
$attrs[] = 'font-size:' . $att['val'] . 'px';
break;
}
}
$openTags = '';
$closeTags = '';
foreach ($tags as $tag) {
$openTags.='<' . $tag . '>';
$closeTags.='</' . $tag . '>';
}
$html.='<span style="' . implode(';', $attrs) . '">' . $openTags . $part->t . $closeTags . '</span>';
}
if ($li) {
$html.='</li>';
}
$html.="</span>";
}
//Trying to weed out non-utf8 stuff from the file:
$regex = <<<'END'
/
(
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx
| [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
| [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2
| [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3
){1,100} # ...one or more times
)
| . # anything else
/x
END;
preg_replace($regex, '$1', $html);
return $html . '</body></html>';
exit();
}
}
public function get_errors() {
return $this->errors;
}
private function getStyles() {
}
}