自动使用MS Word作为PDF到HTML转换器

时间:2015-05-02 22:24:58

标签: pdf ms-word

我有一项任务是将数百个PDF文档转换为HTML格式。我尝试了多个独立转换器,但它们在处理列和连字符对齐方面存在问题。

但是,如果我只是在MS Word中打开文档,它会完美地处理它们。

所以,基本上,我需要一种方法来自动在Word中打开PDF,等待它处理并将其保存为HTML(已过滤)。

有人知道,我该怎么办?

1 个答案:

答案 0 :(得分:0)

我们可以在PowerShell中完成。

将你的要求分成两部分

第一 - 将PDF转换为WORD文档

'**** The script runs in a loop until it detects a new file in the directory
'**** It checks the source folder every 30 seconds.  To change this interval
'**** change the sleep time at the end of the program to the number of seconds x 1000
'****  wscript.sleep (15000) would check the folder every 15 seconds
'****  
'****
'**** The program uses AnyBizSoft PDtoWord converter.  It is available as a free version off of
'***  facebook here - Facebook  - http://www.facebook.com/AnyBizSoft?v=app_6009294086
'***  You can buy it from thier website at http://www.anypdftools.com/pdf-to-word.html
'***  I have no connection with them other than they were the first one i found that worked with
'***  a command line.
'***
'***  The script uses two directories.  C:\Source\ is where pdf files are copied to
'***  C:\Converted is where the converted file is placed.  It is either a doc file if you have Office 2003 or
'***  older or a docx if you have Office 2007 or newer.
'***  After the file is converted the original pdf is deleted. This can be changed by commenting out the
'***  Line that deletes the file near the end of the script.
'***
'***  The script can be placed anywhere, but the pdftoword folder needs to be copied from the program files 
'***  directory to the c:\source folder
'***
'Option Explicit
Set objFSO = CreateObject("Scripting.FileSystemObject")
Dim strComputer
strComputer = "."
spath="C:\source\"     '*** Source directory
dpath="C:\converted\"  '*** Destination or Converted Directory

Set objWMIService = GetObject("winmgmts:" _
    & "{impersonationLevel=impersonate}!\\" & _
        strComputer & "\root\cimv2")
Set colMonitoredEvents = objWMIService.ExecNotificationQuery _
    ("SELECT * FROM __InstanceCreationEvent WITHIN 10 WHERE " _
        & "Targetinstance ISA 'CIM_DirectoryContainsFile' and " _
            & "TargetInstance.GroupComponent= " _
                & "'Win32_Directory.Name=""c:\\\\source""'")
Do

  Do 
    set sourcefolder=objFso.GetFolder(spath)
        numfiles=sourcefolder.files.count
        set sourcefiles = sourcefolder.files
        for each objFile in sourcefiles
                sourcefile = objFile.name
      next 
  loop until (numfiles > 0)


'*** Call pdftoword to convert the file
     Set wshShell = WScript.CreateObject ("WSCript.shell")
     convertstr="c:\source\pdftoword\pdftoword.exe " & chr(34) & spath  & sourcefile & chr(34)
     wshshell.run convertstr, 6, false

Do      '*** Wait for docx to be created before continuing
    Set objLatestEvent = colMonitoredEvents.NextEvent
loop until (instr(objLatestEvent.TargetInstance.PartComponent,"doc")  > 0)           


'***  Make time stamp for file name
        d = Now 
    hhmmss = Right("00" & Hour(d), 2) & Right("00" & Minute(d), 2) & Right("00" & Second(d), 2)

'***  Get just the filename without the extension
  sourcefilename =  left(sourcefile,instr(sourcefile,".")-1)

'***  Add the timestamp to the converted file       
     newname = sourcefilename & "-" & hhmmss


'***  Exit program if file exists in the destination folder.  Highly unlikely since it is timestamped
if objfso.FileExists(dpath & newname & ".docx") then
        wscript.echo "Destination file " & dpath & newname & ".docx exists already"
        WScript.Quit 
end if
if objfso.FileExists("c:\converted\" & newname & ".doc") then
        wscript.echo "Destination file " &  dpath & newname & ".doc exists already"
        WScript.Quit 
end if

'*** move converted file to the converted folder then delete original
if objfso.FileExists(spath & sourcefilename & ".docx") then 
        newname= dpath & newname & ".docx"
        oldname = spath & sourcefilename & ".docx"
    objfso.Movefile  "" & oldname & "", "" & newname  & ""
    objfso.DeleteFile("" & spath & sourcefile &  "")   'Delete or comment this line if you do not want the original deleted
end if

if objfso.FileExists(spath & sourcefilename & ".doc") then 
        newname= dpath & newname & ".doc"
        oldname = spath & sourcefilename & ".doc"
    objfso.Movefile  "" & oldname & "", "" & newname  & ""
    objfso.DeleteFile("" & spath & sourcefile &  "")   'Delete or comment this line if you do not want the original deleted
end if


' *** Kill PDFtoword process
strProcessKill="PDFtoWord.exe"
Set colProcess = objWMIService.ExecQuery _
("Select * from Win32_Process Where Name = 'PDFtoWord.exe'"  )
For Each objProcess in colProcess
  objProcess.Terminate()
Next 

wscript.sleep (30000)  'Wait 30 seconds to look for next file.  1000 = 1 second
Loop

SECOND - 将WORD文档转换为HTML

param([string]$docpath,[string]$htmlpath = $docpath)

$srcfiles = Get-ChildItem $docPath -filter "*.doc"
$saveFormat = [Enum]::Parse([Microsoft.Office.Interop.Word.WdSaveFormat], "wdFormatFilteredHTML");
$word = new-object -comobject word.application
$word.Visible = $False

function saveas-filteredhtml
    {
        $opendoc = $word.documents.open($doc.FullName);
        $opendoc.saveas([ref]"$htmlpath\$doc.fullname.html", [ref]$saveFormat);
        $opendoc.close();
    }

ForEach ($doc in $srcfiles)
    {
        Write-Host "Processing :" $doc.FullName
        saveas-filteredhtml
        $doc = $null
    }

$word.quit();

将此代码保存到convertdoc-tohtml.ps1,您可以在一组word文档上运行它,无论doc或docx扩展名如何。

以下是如何运行它:

convertdoc-tohtml.ps1 -docpath "C:\Documents" -htmlpath "C:\Output"