我有一项任务是将数百个PDF文档转换为HTML格式。我尝试了多个独立转换器,但它们在处理列和连字符对齐方面存在问题。
但是,如果我只是在MS Word中打开文档,它会完美地处理它们。
所以,基本上,我需要一种方法来自动在Word中打开PDF,等待它处理并将其保存为HTML(已过滤)。
有人知道,我该怎么办?
答案 0 :(得分:0)
我们可以在PowerShell中完成。
将你的要求分成两部分
第一 - 将PDF转换为WORD文档
'**** The script runs in a loop until it detects a new file in the directory
'**** It checks the source folder every 30 seconds. To change this interval
'**** change the sleep time at the end of the program to the number of seconds x 1000
'**** wscript.sleep (15000) would check the folder every 15 seconds
'****
'****
'**** The program uses AnyBizSoft PDtoWord converter. It is available as a free version off of
'*** facebook here - Facebook - http://www.facebook.com/AnyBizSoft?v=app_6009294086
'*** You can buy it from thier website at http://www.anypdftools.com/pdf-to-word.html
'*** I have no connection with them other than they were the first one i found that worked with
'*** a command line.
'***
'*** The script uses two directories. C:\Source\ is where pdf files are copied to
'*** C:\Converted is where the converted file is placed. It is either a doc file if you have Office 2003 or
'*** older or a docx if you have Office 2007 or newer.
'*** After the file is converted the original pdf is deleted. This can be changed by commenting out the
'*** Line that deletes the file near the end of the script.
'***
'*** The script can be placed anywhere, but the pdftoword folder needs to be copied from the program files
'*** directory to the c:\source folder
'***
'Option Explicit
Set objFSO = CreateObject("Scripting.FileSystemObject")
Dim strComputer
strComputer = "."
spath="C:\source\" '*** Source directory
dpath="C:\converted\" '*** Destination or Converted Directory
Set objWMIService = GetObject("winmgmts:" _
& "{impersonationLevel=impersonate}!\\" & _
strComputer & "\root\cimv2")
Set colMonitoredEvents = objWMIService.ExecNotificationQuery _
("SELECT * FROM __InstanceCreationEvent WITHIN 10 WHERE " _
& "Targetinstance ISA 'CIM_DirectoryContainsFile' and " _
& "TargetInstance.GroupComponent= " _
& "'Win32_Directory.Name=""c:\\\\source""'")
Do
Do
set sourcefolder=objFso.GetFolder(spath)
numfiles=sourcefolder.files.count
set sourcefiles = sourcefolder.files
for each objFile in sourcefiles
sourcefile = objFile.name
next
loop until (numfiles > 0)
'*** Call pdftoword to convert the file
Set wshShell = WScript.CreateObject ("WSCript.shell")
convertstr="c:\source\pdftoword\pdftoword.exe " & chr(34) & spath & sourcefile & chr(34)
wshshell.run convertstr, 6, false
Do '*** Wait for docx to be created before continuing
Set objLatestEvent = colMonitoredEvents.NextEvent
loop until (instr(objLatestEvent.TargetInstance.PartComponent,"doc") > 0)
'*** Make time stamp for file name
d = Now
hhmmss = Right("00" & Hour(d), 2) & Right("00" & Minute(d), 2) & Right("00" & Second(d), 2)
'*** Get just the filename without the extension
sourcefilename = left(sourcefile,instr(sourcefile,".")-1)
'*** Add the timestamp to the converted file
newname = sourcefilename & "-" & hhmmss
'*** Exit program if file exists in the destination folder. Highly unlikely since it is timestamped
if objfso.FileExists(dpath & newname & ".docx") then
wscript.echo "Destination file " & dpath & newname & ".docx exists already"
WScript.Quit
end if
if objfso.FileExists("c:\converted\" & newname & ".doc") then
wscript.echo "Destination file " & dpath & newname & ".doc exists already"
WScript.Quit
end if
'*** move converted file to the converted folder then delete original
if objfso.FileExists(spath & sourcefilename & ".docx") then
newname= dpath & newname & ".docx"
oldname = spath & sourcefilename & ".docx"
objfso.Movefile "" & oldname & "", "" & newname & ""
objfso.DeleteFile("" & spath & sourcefile & "") 'Delete or comment this line if you do not want the original deleted
end if
if objfso.FileExists(spath & sourcefilename & ".doc") then
newname= dpath & newname & ".doc"
oldname = spath & sourcefilename & ".doc"
objfso.Movefile "" & oldname & "", "" & newname & ""
objfso.DeleteFile("" & spath & sourcefile & "") 'Delete or comment this line if you do not want the original deleted
end if
' *** Kill PDFtoword process
strProcessKill="PDFtoWord.exe"
Set colProcess = objWMIService.ExecQuery _
("Select * from Win32_Process Where Name = 'PDFtoWord.exe'" )
For Each objProcess in colProcess
objProcess.Terminate()
Next
wscript.sleep (30000) 'Wait 30 seconds to look for next file. 1000 = 1 second
Loop
SECOND - 将WORD文档转换为HTML
param([string]$docpath,[string]$htmlpath = $docpath)
$srcfiles = Get-ChildItem $docPath -filter "*.doc"
$saveFormat = [Enum]::Parse([Microsoft.Office.Interop.Word.WdSaveFormat], "wdFormatFilteredHTML");
$word = new-object -comobject word.application
$word.Visible = $False
function saveas-filteredhtml
{
$opendoc = $word.documents.open($doc.FullName);
$opendoc.saveas([ref]"$htmlpath\$doc.fullname.html", [ref]$saveFormat);
$opendoc.close();
}
ForEach ($doc in $srcfiles)
{
Write-Host "Processing :" $doc.FullName
saveas-filteredhtml
$doc = $null
}
$word.quit();
将此代码保存到convertdoc-tohtml.ps1
,您可以在一组word文档上运行它,无论doc或docx扩展名如何。
以下是如何运行它:
convertdoc-tohtml.ps1 -docpath "C:\Documents" -htmlpath "C:\Output"