通过gsdll32.dll从PostScript文件中提取纯文本

时间:2013-12-26 01:19:23

标签: delphi pdf

尝试使用 GhostScript ver 9.10 gsdll32.dll 文件从postscript文件中提取纯文本。似乎无法获得任何工作。试过多种变化。

{ extracts plain text from PostScript file via ps2ascii.ps }
procedure PS2TXT(input : AnsiString; output: AnsiString);
var
  code:integer;
  instance: Pointer;
  argv: array of PAnsiChar;
  RunFile: string;
begin
RunFile:= ExtractFilePath(ParamStr(0)) + 'ps2ascii.ps';
  code := gsapi_new_instance(instance, nil);
  if code < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error code: '+IntToStr(code));
  try
    SetLength(argv, 8);
    argv[0] := PAnsiChar('-q -dNODISPLAY -dSAFER -dDELAYBIND ');
    argv[1] := PAnsiChar('-dWRITESYSTEMDICT ');
    argv[2] := PAnsiChar('-dSIMPLE ');
    argv[3] := PAnsiChar('-c save ');
    argv[4] := PAnsiChar('-f ' + RunFile);
    argv[5] := PAnsiChar(input);
    argv[6] := PAnsiChar('-c quit ');
    argv[7] := PAnsiChar('> ' + output + '.txt');
//  argv[7] := PAnsiChar('-sOutputFile='+ output + '.txt' );

    code := gsapi_init_with_args(instance, Length(argv), @argv[0]);
    if code < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(code));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

RunFile varibable只是从正在运行的exe获取当前路径,然后附加到ps2ascii.ps文件。

或者我想通过相同的ps2ascii.ps解释器将PDF文件转换为纯文本文件。输出需要是纯文本,以便我可以解析它提取特定的键字段,然后将其发布到数据库。但是,通过TXTWRITE设备从PDF中提取文本似乎只适用于某些pdf文件,但不是全部。所以这样做是为了解决问题。 有人得到任何与DLL一起使用的代码吗?

1 个答案:

答案 0 :(得分:1)

其他拥有STRUGGLED以获取正确参数以使GhostScript DLL在Delphi中运行的人可能会觉得这很有用。这很快,很脏,但很有效。根据需要清理它。享受!

GhostTools.pas GSDLL32.DLL的类文件

// GhostTools.pas v.03, 12/20013, Marvi mail: phantomlord@embarqmail.com
//
// Open source, modify to whatever extent
// Class to interact with GhostScript gsdll32.dll for simple PDF manipulation
// i.e. PDF to Jpeg, PDF to PNG, PDF to PS, PDF to Text, PS to PDF, etc.
// Requires GhostScript GSDLL32.DLL to reside in .EXE project folder
// or at the very least somewhere your app can get to it.
// If you use the ps2ascii.ps interpreter, do same as well.
//
// include GhostTools in your uses section, and use as needed

unit GhostTools;

interface

uses SysUtils, gsapi; {gsapi.pas file required as well}

 procedure PDF2PNG(input : AnsiString; output: AnsiString);
 {generates PNG image from PDF }

 procedure PDF2JPEG(input : AnsiString; output: AnsiString);
 {generates JPEG image from PDF }

 procedure PDF2PS(input : AnsiString; output: AnsiString);
 {generates PostScript file from PDF file }

 procedure PS2PDF(input : AnsiString; output: AnsiString);
 {generates PDF file from PostScript file }

 procedure PDF2TXT(input : AnsiString; output: AnsiString);
 {extracts plain text via TxtWrite device - Method #1 }

 procedure PDS2TXT(input : AnsiString; output: AnsiString);
 {extract plain text via PostScript interpreter - Method #2 }

 function SlashSwap(PathVar: string): string;
 {exchange backslash for forward slash - unix style }

implementation

{ exchange backslash for forward slash - to unix path format }
{ resolves path issue in ps2ascii.ps running on Windows platform }
function SlashSwap(PathVar: string): string;
var tmp: string;
begin
 tmp:= PathVar;
  while Pos('\', tmp) > 0 do
    tmp[Pos('\', tmp)] := '/';
 result:= tmp;
end;

{ generates PostScript file from PDF file }
procedure PDF2PS(input : AnsiString; output: AnsiString);
var
  ExitCode:integer;
  instance: Pointer;
  Arg: array of PAnsiChar;
begin
  ExitCode := gsapi_new_instance(instance, nil);
  if ExitCode < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error ExitCode: '+IntToStr(ExitCode));
  try
    SetLength(Arg, 7);
    Arg[0] := PAnsiChar('-q');
    arg[1] := PAnsiChar('-dSAFER');
    Arg[2] := PAnsiChar('-dNOPAUSE');
    arg[3] := PAnsiChar('-dBATCH');
    arg[4] := PAnsiChar('-sOutputFile=' + output + '.ps');
    arg[5] := PAnsiChar('-sDEVICE=ps2write');
    arg[6] := PAnsiChar(input);

    ExitCode := gsapi_init_with_args(instance, Length(Arg), @Arg[0]);
    if ExitCode < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(ExitCode));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

{ generates PDF file from PostScript file }
procedure PS2PDF(input : AnsiString; output: AnsiString);
var
  ExitCode:integer;
  instance: Pointer;
  Arg: array of PAnsiChar;
begin
  ExitCode := gsapi_new_instance(instance, nil);
  if ExitCode < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error ExitCode: '+IntToStr(ExitCode));
  try
  SetLength(Arg, 9);
    Arg[0] := PAnsiChar('ps2pdf');
    arg[1] := PAnsiChar('-dNOPAUSE');
    arg[2] := PAnsiChar('-dBATCH');
    arg[3] := PAnsiChar('-dSAFER');
    arg[4] := PAnsiChar('-sDEVICE=pdfwrite');
    arg[5] := PAnsiChar('-sOutputFile='+ output+'.pdf');
    arg[6] := PAnsiChar('-c');
    arg[7] := PAnsiChar('.setpdfwrite');
    arg[8] := PAnsiChar('-f' + input);

    ExitCode := gsapi_init_with_args(instance, Length(Arg), @Arg[0]);
    if ExitCode < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(ExitCode));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

{ generates JPEG image from PDF - 1 image per page }
procedure PDF2JPEG(input : AnsiString; output: AnsiString);
var
  ExitCode:integer;
  instance: Pointer;
  Arg: array of PAnsiChar;
begin
  ExitCode := gsapi_new_instance(instance, nil);
  if ExitCode < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error ExitCode: '+IntToStr(ExitCode));
  try
    SetLength(Arg, 16);
    Arg[0]  := PAnsiChar('-q');
    Arg[1]  := PAnsiChar('-dQUIET');
    Arg[2]  := PAnsiChar('-dPARANOIDSAFER');
    Arg[3]  := PAnsiChar('-dBATCH');
    Arg[4]  := PAnsiChar('-dNOPAUSE');
    Arg[5]  := PAnsiChar('-dNOPROMPT');
    Arg[6]  := PAnsiChar('-dMaxBitmap=500000000');
    Arg[7]  := PAnsiChar('-dFirstPage=1');
    Arg[8]  := PAnsiChar('-dAlignToPixels=0');
    Arg[9]  := PAnsiChar('-dGridFitTT=0');
    Arg[10] := PAnsiChar('-sDEVICE=jpeg');
    Arg[11] := PAnsiChar('-dTextAlphaBits=4');
    Arg[12] := PAnsiChar('-dGraphicsAlphaBits=4');
    Arg[13] := PAnsiChar('-r300x300');
    Arg[14] := PAnsiChar('-sOutputFile='+ output + ' Page-%02d.jpeg' );
    Arg[15] := PAnsiChar(input);

    ExitCode := gsapi_init_with_args(instance, Length(Arg), @Arg[0]);
    if ExitCode < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(ExitCode));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

{ extracts plain text from PDF file via ps2ascii.ps interpreter}
{ another interpreter is pstotxt.ps floating on the internet }
procedure PDS2TXT(input : AnsiString; output: AnsiString);
var
  ExitCode:integer;
  instance: Pointer;
  Arg: array of PAnsiChar;
  PSInterpreter: string;
  OutputFile: string;
begin
input:= SlashSwap(input);
{*note: place your interpreter in your .EXE project folder }
PSInterpreter:= ExtractFilePath(ParamStr(0)) + 'ps2ascii.ps';
PSInterpreter:= SlashSwap(PSInterpreter);
outputFile:= ExtractFilePath(input) + output + '.txt';
OutputFile:= SlashSwap(OutputFile);
output:= OutputFile;

  ExitCode := gsapi_new_instance(instance, nil);
  if ExitCode < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error ExitCode: '+IntToStr(ExitCode));
  try
    SetLength(Arg, 9);
    Arg[0]  := PAnsiChar('-q');
    Arg[1]  := PAnsiChar('-sstdout='+ output); { Your_TXT_File_Out.txt }
    Arg[2]  := PAnsiChar('-dSIMPLE');
    Arg[3]  := PAnsiChar('-sFONTPATH=c:/windows/fonts');
    Arg[4]  := PAnsiChar('-dNODISPLAY');
    Arg[5]  := PAnsiChar('-dDELAYBIND');
    Arg[6]  := PAnsiChar('-dWRITESYSTEMDICT');
    Arg[7]  := PAnsiChar('-f'+ PSInterpreter); { path/to/ps2ascii.ps }
    Arg[8]  := PAnsiChar(input); { Your_PDF_File_In.pdf }

    ExitCode := gsapi_init_with_args(instance, Length(Arg), @Arg[0]);
    if ExitCode < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(ExitCode));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

{ extract plain text from PDF File via TxtWrite device }
procedure PDF2TXT(input : AnsiString; output: AnsiString);
var   ExitCode:integer;
  instance: Pointer;
      Arg: array of PAnsiChar;
begin
  ExitCode := gsapi_new_instance(instance, nil);
  if ExitCode < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error ExitCode: '+IntToStr(ExitCode));
  try
    SetLength(Arg, 5);
    Arg[0] := PAnsiChar('-dBATCH');
    Arg[1] := PAnsiChar('-dNOPAUSE');
    Arg[2] := PAnsiChar('-sDEVICE=txtwrite');
    Arg[3] := PAnsiChar('-sOutputFile='+ output + '.txt' );
    Arg[4] := PAnsiChar(input);

    ExitCode := gsapi_init_with_args(instance, Length(Arg), @Arg[0]);
    if ExitCode < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(ExitCode));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

{ generates PNG image from PDF - 1 image per page }
procedure PDF2PNG(input : AnsiString; output: AnsiString);
var
  ExitCode:integer;
  instance: Pointer;
  Arg: array of PAnsiChar;
begin
  ExitCode := gsapi_new_instance(instance, nil);
  if ExitCode < 0 then
    raise Exception.Create('Impossible to open an instance of ghostscript. Error ExitCode: '+IntToStr(ExitCode));
  try
    SetLength(Arg, 11);
    Arg[0] := PAnsiChar('ps2pdf');
    Arg[1] := PAnsiChar('-dNOPAUSE');
    Arg[2] := PAnsiChar('-dBATCH');
    Arg[3] := PAnsiChar('-dSAFER');
    Arg[4] := PAnsiChar('-sDEVICE=pngalpha');
    Arg[5] := PAnsiChar('-r300');
    Arg[6] := PAnsiChar('-dTextAlphaBits=4');
    Arg[7] := PAnsiChar('-sOutputFile='+output+' Page-%02d.png');
    Arg[8] := PAnsiChar('-c');
    Arg[9] := PAnsiChar('.setpdfwrite');
    Arg[10]:= PAnsiChar('-f'+ input);

    ExitCode := gsapi_init_with_args(instance, Length(Arg), @Arg[0]);
    if ExitCode < 0 then raise Exception.Create('ERROR: init_args: '+IntToStr(ExitCode));
    gsapi_exit(instance);
  finally
    gsapi_delete_instance(instance);
  end;
end;

end.

如果你做得更好,请寄给我一份副本; - )