从Gmail获取pdf附件作为文字

时间:2014-10-28 16:42:35

标签: pdf text google-apps-script gmail email-attachments

我搜索了网络和Stack Overflow但没有找到解决方案。我尝试做的是以下内容:我通过邮件获得某些附件,我希望将其作为(普通)文本进行进一步处理。我的脚本看起来像这样:

function MyFunction() {

  var threads = GmailApp.search ('label:templabel'); 
  var messages = GmailApp.getMessagesForThreads(threads); 

   for (i = 0; i < messages.length; ++i)
   {
     j = messages[i].length; 
   var messageBody = messages[i][0].getBody(); 
   var messageSubject = messages [i][0].getSubject();
     var attach = messages [i][0].getAttachments();
     var attachcontent = attach.getContentAsString();
    GmailApp.sendEmail("mail", messageSubject, "", {htmlBody: attachcontent});
    }
}

不幸的是,这不起作用。这里有人知道我该怎么做吗?它甚至可能吗?

非常感谢你。

最好,菲尔

1 个答案:

答案 0 :(得分:15)

编辑:针对DriveApp进行了更新,因为DocsList已弃用。


我建议将其分解为两个问题。第一个是如何从电子邮件中获取pdf附件,第二个是如何将该pdf转换为文本。

正如您所知,getContentAsString()并未将PDF附件神奇地更改为纯文本或HTML。我们需要做一些更复杂的事情。

首先,我们将附件作为Blob获取,这是多个服务用于交换数据的实用程序类。

var blob = attachments[0].getAs(MimeType.PDF);

因此,第二个问题被分离出来,并且保持我们仅对标记为templabel的每个线程的第一条消息的第一个附件感兴趣的假设,这里是myFunction()看起来的样子:

/**
 * Get messages labeled 'templabel', and send myself the text contents of
 * pdf attachments in new emails.
 */
function myFunction() {

  var threads = GmailApp.search('label:templabel');
  var threadsMessages = GmailApp.getMessagesForThreads(threads);

  for (var thread = 0; thread < threadsMessages.length; ++thread) {
    var message = threadsMessages[thread][0];
    var messageBody = message.getBody();
    var messageSubject = message.getSubject();
    var attachments = message.getAttachments();

    var blob = attachments[0].getAs(MimeType.PDF);
    var filetext = pdfToText( blob, {keepTextfile: false} );

    GmailApp.sendEmail(Session.getActiveUser().getEmail(), messageSubject, filetext);
  }
}

我们依靠帮助函数pdfToText()将我们的pdf blob转换为文本,然后我们将其作为纯文本电子邮件发送给自己。这个辅助功能有多种选择;通过设置keepTextfile: false,我们已选择将PDF文件的文本内容返回给我们,并且不会在我们的云端硬盘中留下任何残留文件。

pdfToText()

此实用程序可用as a gist。这里提供了几个例子。

previous answer表示可以使用Drive API的insert方法执行OCR,但它没有提供代码详细信息。通过引入高级Google服务,可以从Google Apps脚本轻松访问Drive API。您需要在Drive API下的编辑器中打开并启用Resources > Advanced Google Services

pdfToText()使用云端硬盘服务从PDF文件的内容生成Google文档。不幸的是,这包含&#34;图片&#34;文档中的每个页面 - 我们无能为力。然后,它使用常规DocumentService将文档正文提取为纯文本。

/**
 * See gist: https://gist.github.com/mogsdad/e6795e438615d252584f
 *
 * Convert pdf file (blob) to a text file on Drive, using built-in OCR.
 * By default, the text file will be placed in the root folder, with the same
 * name as source pdf (but extension 'txt'). Options:
 *   keepPdf (boolean, default false)     Keep a copy of the original PDF file.
 *   keepGdoc (boolean, default false)    Keep a copy of the OCR Google Doc file.
 *   keepTextfile (boolean, default true) Keep a copy of the text file.
 *   path (string, default blank)         Folder path to store file(s) in.
 *   ocrLanguage (ISO 639-1 code)         Default 'en'.
 *   textResult (boolean, default false)  If true and keepTextfile true, return
 *                                        string of text content. If keepTextfile
 *                                        is false, text content is returned without
 *                                        regard to this option. Otherwise, return
 *                                        id of textfile.
 *
 * @param {blob}   pdfFile    Blob containing pdf file
 * @param {object} options    (Optional) Object specifying handling details
 *
 * @returns {string}          id of text file (default) or text content
 */
function pdfToText ( pdfFile, options ) {
  // Ensure Advanced Drive Service is enabled
  try {
    Drive.Files.list();
  }
  catch (e) {
    throw new Error( "To use pdfToText(), first enable 'Drive API' in Resources > Advanced Google Services." );
  }

  // Set default options
  options = options || {};
  options.keepTextfile = options.hasOwnProperty("keepTextfile") ? options.keepTextfile : true;

  // Prepare resource object for file creation
  var parents = [];
  if (options.path) {
    parents.push( getDriveFolderFromPath (options.path) );
  }
  var pdfName = pdfFile.getName();
  var resource = {
    title: pdfName,
    mimeType: pdfFile.getContentType(),
    parents: parents
  };

  // Save PDF to Drive, if requested
  if (options.keepPdf) {
    var file = Drive.Files.insert(resource, pdfFile);
  }

  // Save PDF as GDOC
  resource.title = pdfName.replace(/pdf$/, 'gdoc');
  var insertOpts = {
    ocr: true,
    ocrLanguage: options.ocrLanguage || 'en'
  }
  var gdocFile = Drive.Files.insert(resource, pdfFile, insertOpts);

  // Get text from GDOC  
  var gdocDoc = DocumentApp.openById(gdocFile.id);
  var text = gdocDoc.getBody().getText();

  // We're done using the Gdoc. Unless requested to keepGdoc, delete it.
  if (!options.keepGdoc) {
    Drive.Files.remove(gdocFile.id);
  }

  // Save text file, if requested
  if (options.keepTextfile) {
    resource.title = pdfName.replace(/pdf$/, 'txt');
    resource.mimeType = MimeType.PLAIN_TEXT;

    var textBlob = Utilities.newBlob(text, MimeType.PLAIN_TEXT, resource.title);
    var textFile = Drive.Files.insert(resource, textBlob);
  }

  // Return result of conversion
  if (!options.keepTextfile || options.textResult) {
    return text;
  }
  else {
    return textFile.id
  }
}

utility from Bruce McPherson

有助于转换为DriveApp
// From: http://ramblings.mcpher.com/Home/excelquirks/gooscript/driveapppathfolder
function getDriveFolderFromPath (path) {
  return (path || "/").split("/").reduce ( function(prev,current) {
    if (prev && current) {
      var fldrs = prev.getFoldersByName(current);
      return fldrs.hasNext() ? fldrs.next() : null;
    }
    else { 
      return current ? null : prev; 
    }
  },DriveApp.getRootFolder()); 
}