Question

我正在尝试索引上载到s3存储桶的pdf文档。我的lambda函数在PDF提取部分正常工作。它正在与弹性搜索端点建立连接，并且在上传数据弹性搜索以建立索引时会引发错误。请在下面找到lambda函数代码。请帮我解决一下这个。预先感谢。

from __future__ import print_function

import json
import urllib
import boto3
import slate
import elasticsearch
import datetime

es_endpoint = 'search-sdjsf-zrtisx]sdaswasfsjmtsyuih3awvu.us-east- 
1.es.amazonaws.com'
es_index = 'pdf_text_extracts'
es_type = 'document'

print('Loading function')

s3 = boto3.client('s3')

# prepare a dict to hold our document data
doc_data = {}
doc_data['insert_time'] = 
str(datetime.datetime.isoformat(datetime.datetime.now()))


def lambda_handler(event, context):
    #print("Received event: " + json.dumps(event, indent=2))

    # Get the object from the event and show its content type
    bucket = event['Records'][0]['s3']['bucket']['name']
    object_key = urllib.unquote_plus(event['Records'][0]['s3']['object'] 
    ['key']).decode('utf8')
    try:
        # get the file data from s3
        temp_pdf_file = open('/tmp/tempfile.pdf', 'w') 
        response = s3.get_object(Bucket=bucket, Key=object_key)
        print("CONTENT TYPE: " + response['ContentType'])
        # return response['ContentType']
        temp_pdf_file.write(response['Body'].read()) # write the object data 
        to a local file; will be passed to slate
        temp_pdf_file.close() # close the temporary file for now

        # pull the text from the temporary PDF file using slate
        print("Extracting data from: " + object_key)
        with open('/tmp/tempfile.pdf') as temp_pdf_file:

        doc = slate.PDF(temp_pdf_file)

        # store document data to dict
        doc_data['source_pdf_name'] = object_key
        doc_data['document_text'] = doc[0] # we're only worried about page 1 
        at this point

        #datj=json.dumps(doc_data)
        #z=json.loads(datj)
        #print(z)
    except Exception as e:
        print(e)
        print('Error getting object {} from bucket {}. Make sure they exist 
        and your bucket is in the same region as this 
        function.'.format(object_key, bucket))
        raise e

    # put the data in ES
    #try:
    es = elasticsearch.Elasticsearch([{'host': es_endpoint, 'port': 443, 
   'use_ssl': True}]) # hold off on validating certs
    es_response = es.index(index=es_index, doc_type=es_type, body=doc_data)
    print('Data posted to ES: ' + str(es_response))

    #except Exception as e:
        #print('Data post to ES failed: ' + str(e))
        #raise e   

return "Done"

我已删除了try和except（在最后一个块中）以查找实际错误，并在尝试将数据上传到弹性搜索时抛出以下错误。

Traceback (most recent call last):
File "/var/runtime/awslambda/bootstrap.py", line 576, in <module>
main()
File "/var/runtime/awslambda/bootstrap.py", line 571, in main
handle_event_request(request_handler, invokeid, event_body, context_objs, 
invoked_function_arn)
File "/var/runtime/awslambda/bootstrap.py", line 264, in 
handle_event_request
result = report_fault_helper(invokeid, sys.exc_info(), None)
File "/var/runtime/awslambda/bootstrap.py", line 315, in report_fault_helper
msgs = [str(value), etype.__name__]

Answer 1

最后删除return "Done"，这在Lambda环境中是不允许的。

尝试将数据上传到弹性搜索时，AWS Lambda bootstrap.py文件抛出错误

1 个答案: