在下面的代码中,大的.pdf文件被分成几页,然后上传到存储桶中,并同时排队到pubsub中
def publish_messages(project_id, topic_id, enqueue_file):
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(project_id, topic_id)
data = enqueue_file
# Data must be a bytestring
data = data.encode("utf-8")
# When you publish a message, the client returns a future.
future = publisher.publish(topic_path, data=data)
print(future.result())
print(enqueue_file + "has been enqueued to Pub/Sub.")
def upload_local_directory_to_gcs(local_path, bucket, gcs_path):
assert os.path.isdir(local_path)
for local_file in glob.glob(local_path + '/**'):
if not os.path.isfile(local_file):
continue
remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
storage_client = storage.Client()
buck = storage_client.bucket(bucket)
blob = buck.blob(remote_path)
blob.upload_from_filename(local_file)
print("Uploaded " + local_file + " to gs bucket " + bucket)
publish_messages("Project1", "my-topic", local_file)
我使用以下代码接收消息
def receive_messages(project_id, subscription_id , timeout=None):
from concurrent.futures import TimeoutError
from google.cloud import pubsub_v1
subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(project_id, subscription_id)
def callback(message):
print("Received message: {}".format(message))
message.ack()
streaming_pull_future = subscriber.subscribe(subscription_path, callback=callback)
print("Listening for messages on {}..\n".format(subscription_path))
with subscriber:
try:
streaming_pull_future.result(timeout=timeout)
except TimeoutError:
streaming_pull_future.cancel()
if __name__ == "__main__":
receive_messages("Project1", "my-sub")
但是当我收到消息时,我只会得到字符串数据。
Received message: Message {
data: b'/tmp/doc_pages/document-page17.pdf'
ordering_key: ''
attributes: {}
}
我的想法是获取pdf文件并使用Vision API执行一些OCR操作。是否可以获取pdf文件本身?还有其他方法请告诉我。
谢谢!