如何与开发服务器上的谷歌云存储rsync本地目录

时间:2016-02-14 18:23:25

标签: google-app-engine google-cloud-storage

有一种方法可以使用gsutil命令行工具将文件从文件夹同步到Google云端存储分区:

gsutil rsync -r <src> gs://<bucket>

有没有办法在development server上做同样的事情?

2 个答案:

答案 0 :(得分:1)

由于开发服务器在运行--storage_path=参数时通过dev_appserver.py指定的本地目录模拟云存储,您可以使用常规的linux工具运行rsync,即这样的事情(如果您在不同的文件之间同步文件)服务器):

rsync -a ~/dir1 username@remote_host:destination_directory

或者,如果您要在本地系统上同步文件,请使用以下内容:

rsync -r dir1/ dir2 

答案 1 :(得分:1)

我重播自己,因为我无法找到任何其他解决方案,所以我已经实施了自己的解决方案。我不确定这是否是做这件事的正确方法,但它做了我想做的事。也许别人会发现它也很有用。

我已经为webapp2创建了上传处理程序,允许我通过http POST请求上传多部分编码的文件。

import cloudstorage as gcs
from webapp2 import uri_for
from google.appengine.ext import blobstore
from google.appengine.ext.webapp import blobstore_handlers


class Upload(blobstore_handlers.BlobstoreUploadHandler):
    def post(self):
        """Copy uploaded files to provided bucket destination"""
        fileinfo = self.get_file_infos()[0]
        destpath = '/{}/{}'.format(
            self.request.get('bucket'),
            self.request.get('dest'))

        gcs.copy2(fileinfo.gs_object_name[3:], destpath)
        gcs.delete(fileinfo.gs_object_name[3:])
        self.response.content_type = 'text/plain'
        self.response.write('File created: {}'.format(destpath))

    def get(self):
        """Returns URL to open upload session

        when `bucket` parameter is provided the blob will be uploaded to
        Google Cloud Storage bucket
        """
        bucket = self.request.get('bucket')
        self.response.content_type = 'text/plain'
        self.response.write(_create_upload_url(bucket))


def _create_upload_url(bucket):
    """Returns open upload session URL"""
    if bucket:
        bucket = '{}/'.format(bucket)

    return blobstore.create_upload_url(
        success_path=uri_for('upload'),
        gs_bucket_name=bucket
    )

然后我创建了可用于从控制台上传文件的CLI任务:

#!/usr/bin/env python
from urlparse import urlparse
from xml.dom.minidom import parseString
import argparse
import hashlib
import magic
import os
import requests
import subprocess
import sys
import urllib2


def _sync(file, endpoint):
    """Upload file to given endpoint

    - on success returns: None
    - on failure returns: error message
    """
    r = requests.get(endpoint, params={'destpath': file})
    if r.status_code != 200:
        return "[{}] Can't retrive upload url".format(r.status_code)
    upload_url = r.text
    mime_type = _get_mime_type(file)

    r = requests.post(
        upload_url,
        files={
          'file': ('file.tmp', open(file, 'rb'), mime_type)
        })

    if r.status_code != 200:
        return "[{}] Can't upload file".format(r.status_code)


def _delete(file, endpoint):
    """Delete file from given endpoint

    - on success returns: None
    - on failure returns: error message
    """
    r = requests.delete(
        endpoint,
        params={
          'path': file
        })

    if r.status_code != 200:
        return "[{}] Can't delete file".format(r.status_code)


def _get_mime_type(path):
    """Returns mime type of the file"""
    mime = magic.Magic(mime=True)
    return mime.from_file(path)


def _etag(path):
    """Returns ETag for a given file"""
    hash_md5 = hashlib.md5()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            hash_md5.update(chunk)
    f.close()
    return hash_md5.hexdigest()


def _get_bucket_state(url, bucket, result={}, marker=None):
    """Returns current state (file lists) of bucket for GAE local storage"""
    o = urlparse(url)
    gcsurl = "{}://{}/_ah/gcs/{}".format(
            o.scheme, o.netloc, bucket)

    if marker:
        gcsurl += '?marker={}'.format(marker)

    print "Fetching files from bucket: {}".format(gcsurl)

    root = parseString(urllib2.urlopen(gcsurl).read())

    for content in root.getElementsByTagName('Contents'):
        key = content.getElementsByTagName('Key')[0].childNodes[0].data
        size = content.getElementsByTagName('Size')[0].childNodes[0].data
        etag = content.getElementsByTagName('ETag')[0].childNodes[0].data
        lm = content.getElementsByTagName('LastModified')[0].childNodes[0].data
        result['{}/{}'.format(bucket, key)] = {
                'etag': etag,
                'size': size,
                'last-modifed': lm,
                }

    print "found {} files so far...".format(len(result))
    nextMarker = root.getElementsByTagName('NextMarker')
    if nextMarker:
        _get_bucket_state(
                url, bucket, result, nextMarker[0].childNodes[0].data)

    return result


parser = argparse.ArgumentParser(description="""
Synchronize data with local Google Cloud Storage bucket

Usage example:
  % ./sync_local <bucket_dir> http://localhost:8080/upload
""", formatter_class=argparse.RawTextHelpFormatter)

parser.add_argument(
    'bucket',
    help='Source directory, its name will be used as destination bucket name',
    nargs=1
)

parser.add_argument(
    'url',
    help='upload url required for local environemnt',
    nargs=1,
    default='http://localhost:8080/upload'
)

parser.add_argument(
    '--dry-run',
    help="show what will be done but don't send any data",
    action='store_true'
)

args = parser.parse_args()

url = args.url[0]
bucket = args.bucket[0].rstrip('/')
dry_run = args.dry_run

# Start sync
print "Building sync state..."
current_state = _get_bucket_state(url, bucket)
print "Getting local files list..."
ls = subprocess.check_output(
        'find {} -type f'.format(bucket),
        stderr=subprocess.STDOUT,
        shell=True).split("\n")[:-1]

to_update = []
the_same = []

for file in ls:
    if file:
        if file in current_state:
            if current_state[file]['etag'] == _etag(file):
                the_same.append(file)
            else:
                to_update.append(file)
        else:
            to_update.append(file)

to_delete = set(current_state.keys()) - set(to_update) - set(the_same)

print "Files to sync: {}".format(len(ls))
print "Current state: {}".format(len(current_state))
print "Same: {}, To udpate: {}, To delete: {}".format(
        len(the_same), len(to_update), len(to_delete))

if len(to_update) or len(to_delete):
    var = raw_input("Do you want to sync data? [yn]: ")
    if var.strip() != 'y':
        sys.exit()
else:
    print "Already up-to-date"

for file in to_update:
    if dry_run:
        print 'WILL UPDATE: {}'.format(file)
        continue
    else:
        result = _sync(file, url)
        if result:
            print 'ERROR: {} {}'.format(result, file)
        else:
            print 'UPDATED: {}'.format(file)

for file in to_delete:
    if dry_run:
        print 'WILL DELETE: {}'.format(file)
        continue
    else:
        result = _delete(file, url)
        if result:
            print 'ERROR: {} {}'.format(result, file)
        else:
            print 'DELETED: {}'.format(file)

当dev服务器运行时,我可以简单地将文件从指定的磁盘位置上传到需要保存源文件名的本地存储桶:

./sync_local <dir> http://localhost:8080/upload

其中<dir>与要发送文件的存储桶名称相同。

上传文件后,您可以转到:

列出存储桶
http://localhost:8080/_ah/gcs/<bucket_name>