Question

我正在用python编写轻量级的ETL函数。为了方便测试，已在Google Datalabs中对其进行了构建。

工作流的一部分包括从Cloud Storage获取.csv并将其另存为Pandas Dataframe。这在Datalabs中可以完美地工作，但是在Cloud Functions中由于某种原因，它会从.csv的开头再次开始读取，并附加到最下面的结果中，大约300行重复。

我尝试了几种读取.csv的方法（pd.read_csv，gcsfs，gsutil，％gcs），它们在Datalabs中都能正常工作，读取正确的行数，但是当放入Cloud Functions时，我得到了重复的行。这是gcsfs的示例：

 import gcsfs
 import pandas as pd
 bucket = 'my_bucket'
 gc_project = 'my-project'
 latest_filename = 'my.csv'
 gs_current_object = bucket + '/' + latest_filename
 fs = gcsfs.GCSFileSystem(project=gc_project)
 with fs.open(gs_current_object, 'rb') as f:
     df_new = pd.read_csv(f)
 print(df_new.shape)

我希望形状是（15097，26），这是我在Datalabs中得到的形状以及在测试.csv中有多少行，但是我得到的（15428，26）是原始.csv，并附加了重复的行从一开始。

我可以使用删除重复项，但是： 1.宁愿保持功能轻巧，尤其是因为它在Cloud Functions中，我有2GB的空间可以运行它 2.标头也被附加了，所以它开始变得混乱，因为我需要找到它并放下它，就像使用.drop_duplicates

有人碰到过类似的东西吗？我可以通过读取.csv来解决此错误的任何措施，这样就不必清理错误读取的文件了吗？

编辑：这是我的Cloud Functions实例中的完整代码（显然删除了实际姓名和个人信息）。我尝试处理此版本中的重复行，但无法处理。实际上，我得到了一个非常奇怪的输出，其中在删除重复项和顶部行（重复标题）后，df_new的形状显示为（15065，26），但是当我稍后执行df_new.tail（）时，我得到了15098行，最后一行是重复标题也是如此，当我尝试解析日期时会报错。

def csv_update(request):
    #Moved all imports and isntalls at top
    print('Importing packages and setting variables')
    from datetime import datetime 
    import ftplib
    import gcsfs
    import glob
    from googleapiclient import discovery
    import gspread
    from gspread_dataframe import get_as_dataframe, set_with_dataframe
    from oauth2client.client import GoogleCredentials
    from oauth2client.service_account import ServiceAccountCredentials
    import os
    import pandas as pd

    #Defining function variables.
    ftp_usr = "myemail@dotcom.com"
    ftp_pass = "my_unsecured_pass"
    bucket = 'my_bucket'
    gc_project = 'my-project'
    json_original = {
      "type": "service_account",
      "project_id": "my-project",
      "private_key_id": "my_id",
      "private_key": "-----BEGIN PRIVATE KEY-----\MY KEY\n-----END PRIVATE KEY-----\n",
      "client_email": "my_service_account@my_project.iam.gserviceaccount.com",
      "client_id": "my_client_id",
      "auth_uri": "https://accounts.google.com/o/oauth2/auth",
      "token_uri": "https://oauth2.googleapis.com/token",
      "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
      "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/client_email"
    }
    g_spreadsheet_id = 'my_gsheet_id'
    g_sheet_name = 'test'
    dtypes = {'LeadId': 'str'}
    root_dir = '/tmp'
    ftp_path = 'my_ftp_dir'
    date_col_name = 'LeadCreationDate'
    lead_col_name = 'LeadId'

    #Import ftplib. Connect to box (encrypted FTPES) with my credentials and download latest file from crown_reporting. 
    #Get downloaded file from local to crown_test bucket
    print('Connecting to FTP and downloading most recent file to local and then to GS bucket')
    os.chdir(root_dir)
    ftp = ftplib.FTP_TLS("ftp.box.com") 
    ftp.login(ftp_usr, ftp_pass) 
    ftp.cwd(ftp_path)
    ftp.retrlines('LIST')
    lines = ftp.nlst("-t")
    latest_filename = lines[-1]
    print(lines)
    print(latest_filename)
    ftp.retrbinary("RETR " + latest_filename ,open(latest_filename, 'wb').write)
    ftp.quit()
    credentials = GoogleCredentials.get_application_default()
    service = discovery.build('storage', 'v1', credentials=credentials)     
    body = {'name': latest_filename}
    req = service.objects().insert(bucket=bucket, body=body, media_body=latest_filename)
    resp = req.execute()
    files = glob.glob(root_dir +'/*')
    for f in files:
        os.remove(f)

    #Read the newest CSV from Google Storage (uses latest_filename from initial FTP download).
    #Had to add .drop_duplicates(keep='first', inplace=True) because some of the lead IDs have multiple rows.
    #Added a custom function to parse the dates as they have 2 different formats and needs to be parsed as datetime in order to sort after appending to df_old later.
    print('Read current csv from GS bucket as df_new')
    gs_current_object = bucket + '/' + latest_filename
    fs = gcsfs.GCSFileSystem(project=gc_project)
    col_names=['LeadId', 'Lead_Status', 'MoveType', 'Relo_Status', 'LeadCreationDate',
       'EstServiceRevenueUSD', 'EstServiceCostUSD', 'ActServiceRevenueUSD',
       'ActInsuranceRevenueUSD', 'ActServiceCostUSD', 'ActInsCostUSD',
       'ActServiceMarginUSD', 'CustomerType', 'SaleDate',
       'ControllingOfficeName', 'ControllingCountry', 'ControllingRegion',
       'OriginCity', 'OriginState', 'OriginCountry', 'DestinationCity',
       'DestinationState', 'DestinationCountry', 'UnqualifyReason',
       'LeadControllingCountry', 'URL']
    with fs.open(gs_current_object, 'rb') as f:
        df_new = pd.read_csv(f, header=None, names=col_names)
    print(df_new.shape)
    print(df_new.dtypes)
    df_new[lead_col_name] = df_new[lead_col_name].astype(str)
    df_new.drop_duplicates(subset=lead_col_name, keep='first', inplace=True)
    print(df_new.shape)
    df_new = df_new[1:]
    print(df_new.shape)                       
    dt_strings = []
    for dt_str in df_new[date_col_name]:
        dt_str = dt_str[:dt_str.find(' ')] 
        dt_strings.append(dt_str)
    print(len(dt_strings))
    def try_parsing_date(text):
        if len(text) == 10:
            return datetime.strptime(text, '%m/%d/%Y')
        else:
            text = '0' + text
            return datetime.strptime(text, '%m/%d/%Y')
    print(df_new.index[(df_new[date_col_name] == date_col_name) | (df_new[date_col_name] == '0LeadCreationDat') ].values)
    print(df_new.tail())
    dt_strings_conv = [try_parsing_date(date) for date in dt_strings]
    df_new[date_col_name] = dt_strings_conv
    print(df_new[date_col_name])
    print(dt_strings_conv)
    df_new.set_index(lead_col_name, drop=True, inplace=True)

    #Authorize for G sheet with JSON. Changed this to JSON parsed dictionary so it's saved within script.  
    scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
    creds = ServiceAccountCredentials.from_json_keyfile_dict(json_original, scope)
    gs = gspread.authorize(creds)

    #Now we can access sheet. NB I had to enable sheets api in console here for this to work. Import pandas and gspread_dataframe. 
    #Set up worksheet via gspread and get the current (old) data in a df. 
    #We also specify a dtype of leadid column as otherwise Pandas thinks it's an integer (first IDs are just numbers). 
    #Had to add .drop_duplicates(keep='first', inplace=True) because some of the lead IDs have multiple rows.
    print('Read current gsheet as df_old')
    sheet = gs.open_by_key(g_spreadsheet_id).worksheet(g_sheet_name) 
    df_old=get_as_dataframe(sheet, dtype=dtypes, parse_dates=[date_col_name])
    df_old.drop_duplicates(subset=lead_col_name, keep='first', inplace=True)
    df_old.set_index(lead_col_name, drop=True, inplace=True)
    print(df_old.dtypes)

    #Update any changed rows in df_old with df_new values. Add any new rows (using append and dropping duplicates). Added sort=True to concat because of future warning.
    print('Update df_old with df_new values')
    df_old.update(df_new)
    #print(df_old.shape)
    #df_old.tail(15)
    print('Concat df_old with df_new and drop duplicates')
    df_combined = pd.concat([df_old, df_new], sort=True).reset_index()
    df_combined.drop_duplicates(subset=lead_col_name, keep='last', inplace=True)
    df_combined.sort_values(by=[date_col_name], inplace=True)
    #df_combined.reset_index(inplace=True, drop=True)
    #print(df_combined.shape)

    #Connect to gsheet and select worksheet again (in case of timeout, these are commented out as was running fine in tests). Replace all data with newly combined df.
    print('Write updated and concat df_combined to gsheet')
    set_with_dataframe(sheet, df_combined)

Answer 1

因此，从Google存储空间读取到数据框的所有方式都会直接为我在Cloud Functions中产生此错误。我很想在某个时候深入浅出，但是现在我需要使我的功能正常工作。

如果有人遇到类似的问题-我最终使用下面的代码先存储在本地临时存储中，然后再使用pd.read_csv，它工作正常（请注意，google.cloud与google-cloud-storage一起安装在Requirements.txt）：

from google.cloud import storage
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print('Blob {} downloaded to {}.'.format(
    source_blob_name,
    destination_file_name))
download_blob(bucket, latest_filename, latest_filename)
df_new = pd.read_csv(root_dir + "/" + latest_filename, dtype=dtypes)

在Google Cloud Functions中运行时，从.csv到Google Cloud Storage的数据帧读取有一个错误

1 个答案: