AWS Glue为什么会生成多个json文件?

时间:2018-07-11 04:58:48

标签: amazon-web-services

我正忙着使用POC(使用AWS Glue)从RDS AWS Postgresql表中提取数据,我想生成一个JSON文件。

我正在使用以下脚本,但是它继续生成多个文件,每个文件中有5行。如何仅生成1个文件?

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

## @type: DataSource
## @args: [database = "temp-crawlerdb-xxxxx", table_name = "taxservice__3fa3bf8633994e1a827498190adbe56a_contingencyrunningtotal", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "temp-crawlerdb-xxxxx", table_name = "taxservice__3fa3bf8633994e1a827498190adbe56a_contingencyrunningtotal", transformation_ctx = "datasource0")

## @type: ApplyMapping
## @args: [mapping = [("stake", "decimal(18,6)", "stake", "decimal(18,6)"), ("branchid", "long", "branchid", "long"), ("winningstake", "decimal(18,6)", "winningstake", "decimal(18,6)"), ("grossrevenue", "decimal(18,6)", "grossrevenue", "decimal(18,6)"), ("vatrate", "decimal(18,6)", "vatrate", "decimal(18,6)"), ("tmstamp", "timestamp", "tmstamp", "timestamp"), ("usrid", "string", "usrid", "string"), ("contingencyexternalreference", "string", "contingencyexternalreference", "string"), ("winnings", "decimal(18,6)", "winnings", "decimal(18,6)"), ("ggrtaxrate", "decimal(18,6)", "ggrtaxrate", "decimal(18,6)"), ("taxpayable", "decimal(18,6)", "taxpayable", "decimal(18,6)"), ("vatpayable", "decimal(18,6)", "vatpayable", "decimal(18,6)")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]

applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("stake", "decimal(18,6)", "stake", "decimal(18,6)"), ("branchid", "long", "branchid", "long"), ("winningstake", "decimal(18,6)", "winningstake", "decimal(18,6)"), ("grossrevenue", "decimal(18,6)", "grossrevenue", "decimal(18,6)"), ("vatrate", "decimal(18,6)", "vatrate", "decimal(18,6)"), ("tmstamp", "timestamp", "tmstamp", "timestamp"), ("usrid", "string", "usrid", "string"), ("contingencyexternalreference", "string", "contingencyexternalreference", "string"), ("winnings", "decimal(18,6)", "winnings", "decimal(18,6)"), ("ggrtaxrate", "decimal(18,6)", "ggrtaxrate", "decimal(18,6)"), ("taxpayable", "decimal(18,6)", "taxpayable", "decimal(18,6)"), ("vatpayable", "decimal(18,6)", "vatpayable", "decimal(18,6)")], transformation_ctx = "applymapping1")

## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://tax-service-xxxxx"}, format = "json", transformation_ctx = "datasink2"]
## @return: datasink2
## @inputs: [frame = applymapping1]
datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://tax-service-xxxxx"}, format = "csv", transformation_ctx = "datasink2")

job.commit()

2 个答案:

答案 0 :(得分:2)

在应用映射之前,请执行以下操作:

   using System;
   using System.Collections.Generic;
   using System.Net;
   using System.Net.Sockets;
   using System.Text; 
   namespace MultiServer
 {
    class Program
{
    private static readonly Socket serverSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
    private static readonly List<Socket> clientSockets = new List<Socket>();
    private const int BUFFER_SIZE = 2048;
    private const int PORT = 100;
    private static readonly byte[] buffer = new byte[BUFFER_SIZE];
  static void Main()
    {
        Console.Title = "Server";
        SetupServer();
        Console.ReadLine(); // When we press enter close everything
       CloseAllSockets();
    }

    private static void SetupServer()
    {
        Console.WriteLine("Setting up server...");
        serverSocket.Bind(new IPEndPoint(IPAddress.Any, PORT));
        serverSocket.Listen(100);
        serverSocket.BeginAccept(new AsyncCallback(AcceptCallback), null);
        Console.WriteLine("Server setup complete");
    }

    /// <summary>
    /// Close all connected client (we do not need to shutdown the server socket as its connections
    /// are already closed with the clients).
    /// </summary>
    private static void CloseAllSockets()
    {
        foreach (Socket socket in clientSockets)
        {
            socket.Shutdown(SocketShutdown.Both);
            socket.Close();
        }

        serverSocket.Close();
    }

    private static void AcceptCallback(IAsyncResult AR)
    {
        Socket socket;

        try
        {
            socket = serverSocket.EndAccept(AR);
        }
        catch (ObjectDisposedException) // I cannot seem to avoid this (on exit when properly closing sockets)
        {
            return;
        }

        if ( socket != null)
        {
            clientSockets.Add(socket);

            socket.BeginReceive(buffer, 0, BUFFER_SIZE, SocketFlags.None, ReceiveCallback, socket);
            Console.WriteLine("Client connected, waiting for request...");
            serverSocket.BeginAccept(AcceptCallback, null);

        }


    }

    private static void ReceiveCallback(IAsyncResult AR)
    {
        Socket current = (Socket)AR.AsyncState;
        int received;

        try
        {
            received = current.EndReceive(AR);
        }
        catch (SocketException)
        {
            Console.WriteLine("Client forcefully disconnected");
            // Don't shutdown because the socket may be disposed and its disconnected anyway.
            current.Close(); 
            clientSockets.Remove(current);
            return;
        }

        byte[] recBuf = new byte[received];
        Array.Copy(buffer, recBuf, received);
        string text = Encoding.ASCII.GetString(recBuf);
        Console.WriteLine("Received Text: " + text);

       // if (text.ToLower() == "get time") // Client requested time

            Console.WriteLine("Text is a get time request");
            byte[] data = Encoding.ASCII.GetBytes(DateTime.Now.ToLongTimeString());
            current.Send(data);
            Console.WriteLine("Time sent to client");

        if (text.ToLower() == "exit") // Client wants to exit gracefully
        {
            // Always Shutdown before closing
            current.Shutdown(SocketShutdown.Both);
            current.Close();
            clientSockets.Remove(current);
            Console.WriteLine("Client disconnected");
            return;
        }


       current.BeginReceive(buffer, 0, BUFFER_SIZE, SocketFlags.None, ReceiveCallback, current); 
    }
}

答案 1 :(得分:1)

以防万一其他人遇到上述问题,但确实不能解释原因。这篇文章详细介绍了它。

https://thedataguy.in/aws-glue-custom-output-file-size-and-fixed-number-of-files/

“为什么胶水会产生更多的小文件? 如果您正在Glue中处理小块文件,它将读取然后将其转换为DynamicFrames。胶水在Spark上运行。因此,动态框架将被移至EMT群集中的分区。而且,Glue在所有节点之间平均分配数据,以获得更好的性能。处理完毕后,所有分区都将推送到您的目标。每个分区将有一个文件。这就是为什么我们要获得更多文件的原因。“