我正忙着使用POC(使用AWS Glue)从RDS AWS Postgresql表中提取数据,我想生成一个JSON文件。
我正在使用以下脚本,但是它继续生成多个文件,每个文件中有5行。如何仅生成1个文件?
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "temp-crawlerdb-xxxxx", table_name = "taxservice__3fa3bf8633994e1a827498190adbe56a_contingencyrunningtotal", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "temp-crawlerdb-xxxxx", table_name = "taxservice__3fa3bf8633994e1a827498190adbe56a_contingencyrunningtotal", transformation_ctx = "datasource0")
## @type: ApplyMapping
## @args: [mapping = [("stake", "decimal(18,6)", "stake", "decimal(18,6)"), ("branchid", "long", "branchid", "long"), ("winningstake", "decimal(18,6)", "winningstake", "decimal(18,6)"), ("grossrevenue", "decimal(18,6)", "grossrevenue", "decimal(18,6)"), ("vatrate", "decimal(18,6)", "vatrate", "decimal(18,6)"), ("tmstamp", "timestamp", "tmstamp", "timestamp"), ("usrid", "string", "usrid", "string"), ("contingencyexternalreference", "string", "contingencyexternalreference", "string"), ("winnings", "decimal(18,6)", "winnings", "decimal(18,6)"), ("ggrtaxrate", "decimal(18,6)", "ggrtaxrate", "decimal(18,6)"), ("taxpayable", "decimal(18,6)", "taxpayable", "decimal(18,6)"), ("vatpayable", "decimal(18,6)", "vatpayable", "decimal(18,6)")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("stake", "decimal(18,6)", "stake", "decimal(18,6)"), ("branchid", "long", "branchid", "long"), ("winningstake", "decimal(18,6)", "winningstake", "decimal(18,6)"), ("grossrevenue", "decimal(18,6)", "grossrevenue", "decimal(18,6)"), ("vatrate", "decimal(18,6)", "vatrate", "decimal(18,6)"), ("tmstamp", "timestamp", "tmstamp", "timestamp"), ("usrid", "string", "usrid", "string"), ("contingencyexternalreference", "string", "contingencyexternalreference", "string"), ("winnings", "decimal(18,6)", "winnings", "decimal(18,6)"), ("ggrtaxrate", "decimal(18,6)", "ggrtaxrate", "decimal(18,6)"), ("taxpayable", "decimal(18,6)", "taxpayable", "decimal(18,6)"), ("vatpayable", "decimal(18,6)", "vatpayable", "decimal(18,6)")], transformation_ctx = "applymapping1")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://tax-service-xxxxx"}, format = "json", transformation_ctx = "datasink2"]
## @return: datasink2
## @inputs: [frame = applymapping1]
datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://tax-service-xxxxx"}, format = "csv", transformation_ctx = "datasink2")
job.commit()
答案 0 :(得分:2)
在应用映射之前,请执行以下操作:
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Sockets;
using System.Text;
namespace MultiServer
{
class Program
{
private static readonly Socket serverSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
private static readonly List<Socket> clientSockets = new List<Socket>();
private const int BUFFER_SIZE = 2048;
private const int PORT = 100;
private static readonly byte[] buffer = new byte[BUFFER_SIZE];
static void Main()
{
Console.Title = "Server";
SetupServer();
Console.ReadLine(); // When we press enter close everything
CloseAllSockets();
}
private static void SetupServer()
{
Console.WriteLine("Setting up server...");
serverSocket.Bind(new IPEndPoint(IPAddress.Any, PORT));
serverSocket.Listen(100);
serverSocket.BeginAccept(new AsyncCallback(AcceptCallback), null);
Console.WriteLine("Server setup complete");
}
/// <summary>
/// Close all connected client (we do not need to shutdown the server socket as its connections
/// are already closed with the clients).
/// </summary>
private static void CloseAllSockets()
{
foreach (Socket socket in clientSockets)
{
socket.Shutdown(SocketShutdown.Both);
socket.Close();
}
serverSocket.Close();
}
private static void AcceptCallback(IAsyncResult AR)
{
Socket socket;
try
{
socket = serverSocket.EndAccept(AR);
}
catch (ObjectDisposedException) // I cannot seem to avoid this (on exit when properly closing sockets)
{
return;
}
if ( socket != null)
{
clientSockets.Add(socket);
socket.BeginReceive(buffer, 0, BUFFER_SIZE, SocketFlags.None, ReceiveCallback, socket);
Console.WriteLine("Client connected, waiting for request...");
serverSocket.BeginAccept(AcceptCallback, null);
}
}
private static void ReceiveCallback(IAsyncResult AR)
{
Socket current = (Socket)AR.AsyncState;
int received;
try
{
received = current.EndReceive(AR);
}
catch (SocketException)
{
Console.WriteLine("Client forcefully disconnected");
// Don't shutdown because the socket may be disposed and its disconnected anyway.
current.Close();
clientSockets.Remove(current);
return;
}
byte[] recBuf = new byte[received];
Array.Copy(buffer, recBuf, received);
string text = Encoding.ASCII.GetString(recBuf);
Console.WriteLine("Received Text: " + text);
// if (text.ToLower() == "get time") // Client requested time
Console.WriteLine("Text is a get time request");
byte[] data = Encoding.ASCII.GetBytes(DateTime.Now.ToLongTimeString());
current.Send(data);
Console.WriteLine("Time sent to client");
if (text.ToLower() == "exit") // Client wants to exit gracefully
{
// Always Shutdown before closing
current.Shutdown(SocketShutdown.Both);
current.Close();
clientSockets.Remove(current);
Console.WriteLine("Client disconnected");
return;
}
current.BeginReceive(buffer, 0, BUFFER_SIZE, SocketFlags.None, ReceiveCallback, current);
}
}
答案 1 :(得分:1)
以防万一其他人遇到上述问题,但确实不能解释原因。这篇文章详细介绍了它。
https://thedataguy.in/aws-glue-custom-output-file-size-and-fixed-number-of-files/
“为什么胶水会产生更多的小文件? 如果您正在Glue中处理小块文件,它将读取然后将其转换为DynamicFrames。胶水在Spark上运行。因此,动态框架将被移至EMT群集中的分区。而且,Glue在所有节点之间平均分配数据,以获得更好的性能。处理完毕后,所有分区都将推送到您的目标。每个分区将有一个文件。这就是为什么我们要获得更多文件的原因。“