我正在尝试将csv
文件数据导入dynamodb。
请给我一个建议。
first_name last_name
sri ram
Rahul Dravid
JetPay Underwriter
Anil Kumar Gurram
答案 0 :(得分:10)
您要使用哪种语言导入数据。我只是在nodejs中编写一个函数,可以将csv文件导入到dynamodb表中。它首先将整个csv解析为数组,将数组拆分为块(25),然后将batchWriteItem解析为表。
注意:DynamoDB在batchinsert中一次只允许1到25条记录。所以我们必须将数组拆分成块。
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
var csv_filename = "YOUR_CSV_FILENAME_WITH_ABSOLUTE_PATH";
rs = fs.createReadStream(csv_filename);
parser = parse({
columns : true,
delimiter : ','
}, function(err, data) {
var split_arrays = [], size = 25;
while (data.length > 0) {
split_arrays.push(data.splice(0, size));
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, function(item_data, callback) {
ddb.batchWriteItem({
"TABLE_NAME" : item_data
}, {}, function(err, res, cap) {
console.log('done going next');
if (err == null) {
console.log('Success chunk #' + chunk_no);
data_imported = true;
} else {
console.log(err);
console.log('Fail chunk #' + chunk_no);
data_imported = false;
}
chunk_no++;
callback();
});
}, function() {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);
答案 1 :(得分:9)
您可以使用AWS Data Pipeline这样的内容。您可以将csv
文件上传到S3,然后使用Data Pipeline检索和填充DynamoDB表。他们有step-by-step tutorial。
答案 2 :(得分:6)
作为一个没有权限来创建数据管道的低级开发者,我不得不使用这个javascript。 Hassan Sidique的代码略显过时,但这对我有用:
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
const AWS = require('aws-sdk');
const dynamodbDocClient = new AWS.DynamoDB({ region: "eu-west-1" });
var csv_filename = "./CSV.csv";
rs = fs.createReadStream(csv_filename);
parser = parse({
columns : true,
delimiter : ','
}, function(err, data) {
var split_arrays = [], size = 25;
while (data.length > 0) {
//split_arrays.push(data.splice(0, size));
let cur25 = data.splice(0, size)
let item_data = []
for (var i = cur25.length - 1; i >= 0; i--) {
const this_item = {
"PutRequest" : {
"Item": {
// your column names here will vary, but you'll need do define the type
"Title": {
"S": cur25[i].Title
},
"Col2": {
"N": cur25[i].Col2
},
"Col3": {
"N": cur25[i].Col3
}
}
}
};
item_data.push(this_item)
}
split_arrays.push(item_data);
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, (item_data, callback) => {
const params = {
RequestItems: {
"tagPerformance" : item_data
}
}
dynamodbDocClient.batchWriteItem(params, function(err, res, cap) {
if (err === null) {
console.log('Success chunk #' + chunk_no);
data_imported = true;
} else {
console.log(err);
console.log('Fail chunk #' + chunk_no);
data_imported = false;
}
chunk_no++;
callback();
});
}, () => {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);

答案 3 :(得分:5)
我写了一个工具来使用并行执行来做到这一点,不需要在机器上安装任何依赖项或开发人员工具(它是用Go语言编写的)。
它可以处理:
它可用于MacOS,Linux,Windows和Docker:https://github.com/a-h/ddbimport
这是我的测试结果,表明它可以使用AWS Step Functions并行地更快地导入。
我将在2020年5月15日BST 1155-https://www.twitch.tv/awscomsum
上更详细地描述该工具。答案 4 :(得分:3)
这是我的解决方案。我依赖的事实是有一些类型的标题指示哪个列做了什么。简单直接。没有管道无意义的快速上传..
import os, json, csv, yaml, time
from tqdm import tqdm
# For Database
import boto3
# Variable store
environment = {}
# Environment variables
with open("../env.yml", 'r') as stream:
try:
environment = yaml.load(stream)
except yaml.YAMLError as exc:
print(exc)
# Get the service resource.
dynamodb = boto3.resource('dynamodb',
aws_access_key_id=environment['AWS_ACCESS_KEY'],
aws_secret_access_key=environment['AWS_SECRET_KEY'],
region_name=environment['AWS_REGION_NAME'])
# Instantiate a table resource object without actually
# creating a DynamoDB table. Note that the attributes of this table
# are lazy-loaded: a request is not made nor are the attribute
# values populated until the attributes
# on the table resource are accessed or its load() method is called.
table = dynamodb.Table('data')
# Header
header = []
# Open CSV
with open('export.csv') as csvfile:
reader = csv.reader(csvfile,delimiter=',')
# Parse Each Line
with table.batch_writer() as batch:
for index,row in enumerate(tqdm(reader)):
if index == 0:
#save the header to be used as the keys
header = row
else:
if row == "":
continue
# Create JSON Object
# Push to DynamoDB
data = {}
# Iterate over each column
for index,entry in enumerate(header):
data[entry.lower()] = row[index]
response = batch.put_item(
Item=data
)
# Repeat
答案 5 :(得分:3)
我建议使用本地版本的DynamoDB,以防您在开始产生费用之前需要进行健全性检查。在发布之前我做了一些小修改,所以一定要用对你有意义的任何方法进行测试。我注释了一个虚假的批量上传作业,您可以使用它替代任何远程或本地的DynamoDB服务,以在stdout中验证这是否符合您的需求。
如果您使用手动安装路线,则可以使用以下内容启动dynamodb-local:
java -Djava.library.path=<PATH_TO_DYNAMODB_LOCAL>/DynamoDBLocal_lib/\
-jar <PATH_TO_DYNAMODB_LOCAL>/DynamoDBLocal.jar\
-inMemory\
-sharedDb
npm路线可能更简单。
除此之外,请参阅dynamodb-admin。
我使用npm i -g dynamodb-admin
安装了dynamodb-admin。然后可以使用:
dynamodb-admin
dynamodb-local默认为localhost:8000
。
dynamodb-admin是一个默认为localhost:8001
的网页。启动这两项服务后,在浏览器中打开localhost:8001
以查看和操作数据库。
下面的脚本不会创建数据库。请使用dynamodb-admin
。
maxSockets
配置选项,并修改创建批量数组的代码进行试验,以便它只向前传递单个批次。 /**
* Uploads CSV data to DynamoDB.
*
* 1. Streams a CSV file line-by-line.
* 2. Parses each line to a JSON object.
* 3. Collects batches of JSON objects.
* 4. Converts batches into the PutRequest format needed by AWS.DynamoDB.batchWriteItem
* and runs 1 or more batches at a time.
*/
const AWS = require("aws-sdk")
const chalk = require('chalk')
const fs = require('fs')
const split = require('split2')
const uuid = require('uuid')
const through2 = require('through2')
const { Writable } = require('stream');
const { Transform } = require('stream');
const CSV_FILE_PATH = __dirname + "/../assets/whatever.csv"
// A whitelist of the CSV columns to ingest.
const CSV_KEYS = [
"id",
"name",
"city"
]
// Inadequate WCU will cause "insufficient throughput" exceptions, which in this script are not currently
// handled with retry attempts. Retries are not necessary as long as you consistently
// stay under the WCU, which isn't that hard to predict.
// The number of records to pass to AWS.DynamoDB.DocumentClient.batchWrite
// See https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_BatchWriteItem.html
const MAX_RECORDS_PER_BATCH = 25
// The number of batches to upload concurrently.
// https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/node-configuring-maxsockets.html
const MAX_CONCURRENT_BATCHES = 1
// MAKE SURE TO LAUNCH `dynamodb-local` EXTERNALLY FIRST IF USING LOCALHOST!
AWS.config.update({
region: "us-west-1"
,endpoint: "http://localhost:8000" // Comment out to hit live DynamoDB service.
});
const db = new AWS.DynamoDB()
// Create a file line reader.
var fileReaderStream = fs.createReadStream(CSV_FILE_PATH)
var lineReaderStream = fileReaderStream.pipe(split())
var linesRead = 0
// Attach a stream that transforms text lines into JSON objects.
var skipHeader = true
var csvParserStream = lineReaderStream.pipe(
through2(
{
objectMode: true,
highWaterMark: 1
},
function handleWrite(chunk, encoding, callback) {
// ignore CSV header
if (skipHeader) {
skipHeader = false
callback()
return
}
linesRead++
// transform line into stringified JSON
const values = chunk.toString().split(',')
const ret = {}
CSV_KEYS.forEach((keyName, index) => {
ret[keyName] = values[index]
})
ret.line = linesRead
console.log(chalk.cyan.bold("csvParserStream:",
"line:", linesRead + ".",
chunk.length, "bytes.",
ret.id
))
callback(null, ret)
}
)
)
// Attach a stream that collects incoming json lines to create batches.
// Outputs an array (<= MAX_CONCURRENT_BATCHES) of arrays (<= MAX_RECORDS_PER_BATCH).
var batchingStream = (function batchObjectsIntoGroups(source) {
var batchBuffer = []
var idx = 0
var batchingStream = source.pipe(
through2.obj(
{
objectMode: true,
writableObjectMode: true,
highWaterMark: 1
},
function handleWrite(item, encoding, callback) {
var batchIdx = Math.floor(idx / MAX_RECORDS_PER_BATCH)
if (idx % MAX_RECORDS_PER_BATCH == 0 && batchIdx < MAX_CONCURRENT_BATCHES) {
batchBuffer.push([])
}
batchBuffer[batchIdx].push(item)
if (MAX_CONCURRENT_BATCHES == batchBuffer.length &&
MAX_RECORDS_PER_BATCH == batchBuffer[MAX_CONCURRENT_BATCHES-1].length)
{
this.push(batchBuffer)
batchBuffer = []
idx = 0
} else {
idx++
}
callback()
},
function handleFlush(callback) {
if (batchBuffer.length) {
this.push(batchBuffer)
}
callback()
}
)
)
return (batchingStream);
})(csvParserStream)
// Attach a stream that transforms batch buffers to collections of DynamoDB batchWrite jobs.
var databaseStream = new Writable({
objectMode: true,
highWaterMark: 1,
write(batchBuffer, encoding, callback) {
console.log(chalk.yellow(`Batch being processed.`))
// Create `batchBuffer.length` batchWrite jobs.
var jobs = batchBuffer.map(batch =>
buildBatchWriteJob(batch)
)
// Run multiple batch-write jobs concurrently.
Promise
.all(jobs)
.then(results => {
console.log(chalk.bold.red(`${batchBuffer.length} batches completed.`))
})
.catch(error => {
console.log( chalk.red( "ERROR" ), error )
callback(error)
})
.then( () => {
console.log( chalk.bold.red("Resuming file input.") )
setTimeout(callback, 900) // slow down the uploads. calculate this based on WCU, item size, batch size, and concurrency level.
})
// return false
}
})
batchingStream.pipe(databaseStream)
// Builds a batch-write job that runs as an async promise.
function buildBatchWriteJob(batch) {
let params = buildRequestParams(batch)
// This was being used temporarily prior to hooking up the script to any dynamo service.
// let fakeJob = new Promise( (resolve, reject) => {
// console.log(chalk.green.bold( "Would upload batch:",
// pluckValues(batch, "line")
// ))
// let t0 = new Date().getTime()
// // fake timing
// setTimeout(function() {
// console.log(chalk.dim.yellow.italic(`Batch upload time: ${new Date().getTime() - t0}ms`))
// resolve()
// }, 300)
// })
// return fakeJob
let promise = new Promise(
function(resolve, reject) {
let t0 = new Date().getTime()
let printItems = function(msg, items) {
console.log(chalk.green.bold(msg, pluckValues(batch, "id")))
}
let processItemsCallback = function (err, data) {
if (err) {
console.error(`Failed at batch: ${pluckValues(batch, "line")}, ${pluckValues(batch, "id")}`)
console.error("Error:", err)
reject()
} else {
var params = {}
params.RequestItems = data.UnprocessedItems
var numUnprocessed = Object.keys(params.RequestItems).length
if (numUnprocessed != 0) {
console.log(`Encountered ${numUnprocessed}`)
printItems("Retrying unprocessed items:", params)
db.batchWriteItem(params, processItemsCallback)
} else {
console.log(chalk.dim.yellow.italic(`Batch upload time: ${new Date().getTime() - t0}ms`))
resolve()
}
}
}
db.batchWriteItem(params, processItemsCallback)
}
)
return (promise)
}
// Build request payload for the batchWrite
function buildRequestParams(batch) {
var params = {
RequestItems: {}
}
params.RequestItems.Provider = batch.map(obj => {
let item = {}
CSV_KEYS.forEach((keyName, index) => {
if (obj[keyName] && obj[keyName].length > 0) {
item[keyName] = { "S": obj[keyName] }
}
})
return {
PutRequest: {
Item: item
}
}
})
return params
}
function pluckValues(batch, fieldName) {
var values = batch.map(item => {
return (item[fieldName])
})
return (values)
}
答案 6 :(得分:1)
这是一个更简单的解决方案。使用此解决方案,您不必删除空字符串属性。
require('./env'); //contains aws secret/access key
const parse = require('csvtojson');
const AWS = require('aws-sdk');
// --- start user config ---
const CSV_FILENAME = __dirname + "/002_subscribers_copy_from_db.csv";
const DYNAMODB_TABLENAME = '002-Subscribers';
// --- end user config ---
//You could add your credentials here or you could
//store it in process.env like I have done aws-sdk
//would detect the keys in the environment
AWS.config.update({
region: process.env.AWS_REGION
});
const db = new AWS.DynamoDB.DocumentClient({
convertEmptyValues: true
});
(async ()=>{
const json = await parse().fromFile(CSV_FILENAME);
//this is efficient enough if you're processing small
//amounts of data. If your data set is large then I
//suggest using dynamodb method .batchWrite() and send
//in data in chunks of 25 (the limit) and find yourself
//a more efficient loop if there is one
for(var i=0; i<json.length; i++){
console.log(`processing item number ${i+1}`);
let query = {
TableName: DYNAMODB_TABLENAME,
Item: json[i]
};
await db.put(query).promise();
/**
* Note: If "json" contains other nested objects, you would have to
* loop through the json and parse all child objects.
* likewise, you would have to convert all children into their
* native primitive types because everything would be represented
* as a string.
*/
}
console.log('\nDone.');
})();
答案 7 :(得分:1)
一种导入/导出内容的方法:
"""
Batch-writes data from a file to a dynamo-db database.
"""
import json
import boto3
# Get items from DynamoDB table like this:
# aws dynamodb scan --table-name <table-name>
# Create dynamodb client.
client = boto3.client(
'dynamodb',
aws_access_key_id='',
aws_secret_access_key=''
)
with open('', 'r') as file:
data = json.loads(file.read())['Items']
# Execute write-data request for each item.
for item in data:
client.put_item(
TableName='',
Item=item
)
答案 8 :(得分:1)
我使用了 https://github.com/GorillaStack/dynamodb-csv-export-import。它非常简单,而且效果很好。我只是按照自述文件中的说明进行操作:
# Install globally
npm i -g @gorillastack/dynamodb-csv-export-import
# Set AWS region
export AWS_DEFAULT_REGION=us-east-1
# Use it for your CSV and dynamo table
dynamodb-csv-export-import my-exported-file.csv MyDynamoDbTableName
答案 9 :(得分:0)
您可以尝试使用批量写入和多处理来加速批量导入。
import csv
import time
import boto3
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
current_milli_time = lambda: int(round(time.time() * 1000))
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('table_name')
def add_users_in_batch(data):
with table.batch_writer() as batch:
for item in data:
batch.put_item(Item = item)
def run_batch_migration():
start = current_milli_time()
row_count = 0
batch = []
batches = []
with open(CSV_PATH, newline = '') as csvfile:
reader = csv.reader(csvfile, delimiter = '\t', quotechar = '|')
for row in reader:
row_count += 1
item = {
'email': row[0],
'country': row[1]
}
batch.append(item)
if row_count % 25 == 0:
batches.append(batch)
batch = []
batches.append(batch)
pool.map(add_users_in_batch, batches)
print('Number of rows processed - ', str(row_count))
end = current_milli_time()
print('Total time taken for migration : ', str((end - start) / 1000), ' secs')
if __name__ == "__main__":
run_batch_migration()
答案 10 :(得分:0)
另一个快速的解决方法是先将CSV加载到RDS或任何其他mysql实例,这很容易做到(https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html),然后使用DMS(AWS数据库迁移服务)将整个数据加载到dynamodb 。您必须为DMS创建角色,然后才能加载数据。但这在无需运行任何脚本的情况下效果很好。
答案 11 :(得分:0)
更新了2019年Javascript代码
我对上面的任何Javascript代码示例都不太满意。从上面的Hassan Siddique答案开始,我已经更新到最新的API,包括示例凭证代码,将所有用户配置移至顶部,缺少时添加了uuid()并去除了空白字符串。
const fs = require('fs');
const parse = require('csv-parse');
const async = require('async');
const uuid = require('uuid/v4');
const AWS = require('aws-sdk');
// --- start user config ---
const AWS_CREDENTIALS_PROFILE = 'serverless-admin';
const CSV_FILENAME = "./majou.csv";
const DYNAMODB_REGION = 'eu-central-1';
const DYNAMODB_TABLENAME = 'entriesTable';
// --- end user config ---
const credentials = new AWS.SharedIniFileCredentials({
profile: AWS_CREDENTIALS_PROFILE
});
AWS.config.credentials = credentials;
const docClient = new AWS.DynamoDB.DocumentClient({
region: DYNAMODB_REGION
});
const rs = fs.createReadStream(CSV_FILENAME);
const parser = parse({
columns: true,
delimiter: ','
}, function(err, data) {
var split_arrays = [],
size = 25;
while (data.length > 0) {
split_arrays.push(data.splice(0, size));
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, function(item_data, callback) {
const params = {
RequestItems: {}
};
params.RequestItems[DYNAMODB_TABLENAME] = [];
item_data.forEach(item => {
for (key of Object.keys(item)) {
// An AttributeValue may not contain an empty string
if (item[key] === '')
delete item[key];
}
params.RequestItems[DYNAMODB_TABLENAME].push({
PutRequest: {
Item: {
id: uuid(),
...item
}
}
});
});
docClient.batchWrite(params, function(err, res, cap) {
console.log('done going next');
if (err == null) {
console.log('Success chunk #' + chunk_no);
data_imported = true;
} else {
console.log(err);
console.log('Fail chunk #' + chunk_no);
data_imported = false;
}
chunk_no++;
callback();
});
}, function() {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);
答案 12 :(得分:0)
我为此创造了一颗宝石。
现在您可以通过运行add_library(mylib
STATIC
IMPORTED
mylib.a)
进行安装,然后可以使用以下命令:
gem install dynamocli
答案 13 :(得分:0)
最简单的解决方案可能是使用 AWS 制作的模板/解决方案:
对 Amazon DynamoDB 实施批量 CSV 摄取 https://aws.amazon.com/blogs/database/implementing-bulk-csv-ingestion-to-amazon-dynamodb/
通过这种方法,您可以使用提供的模板创建一个 CloudFormation 堆栈,其中包括一个 S3 存储桶、一个 Lambda 函数和一个新的 DynamoDB 表。 lambda 被触发以在上传到 S3 存储桶时运行并批量插入表中。
在我的例子中,我想插入到一个现有的表中,所以我只是在创建堆栈后更改了 Lambda 函数的环境变量。
答案 14 :(得分:0)
按照以下链接中的说明将数据导入到 DynamoDB 中的现有表:
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/SampleData.LoadData.html
请注意,表的名称是您必须在此处找到的名称: https://console.aws.amazon.com/dynamodbv2/home
并且表名是用在json文件里面的,json文件名本身并不重要。例如,我有一个表为 Country-kdezpod7qrap7nhpjghjj-staging
,然后为了将数据导入该表,我必须制作一个这样的 json 文件:
{
"Country-kdezpod7qrap7nhpjghjj-staging": [
{
"PutRequest": {
"Item": {
"id": {
"S": "ir"
},
"__typename": {
"S": "Country"
},
"createdAt": {
"S": "2021-01-04T12:32:09.012Z"
},
"name": {
"S": "Iran"
},
"self": {
"N": "1"
},
"updatedAt": {
"S": "2021-01-04T12:32:09.012Z"
}
}
}
}
]
}
如果你不知道如何为每个 PutRequest 创建项目,那么你可以在你的数据库中创建一个带有突变的项目,然后尝试复制它,然后它会为你显示一个项目的结构:
如果您的 CSV 文件中有大量项目,您可以使用以下 npm 工具生成 json 文件:
https://www.npmjs.com/package/json-dynamo-putrequest
然后我们可以使用下面的命令来导入数据:
aws dynamodb batch-write-item --request-items file://Country.json
如果成功导入数据,您必须看到以下输出:
{
"UnprocessedItems": {}
}
另外请注意,使用这种方法,您的数组中只能有 25 个 PutRequest
项。所以如果你想推送 100 个项目,你需要创建 4 个文件。
答案 15 :(得分:0)
试试this。这非常简单且有用。