在我的应用程序中,我当前支持从直接下载URL上传,或者用户输入一个URL,或者从Box文件选择器小部件生成一个URL。我使用Net:HTTP请求执行此操作,将每个段写入文件系统。
现在,我想更改为在S3中存储来自url的文件,因为文件太大而无法放入内存中。
以下是我目前正在处理的代码段:
queue = Queue.new
up_url = presigned_url_from_aws
down_uri = remote_download_url
producer = Thread.new do
# stream the file from the url,
# (code based on something currently working)
Net::HTTP.start(down_uri.host, down_uri.port, :use_ssl => (down_uri.scheme == 'https')) {|http|
http.request_get(down_uri.path) {|res|
res.read_body {|seg|
queue << seg
update_progress()
}
}
}
end
consumer = Thread.new do
# turn queue input into body_stream ?
end
# Use presigned url to upload file to aws
Net::HTTP.start(up_url.host) do |http|
http.send_request("PUT", up_url.request_uri, body_stream, {
# This is required, or Net::HTTP will add a default unsigned content-type.
"content-type" => "",
})
end
答案 0 :(得分:0)
我最终找到了一个可行的解决方案。和以前一样,此代码位于ProgressJob类中。我使用了aws分段上传api。我为段创建了一个队列,一个生产者线程将其放入队列,一个消费者线程将其从队列中取出以进行进一步处理,以及一个线程在合适的时间关闭了队列。在使用者线程中,我将这些段放入一个StringIO对象中,直到每个段为止,但最后一个段至少为5 MB(上传部分的最小大小),然后在我获得它们时将这些部分发送到s3,以免填满磁盘或内存。有很多陷阱,但是下面是我最终得到的工作代码,以防万一这对其他人有帮助:
require 'tempfile'
require 'open-uri'
require 'fileutils'
require 'net/http'
require 'aws-sdk-s3'
class CreateDatafileFromRemoteJob < ProgressJob::Base
Thread.abort_on_exception=true
FIVE_MB = 1024 * 1024 * 5
def initialize(dataset_id, datafile, remote_url, filename, filesize)
@remote_url = remote_url
@dataset_id = dataset_id
@datafile = datafile
@filename = filename
@filesize = filesize #string because it is used in display
if filesize.to_f < 4000
progress_max = 2
else
progress_max = (filesize.to_f / 4000).to_i + 1
end
super progress_max: progress_max
end
def perform
more_segs_to_do = true
upload_incomplete = true
@datafile.binary_name = @filename
@datafile.storage_root = Application.storage_manager.draft_root.name
@datafile.storage_key = File.join(@datafile.web_id, @filename)
@datafile.binary_size = @filesize
@datafile.save!
if IDB_CONFIG[:aws][:s3_mode]
upload_key = @datafile.storage_key
upload_bucket = Application.storage_manager.draft_root.bucket
if Application.storage_manager.draft_root.prefix
upload_key = "#{Application.storage_manager.draft_root.prefix}#{@datafile.storage_key}"
end
client = Application.aws_client
if @filesize.to_f < FIVE_MB
web_contents = open(@remote_url) {|f| f.read}
Application.storage_manager.draft_root.copy_io_to(@datafile.storage_key, web_contents, nil, @filesize.to_f)
upload_incomplete = false
else
parts = []
seg_queue = Queue.new
mutex = Mutex.new
segs_complete = false
segs_todo = 0
segs_done = 0
begin
upload_id = aws_mulitpart_start(client, upload_bucket, upload_key)
seg_producer = Thread.new do
uri = URI.parse(@remote_url)
Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == 'https')) {|http|
http.request_get(uri.path) {|res|
res.read_body {|seg|
mutex.synchronize {
segs_todo = segs_todo + 1
}
seg_queue << seg
update_progress
}
}
}
mutex.synchronize {
segs_complete = true
}
end
seg_consumer = Thread.new do
part_number = 1
partio = StringIO.new("", 'wb+')
while seg = seg_queue.deq # wait for queue to be closed in controller thread
partio << seg
if partio.size > FIVE_MB
partio.rewind
mutex.synchronize {
etag = aws_upload_part(client, partio, upload_bucket, upload_key, part_number, upload_id)
parts_hash = {etag: etag, part_number: part_number}
parts.push(parts_hash)
}
part_number = part_number + 1
partio.close if partio&.closed?
partio = StringIO.new("", 'wb+')
end
mutex.synchronize {
segs_done = segs_done + 1
}
end
# upload last part, less than 5 MB
mutex.synchronize {
partio.rewind
etag = aws_upload_part(client, partio, upload_bucket, upload_key, part_number, upload_id)
parts_hash = {etag: etag, part_number: part_number}
parts.push(parts_hash)
Rails.logger.warn("Another part bites the dust: #{part_number}")
partio.close if partio&.closed?
aws_complete_upload(client, upload_bucket, upload_key, parts, upload_id)
upload_incomplete = false
}
end
controller = Thread.new do
while more_segs_to_do
sleep 0.9
mutex.synchronize {
if segs_complete && ( segs_done == segs_todo)
more_segs_to_do = false
end
}
end
seg_queue.close
end
rescue Exception => ex
# ..|..
#
Rails.logger.warn("something went wrong during multipart upload")
Rails.logger.warn(ex.class)
Rails.logger.warn(ex.message)
ex.backtrace.each do |line|
Rails.logger.warn(line)
end
Application.aws_client.abort_multipart_upload({
bucket: upload_bucket,
key: upload_key,
upload_id: upload_id,
})
raise ex
end
end
else
filepath = "#{Application.storage_manager.draft_root.path}/#{@datafile.storage_key}"
dir_name = File.dirname(filepath)
FileUtils.mkdir_p(dir_name) unless File.directory?(dir_name)
File.open(filepath, 'wb+') do |outfile|
uri = URI.parse(@remote_url)
Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == 'https')) {|http|
http.request_get(uri.path) {|res|
res.read_body {|seg|
outfile << seg
update_progress
}
}
}
end
upload_incomplete = false
end
while upload_incomplete
sleep 1.3
end
end
def aws_mulitpart_start(client, upload_bucket, upload_key)
start_response = client.create_multipart_upload({
bucket: upload_bucket,
key: upload_key,
})
start_response.upload_id
end
def aws_upload_part(client, partio, upload_bucket, upload_key, part_number, upload_id)
part_response = client.upload_part({
body: partio,
bucket: upload_bucket,
key: upload_key,
part_number: part_number,
upload_id: upload_id,
})
part_response.etag
end
def aws_complete_upload(client, upload_bucket, upload_key, parts, upload_id)
response = client.complete_multipart_upload({
bucket: upload_bucket,
key: upload_key,
multipart_upload: {parts: parts, },
upload_id: upload_id,
})
end
end