我有一个DataFlow作业,它首先读取位于Google云端存储中的2个文本文件。文本文件包含也位于Google云端存储中的图像路径。
经过一些检查后,我可以确认读取文本文件是否成功,但DataFlow作业仍然停留在读取图像时。相同的代码在本地完美运行。这让我觉得可能图像路径不正确,但事实并非如此。
这是我的工作ID:2018-01-10_12_16_56-8294573519126715750
任何建议都将不胜感激。此外,关于如何解决/调试此问题的任何指针都非常有用,因为我甚至不知道从哪里开始。
由于
管道定义
def configure_pipeline(pipeline, args):
read_input_source = beam.io.ReadFromText(args.input_path, strip_trailing_newlines=True)
read_img_paths = beam.io.ReadFromText(args.input_imgs, strip_trailing_newlines=True)
img_paths = (pipeline | 'Read image paths' >> read_img_paths)
train_points = (pipeline | 'Read data point' >> read_input_source)
_ = (train_points | "Read image" >> beam.ParDo(ExtractDataDoFn(), beam.pvalue.AsIter(img_paths)))
读取图片 - 大多数代码只是解析文本文件中的图像路径和一些数据来索引图像。
class ExtractDataDoFn(beam.DoFn):
def start_bundle(self, context=None):
# Each frame has its own path to its image
self.frame_number_to_name = {}
def process(self, element, img_paths):
try:
line = element.element
except AttributeError:
pass
if not self.frame_number_to_name:
for path in img_paths:
if len(path) > 4:
frame_number = int(path[-10 : -4])
self.frame_number_to_name[frame_number] = path
line_tokens = element.split(':')
pivot_example = line_tokens[0].strip('\'')
example = line_tokens[1].strip('\'')
label = int(line_tokens[2])
# Get image paths
pivot_frame_number = int(pivot_example.split(',')[0])
pivot_path = self.frame_number_to_name[pivot_frame_number]
example_frame_number = int(example.split(',')[0])
example_path = self.frame_number_to_name[example_frame_number]
# Read images
def _open_file_read_binary(uri):
try:
return file_io.FileIO(uri, mode='rb')
except errors.InvalidArgumentError:
return file_io.FileIO(uri, mode='r')
# Read pivot
try:
with _open_file_read_binary(pivot_path) as f:
pivot_image_bytes = f.read()
pivot_img = Image.open(io.BytesIO(pivot_image_bytes)).convert('RGB')
except Exception as e: # pylint: disable=broad-except
logging.exception('Error processing image %s: %s', pivot_example, str(e))
return
# Read example
try:
with _open_file_read_binary(example_path) as f:
example_image_bytes = f.read()
example_img = Image.open(io.BytesIO(example_image_bytes)).convert('RGB')
except Exception as e: # pylint: disable=broad-except
logging.exception('Error processing image %s: %s', example, str(e))
return
# Convert to Numpy array
pivot_np = np.array(pivot_img)
example_np = np.array(example_img)
def _get_feature(line, img):
frame_number = int(line.split(',')[0])
y, x = int(line.split(',')[3]), int(line.split(',')[2])
h, w = int(line.split(',')[5]), int(line.split(',')[4])
bb = img[y : y + h, x : x + w, :]
return bb
# Get raw content of bounding box
pivot_feature = _get_feature(pivot_example, pivot_np)
example_feature = _get_feature(example, example_np)
# Resize data
pivot_feature = Image.fromarray(pivot_feature).resize((224, 224))
example_feature = Image.fromarray(example_feature).resize((224, 224))
# Convert back to numpy
pivot_feature = np.array(pivot_feature, np.float64)
example_feature = np.array(example_feature, np.float64)
# print(pivot_feature.shape)
yield pivot_feature, example_feature, label