我在理解最近破坏生产代码的现象时遇到了问题。这段代码
with self.s3.get_stream(bucket=self.import_bucket, key_name=self.in_file, mode="rb") as file_obj:
# Check file for any amount data, return false if there is no Data
if not self.check_file_for_data(file_obj):
return False
my_producer = iter(self.line_producer(file_obj))
self.header = next(my_producer)
def check_file_for_data(self, file_obj):
try:
next(islice(file_obj, 1, 2))
except StopIteration:
# File has no Data other than column names.
return False
except Exception as e:
# unknown problem caused in reading file.
self.log.error("Error in reading file: {0} for post processing. error message: {1}".format(self.in_file, e))
else:
# File has readable data other than Column names.
return True
def line_producer(self, file_obj):
# Added recently to solve this problem, but was working without this, till dec
file_obj.seek(0)
self.log.info("LINE PRODUCTION : Started")
csv_reader_obj = csv.reader(file_obj)
header = next(csv_reader_obj)
yield header
for index, row in enumerate(csv_reader_obj, 1):
while self.topic_queue.qsize() > 20000:
pass
packet = ([index, row, header])
try:
# Block at most for 500 sec till an Empty slot is found
self.topic_queue.put(obj=packet, block=True, timeout= 500)
except Exception as e:
self.log.info("LINE PRODUCTION : FAILED")
type_, value_, traceback_ = sys.exc_info()
self.log.exception("traceback :{} ||type: {} ||value: {}".format(traceback.extract_tb(traceback_),
type_,
value_))
raise ValueError("PROBLEM IN LOADING PACKET TO TASK QUEUE. "
"\n \t PACKET -> {} \n \t INDEX -> {}".format(packet, index))
if index % 100000 == 0:
self.log.info("produced -> {}".format(index))
self.log.info("LINE PRODUCTION : FINISHED")
yield index
这种情况一直持续到12月。但最近开始破产。在调试之后,我理解的是。
self.header = next(my_producer)
此行返回文件中的第2行而不是Header。我必须重置对象以获得这样的标题。
file_obj.seek(0)
如果有什么我在这里做错了,请你告诉我们。也为什么现在才打破??
答案 0 :(得分:1)
对我而言,当您执行
时,check_file_for_data
似乎总是从文件中消耗两行
next(islice(file_obj, 1, 2))
跳过一行,然后返回。
如果没有回到line_producer
中文件的开头,我真的不知道你的代码片段是如何工作的。