我现在尝试了很多,过去几天也读了很多,但我无法找到解决问题的正确方法。也许有人可以帮助我。
我必须使用CSV文件,其中wdw_clip_db_2018-01-17_2(4720行)应包含所有数据,而wdw_content_complete(2752行)仅包含子集。
wdw_clip_db_2018-01-17_2.csv:
11,0_7cjgob0v,9000301_AzubiGlueckT1.mxf,0_7cjgob0v.mpg
43,0_heor15yl,,0_heor15yl.mpg
1616,0_dfopff5t,578_Bier.MXF,0_dfopff5t.mpg
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
1931,0_cbx3zgw6,9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6.mpg
wdw_content_complete.csv:
1737,9000301_AzubiGlueckT1.mxf,0_7cjgob0v
1451,578_Bier.MXF,0_dfopff5t
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
1762,9070201_KeinGeldFuerGeschen.mxf,NULL
我需要提出的是Excel可读的以下csv文件:
wdw_clean_assets.csv:
9000301_AzubiGlueckT1.mxf,0_7cjgob0v
578_Bier.MXF,0_dfopff5t
其中wdw_clean_assets包含与文件和external_refernce匹配的每一行(例如0_7cjgob0v)。
wdw_to_add_ext_refs.csv:
9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6
其中wdw_to_add_ext_refs包含与文件匹配但在external_reference字段中为NULL的每一行。 NULL将替换为wdw_clip_db_2018-01-17_2.csv中的外部引用。
当我比较行数时,接缝是wdw_content_complete.csv中不在wdw_clip_db_2018-01-17_2.csv中的行。说实话,不应该这样,我需要找出这些线路的错误。因此,我需要将其余的wdw_content_complete.csv放入新的CSV文件中。
wdw_to_clean_assets.csv:
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
最后,我需要两个CSV中的其余CSV wdw_clip_db_2018-01-17_2.csv和wdw_content_complete.csv。因此,我试图以某种方式从另一个列表中减去一个列表,遗憾的是,这些列表也不能正常工作。
wdw_hansi_assets_rest.csv:
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
wdw_mediahub_assets_rest.csv:
1433,445_Holzverarbeitung.MXF,NULL
到目前为止我得到的是这个Python脚本:
导入csv
# CSV Files
# wdw_clip_db_2018-01-17_2.csv
# wdw_content_complete.csv
# Reading the CSV Files
hansi_assets = []
with open('wdw_clip_db_2018-01-17_2.csv') as hansi_db:
reader = csv.reader(hansi_db)
for row in reader:
hansi_assets.append(row)
hansi_db.close()
mediahub_assets = []
with open('wdw_content_complete.csv') as mediahub_db:
reader = csv.reader(mediahub_db)
for row in reader:
mediahub_assets.append(row)
mediahub_db.close()
clean_asset = []
clean_assets = []
to_add_ext_ref = []
to_add_ext_refs = []
to_clean_assets = []
hansi_assets_rest = []
mediahub_assets_rest = []
hansi_assets_rm = []
mediahub_assets_rm = []
num_clean_rwos = 0
num_to_clean_rows = 0
num_to_add_ext_refs = 0
num_dirty_rows = 0
num_hansi_iterations = 0
num_mediahub_iterations = 0
num_mediahub_null = 0
num_hansi_mediahub_matches = 0
# Looping over the CSV Files
for hansi_asset in hansi_assets:
num_hansi_iterations += 1
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
# Checking if there are similar, clean entries
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and hansi_asset[1] == mediahub_asset[2]:
clean_assets.append(mediahub_asset)
# Counting for evaluation reasons
num_clean_rwos += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries which miss the Ext_Ref field and replacing the NULL by the corresponding Ext_Ref in the hansi_asset
elif hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and mediahub_asset[2] == "NULL":
to_add_ext_ref = [mediahub_asset[1], hansi_asset[1]]
to_add_ext_refs.append(to_add_ext_ref)
# Counting for evaluation reasons
num_to_add_ext_refs += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries that don't match
elif hansi_asset[2] != mediahub_asset[1] or hansi_asset[3] != mediahub_asset[1]:
to_clean_assets.append([mediahub_asset[1], mediahub_asset[2]])
# Counting for evaluation reasons
num_to_clean_rows += 1
# Creating a list to substract from its origin to get the Rest
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Just counting the Matches
for hansi_asset in hansi_assets:
for mediahub_asset in mediahub_assets:
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1]:
num_hansi_mediahub_matches += 1
# Just counting the NULLs
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
if mediahub_asset[2] == "NULL":
num_mediahub_null += 1
# for mediahub_asset_rm in mediahub_assets_rm:
# if mediahub_asset[1] != mediahub_asset_rm[1]:
# mediahub_assets_rest = Diff(mediahub_assets, mediahub_assets_rm)
# Trying to substract medihub_assets_rm from mediahub_assets to get the Rest
mediahub_assets_rest = [item for item in mediahub_assets_rm if item not in mediahub_assets]
hansi_assets_rest = [item for item in hansi_assets_rm if item not in hansi_assets]
# Printing some lines for evaluation
print hansi_assets[1]
print mediahub_assets[1]
print clean_assets[1]
print to_clean_assets[1]
print to_add_ext_refs[1]
print hansi_assets_rest[1]
print mediahub_assets_rest[1]
print hansi_assets_rm[1]
print mediahub_assets_rm[1]
print "Num Hansi Assets: " + str(len(hansi_assets))
print "Num Mediahub Assets: " + str(len(mediahub_assets))
print "Num Clean Assets: " + str(len(clean_assets))
print "Num Hansi Assets to remove: " + str(len(hansi_assets_rm))
print "Num Mediahub Assets to remove: " + str(len(mediahub_assets_rm))
print "Num Hansi Rest Assets: " + str(len(hansi_assets_rest))
print "Num Mediahub Rest Assets: " + str(len(mediahub_assets_rest))
print "Num Mediahub NULLs: " + str(num_mediahub_null)
print "Num Hansi Mediahub Matches: " + str(num_hansi_mediahub_matches)
print "Num Clean Rows: " + str(num_clean_rwos)
print "Num To Clean Rows: " + str(num_to_clean_rows)
print "Num To Add Ext_Ref: " + str(num_to_add_ext_refs)
print "Num Dirty Rows: " + str(num_dirty_rows)
print "Num Hansi Iterations: " + str(num_hansi_iterations)
print "Num Mediahub Iterations: " + str(num_mediahub_iterations / num_hansi_iterations)
# Writing clean_assets to a file
wdw_clean_assets = []
with open('wdw_clean_assets.csv', 'w') as wdw_clean_assets:
writer = csv.writer(wdw_clean_assets)
for row in clean_assets:
writer.writerow([row])
wdw_clean_assets.close()
wdw_to_add_ext_refs =[]
with open('wdw_to_add_ext_refs.csv', 'w') as wdw_to_add_ext_refs:
writer = csv.writer(wdw_to_add_ext_refs)
for row in to_add_ext_refs:
writer.writerow([row])
wdw_to_clean_assets = []
with open('wdw_to_clean_assets.csv', 'w') as wdw_to_clean_assets:
writer = csv.writer(wdw_to_clean_assets)
for row in to_clean_assets:
writer.writerow([row])
wdw_to_clean_assets.close()
wdw_hansi_assets_rest = []
with open('wdw_hansi_assets_rest.csv', 'w') as wdw_hansi_assets_rest:
writer = csv.writer(wdw_hansi_assets_rest)
for row in hansi_assets_rest:
writer.writerow([row])
wdw_hansi_assets_rest.close()
wdw_mediahub_assets_rest = []
with open('wdw_mediahub_assets_rest.csv', 'w') as wdw_mediahub_assets_rest:
writer = csv.writer(wdw_mediahub_assets_rest)
for row in mediahub_assets_rest:
writer.writerow([row])
wdw_mediahub_assets_rest.close()
任何帮助表示赞赏!
曼努埃尔