我必须使用pandas
比较Python中的两个CSV文件有两个CSV文件包含不同形式的产品代码 第一种方式: -
LYSB00LW3ZL3K-ELECTRNCS
LYSB00LW3ZL3K-ELECTRNCS- Standard Packaging- W20 - Dual Driver
LYSB01KH2MDPU-ELECTRNCS
LYSB01KH2MDPU-ELECTRNCS- Small Bangle
LYSB01KH2MDPU-ELECTRNCS- Large Bangle
LYSB06XXD7NYY-ELECTRNCS- Large
LYSB06XXD7NYY-ELECTRNCS- Small
LYSB01KM4T0PO-ELECTRNCS
第二种方式: - (如果您删除上述产品代码的LYS以及之后的所有内容 - 第二种形式存在)
B00LW3ZL3K
B01KH2MDPU
所以我必须比较两个文件,并将第一列作为产品代码制作新的csv文件,将第二列作为状态
结果应该以2种不同的情况提供输出
1)如果第二个文件中存在B00LW3ZL3K(产品代码),则应从第一个文件返回所有所属产品代码,并将其状态显示为“库存产品”
2)如果第二个文件中不存在B01KM4T0PO(产品代码),则应从第一个文件中返回所有所属产品代码,并将其状态显示为“产品缺货”
Output:
In-Stock
LYSB00LW3ZL3K-ELECTRNCS
LYSB00LW3ZL3K-ELECTRNCS- Standard Packaging- W20 - Dual Driver
LYSB01KH2MDPU-ELECTRNCS
LYSB01KH2MDPU-ELECTRNCS- Small Bangle
LYSB01KH2MDPU-ELECTRNCS- Large Bangle
Out-of-Stock
LYSB06XXD7NYY-ELECTRNCS- Large
LYSB06XXD7NYY-ELECTRNCS- Small
LYSB01KM4T0PO-ELECTRNCS
答案 0 :(得分:0)
- 这是我解决这个问题的方法
import pandas as pd
import datetime
import os
class Update(object):
def __init__(self, category):
"""Path to file"""
masterfile = os.path.realpath('lys_masterfile.txt')
update_file = os.path.realpath('Outputs/liveyoursport/Update_Spider/{}_Update.csv'.format(category))
self.comparision(masterfile, update_file, category)
def comparision(self, output_file, update_file, category):
''' Function to extract correct data by category '''
sku_dict = {
'Electronics': 'ELECTRNCS',
'Sports Equipment': 'SPRTSEQIP',
'Health and Beauty': 'HLTHBTY',
"Women's Fashion Accessories": 'WMNFSHACCSS',
'Toys and Games': 'TOYS',
"Men's Fashion Shoes": 'MNFSHSHOE',
"Other Sports Shoes": 'OTHSPRTSSHOE',
"Women's Sports Shoes": 'WMNSPORTSHOE',
"Men's Running Shoes": 'MNSRUNSHOE',
"Amazon Global-Toys": 'GLBTOYS',
"Women's Running Shoes": 'WMNRUNSHOE',
"Women's Fashion Shoes": 'WMNFSHSHOE',
"Computer & Accessories": 'CMPTRACCS',
"Office Supplies": "OFFSUPPLIES",
"Clothing Accessories": "CLTHACCSS",
"TigerDirect": "TDRCT"
}
sku = sku_dict.get(category)
def extraction(value):
if isinstance(value, str) and sku in value:
asin = value.split('-')[0].replace('LYS', '')
return asin
else:
return 'None'
"""Extract only necessary field from file """
masterfile_sku = pd.read_csv(output_file, usecols=['Product Code/SKU'], delimiter='\t', skip_blank_lines=True)
""" Trying to extract SKU """
masterfile_asin = masterfile_sku['Product Code/SKU'].apply(extraction)
""" Making another dataFrame for comparision """
products_df = pd.DataFrame(
{'sku': masterfile_asin, 'Product Code/SKU': masterfile_sku['Product Code/SKU']}).query("sku != 'None'")
"""Fetching Update file and separating in_stock and out_stock """
update_df = pd.read_csv(update_file, usecols=[2, 3], names=['sku', 'price'])
update_in_stock_df = update_df.query("price != 'nan'")
update_out_stock_df = update_df.query("price == 'nan'")
""" Check for instock Product """
in_stock = pd.merge(products_df, update_in_stock_df, on='sku', how='inner')
# print in_stock
""" Check for out-of-stock Product """
out_of_stock = pd.merge(in_stock, products_df, on='sku', how='right', indicator=True).query(
"_merge == 'right_only'")
out_of_stock = pd.merge(out_of_stock, update_out_stock_df, on='sku', how='outer')
out_of_stock = out_of_stock.drop_duplicates(subset='sku')
"""Writing all dataFrames"""
in_stock.to_csv(os.path.realpath('Outputs/liveyoursport/in_stock/Lys_{}_in_stock.csv'.format(category)))
out_of_stock.to_csv(
os.path.realpath('Outputs/liveyoursport/out_of_stock/Lys_{}_out_of_stock.csv'.format(category)))
if __name__ == '__main__':
a = datetime.datetime.now()
Update("Women's Running Shoes")
print 'Done'
print 'Completed in {}'.format(datetime.datetime.now() - a)