如果某个值在

时间:2018-08-03 15:21:55

标签: python

我有一个.csv文件,其值会干扰我的计算

方法

我想删除每行中特定值之后在行上退出的值,例如,如果某行上有一个“(B)”,并且它是“(D)”之前的第一个,而其他“(B)”之前的第一个”仅保留第一个“(B)”

与“ +”,“ ++”和“ +++”相同,我只希望每行只保留第一个

所需结果

1277|2013-12-17 16:00:00|100|+|
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|C
1378|2014-01-21 20:00:00|(B)|100||D

csv文件的示例:

1277|2013-12-17 16:00:00|100|+|++|
1360|2014-01-15 16:00:00|(B)|(D)|99|++|+++||+|E
1402|2014-02-05 20:00:00|(D)|(B)|99|++|+||D
1360|2014-01-29 08:00:00|(D)|(B)|99||C
1378|2014-01-21 20:00:00|(B)|100||D

3 个答案:

答案 0 :(得分:0)

这是一个简短的小程序,它使用元组import dash from dash.dependencies import Input, Output import dash_core_components as dcc import dash_html_components as html import plotly.graph_objs as go import pandas as pd import os import dash_table_experiments as dt df_excel = pd.read_csv('C:\\Users\\intern1\\info.csv', nrows = 5) df_excel_wsj = pd.read_csv('C:\\Users\\intern1\\WSJDATA.csv', encoding='iso-8859-1', nrows = 10) df_excel_calendar = pd.read_excel('C:\\Users\\intern1\\Excel Calendar Correct.xlsm', sheet_name= 'August') df_ma_stats = pd.read_csv('C:\\Users\\intern1\\MAstatistics.csv', encoding='iso-8859-1') df_bloomb_stats = pd.read_csv('C:\\Users\\intern1\\Excel5kFields.csv', encoding ='iso-8859-1') sheet_to_df_map = pd.ExcelFile('2013 Onwards Database.xlsm') dropdown_options = pd.read_excel('2013 Onwards Database.xlsm', sheet_name=None) app = dash.Dash(__name__) server = app.server colorss = { 'background': '#111111', 'text': '#7FDBFF' } def make_dash_table(df): table =[] for index, row in df.iterrows(): html_row = [] for i in range(len(row)): html_row.append(html.Td([row[i]])) table.append(html.Tr(html_row)) return table def get_logo(): logo = html.Div([ html.Div([ html.Img(src='https://media.giphy.com/media/jl7eVqDXCFcm4/giphy.gif', height = '200', width = '350') ], className = "ten columns padded"), html.Div([ dcc.Link('Full View ', href='/full-view') ], className = "two columns page-view no-print") ], className = "row gs-header") return logo def get_header(): header = html.Div([ html.Div([ html.H5('M&A Database Analysis'), ], className = "twelve columns") ], className = "row gs-header gs-text-header padded") return header def get_menu(): menu = html.Div([ dcc.Link('Overview ', href ='/overview', className="tab first"), dcc.Link('Timeline & Dates', href = '/timeline-dates', className = "tab"), dcc.Link('Further Analysis', href = '/further-analysis', className= "tab"), dcc.Link('Extra Excel Data', href='/excel-data', className= "tab") ], className = "row ") return menu overview = html.Div([ html.Div([ get_logo(), get_header(), html.Br([]), get_menu(), html.Div([ html.Div([ html.H6('ok', className="gs-header gs-text-header padded"), html.Br([]), html.P("filler."), ], className = "six columns"), html.Div([ html.H6('HSR ', className="gs-header gs-text-header padded"), html.Table(make_dash_table(df_excel), style = {'width': 380, 'overflowX': 'scroll', 'display':'inline-block'}) ], className="six columns"), ],className="row "), html.Div([ html.H6('WSJ Data', className="gs-header gs-text-header padded"), html.Table(make_dash_table(df_excel_wsj), style = {'width':800, 'overflowX': 'wordwrap', 'height': 600, 'overflowY': 'scroll', 'display':'inline-block'}), ], className = "twelve columns"), ], className="row") ], className = "page") timelineDates = html.Div([ html.Div([ get_logo(), get_header(), html.Br([]), get_menu(), html.Div([ html.H6('Filler Text', className= 'gs-header gs-text-header padded'), html.Br([]), html.Table(make_dash_table(sheet_to_df_map), style = {'height': 1200, 'overflowX': 'wordwrap', 'width': 800, 'overflowY': 'scroll'}), ], className= 'six columns'), ],className="row ") ], className="page") furtherAnalysis = html.Div([ html.Div([ get_logo(), get_header(), html.Br([]), get_menu(), html.Div([ html.H6('Calendar', className= 'gs-header gs-text-header padded center-aligned'), html.Br([]), html.Table(make_dash_table(df_excel_calendar), style = {'height':800, 'width':800}), ], className= 'twelve columns center-aligned'), ],className="row ") ], className="page") excelData = html.Div([ get_logo(), get_header(), html.Br([]), get_menu(), html.Div([ html.H6('Graphs from the main sheet', className = 'gs-header gs-text-header padded'), html.Div([ dcc.Graph( id='graph-4', figure={ 'data': [ go.Scatter( x = df_bloomb_stats['Buyer Name'], y = df_bloomb_stats['Tot Value'], line = {"color": "rgb(0, 0, 255)"}, mode = "lines", name = "Total Value by Buyer (mil)" ), go.Scatter( x = df_bloomb_stats['Buyer Name'], y = df_bloomb_stats['Average Size'], line = {"color": "rgb(0, 191, 255)"}, mode = "lines", name = "Average Value by Buyer (mil)" ) ], 'layout': go.Layout( autosize = True, width = 800, height = 300, font = { "family": "Raleway", "size": 10 }, margin = { "r": 40, "t": 40, "b": 30, "l": 40 }, showlegend = True, titlefont = { "family": "Raleway", "size": 10 }, xaxis = { "autorange": True, }, ) } ) ], className="twelve columns") ], className="row "), ], className="page") noPage = html.Div([ html.P(["404 Page not found"]) ], className ="no-page") app.layout = html.Div([ dcc.Location(id='url', refresh=False), html.Div(id='page-content'), html.H2("Select Target Company"), html.Div([dcc.Dropdown(id="field_dropdown", options=[{ 'label': i, 'value': i } for i in dropdown_options], value='Sheet3')], style={'width': '25%', 'display': 'inline-block'}), dt.DataTable(rows=[{}], row_selectable=True, filterable=True, sortable=True, selected_row_indices=[], id='datatable') ]) @app.callback( dash.dependencies.Output('datatable', 'rows'), [dash.dependencies.Input('field_dropdown', 'value')]) def update_datatable(user_selection): if user_selection == 'Sheet1': return sheet_to_df_map.parse(0).to_dict('records') elif user_selection == 'Sheet2': return sheet_to_df_map.parse(1).to_dict('records') else: return sheet_to_df_map.parse(2).to_dict('records') @app.callback(dash.dependencies.Output('page-content', 'children'), [dash.dependencies.Input('url', 'pathname')]) def display_page(pathname): if pathname == '/' or pathname == '/overview': return overview elif pathname == '/timeline-dates': return timelineDates elif pathname == '/further-analysis': return furtherAnalysis elif pathname =='/excel-data': return excelData elif pathname == '/full-view': return overview,timelineDates,furtherAnalysis, excelData else: return noPage external_css = ["https://cdnjs.cloudflare.com/ajax/libs/normalize/7.0.0/normalize.min.css", "https://cdnjs.cloudflare.com/ajax/libs/skeleton/2.0.4/skeleton.min.css", "https://fonts.googleapis.com/css?family=Raleway:400,300,600", "https://codepen.io/bcd/pen/KQrXdb.css", "https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css"] for css in external_css: app.css.append_css({"external_url": css}) external_js = ["https://code.jquery.com/jquery-3.2.1.min.js", "https://codepen.io/bcd/pen/YaXojL.js"] for js in external_js: app.scripts.append_script({"external_url": js}) if __name__ == '__main__': app.run_server(debug=True) 的列表,并按照问题中的描述删除值。它只是遍历数据,一旦在无效分组中找到一个值,它将删除该组中的所有后续值

invalid_together

答案 1 :(得分:0)

您可以使用内置的csv模块读取一个CSV,然后过滤其每一行以不包含相同 category 的重复元素,最后将所有内容记为新的CSV。首先创建一个类别过滤器:

categories = [  # make a list of tuples containing elements that should appear only once
    ("(B)", "(D)"),
    ("+", "++", "+++")
]

categories_map = {e: c[0] for c in categories for e in c}  # turn it into a quick lookup map

def filter_elements(row):  # and then build your filters
    unique = set()  # a set to hold our unique values
    for column in row:
        if column in categories_map:
            if categories_map[column] not in unique:
                unique.add(categories_map[column])
                yield column
        elif column:  # use `else:` instead if you want to keep the empty fields
            yield column

最后,打开输入的CSV,进行读取,过滤其行,然后立即将其写入输出CSV:

with open("in.csv", "r", newline="") as f_in, open("out.csv", "w", newline="") as f_out:
    writer = csv.writer(f_out, delimiter="|")  # create a CSV writer
    for row in csv.reader(f_in, delimiter="|"):  # iterate over a CSV reader
        writer.writerow(c for c in filter_elements(row))  # filter + write to the out.csv

对于您发布的示例数据,这将产生out.csv,其中包含:

1277|2013-12-17 16:00:00|100|+
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|C
1378|2014-01-21 20:00:00|(B)|100|D

答案 2 :(得分:0)

您可以使用正则表达式提取所需的部分:

import re

pattern = re.compile(r'^(.* \d+:\d+:\d+(?=\|))\|(\(\S\)(?=|))?.*?(\d+)\|(\++)?.*?\|(\S)?$')

with open('data.csv', 'r') as infile:
    with open('result.csv', 'w')  as outfile:
        for line in infile:
            outfile.write('|'.join(str(x) for x in pattern.match(line).groups() if x) + '\n')

这将导致:

1277|2013-12-17 16:00:00|100|+
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|C
1378|2014-01-21 20:00:00|(B)|100|D

如果要对输出进行后处理,可能最好使每行元素的数量保持恒定,而不是跳过为空的元素。为此,您可以将最后一行替换为:

outfile.write('|'.join(str(x) for x in pattern.match(line).groups()) + '\n')

这将作为输出:

1277|2013-12-17 16:00:00|None|100|+|None
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|None|C
1378|2014-01-21 20:00:00|(B)|100|None|D

编辑:

为了捕捉像这样的行:

325|2014-01-18 20:00:00|(B)|93|++|+||Calme 

模式可以修改为:

pattern = re.compile(r'^(.* \d+:\d+:\d+(?=\|))\|(\(\S\)(?=|))?.*?(\d+)\|(\++)?.*\|(\S+)?\s*?$')

您可以快速验证它here,也可以对其最初失败的其余行进行快速验证。