我有一个.csv文件,其值会干扰我的计算
方法
我想删除每行中特定值之后在行上退出的值,例如,如果某行上有一个“(B)”,并且它是“(D)”之前的第一个,而其他“(B)”之前的第一个”仅保留第一个“(B)”
与“ +”,“ ++”和“ +++”相同,我只希望每行只保留第一个
所需结果
1277|2013-12-17 16:00:00|100|+|
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|C
1378|2014-01-21 20:00:00|(B)|100||D
csv文件的示例:
1277|2013-12-17 16:00:00|100|+|++|
1360|2014-01-15 16:00:00|(B)|(D)|99|++|+++||+|E
1402|2014-02-05 20:00:00|(D)|(B)|99|++|+||D
1360|2014-01-29 08:00:00|(D)|(B)|99||C
1378|2014-01-21 20:00:00|(B)|100||D
答案 0 :(得分:0)
这是一个简短的小程序,它使用元组import dash
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go
import pandas as pd
import os
import dash_table_experiments as dt
df_excel = pd.read_csv('C:\\Users\\intern1\\info.csv', nrows = 5)
df_excel_wsj = pd.read_csv('C:\\Users\\intern1\\WSJDATA.csv', encoding='iso-8859-1', nrows = 10)
df_excel_calendar = pd.read_excel('C:\\Users\\intern1\\Excel Calendar Correct.xlsm', sheet_name= 'August')
df_ma_stats = pd.read_csv('C:\\Users\\intern1\\MAstatistics.csv', encoding='iso-8859-1')
df_bloomb_stats = pd.read_csv('C:\\Users\\intern1\\Excel5kFields.csv', encoding ='iso-8859-1')
sheet_to_df_map = pd.ExcelFile('2013 Onwards Database.xlsm')
dropdown_options = pd.read_excel('2013 Onwards Database.xlsm', sheet_name=None)
app = dash.Dash(__name__)
server = app.server
colorss = {
'background': '#111111',
'text': '#7FDBFF'
}
def make_dash_table(df):
table =[]
for index, row in df.iterrows():
html_row = []
for i in range(len(row)):
html_row.append(html.Td([row[i]]))
table.append(html.Tr(html_row))
return table
def get_logo():
logo = html.Div([
html.Div([
html.Img(src='https://media.giphy.com/media/jl7eVqDXCFcm4/giphy.gif', height = '200', width = '350')
], className = "ten columns padded"),
html.Div([
dcc.Link('Full View ', href='/full-view')
], className = "two columns page-view no-print")
], className = "row gs-header")
return logo
def get_header():
header = html.Div([
html.Div([
html.H5('M&A Database Analysis'),
], className = "twelve columns")
], className = "row gs-header gs-text-header padded")
return header
def get_menu():
menu = html.Div([
dcc.Link('Overview ', href ='/overview', className="tab first"),
dcc.Link('Timeline & Dates', href = '/timeline-dates', className = "tab"),
dcc.Link('Further Analysis', href = '/further-analysis', className= "tab"),
dcc.Link('Extra Excel Data', href='/excel-data', className= "tab")
], className = "row ")
return menu
overview = html.Div([
html.Div([
get_logo(),
get_header(),
html.Br([]),
get_menu(),
html.Div([
html.Div([
html.H6('ok',
className="gs-header gs-text-header padded"),
html.Br([]),
html.P("filler."),
], className = "six columns"),
html.Div([
html.H6('HSR ',
className="gs-header gs-text-header padded"),
html.Table(make_dash_table(df_excel), style = {'width': 380, 'overflowX': 'scroll', 'display':'inline-block'})
], className="six columns"),
],className="row "),
html.Div([
html.H6('WSJ Data',
className="gs-header gs-text-header padded"),
html.Table(make_dash_table(df_excel_wsj), style = {'width':800, 'overflowX': 'wordwrap', 'height': 600, 'overflowY': 'scroll', 'display':'inline-block'}),
], className = "twelve columns"),
], className="row")
], className = "page")
timelineDates = html.Div([
html.Div([
get_logo(),
get_header(),
html.Br([]),
get_menu(),
html.Div([
html.H6('Filler Text',
className= 'gs-header gs-text-header padded'),
html.Br([]),
html.Table(make_dash_table(sheet_to_df_map), style = {'height': 1200, 'overflowX': 'wordwrap', 'width': 800, 'overflowY': 'scroll'}),
], className= 'six columns'),
],className="row ")
], className="page")
furtherAnalysis = html.Div([
html.Div([
get_logo(),
get_header(),
html.Br([]),
get_menu(),
html.Div([
html.H6('Calendar',
className= 'gs-header gs-text-header padded center-aligned'),
html.Br([]),
html.Table(make_dash_table(df_excel_calendar), style = {'height':800, 'width':800}),
], className= 'twelve columns center-aligned'),
],className="row ")
], className="page")
excelData = html.Div([
get_logo(),
get_header(),
html.Br([]),
get_menu(),
html.Div([
html.H6('Graphs from the main sheet',
className = 'gs-header gs-text-header padded'),
html.Div([
dcc.Graph(
id='graph-4',
figure={
'data': [
go.Scatter(
x = df_bloomb_stats['Buyer Name'],
y = df_bloomb_stats['Tot Value'],
line = {"color": "rgb(0, 0, 255)"},
mode = "lines",
name = "Total Value by Buyer (mil)"
),
go.Scatter(
x = df_bloomb_stats['Buyer Name'],
y = df_bloomb_stats['Average Size'],
line = {"color": "rgb(0, 191, 255)"},
mode = "lines",
name = "Average Value by Buyer (mil)"
)
],
'layout': go.Layout(
autosize = True,
width = 800,
height = 300,
font = {
"family": "Raleway",
"size": 10
},
margin = {
"r": 40,
"t": 40,
"b": 30,
"l": 40
},
showlegend = True,
titlefont = {
"family": "Raleway",
"size": 10
},
xaxis = {
"autorange": True,
},
)
}
)
], className="twelve columns")
], className="row "),
], className="page")
noPage = html.Div([
html.P(["404 Page not found"])
], className ="no-page")
app.layout = html.Div([
dcc.Location(id='url', refresh=False),
html.Div(id='page-content'),
html.H2("Select Target Company"),
html.Div([dcc.Dropdown(id="field_dropdown", options=[{
'label': i,
'value': i
} for i in dropdown_options],
value='Sheet3')],
style={'width': '25%',
'display': 'inline-block'}),
dt.DataTable(rows=[{}],
row_selectable=True,
filterable=True,
sortable=True,
selected_row_indices=[],
id='datatable')
])
@app.callback(
dash.dependencies.Output('datatable', 'rows'),
[dash.dependencies.Input('field_dropdown', 'value')])
def update_datatable(user_selection):
if user_selection == 'Sheet1':
return sheet_to_df_map.parse(0).to_dict('records')
elif user_selection == 'Sheet2':
return sheet_to_df_map.parse(1).to_dict('records')
else:
return sheet_to_df_map.parse(2).to_dict('records')
@app.callback(dash.dependencies.Output('page-content', 'children'),
[dash.dependencies.Input('url', 'pathname')])
def display_page(pathname):
if pathname == '/' or pathname == '/overview':
return overview
elif pathname == '/timeline-dates':
return timelineDates
elif pathname == '/further-analysis':
return furtherAnalysis
elif pathname =='/excel-data':
return excelData
elif pathname == '/full-view':
return overview,timelineDates,furtherAnalysis, excelData
else:
return noPage
external_css = ["https://cdnjs.cloudflare.com/ajax/libs/normalize/7.0.0/normalize.min.css",
"https://cdnjs.cloudflare.com/ajax/libs/skeleton/2.0.4/skeleton.min.css",
"https://fonts.googleapis.com/css?family=Raleway:400,300,600",
"https://codepen.io/bcd/pen/KQrXdb.css",
"https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css"]
for css in external_css:
app.css.append_css({"external_url": css})
external_js = ["https://code.jquery.com/jquery-3.2.1.min.js",
"https://codepen.io/bcd/pen/YaXojL.js"]
for js in external_js:
app.scripts.append_script({"external_url": js})
if __name__ == '__main__':
app.run_server(debug=True)
的列表,并按照问题中的描述删除值。它只是遍历数据,一旦在无效分组中找到一个值,它将删除该组中的所有后续值
invalid_together
答案 1 :(得分:0)
您可以使用内置的csv
模块读取一个CSV,然后过滤其每一行以不包含相同 category 的重复元素,最后将所有内容记为新的CSV。首先创建一个类别过滤器:
categories = [ # make a list of tuples containing elements that should appear only once
("(B)", "(D)"),
("+", "++", "+++")
]
categories_map = {e: c[0] for c in categories for e in c} # turn it into a quick lookup map
def filter_elements(row): # and then build your filters
unique = set() # a set to hold our unique values
for column in row:
if column in categories_map:
if categories_map[column] not in unique:
unique.add(categories_map[column])
yield column
elif column: # use `else:` instead if you want to keep the empty fields
yield column
最后,打开输入的CSV,进行读取,过滤其行,然后立即将其写入输出CSV:
with open("in.csv", "r", newline="") as f_in, open("out.csv", "w", newline="") as f_out:
writer = csv.writer(f_out, delimiter="|") # create a CSV writer
for row in csv.reader(f_in, delimiter="|"): # iterate over a CSV reader
writer.writerow(c for c in filter_elements(row)) # filter + write to the out.csv
对于您发布的示例数据,这将产生out.csv
,其中包含:
1277|2013-12-17 16:00:00|100|+ 1360|2014-01-15 16:00:00|(B)|99|++|E 1402|2014-02-05 20:00:00|(D)|99|++|D 1360|2014-01-29 08:00:00|(D)|99|C 1378|2014-01-21 20:00:00|(B)|100|D
答案 2 :(得分:0)
您可以使用正则表达式提取所需的部分:
import re
pattern = re.compile(r'^(.* \d+:\d+:\d+(?=\|))\|(\(\S\)(?=|))?.*?(\d+)\|(\++)?.*?\|(\S)?$')
with open('data.csv', 'r') as infile:
with open('result.csv', 'w') as outfile:
for line in infile:
outfile.write('|'.join(str(x) for x in pattern.match(line).groups() if x) + '\n')
这将导致:
1277|2013-12-17 16:00:00|100|+
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|C
1378|2014-01-21 20:00:00|(B)|100|D
如果要对输出进行后处理,可能最好使每行元素的数量保持恒定,而不是跳过为空的元素。为此,您可以将最后一行替换为:
outfile.write('|'.join(str(x) for x in pattern.match(line).groups()) + '\n')
这将作为输出:
1277|2013-12-17 16:00:00|None|100|+|None
1360|2014-01-15 16:00:00|(B)|99|++|E
1402|2014-02-05 20:00:00|(D)|99|++|D
1360|2014-01-29 08:00:00|(D)|99|None|C
1378|2014-01-21 20:00:00|(B)|100|None|D
编辑:
为了捕捉像这样的行:
325|2014-01-18 20:00:00|(B)|93|++|+||Calme
模式可以修改为:
pattern = re.compile(r'^(.* \d+:\d+:\d+(?=\|))\|(\(\S\)(?=|))?.*?(\d+)\|(\++)?.*\|(\S+)?\s*?$')
您可以快速验证它here,也可以对其最初失败的其余行进行快速验证。