我想在熊猫中做一些我可以在熊猫之外做的事情(下面的代码),但它的可读性很差。
目标:以10行或更少的最大间隔对列表(或DataFrame)列表中的行进行子采样,具体取决于“状态”列的值是否更改。此外,这应该针对dtype
列的列值'a'和'b'单独完成。
重现预期输出的代码:
# input (list of list, but could be converted to DataFrame)
# columns: 1:index, 2:state, 3:dtype, 4:value.
x = [
[1, 0, 'b', 93.8],
[2, 0, 'b', 97.4],
[3, 0, 'b', 76.1],
[4, 0, 'b', 21.1],
[5, 0, 'b', 65.7],
[6, 0, 'b', 90.8],
[7, 0, 'b', 63.8],
[8, 0, 'b', 82.9],
[9, 0, 'b', 19.8],
[10, 0, 'b', 10.2],
[11, 0, 'b', 1.3],
[12, 1, 'b', 37.6],
[13, 0, 'b', 18.2],
[14, 0, 'b', 16.9],
[15, 0, 'b', 95.6],
[16, 1, 'b', 23.7],
[17, 0, 'b', 54.1],
[18, 0, 'b', 99.0],
[19, 0, 'b', 16.3],
[20, 0, 'a', 80.7],
[21, 0, 'a', 23.1],
[22, 0, 'a', 96.6],
[23, 0, 'a', 56.7],
[24, 0, 'a', 45.3],
[25, 1, 'a', 58.0],
[26, 0, 'a', 49.9],
[27, 0, 'a', 91.3],
[28, 0, 'b', 60.2],
[29, 0, 'b', 76.8],
[30, 0, 'b', 45.3],
[31, 0, 'b', 69.6],
[32, 0, 'b', 99.0],
[33, 0, 'b', 29.5],
[34, 0, 'b', 11.0],
[35, 0, 'b', 68.9],
[36, 0, 'b', 75.8],
[37, 1, 'b', 89.8],
[38, 0, 'b', 57.7],
[39, 1, 'b', 20.3],
[40, 0, 'b', 98.6],
[41, 0, 'b', 96.7],
[42, 0, 'b', 17.9],
[43, 1, 'b', 14.6],
[44, 0, 'b', 92.5],
[45, 0, 'b', 33.6],
[46, 1, 'b', 58.9],
[47, 1, 'b', 71.9],
[48, 0, 'b', 74.9],
[49, 0, 'b', 43.3],
[50, 1, 'b', 29.5],
[51, 0, 'b', 24.6],
[52, 0, 'b', 2.3],
[53, 0, 'b', 19.1],
[54, 0, 'b', 31.6],
[55, 0, 'b', 80.6],
[56, 0, 'b', 3.2],
[57, 0, 'b', 58.5],
[58, 1, 'b', 30.2],
[59, 1, 'b', 29.1],
[60, 0, 'b', 47.6],
[61, 0, 'b', 76.4],
[62, 0, 'b', 21.6],
[63, 0, 'b', 82.7],
[64, 0, 'b', 0.2],
[65, 0, 'b', 9.4],
[66, 0, 'b', 75.1],
[67, 0, 'b', 33.8],
[68, 0, 'b', 82.0],
[69, 0, 'b', 56.9],
[70, 0, 'b', 62.5],
[71, 0, 'b', 53.5],
[72, 0, 'b', 7.0],
[73, 0, 'a', 37.4],
[74, 0, 'a', 88.8],
[75, 0, 'a', 46.4],
[76, 0, 'a', 86.3],
[77, 0, 'a', 54.3],
[78, 0, 'b', 23.4],
[79, 0, 'b', 1.1],
[80, 0, 'b', 78.5],
[81, 0, 'b', 39.1],
[82, 1, 'b', 79.0],
[83, 0, 'b', 41.0],
[84, 0, 'b', 40.3],
[85, 0, 'a', 66.5],
[86, 0, 'a', 66.8],
[87, 0, 'a', 86.8],
[88, 1, 'b', 96.9],
[89, 0, 'b', 2.1],
[90, 0, 'b', 46.3],
[91, 0, 'b', 28.9],
[92, 0, 'b', 43.2],
[93, 0, 'b', 58.9],
[94, 0, 'b', 60.6],
[95, 0, 'b', 15.4],
[96, 0, 'b', 69.4],
[97, 1, 'b', 18.4],
[98, 0, 'b', 41.3],
[99, 0, 'b', 40.5]
]
]
代码重新取样 x
state
'a'和'b':
def resample(x, log_interval, dtype):
if not x:
return
red = []
prev_state, next_val, last_val = 0, 0, 0
for row in x:
if row[2] == dtype:
if row[0] >= next_val or row[1] != prev_state and row[0] > last_val:
red.append(row)
prev_state = row[1]
next_val = row[0] + log_interval
last_val = row[0]
return red
red_a = resample(x, 10, 'a')
red_b = resample(x, 10, 'b')
red_a
和red_b
的预期结果:
red_a = [
[20, 0, a, 80.7],
[25, 1, a, 58.0],
[26, 0, a, 49.9],
[73, 0, a, 37.4],
[85, 0, a, 66.5]
]
red_b = [
[1, 0, b, 93.8],
[11, 0, b, 1.3],
[12, 1, b, 37.6],
[13, 0, b, 18.2],
[16, 1, b, 23.7],
[17, 0, b, 54.1],
[28, 0, b, 60.2],
[37, 1, b, 89.8],
[38, 0, b, 57.7],
[39, 1, b, 20.3],
[40, 0, b, 98.6],
[43, 1, b, 14.6],
[44, 0, b, 92.5],
[46, 1, b, 58.9],
[48, 0, b, 74.9],
[50, 1, b, 29.5],
[51, 0, b, 24.6],
[58, 1, b, 30.2],
[60, 0, b, 47.6],
[70, 0, b, 62.5],
[80, 0, b, 78.5],
[82, 1, b, 79.0],
[83, 0, b, 41.0],
[88, 1, b, 96.9],
[89, 0, b, 2.1],
[97, 1, b, 18.4],
[98, 0, b, 41.3]
]
我怎么能在熊猫中做到这一点?
一个起点是:
columns = ['ind', 'state', 'dtype', 'value']
df = pd.DataFrame(x, columns=columns)
但是,如果我尝试for循环,它会非常慢(例如for row in df: ...
)。
知道如何从这里开始吗?
答案 0 :(得分:1)
首先从using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Builder;
using Microsoft.AspNetCore.Hosting;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Microsoft.AspNetCore.SpaServices.AngularCli;
namespace AngularSPA
{
public class Startup
{
public Startup(IConfiguration configuration)
{
Configuration = configuration;
}
public IConfiguration Configuration { get; }
// This method gets called by the runtime. Use this method to add services to the container.
public void ConfigureServices(IServiceCollection services)
{
services.AddMvc();
// In production, the Angular files will be served from this directory
services.AddSpaStaticFiles(configuration =>
{
configuration.RootPath = "ClientApp/dist";
});
}
// This method gets called by the runtime. Use this method to configure the HTTP request pipeline.
public void Configure(IApplicationBuilder app, IHostingEnvironment env)
{
if (env.IsDevelopment())
{
app.UseDeveloperExceptionPage();
}
else
{
app.UseExceptionHandler("/Home/Error");
}
app.UseStaticFiles();
app.UseSpaStaticFiles();
app.UseMvc(routes =>
{
routes.MapRoute(
name: "default",
template: "{controller}/{action=Index}/{id?}");
});
app.UseSpa(spa =>
{
spa.Options.SourcePath = "ClientApp";
if (env.IsDevelopment())
{
spa.UseAngularCliServer(npmScript: "start");
}
});
}
}
}
开始,您首先可以创建两个DF(df = pd.DataFrame(x, columns=['ind', 'state', 'dtype', 'value'])
和df_a
)来选择状态,例如:
df_b
然后你创建一个函数df_a = df[df['dtype'] =='a'].copy()
df_b = df[df['dtype'] =='b'].copy()
,你将select_row
到这些DF:
apply
现在,您可以使用布尔值在def select_row( row, log_interval):
# using global varaibles might be a bit dangerous but I didn't find another way
global prev_state, next_val, last_val
# Here your conditions
if (row['ind'] >= next_val) or (row['state'] != prev_state and row['ind'] > last_val):
# change the values of the global variables
prev_state = row['state']
next_val = row['ind'] + log_interval
last_val = row['ind']
return True # return True if your condition is met
else: # return False otherwise
return False
和df_a
中创建列,例如:
df_b
最后,您可以通过在log_interval = 10
prev_state, next_val, last_val = 0, 0, 0
df_a['bool'] = df_a.apply(select_row, args = ([log_interval ]), axis = 1)
#same for df_b but don't forget to reset your global values
prev_state, next_val, last_val = 0, 0, 0
df_b['bool'] = df_b.apply(select_row, args = ([log_interval ]), axis = 1)
列中选择df_a
(和df_b
)行True
并删除此列来创建您的两个输出:
'bool'