使用特定条件对大熊猫进行排序

时间:2019-06-06 21:05:00

标签: python pandas

SO社区

我正在用python a进行编码,我需要创建一个函数,该函数的输入将按预期工资,约束(即广泛,有针对性或多样化),区域(1-13)和级别(1- 2)。

数据示例为:

     MajorII Area_ID Level_ID expected_wage
2655    52  1   1   0.907616
2621    18  11  2   0.776567
2652    49  1   2   0.730930
2608    5   10  1   0.628810
2644    41  1   1   0.505208
2659    56  2   1   0.503492
2617    14  11  2   0.471512
2667    64  3   1   0.445349
2704    101 7   2   0.436971
2673    70  3   2   0.412259
2612    9   11  1   0.411015
2717    114 9   1   0.408277
2653    50  1   2   0.407092

我想从此处返回基于约束的3个数据帧。 例如,广泛约束定义为:

  • DataFrame1 :首先对区域和级别内部的较高期望工资进行排序,然后对区域和级别外部进行排序,然后对区域和级别内部进行排序,最后对区域和级别进行排序。
  • DataFrame2 :首先对外部区域和内部级别进行排序,然后对区域和级别外部进行排序,然后对区域和外部级别进行内部排序,最后对区域和级别进行内部排序。
  • DataFrame3 :首先对外部区域和级别进行排序,然后对区域和内部级别进行排序,然后在区域和外部级别内部进行排序,最后对区域和级别进行内部排序。

实际上,我正在使用此功能

def sortOptions(sample_data, constrain, area, level):
# Number
options = np.empty(0)
num_options_needed = 4
if constrain == 'Broad':
    # Code Broad: 1 option in area-level, 3 options outside area, 2 options in level.

    # a) Sorted option in area-level first, then in area but no level, the in level but no area and then neither in area or level:
    utilities_to_sort1 = utilities.copy()

    q = '(Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    util_area_level = utilities_to_sort1.query(q)
    utilities_to_sort1 = utilities_to_sort1[~utilities_to_sort1.isin(util_area_level)].dropna()

    q = 'not (Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort1.query(q)
    util_area_level = util_area_level.append(aux)
    utilities_to_sort1 = utilities_to_sort1[~utilities_to_sort1.isin(aux)].dropna()

    q = '(Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort1.query(q)
    util_area_level = util_area_level.append(aux)
    utilities_to_sort1 = utilities_to_sort1[~utilities_to_sort1.isin(aux)].dropna()

    q = 'not (Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort1.query(q)
    util_area_level = util_area_level.append(aux)
    utilities_to_sort1 = utilities_to_sort1[~utilities_to_sort1.isin(aux)].dropna()


    # b) 3 options outside area:
    utilities_to_sort2 = utilities.copy()
    q = '(Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    util_notarea_level1 = utilities_to_sort1.query(q)
    utilities_to_sort2 = utilities_to_sort2[~utilities_to_sort2.isin(util_notarea_level1)].dropna()

    q = 'not (Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort2.query(q)
    util_notarea_level1 = util_notarea_level1.append(aux)
    utilities_to_sort2 = utilities_to_sort2[~utilities_to_sort2.isin(aux)].dropna()

    q = 'not (Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort2.query(q)
    util_notarea_level1 = util_notarea_level1.append(aux)
    utilities_to_sort2 = utilities_to_sort2[~utilities_to_sort2.isin(aux)].dropna()

    q = '(Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort2.query(q)
    util_notarea_level1 = util_notarea_level1.append(aux)
    utilities_to_sort2 = utilities_to_sort2[~utilities_to_sort2.isin(aux)].dropna()

    # c) 3 options outside area:
    utilities_to_sort3 = utilities.copy()
    q = '(Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    util_notarea_level2 = utilities_to_sort1.query(q)
    utilities_to_sort3 = utilities_to_sort3[~utilities_to_sort3.isin(util_notarea_level2)].dropna()

    q = 'not (Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort3.query(q)
    util_notarea_level2 = util_notarea_level2.append(aux)
    utilities_to_sort3 = utilities_to_sort3[~utilities_to_sort3.isin(aux)].dropna()

    q = 'not (Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort3.query(q)
    util_notarea_level2 = util_notarea_level2.append(aux)
    utilities_to_sort3 = utilities_to_sort3[~utilities_to_sort3.isin(aux)].dropna()

    q = '(Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort3.query(q)
    util_notarea_level2 = util_notarea_level2.append(aux)
    utilities_to_sort3 = utilities_to_sort3[~utilities_to_sort3.isin(aux)].dropna()

    # d)
    utilities_to_sort4 = utilities.copy()
    q = 'not (Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    util_notarea_notlevel = utilities_to_sort4.query(q)
    utilities_to_sort4 = utilities_to_sort4[~utilities_to_sort4.isin(util_notarea_notlevel)].dropna()

    q = '(Level_ID == {0}) and not (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort4.query(q)
    util_notarea_notlevel = util_notarea_notlevel.append(aux)
    utilities_to_sort4 = utilities_to_sort4[~utilities_to_sort4.isin(aux)].dropna()

    q = 'not (Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort1.query(q)
    util_notarea_notlevel = util_notarea_notlevel.append(aux)
    utilities_to_sort4 = utilities_to_sort4[~utilities_to_sort4.isin(aux)].dropna()

    q = '(Level_ID == {0}) and (Area_ID == {1})'.format(level, area)
    aux = utilities_to_sort4.query(q)
    util_notarea_notlevel = util_notarea_notlevel.append(aux)
    utilities_to_sort4 = utilities_to_sort4[~utilities_to_sort4.isin(aux)].dropna()

    opts1 = util_area_level['MajorID'].astype(int)
    opts2 = util_notarea_level1['MajorID'].astype(int)
    opts3 = util_notarea_level2['MajorID'].astype(int)
    opts4 = util_notarea_notlevel['MajorID'].astype(int)


elif constrain == 'Targeted':
    # Code Targeted
    options = np.empty(0)
elif constrain == 'Diverse':
    # Code Diverse
    options = np.empty(0)
else:
    print("Constrain is not define. It must be Broad, Diverse or Targeted")

return opts1, opts2, opts3, opts4

它返回了我想要的东西,但是必须有一种最优雅,最有效的方法来实现它!

df = pd.DataFrame({'MajorID' : [52, 18, 49, 5, 41, 56, 14, 64, 101, 70, 9, 114, 50],
                    'Area_ID' : [1, 11, 1, 10, 1, 2, 11, 3, 7, 3, 11, 9, 1 ],
                    'Level_ID' :[1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2],
                    'expected_wage' :[0.907616, 0.776567, 0.730930, 0.628810, 0.505208, 0.503492, 0.471512, 0.445349, 0.436971, 0.412259, 0.411015, 0.408277, 0.407092] })

opts1, opts2, opts3, opts4 = sortOptions(df, "Broad", 2, 1)

opts1
MajorID Area_ID Level_ID
5   56  2   1
0   52  1   1
3   5   10  1
4   41  1   1
7   64  3   1
10  9   11  1
11  114 9   1
1   18  11  2
2   49  1   2
6   14  11  2
8   101 7   2
9   70  3   2
12  50  1   2

opts2
    MajorID Area_ID Level_ID
1   18  11  2
2   49  1   2
6   14  11  2
8   101 7   2
9   70  3   2
12  50  1   2
5   56  2   1
0   52  1   1
3   5   10  1
4   41  1   1
7   64  3   1
10  9   11  1
11  114 9   1

0 个答案:

没有答案