如何在行中添加值?

时间:2016-05-26 11:28:23

标签: python pandas

如何向行添加值

  1. 我在数据框中创建了一列,并将值赋值为0。
  2. 用于更新这些列值的书面逻辑,但不反映。
  3. 输入:

    >>> parafix_df = main_df[["line_width", "para_num", "bbox" ]]
    >>> parafix_df
       line_width para_num                             bbox
    0     238.546      NaN  (50.0, 579.3, 288.546, 598.022)
    1         318        1    (64.0, 564.9, 382.0, 583.622)
    2         332        2    (50.0, 550.5, 382.0, 569.222)
    3         332        2    (50.0, 536.1, 382.0, 554.822)
    4     328.977        2  (50.0, 521.7, 378.977, 540.422)
    5         318        3    (64.0, 507.3, 382.0, 526.022)
    6         332        3    (50.0, 492.9, 382.0, 511.622)
    7         332        3    (50.0, 478.5, 382.0, 497.222)
    8         332        3    (50.0, 464.1, 382.0, 482.822)
    9         332        3    (50.0, 449.7, 382.0, 468.422)
    10      59.04        3   (50.0, 435.3, 109.04, 454.022)
    11    304.007        4  (64.0, 420.9, 368.007, 439.622)
    12        318        5    (64.0, 406.5, 382.0, 425.222)
    13        332        5    (50.0, 392.1, 382.0, 410.822)
    14        332        5    (50.0, 377.7, 382.0, 396.422)
    15        332        5    (50.0, 363.3, 382.0, 382.022)
    16     43.252        5   (50.0, 348.9, 93.252, 367.622)
    17        318        6    (64.0, 334.5, 382.0, 353.222)
    18        332        6    (50.0, 320.1, 382.0, 338.822)
    19        332        6    (50.0, 305.7, 382.0, 324.422)
    20        332        6    (50.0, 291.3, 382.0, 310.022)
    21        332        6    (50.0, 276.9, 382.0, 295.622)
    22     317.02        6   (50.0, 262.5, 367.02, 281.222)
    23        318        7    (64.0, 248.1, 382.0, 266.822)
    24        332        7    (50.0, 233.7, 382.0, 252.422)
    25     47.014        7   (50.0, 219.3, 97.014, 238.022)
    26        318        8    (64.0, 204.9, 382.0, 223.622)
    27    316.723        8  (50.0, 190.5, 366.723, 209.222)
    28        318        9    (64.0, 176.1, 382.0, 194.822)
    29    326.766        9  (50.0, 161.7, 376.766, 180.422)
    30        318       10    (64.0, 147.3, 382.0, 166.022)
    31        332       10    (50.0, 132.9, 382.0, 151.622)
    32        332       10    (50.0, 118.5, 382.0, 137.222)
    33    305.393       11  (64.0, 104.1, 369.393, 122.822)
    34        318       12     (64.0, 89.7, 382.0, 108.422)
    35        318       13      (64.0, 75.3, 382.0, 94.022)
    36    319.165       13    (50.0, 60.9, 369.165, 79.622)
    37    308.165       14    (64.0, 46.5, 372.165, 65.222)
    38        318       15      (64.0, 32.1, 382.0, 50.822)
    39    329.153       15    (50.0, 17.7, 379.153, 36.422)
    40        318       16       (64.0, 3.3, 382.0, 22.022)
    41    324.335       16    (50.0, -11.1, 374.335, 7.622)
    

    代码

    parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
    parafix_df["new_para_num"] = 0
    
    max_width = parafix_df['line_width'].max()
    bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]
    
    previous = None
    para1 = 1
    for current, next in izip(parafix_df.iterrows(), parafix_df.iloc[1:].iterrows()):
        if previous==None:
            current[1]["new_para_num"] = para1
        else:
            bbox_current = current[1]["bbox"]
            bbox_next = next[1]["bbox"]
            bbox_previous = previous[1]["bbox"]
            if bbox_current[0]>bbox_max_width[0]:
                para1 += 1
                print "para1:", para1
            current[1]["new_para_num"] = para1
    
        previous = current
    

    输出上述代码

                                  bbox  new_para_num  
    0   (50.0, 579.3, 288.546, 598.022)             0  
    1     (64.0, 564.9, 382.0, 583.622)             0  
    2     (50.0, 550.5, 382.0, 569.222)             0  
    3     (50.0, 536.1, 382.0, 554.822)             0  
    4   (50.0, 521.7, 378.977, 540.422)             0  
    5     (64.0, 507.3, 382.0, 526.022)             0  
    6     (50.0, 492.9, 382.0, 511.622)             0  
    7     (50.0, 478.5, 382.0, 497.222)             0  
    8     (50.0, 464.1, 382.0, 482.822)             0  
    9     (50.0, 449.7, 382.0, 468.422)             0  
    10   (50.0, 435.3, 109.04, 454.022)             0  
    11  (64.0, 420.9, 368.007, 439.622)             0  
    12    (64.0, 406.5, 382.0, 425.222)             0  
    13    (50.0, 392.1, 382.0, 410.822)             0  
    14    (50.0, 377.7, 382.0, 396.422)             0  
    15    (50.0, 363.3, 382.0, 382.022)             0  
    16   (50.0, 348.9, 93.252, 367.622)             0  
    17    (64.0, 334.5, 382.0, 353.222)             0  
    18    (50.0, 320.1, 382.0, 338.822)             0  
    19    (50.0, 305.7, 382.0, 324.422)             0  
    20    (50.0, 291.3, 382.0, 310.022)             0  
    21    (50.0, 276.9, 382.0, 295.622)             0  
    22   (50.0, 262.5, 367.02, 281.222)             0  
    23    (64.0, 248.1, 382.0, 266.822)             0  
    24    (50.0, 233.7, 382.0, 252.422)             0  
    25   (50.0, 219.3, 97.014, 238.022)             0  
    26    (64.0, 204.9, 382.0, 223.622)             0  
    27  (50.0, 190.5, 366.723, 209.222)             0  
    28    (64.0, 176.1, 382.0, 194.822)             0  
    29  (50.0, 161.7, 376.766, 180.422)             0  
    30    (64.0, 147.3, 382.0, 166.022)             0  
    31    (50.0, 132.9, 382.0, 151.622)             0  
    32    (50.0, 118.5, 382.0, 137.222)             0  
    33  (64.0, 104.1, 369.393, 122.822)             0  
    34     (64.0, 89.7, 382.0, 108.422)             0  
    35      (64.0, 75.3, 382.0, 94.022)             0  
    36    (50.0, 60.9, 369.165, 79.622)             0  
    37    (64.0, 46.5, 372.165, 65.222)             0  
    38      (64.0, 32.1, 382.0, 50.822)             0  
    39    (50.0, 17.7, 379.153, 36.422)             0  
    40       (64.0, 3.3, 382.0, 22.022)             0  
    41    (50.0, -11.1, 374.335, 7.622)             0  
    

    但我想要新的para值:

    para1: 2
    para1: 3
    para1: 4
    para1: 5
    para1: 6
    para1: 7
    para1: 8
    para1: 9
    para1: 10
    para1: 11
    para1: 12
    para1: 13
    para1: 14
    para1: 15
    para1: 16
    

    你能帮助我吗?

    以下是我的最终工作代码:

    parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
    parafix_df["new_para_num"] = 0
    
    max_width = parafix_df['line_width'].max()
    bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]
    
    para1 = 1
    for indx, current in enumerate(parafix_df.iterrows(), start=0):
        if indx!=0:
            bbox_current = current[1]["bbox"]
            if bbox_current[0]>bbox_max_width[0]:
                para1 += 1
        parafix_df.iloc[indx, 4] = para1
    

    我们可以更优化吗?

1 个答案:

答案 0 :(得分:1)

<强>更新

IIUC,你可以这样做:

df.new_para_num = 1

In [210]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'].cumsum() + 1
Out[210]:
2      2
3      3
6      4
7      5
8      6
9      7
13     8
14     9
15    10
18    11
19    12
20    13
21    14
24    15
31    16
32    17
Name: new_para_num, dtype: int64

如果您想有条件地更新原始DF中的new_para_num列:

In [223]: df.new_para_num = 1

In [224]: selected = df.loc[df.line_width == df.line_width.max()].copy()

In [226]: selected.new_para_num = selected.new_para_num.cumsum() + 1

In [227]: selected
Out[227]:
    line_width  para_num                           bbox  new_para_num
2        332.0       2.0  [50.0, 550.5, 382.0, 569.222]             2
3        332.0       2.0  [50.0, 536.1, 382.0, 554.822]             3
6        332.0       3.0  [50.0, 492.9, 382.0, 511.622]             4
7        332.0       3.0  [50.0, 478.5, 382.0, 497.222]             5
8        332.0       3.0  [50.0, 464.1, 382.0, 482.822]             6
9        332.0       3.0  [50.0, 449.7, 382.0, 468.422]             7
13       332.0       5.0  [50.0, 392.1, 382.0, 410.822]             8
14       332.0       5.0  [50.0, 377.7, 382.0, 396.422]             9
15       332.0       5.0  [50.0, 363.3, 382.0, 382.022]            10
18       332.0       6.0  [50.0, 320.1, 382.0, 338.822]            11
19       332.0       6.0  [50.0, 305.7, 382.0, 324.422]            12
20       332.0       6.0  [50.0, 291.3, 382.0, 310.022]            13
21       332.0       6.0  [50.0, 276.9, 382.0, 295.622]            14
24       332.0       7.0  [50.0, 233.7, 382.0, 252.422]            15
31       332.0      10.0  [50.0, 132.9, 382.0, 151.622]            16
32       332.0      10.0  [50.0, 118.5, 382.0, 137.222]            17

In [228]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'] = selected

In [229]: df
Out[229]:
    line_width  para_num                             bbox  new_para_num
0      238.546       NaN  [50.0, 579.3, 288.546, 598.022]             1
1      318.000       1.0    [64.0, 564.9, 382.0, 583.622]             1
2      332.000       2.0    [50.0, 550.5, 382.0, 569.222]             2
3      332.000       2.0    [50.0, 536.1, 382.0, 554.822]             3
4      328.977       2.0  [50.0, 521.7, 378.977, 540.422]             1
5      318.000       3.0    [64.0, 507.3, 382.0, 526.022]             1
6      332.000       3.0    [50.0, 492.9, 382.0, 511.622]             4
7      332.000       3.0    [50.0, 478.5, 382.0, 497.222]             5
8      332.000       3.0    [50.0, 464.1, 382.0, 482.822]             6
9      332.000       3.0    [50.0, 449.7, 382.0, 468.422]             7
10      59.040       3.0   [50.0, 435.3, 109.04, 454.022]             1
11     304.007       4.0  [64.0, 420.9, 368.007, 439.622]             1
12     318.000       5.0    [64.0, 406.5, 382.0, 425.222]             1
13     332.000       5.0    [50.0, 392.1, 382.0, 410.822]             8
14     332.000       5.0    [50.0, 377.7, 382.0, 396.422]             9
15     332.000       5.0    [50.0, 363.3, 382.0, 382.022]            10
16      43.252       5.0   [50.0, 348.9, 93.252, 367.622]             1
17     318.000       6.0    [64.0, 334.5, 382.0, 353.222]             1
18     332.000       6.0    [50.0, 320.1, 382.0, 338.822]            11
19     332.000       6.0    [50.0, 305.7, 382.0, 324.422]            12
20     332.000       6.0    [50.0, 291.3, 382.0, 310.022]            13
21     332.000       6.0    [50.0, 276.9, 382.0, 295.622]            14
22     317.020       6.0   [50.0, 262.5, 367.02, 281.222]             1
23     318.000       7.0    [64.0, 248.1, 382.0, 266.822]             1
24     332.000       7.0    [50.0, 233.7, 382.0, 252.422]            15
25      47.014       7.0   [50.0, 219.3, 97.014, 238.022]             1
26     318.000       8.0    [64.0, 204.9, 382.0, 223.622]             1
27     316.723       8.0  [50.0, 190.5, 366.723, 209.222]             1
28     318.000       9.0    [64.0, 176.1, 382.0, 194.822]             1
29     326.766       9.0  [50.0, 161.7, 376.766, 180.422]             1
30     318.000      10.0    [64.0, 147.3, 382.0, 166.022]             1
31     332.000      10.0    [50.0, 132.9, 382.0, 151.622]            16
32     332.000      10.0    [50.0, 118.5, 382.0, 137.222]            17
33     305.393      11.0  [64.0, 104.1, 369.393, 122.822]             1
34     318.000      12.0     [64.0, 89.7, 382.0, 108.422]             1
35     318.000      13.0      [64.0, 75.3, 382.0, 94.022]             1
36     319.165      13.0    [50.0, 60.9, 369.165, 79.622]             1
37     308.165      14.0    [64.0, 46.5, 372.165, 65.222]             1
38     318.000      15.0      [64.0, 32.1, 382.0, 50.822]             1
39     329.153      15.0    [50.0, 17.7, 379.153, 36.422]             1
40     318.000      16.0       [64.0, 3.3, 382.0, 22.022]             1
41     324.335      16.0    [50.0, -11.1, 374.335, 7.622]             1

PS,但我仍然不确定我是否理解你的目标

OLD回答:

您可以使用shift函数来访问上一行和下一行:

df.shift(-1)  # df will be shifted one row backwards (will show `next` row) 

df.shift(1)  # df will be shifted one row forwards (will show `prev` row)

示例:

In [142]: df
Out[142]:
   a  b  c
0  8  3  0
1  8  3  4
2  9  4  1
3  2  1  8
4  5  6  3

In [147]: df['prev_a'] = df.a.shift(1)

In [148]: df['next_a'] = df.a.shift(-1)

In [149]: df
Out[149]:
   a  b  c  prev_a  next_a
0  8  3  0     NaN     8.0
1  8  3  4     8.0     9.0
2  9  4  1     8.0     2.0
3  2  1  8     9.0     5.0
4  5  6  3     2.0     NaN