通过正则表达式函数在熊猫列中进行字符串标记

时间:2018-08-03 21:30:06

标签: python regex pandas

首先,我有一个很大的数据集,这就是为什么我通过https://drive.google.com/open?id=1qUh2yZCWnCOoYy5SstIBzr9uqrjfWx6y共享它的原因 此链接供您尝试代码。

我使用以下代码的目的是获取birth_date,birth_place和death_date值旁边的信息,然后在句子中搜索这些值以替换如下标记;

INPUT DATA
Column1                                                                Column2
his name is Ali. Ali born on 08 August 1985, in Kadikoy, Istanbul      birth_date_1:08 \t birth_date_2:august \t birth_date_3:1985 \t birth_place_1:kadikoy \t birth_place_2:istanbul

OUTPUT DATA
Column1                                                                                                                                                    Column2
his name is Ali. Ali born on [bbdate]08[bedate] [bbdate]August[bedate] [bbdate]1985[bedate], in [bbplace]Kadikoy[beplace], [bbplace]Istanbul[beplace]      birth_date_1:08 \t birth_date_2:august \t birth_date_3:1985 \t birth_place_1:kadikoy \t birth_place_2:istanbul

为实现这一点,我在下面的代码中编写了代码。我在使用此代码时遇到一些性能问题。如果您能提供更快的替代方法,我将感到很高兴。

下面的代码也会产生一些错误的标记,例如下面的标记。它添加了一些针对某些行重复的标志,这些标志在其原始来源与其他行相比没有任何特定差异。

OUTPUT DATA
Column1                                                                                                                                                    Column2
his name is Ali. Ali born on [bbdate]08[bedate] [bbdate][bbdate]August[bedate][bedate] [bbdate]1985[bedate], in [bbplace]Kadikoy[beplace], [bbplace][bbplace]Istanbul[beplace][beplace]        birth_date_1:08 \t birth_date_2:august \t birth_date_3:1985 \t birth_place_1:kadikoy \t birth_place_2:istanbul

df = pd.read_csv('test_article_consolidated.csv',sep=';',names = ["Wiki_ID", "Wiki_Title", "First_Sentence","Info_Box"])

def replace_birth_date (crh_extract,x,dataframe):
    bbdate = "[bbdate]"
    ebdate = "[ebdate]"
    crh_find = crh_extract
    crh_replace = bbdate + str(crh_extract) + ebdate
    dataframe['First_Sentence'][x] = dataframe['First_Sentence'][x].replace(crh_find, crh_replace)

def replace_birth_place (crh_extract,x,dataframe):
    bbplace = "[bbplace]"
    ebplace = "[ebplace]"
    crh_find = crh_extract
    crh_replace = bbplace + str(crh_extract) + ebplace
    dataframe['First_Sentence'][x] = dataframe['First_Sentence'][x].replace(crh_find, crh_replace)

def replace_death_date (crh_extract,x,dataframe):
    bddate = "[bddate]"
    eddate = "[eddate]"
    crh_find = crh_extract
    crh_replace = bddate + str(crh_extract) + eddate
    dataframe['First_Sentence'][x] = dataframe['First_Sentence'][x].replace(crh_find, crh_replace)

def month_check(crh_extract):
    m = {
    'jan': 1,
    'feb': 2,
    'mar': 3,
    'apr': 4,
    'may': 5,
    'jun': 6,
    'jul': 7,
    'aug': 8,
    'sep': 9,
    'oct': 10,
    'nov': 11,
    'dec': 12
    }
    s = crh_extract.strip()[:3].lower()
    if m.__contains__(s):
        return True
    else:
        return False



for x in range(df.shape[0]):
if re.search(r"\w*birth_date:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*birth_date:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_date:\b", df['Info_Box'][x]).end()
    crh_extract =df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_birth_date(crh_extract,x,df)
        if int(crh_extract) < 2100 and int(crh_extract) >=32:
            replace_birth_date(crh_extract,x,df)
    if month_check(crh_extract):
        replace_birth_date(crh_extract, x, df)

if re.search(r"\w*birth_date_1:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*birth_date_1:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_date_1:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_birth_date(crh_extract,x,df)
        if int(crh_extract) < 2100 and int(crh_extract) >=32:
            replace_birth_date(crh_extract,x,df)
    if month_check(crh_extract):
        replace_birth_date(crh_extract, x, df)

if re.search(r"\w*birth_date_2:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*birth_date_2:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_date_2:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_birth_date(crh_extract, x, df)
        if int(crh_extract) < 2100 and int(crh_extract) >= 32:
            replace_birth_date(crh_extract, x, df)
    if month_check(crh_extract):
        replace_birth_date(crh_extract, x, df)

if re.search(r"\w*birth_date_3:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*birth_date_3:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_date_3:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_birth_date(crh_extract, x, df)
        if int(crh_extract) < 2100 and int(crh_extract) >= 32:
            replace_birth_date(crh_extract, x, df)
    if month_check(crh_extract):
        replace_birth_date(crh_extract, x, df)

if re.search(r"\w*birth_place:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_1:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_1:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_1:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_2:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_2:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_2:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_3:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_3:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_3:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_4:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_4:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_4:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_5:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_5:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_5:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric() != True:
        replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_6:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_6:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_6:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_7:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_7:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_7:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*birth_place_8:\b", df['Info_Box'][x]):

    finish = re.search(r"\w*birth_place_8:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*birth_place_8:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric()!= True:
            replace_birth_place(crh_extract, x, df)

if re.search(r"\w*death_date:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*death_date:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*death_date:\b", df['Info_Box'][x]).end()
    crh_extract =df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_death_date(crh_extract,x,df)
        if int(crh_extract) < 2100 and int(crh_extract) >=32:
            replace_death_date(crh_extract,x,df)
    if month_check(crh_extract):
        replace_death_date(crh_extract, x, df)

if re.search(r"\w*death_date_1:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*death_date_1:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*death_date_1:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_death_date(crh_extract,x,df)
        if int(crh_extract) < 2100 and int(crh_extract) >=32:
            replace_death_date(crh_extract,x,df)
    if month_check(crh_extract):
        replace_death_date(crh_extract, x, df)

if re.search(r"\w*death_date_2:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*death_date_2:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*death_date_2:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_death_date(crh_extract, x, df)
        if int(crh_extract) < 2100 and int(crh_extract) >= 32:
            replace_death_date(crh_extract, x, df)
    if month_check(crh_extract):
        replace_death_date(crh_extract, x, df)

if re.search(r"\w*death_date_3:\b",df['Info_Box'][x]):

    finish = re.search(r"\w*death_date_3:\w*", df['Info_Box'][x]).end()
    start = re.search(r"\w*death_date_3:\b", df['Info_Box'][x]).end()
    crh_extract = df['Info_Box'][x][start:(finish)]

    if crh_extract.isnumeric():
        if int(crh_extract) <= 31:
            replace_death_date(crh_extract, x, df)
        if int(crh_extract) < 2100 and int(crh_extract) >= 32:
            replace_death_date(crh_extract, x, df)
    if month_check(crh_extract):
        replace_death_date(crh_extract, x, df)

0 个答案:

没有答案