首先,我有一个很大的数据集,这就是为什么我通过https://drive.google.com/open?id=1qUh2yZCWnCOoYy5SstIBzr9uqrjfWx6y共享它的原因 此链接供您尝试代码。
我使用以下代码的目的是获取birth_date,birth_place和death_date值旁边的信息,然后在句子中搜索这些值以替换如下标记;
INPUT DATA
Column1 Column2
his name is Ali. Ali born on 08 August 1985, in Kadikoy, Istanbul birth_date_1:08 \t birth_date_2:august \t birth_date_3:1985 \t birth_place_1:kadikoy \t birth_place_2:istanbul
OUTPUT DATA
Column1 Column2
his name is Ali. Ali born on [bbdate]08[bedate] [bbdate]August[bedate] [bbdate]1985[bedate], in [bbplace]Kadikoy[beplace], [bbplace]Istanbul[beplace] birth_date_1:08 \t birth_date_2:august \t birth_date_3:1985 \t birth_place_1:kadikoy \t birth_place_2:istanbul
为实现这一点,我在下面的代码中编写了代码。我在使用此代码时遇到一些性能问题。如果您能提供更快的替代方法,我将感到很高兴。
下面的代码也会产生一些错误的标记,例如下面的标记。它添加了一些针对某些行重复的标志,这些标志在其原始来源与其他行相比没有任何特定差异。
OUTPUT DATA
Column1 Column2
his name is Ali. Ali born on [bbdate]08[bedate] [bbdate][bbdate]August[bedate][bedate] [bbdate]1985[bedate], in [bbplace]Kadikoy[beplace], [bbplace][bbplace]Istanbul[beplace][beplace] birth_date_1:08 \t birth_date_2:august \t birth_date_3:1985 \t birth_place_1:kadikoy \t birth_place_2:istanbul
。
df = pd.read_csv('test_article_consolidated.csv',sep=';',names = ["Wiki_ID", "Wiki_Title", "First_Sentence","Info_Box"])
def replace_birth_date (crh_extract,x,dataframe):
bbdate = "[bbdate]"
ebdate = "[ebdate]"
crh_find = crh_extract
crh_replace = bbdate + str(crh_extract) + ebdate
dataframe['First_Sentence'][x] = dataframe['First_Sentence'][x].replace(crh_find, crh_replace)
def replace_birth_place (crh_extract,x,dataframe):
bbplace = "[bbplace]"
ebplace = "[ebplace]"
crh_find = crh_extract
crh_replace = bbplace + str(crh_extract) + ebplace
dataframe['First_Sentence'][x] = dataframe['First_Sentence'][x].replace(crh_find, crh_replace)
def replace_death_date (crh_extract,x,dataframe):
bddate = "[bddate]"
eddate = "[eddate]"
crh_find = crh_extract
crh_replace = bddate + str(crh_extract) + eddate
dataframe['First_Sentence'][x] = dataframe['First_Sentence'][x].replace(crh_find, crh_replace)
def month_check(crh_extract):
m = {
'jan': 1,
'feb': 2,
'mar': 3,
'apr': 4,
'may': 5,
'jun': 6,
'jul': 7,
'aug': 8,
'sep': 9,
'oct': 10,
'nov': 11,
'dec': 12
}
s = crh_extract.strip()[:3].lower()
if m.__contains__(s):
return True
else:
return False
for x in range(df.shape[0]):
if re.search(r"\w*birth_date:\b",df['Info_Box'][x]):
finish = re.search(r"\w*birth_date:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_date:\b", df['Info_Box'][x]).end()
crh_extract =df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_birth_date(crh_extract,x,df)
if int(crh_extract) < 2100 and int(crh_extract) >=32:
replace_birth_date(crh_extract,x,df)
if month_check(crh_extract):
replace_birth_date(crh_extract, x, df)
if re.search(r"\w*birth_date_1:\b",df['Info_Box'][x]):
finish = re.search(r"\w*birth_date_1:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_date_1:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_birth_date(crh_extract,x,df)
if int(crh_extract) < 2100 and int(crh_extract) >=32:
replace_birth_date(crh_extract,x,df)
if month_check(crh_extract):
replace_birth_date(crh_extract, x, df)
if re.search(r"\w*birth_date_2:\b",df['Info_Box'][x]):
finish = re.search(r"\w*birth_date_2:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_date_2:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_birth_date(crh_extract, x, df)
if int(crh_extract) < 2100 and int(crh_extract) >= 32:
replace_birth_date(crh_extract, x, df)
if month_check(crh_extract):
replace_birth_date(crh_extract, x, df)
if re.search(r"\w*birth_date_3:\b",df['Info_Box'][x]):
finish = re.search(r"\w*birth_date_3:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_date_3:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_birth_date(crh_extract, x, df)
if int(crh_extract) < 2100 and int(crh_extract) >= 32:
replace_birth_date(crh_extract, x, df)
if month_check(crh_extract):
replace_birth_date(crh_extract, x, df)
if re.search(r"\w*birth_place:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_1:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_1:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_1:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_2:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_2:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_2:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_3:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_3:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_3:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_4:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_4:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_4:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_5:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_5:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_5:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric() != True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_6:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_6:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_6:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_7:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_7:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_7:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*birth_place_8:\b", df['Info_Box'][x]):
finish = re.search(r"\w*birth_place_8:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*birth_place_8:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric()!= True:
replace_birth_place(crh_extract, x, df)
if re.search(r"\w*death_date:\b",df['Info_Box'][x]):
finish = re.search(r"\w*death_date:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*death_date:\b", df['Info_Box'][x]).end()
crh_extract =df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_death_date(crh_extract,x,df)
if int(crh_extract) < 2100 and int(crh_extract) >=32:
replace_death_date(crh_extract,x,df)
if month_check(crh_extract):
replace_death_date(crh_extract, x, df)
if re.search(r"\w*death_date_1:\b",df['Info_Box'][x]):
finish = re.search(r"\w*death_date_1:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*death_date_1:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_death_date(crh_extract,x,df)
if int(crh_extract) < 2100 and int(crh_extract) >=32:
replace_death_date(crh_extract,x,df)
if month_check(crh_extract):
replace_death_date(crh_extract, x, df)
if re.search(r"\w*death_date_2:\b",df['Info_Box'][x]):
finish = re.search(r"\w*death_date_2:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*death_date_2:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_death_date(crh_extract, x, df)
if int(crh_extract) < 2100 and int(crh_extract) >= 32:
replace_death_date(crh_extract, x, df)
if month_check(crh_extract):
replace_death_date(crh_extract, x, df)
if re.search(r"\w*death_date_3:\b",df['Info_Box'][x]):
finish = re.search(r"\w*death_date_3:\w*", df['Info_Box'][x]).end()
start = re.search(r"\w*death_date_3:\b", df['Info_Box'][x]).end()
crh_extract = df['Info_Box'][x][start:(finish)]
if crh_extract.isnumeric():
if int(crh_extract) <= 31:
replace_death_date(crh_extract, x, df)
if int(crh_extract) < 2100 and int(crh_extract) >= 32:
replace_death_date(crh_extract, x, df)
if month_check(crh_extract):
replace_death_date(crh_extract, x, df)