我正在尝试在CSV文档中应用多个功能。我想有一个第一个函数,根据列的值
将数据重新发送到其他函数数据(test.csv):
句子,语言
,FR
,烯
,烯
,它
,ES
,FR
,FR
,FR
,ES
,GE
,FR
,FR
“Prezzi” 时,它
“这不贵”,en
“prixàbaisser”,fr
“casi 50 euros la alfombra es cara”,es
“披,FR
“PREZZipiùbassi”,它
“PREIS”,GE
“PRECIO”,ES
“价格”,EN
“es ist nicht teuer”,fr
脚本:
import string
import pandas as pd
def main(dataset):
dataset = pd.read_csv(dataset, sep =',')
text = dataset['sentence']
language = dataset['language']
for language in dataset:
if language == 'fr':
cleanText_FR()
if language == 'es':
cleanText_ES()
if language == 'it':
cleanText_IT()
if language == 'en':
cleanText_EN()
if language == 'ge':
cleanText_EN()
def cleanText_FR():
text_lower = text.str.lower()
punct = string.punctuation
pattern = r"[{}]".format(punct)
text_no_punct = text_lower.str.replace(pattern, ' ')
text_no_blancks = text_no_punct.replace('\s+', ' ', regex=True)
text_no_blancks = text_no_blancks.str.rstrip()
text_no_duplicate = text_no_blancks.drop_duplicates(keep=False)
text_cluster_random = text_no_small.sample(n=1000)
text_list = text_cluster_random.tolist()
return text_list
def cleanText_ES():
text_lower = text.str.lower()
punct = string.punctuation
pattern = r"[{}]".format(punct)
text_no_punct = text_lower.str.replace(pattern, ' ')
text_no_blancks = text_no_punct.replace('\s+', ' ', regex=True)
text_no_blancks = text_no_blancks.str.rstrip()
text_no_duplicate = text_no_blancks.drop_duplicates(keep=False)
text_cluster_random = text_no_small.sample(n=1000)
text_list = text_cluster_random.tolist()
return text_list
def cleanText_IT():
text_lower = text.str.lower()
punct = string.punctuation
pattern = r"[{}]".format(punct)
text_no_punct = text_lower.str.replace(pattern, ' ')
text_no_blancks = text_no_punct.replace('\s+', ' ', regex=True)
text_no_blancks = text_no_blancks.str.rstrip()
text_no_duplicate = text_no_blancks.drop_duplicates(keep=False)
text_cluster_random = text_no_small.sample(n=1000)
text_list = text_cluster_random.tolist()
return text_list
def cleanText_EN():
text_lower = text.str.lower()
punct = string.punctuation
pattern = r"[{}]".format(punct)
text_no_punct = text_lower.str.replace(pattern, ' ')
text_no_blancks = text_no_punct.replace('\s+', ' ', regex=True)
text_no_blancks = text_no_blancks.str.rstrip()
text_no_duplicate = text_no_blancks.drop_duplicates(keep=False)
text_cluster_random = text_no_small.sample(n=1000)
text_list = text_cluster_random.tolist()
return text_list
def cleanText_GE():
text_lower = text.str.lower()
punct = string.punctuation
pattern = r"[{}]".format(punct)
text_no_punct = text_lower.str.replace(pattern, ' ')
text_no_blancks = text_no_punct.replace('\s+', ' ', regex=True)
text_no_blancks = text_no_blancks.str.rstrip()
text_no_duplicate = text_no_blancks.drop_duplicates(keep=False)
text_cluster_random = text_no_small.sample(n=1000)
text_list = text_cluster_random.tolist()
return text_list
main("test.csv")
我没有任何结果
在[3]中:runfile('/ home / marin / Bureau / preprocess / preprocess.py',wdir ='/ home / marin / Bureau / preprocess')
在[4]中:
我希望我可以将所有数据都视为输出。
我的问题不重复!这是Python而不是R!
答案 0 :(得分:2)
使用.iterrows()通过您的DataFrame进行iterrate,如下所示:
dataset = pd.read_csv(dataset, sep =',')
for num, row in dataset.iterrows():
text = row['sentence']
language = row['language']
#if statements and language clean method calls go here