我在Python中有一个代码来预处理一些文本并将其写入文件。 它删除了主题标签,用户名,符号和链接,停用词,也获得了词根
import tweepy
import time
import os
import sys
import json
import argparse
import re
from collections import defaultdict
import glob
from nltk.stem.snowball import SnowballStemmer
text = "shit.txt"
def process_text(text=text):
text=re.sub('\\B@[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B$[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\bRT\\b','',text)
text = text.lower()
text = re.sub("(https?://[^ ]+)",'',text)
if text:
a1 = [line.split("-")[0] for line in file("ListOfShortWords.txt")]
a2 = [re.sub("\n",'',line.split("-")[1]).encode("utf-8")for line in file("ListOfShortWords.txt")]
HashList = defaultdict(lambda:"nil")
for c in range(0,len(a1)):
HashList[a1[c]] = a2[c]
text = re.sub(r'([aeiou])\1{2,}', r'\1', text)
text = re.sub(r'([^aeiou])\1{2,}', r'\1\1',text)
text = re.sub(r'(.)\1{2,}\\b', r'\1', text)
for key in HashList.keys():
text = re.sub("\\b"+str(key)+"\\b",str(HashList[key]),text)
for stopword in ['about','above','after','ain\'t','aint','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','could','did','do','does','doing','down','during','each','few','for','from','further','had','has','have','having','he','he\'d','he\'ll','he\'s''here''here\'s''hers''herself''him''himself','her','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','ive','if','in','into','is','it','it\'s','its','itself','let\'s','lets','me','more','most','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','she','she\'d','she\'ll','she\'s','shes','should','so','some','such','than','that','that\'s','thats','the','their','theirs','them','themselves','then','there','there\'s','theres','these','they','they\'d','theyd','they\'ll','they\'re','they\'ve','theyll','theyre','theyve','this','those','through','to','too','under','until','up','very','was','we','we\'d','we\'ll','we\'re','we\'ve','were','what','what\'s','whats','when','when\'s','whens','where','where\'s','wheres','which','while','who','who\'s','whos','whom','why','why\'s','whys','with','won\'t','wont','would','you','you\'d','youd','you\'ll','youll','you\'re','you\'ve','youre','youve','your','yours','yourself','yourselves','\'tis','\'twas','tis','twas']:
text = re.sub("\\b"+stopword+"\\b",'',text)
for ch in ['&','$',',','.','/',':',';','"','{','[','}',']','|','\\','+','=','-','_',')','(','*','^','%','!','~','`','?']:
text = text.replace(ch,' ')
text = re.sub("\\b[0-9]*\\b",'',text)
text = text.replace('\'','')
text = re.sub('\\b[a-z]\\b','',text)
text = re.sub(r'[^\x00-\x7F]+',' ',text)
text = ' '.join(text.split())
return text
for pp in ['pos','neg','neu','irr']:
a = 1
for fil in glob.glob("Senti/"+str(pp)+"/*.txt"):
for line in file(fil):
t = process_text(text=line)
realline=''
for word in t.split():
realline = realline+" "+str(SnowballStemmer("english").stem(word)
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
outf.write(realline)
a=a+1
我收到错误说
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
^
SyntaxError: invalid syntax
代码有什么问题?存在所有必需的文件夹和文件
答案 0 :(得分:1)
上一行中有a)缺失... str()函数未正确关闭。