我试过这个命令:
#Cleaning Text (RT, Punctuation etc)
#Creating new dataframe and new features
tw_list = pd.DataFrame(tweet_list)
tw_list["text"] = tw_list[0]
#Removing RT, Punctuation etc
remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([°-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
tw_list["text"] = tw_list.text.str.lower()
tw_list.head(10)
我遇到了一个错误,名为Traceback(最近一次调用最后一次) 下面是结果
<ipython-input-15-e640b99d08dd> in <module>
8 remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
---> 10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
11 tw_list["text"] = tw_list.text.str.lower()
12 tw_list.head(10)
c:\program files\python39\lib\site-packages\pandas\core\series.py in map(self, arg, na_action)
3907 dtype: object
3908 """
-> 3909 new_values = super()._map_values(arg, na_action=na_action)
3910 return self.constructor(new_values, index=self.index).finalize_(
3911 self, method="map"
c:\program files\python39\lib\site-packages\pandas\core\base.py in _map_values(self, mapper, na_action)
935
936 # mapper is a function
--> 937 new_values = map_f(values, mapper)
938
939 return new_values
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-15-e640b99d08dd> in <lambda>(x)
7 #Removing RT, Punctuation etc
8 remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
----> 9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
11 tw_list["text"] = tw_list.text.str.lower()
c:\program files\python39\lib\re.py in sub(pattern, repl, string, count, flags)
208 a callable, it's passed the Match object and must return
209 a replacement string to be used."""
--> 210 return _compile(pattern, flags).sub(repl, string, count)
211
212 def subn(pattern, repl, string, count=0, flags=0):
c:\program files\python39\lib\re.py in _compile(pattern, flags)
302 if not sre_compile.isstring(pattern):
303 raise TypeError("first argument must be string or compiled pattern")
--> 304 p = sre_compile.compile(pattern, flags)
305 if not (flags & DEBUG):
306 if len(_cache) >= _MAXCACHE:
c:\program files\python39\lib\sre_compile.py in compile(p, flags)
762 if isstring(p):
763 pattern = p
--> 764 p = sre_parse.parse(p, flags)
765 else:
766 pattern = None
c:\program files\python39\lib\sre_parse.py in parse(str, flags, state)
946
947 try:
--> 948 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
949 except Verbose:
950 # the VERBOSE flag was switched on inside the pattern. to be
c:\program files\python39\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
441 start = source.tell()
442 while True:
--> 443 itemsappend(_parse(source, state, verbose, nested + 1,
444 not nested and not items))
445 if not sourcematch("|"):
c:\program files\python39\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
832 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
833 not (del_flags & SRE_FLAG_VERBOSE))
--> 834 p = _parse_sub(source, state, sub_verbose, nested + 1)
835 if not source.match(")"):
836 raise source.error("missing ), unterminated subpattern",
c:\program files\python39\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
441 start = source.tell()
442 while True:
--> 443 itemsappend(_parse(source, state, verbose, nested + 1,
444 not nested and not items))
445 if not sourcematch("|"):
c:\program files\python39\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
596 if hi < lo:
597 msg = "bad character range %s-%s" % (this, that)
--> 598 raise source.error(msg, len(this) + 1 + len(that))
599 setappend((RANGE, (lo, hi)))
600 else:
错误说错误:位置18处的字符范围⁰-9
我正在尝试创建新数据框 (tw_list) 和新功能 (text),然后使用 lambda 函数清理文本并清理 RT、链接、标点字符并转换为小写。
答案 0 :(得分:0)
由于此原因,您的正则表达式存在问题:“...[°-9]...” 如果你想要这个字符“°”和“-”,你需要反斜杠“-”以避免正则表达式试图做一个范围。
如果它是“0”而不是“°”,您可以替换“°”字符
您可以在此处查看您的问题: https://regex101.com/r/hhf27i/1
修复方法如下: https://regex101.com/r/8d1VxP/1