Question

我试过这个命令：

#Cleaning Text (RT, Punctuation etc)

#Creating new dataframe and new features
tw_list = pd.DataFrame(tweet_list)
tw_list["text"] = tw_list[0]

#Removing RT, Punctuation etc
remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([°-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
tw_list["text"] = tw_list.text.str.lower()
tw_list.head(10)

我遇到了一个错误，名为Traceback（最近一次调用最后一次） 下面是结果

<ipython-input-15-e640b99d08dd> in <module>
      8 remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
      9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
---> 10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
     11 tw_list["text"] = tw_list.text.str.lower()
     12 tw_list.head(10)

c:\program files\python39\lib\site-packages\pandas\core\series.py in map(self, arg, na_action)
   3907         dtype: object
   3908         """
-> 3909         new_values = super()._map_values(arg, na_action=na_action)
   3910         return self.constructor(new_values, index=self.index).finalize_(
   3911             self, method="map"

c:\program files\python39\lib\site-packages\pandas\core\base.py in _map_values(self, mapper, na_action)
    935 
    936         # mapper is a function
--> 937         new_values = map_f(values, mapper)
    938 
    939         return new_values

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-15-e640b99d08dd> in <lambda>(x)
      7 #Removing RT, Punctuation etc
      8 remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
----> 9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
     10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
     11 tw_list["text"] = tw_list.text.str.lower()

c:\program files\python39\lib\re.py in sub(pattern, repl, string, count, flags)
    208     a callable, it's passed the Match object and must return
    209     a replacement string to be used."""
--> 210     return _compile(pattern, flags).sub(repl, string, count)
    211 
    212 def subn(pattern, repl, string, count=0, flags=0):

c:\program files\python39\lib\re.py in _compile(pattern, flags)
    302     if not sre_compile.isstring(pattern):
    303         raise TypeError("first argument must be string or compiled pattern")
--> 304     p = sre_compile.compile(pattern, flags)
    305     if not (flags & DEBUG):
    306         if len(_cache) >= _MAXCACHE:

c:\program files\python39\lib\sre_compile.py in compile(p, flags)
    762     if isstring(p):
    763         pattern = p
--> 764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None

c:\program files\python39\lib\sre_parse.py in parse(str, flags, state)
    946 
    947     try:
--> 948         p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
    949     except Verbose:
    950         # the VERBOSE flag was switched on inside the pattern.  to be

c:\program files\python39\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
    441     start = source.tell()
    442     while True:
--> 443         itemsappend(_parse(source, state, verbose, nested + 1,
    444                            not nested and not items))
    445         if not sourcematch("|"):

c:\program files\python39\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
    832             sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
    833                            not (del_flags & SRE_FLAG_VERBOSE))
--> 834             p = _parse_sub(source, state, sub_verbose, nested + 1)
    835             if not source.match(")"):
    836                 raise source.error("missing ), unterminated subpattern",

c:\program files\python39\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
    441     start = source.tell()
    442     while True:
--> 443         itemsappend(_parse(source, state, verbose, nested + 1,
    444                            not nested and not items))
    445         if not sourcematch("|"):

c:\program files\python39\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
    596                     if hi < lo:
    597                         msg = "bad character range %s-%s" % (this, that)
--> 598                         raise source.error(msg, len(this) + 1 + len(that))
    599                     setappend((RANGE, (lo, hi)))
    600                 else:

错误说错误：位置18处的字符范围⁰-9

我正在尝试创建新数据框 (tw_list) 和新功能 (text)，然后使用 lambda 函数清理文本并清理 RT、链接、标点字符并转换为小写。

Answer 1

由于此原因，您的正则表达式存在问题：“...[°-9]...” 如果你想要这个字符“°”和“-”，你需要反斜杠“-”以避免正则表达式试图做一个范围。

如果它是“0”而不是“°”，您可以替换“°”字符

您可以在此处查看您的问题： https://regex101.com/r/hhf27i/1

修复方法如下： https://regex101.com/r/8d1VxP/1

Python正则表达式坏字符范围

1 个答案: