无法从数据框中删除空格,因此无法找到平均值

时间:2017-06-03 12:32:38

标签: r regex list matrix whitespace

我需要找到这个值的平均值 类(newtemp) 是的。这个数据是原始的10.6°C。我删除了°C但空间是因为as.numeric不起作用。

newtemp

 [1] "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 "

[12] "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-11.0 "

[23] "-10.6 " "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-9.4 " 

[34] "-8.9 "  "-8.3 "  "-7.2 "  "-7.2 "  "-5.6 "  "-5.0 "  "-3.9 "  "-3.9 "  "-3.9 "  "-3.3 "  "-3.3 " 

[45] "-3.9 "  "-6.1 "  "-8.3 "  "-7.8 "  "-8.9 "  "-10.0 " "-11.7 " "-12.8 "


#Tried this
library(stringr)
try=str_replace_all(newtemp, fixed(" "), "") but not able to remove
#Tried this also
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
trim(x =newtemp)
# STill not removed

as.numeric(try)

 [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

[36] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Warning message:
#NAs introduced by coercion 
#Sill no output.

我使用的代码是:

    library(rvest)
linkurl="https://www.wunderground.com/history/airport/KVAY/2015/2/17/DailyHistory.html?req_city=Cherry+Hill&req_state=NJ&req_statename=New+Jersey&reqdb.zip=08002&reqdb.magic=1&reqdb.wmo=99999&MR=1"
weathertable=read_html(linkurl)%>%html_node("#obsTable")%>%html_table()
weathertable
newtemp=weathertable$Temp.

abc=(gsub("°C", "", newtemp))
abc
abc_new=(gsub("[[:space:]]", "", abc))
as.numeric(abc_new)
trimws(x=abc)
as.numeric(trimws(x=abc))

更新

        > newtemp
     [1] "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
    [10] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
    [19] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C"
    [28] "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-9.4 °C"  "-8.9 °C"  "-8.3 °C"  "-7.2 °C" 
    [37] "-7.2 °C"  "-5.6 °C"  "-5.0 °C"  "-3.9 °C"  "-3.9 °C"  "-3.9 °C"  "-3.3 °C"  "-3.3 °C"  "-3.9 °C" 
    [46] "-6.1 °C"  "-8.3 °C"  "-7.8 °C"  "-8.9 °C"  "-10.0 °C" "-11.7 °C" "-12.8 °C"
    > abc=(gsub(" °C", "", newtemp))
    > abc
     [1] "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
    [10] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
    [19] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C"
    [28] "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-9.4 °C"  "-8.9 °C"  "-8.3 °C"  "-7.2 °C" 
    [37] "-7.2 °C"  "-5.6 °C"  "-5.0 °C"  "-3.9 °C"  "-3.9 °C"  "-3.9 °C"  "-3.3 °C"  "-3.3 °C"  "-3.9 °C" 
    [46] "-6.1 °C"  "-8.3 °C"  "-7.8 °C"  "-8.9 °C"  "-10.0 °C" "-11.7 °C" "-12.8 °C"
    > abc=(gsub("°C", "", newtemp))
> abc
 [1] "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 "
[12] "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-11.0 "
[23] "-10.6 " "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-9.4 " 
[34] "-8.9 "  "-8.3 "  "-7.2 "  "-7.2 "  "-5.6 "  "-5.0 "  "-3.9 "  "-3.9 "  "-3.9 "  "-3.3 "  "-3.3 " 
[45] "-3.9 "  "-6.1 "  "-8.3 "  "-7.8 "  "-8.9 "  "-10.0 " "-11.7 " "-12.8 "
> 

3 个答案:

答案 0 :(得分:2)

您可以使用函数trimws

> x <- "-10.6 "
> trimws(x)
[1] "-10.6"
> as.numeric(trimws(x))
[1] -10.6

更新

这似乎适用于您的情况。

abc <- gsub("(^[-]\\d+\\.\\d+)(.*$)", "\\1", newtemp)
data.frame(new = abc, old = newtemp)
     new      old
1  -10.6 -10.6 °C
2  -10.6 -10.6 °C
3  -11.0 -11.0 °C
4  -10.6 -10.6 °C
5  -10.6 -10.6 °C
6  -10.6 -10.6 °C
7  -10.6 -10.6 °C
8  -10.6 -10.6 °C
9  -10.6 -10.6 °C
10 -10.6 -10.6 °C
11 -10.6 -10.6 °C
12 -10.6 -10.6 °C
13 -10.6 -10.6 °C
14 -10.6 -10.6 °C
...

某些事情正在发生,人物和#34;转换&#34;这是我的头脑,所以我无法解释出现了什么问题。我解决问题的方法是忽略看似冒犯的°C部分,只使用regular expression提取数字部分。在坚果壳中,我使用特殊字符来查找减号([ - ]查找减号或没有),数字(\ d +查找一行中的所有数字),点(\。)和句子结尾($),放组中的所有内容(使用())然后提取第一组,留下其他所有内容&#34;在&#34;后面。

这里的原始角色

> charToRaw(newtemp[1])
 [1] 2d 31 30 2e 36 c2 a0 c2 b0 43

如果我将其复制/粘贴到R

> charToRaw("-10.6 °C")
[1] 2d 31 30 2e 36 20 b0 43

也许拥有更多计算机技能的人可以参与其中。

答案 1 :(得分:2)

您的字符串包含除常规ASCII空间(十进制值32)以外的空格。因此,您需要一个匹配任何Unicode空格的正则表达式。奇怪的是,简单的gsub("[[:space:]]*°C", "", newtemp)在所有R环境中都不起作用。

通常有效的是PCRE正则表达式:

gsub("(*UCP)\\s*°C", "", newtemp, perl=TRUE)

这里,(*UCP)是一个PCRE动词,使速记字符类可识别Unicode,\s可以匹配任何Unicode空格。 perl=TRUE参数使R使用PCRE正则表达式引擎而不是默认的TRE正则表达式引擎。

答案 2 :(得分:1)

您可以使用readr::parse_number来避免整个问题:

library(rvest)
library(tidyverse)

url <- "https://www.wunderground.com/history/airport/KVAY/2015/2/17/DailyHistory.html?req_city=Cherry+Hill&req_state=NJ&req_statename=New+Jersey&reqdb.zip=08002&reqdb.magic=1&reqdb.wmo=99999&MR=1"

h <- url %>% read_html()

obs <- h %>% 
    html_node('#obsTable') %>% 
    html_table() 

obs_clean <- obs %>% 
    mutate(Temp. = parse_number(Temp.)) %>% 
    tbl_df()   # for printing

obs_clean
#> # A tibble: 52 x 13
#>    `Time (EST)` Temp. Windchill `Dew Point` Humidity Pressure Visibility
#>           <chr> <dbl>     <chr>       <chr>    <chr>    <chr>      <chr>
#>  1     12:33 AM  12.9         -      8.1 °F      81% 30.08 in     2.0 mi
#>  2     12:45 AM  12.9         -      8.1 °F      81% 30.05 in     1.2 mi
#>  3     12:51 AM  12.2         -      8.6 °F      85% 30.05 in     0.8 mi
#>  4     12:54 AM  12.9         -      9.0 °F      84% 30.06 in     0.8 mi
#>  5      1:02 AM  12.9         -      9.0 °F      84% 30.05 in     0.5 mi
#>  6      1:25 AM  12.9         -      9.0 °F      84% 30.02 in     1.0 mi
#>  7      1:37 AM  12.9    6.9 °F      9.0 °F      84% 30.03 in     0.8 mi
#>  8      1:54 AM  12.9    6.9 °F      9.0 °F      84% 30.02 in     0.8 mi
#>  9      2:18 AM  12.9    5.2 °F      9.0 °F      84% 30.01 in     1.0 mi
#> 10      2:40 AM  12.9    5.2 °F      9.0 °F      84% 29.99 in     1.0 mi
#> # ... with 42 more rows, and 6 more variables: `Wind Dir` <chr>, `Wind
#> #   Speed` <chr>, `Gust Speed` <chr>, Precip <chr>, Events <chr>,
#> #   Conditions <chr>

或使用正则表达式,

obs %>% mutate(Temp. = as.numeric(gsub('[\\W°FC]', '', Temp.))) %>% tbl_df()

返回同样的东西。