Question

我正在为自己的需要修改代码，这有问题。我已经能够解决大多数问题，但仍停留在当前步骤上。我已经将pdf文件上传到R中，并执行了一系列步骤来操纵该文件进行文本挖掘。

我现在正尝试分割文本的每一行。有用的字符列表是一个（？），我称之为第11个对象。

useful[11]
>"                                          Busti                
169    425          Total             2,786 5,259        Franklin             
256   410"

如您所见，Busti前有很大的空间。有用的[11]是pdf页面的最后一行。本质上，第一列为空白，Busti是第二列，Total是第三列，而Franklin是同一行的第四列。

然后我将拆分有用的内容[11]，因此每一列现在都是一个单独的对象。

split <-
  strsplit(useful,
       "(?<=[0-9])\\s+(?=[A-Z])|(?<=[aA-zZ])\\s{2,}+(?=[A-Z])",
       perl = T)

split[11]
 [[1]]
 [1] "                                          Busti                 
 169    425"
 [2] "Total             2,786 5,259"                                            
 [3] "Franklin            256   410"

R不是将每一列都识别为对象，而是看到obj 1-Busti，对象2-Total，对象3-Franklin，而我想要的是：对象1-space，对象2-Busti，等等。

例如，在上面有用的行[10]中，任何列中都没有空白，因此：

useful[10]
[1] "Total               1,399 2,915           Arkwright            154    320          Smyrna              179    319       Deposit             110   169"

因此，当我使用split功能时，我得到：

split[10]
[[1]]
[1] "Total               1,399 2,915" "Arkwright            154    320" "Smyrna              179    319" 
[4] "Deposit             110   169"

有人可以帮我弄清楚如何解决此问题的正则表达式吗？预先谢谢你！

Answer 1

这是使用tidyverse和purrr的一种方法：

library(tidyverse)

useful <- c("                                          Busti                
169    425          Total             2,786 5,259        Franklin             
256   410", "Total               1,399 2,915           Arkwright            154    320          Smyrna              179    319       Deposit             110   169")

map(useful, str_squish) %>%
  str_split("\\s+")

# [[1]]
# [1] "Busti"    "169"      "425"      "Total"    "2,786"    "5,259"    "Franklin" "256"      "410"     
# 
# [[2]]
# [1] "Total"     "1,399"     "2,915"     "Arkwright" "154"       "320"       "Smyrna"    "179"       "319"       "Deposit"   "110"      
# [12] "169"

或者：

map(useful, str_squish) %>%
  str_split("\\s+(?=[[:alpha:]])")

# [[1]]
# [1] "Busti 169 425"     "Total 2,786 5,259" "Franklin 256 410" 
# 
# [[2]]
# [1] "Total 1,399 2,915" "Arkwright 154 320" "Smyrna 179 319"    "Deposit 110 169"

然后您可能要考虑...

map(useful, str_squish) %>%
  str_split("\\s+(?=[[:alpha:]])") %>%
  enframe %>%
  unnest

# # A tibble: 7 x 2
#   name value            
#   <int> <chr>            
# 1     1 Busti 169 425    
# 2     1 Total 2,786 5,259
# 3     1 Franklin 256 410 
# 4     2 Total 1,399 2,915
# 5     2 Arkwright 154 320
# 6     2 Smyrna 179 319   
# 7     2 Deposit 110 169

甚至...

map(useful, str_squish) %>%
  str_split("\\s+(?=[[:alpha:]])") %>%
  enframe %>%
  unnest %>%
  separate(value, c("Group", "Item1", "Item2"), sep = "\\s") %>%
  mutate_at(vars(starts_with("Item")), ~ str_replace(., ",", "") %>% as.numeric)

# # A tibble: 7 x 4
#    name Group     Item1 Item2
#   <int> <chr>     <dbl> <dbl>
# 1     1 Busti       169   425
# 2     1 Total      2786  5259
# 3     1 Franklin    256   410
# 4     2 Total      1399  2915
# 5     2 Arkwright   154   320
# 6     2 Smyrna      179   319
# 7     2 Deposit     110   169

最后，如果“项目”的数量未知或长度不同，则需要执行以下操作和/或引用this question：

map(useful, str_squish) %>%
  str_split("\\s+(?=[[:alpha:]])") %>%
  enframe %>%
  unnest %>%
  mutate(to_sep = str_split(value, "\\s")) %>%
  unnest(to_sep) %>%
  group_by(value) %>%
  mutate(row = row_number()) %>%
  spread(row, to_sep)

# # A tibble: 7 x 5
# # Groups:   value [7]
#    name value             `1`       `2`   `3`
#   <int> <chr>             <chr>     <chr> <chr>
# 1     1 Busti 169 425     Busti     169   425
# 2     1 Franklin 256 410  Franklin  256   410
# 3     1 Total 2,786 5,259 Total     2,786 5,259
# 4     2 Arkwright 154 320 Arkwright 154   320
# 5     2 Deposit 110 169   Deposit   110   169
# 6     2 Smyrna 179 319    Smyrna    179   319
# 7     2 Total 1,399 2,915 Total     1,399 2,915

您可能需要考虑将其分解为一个更具体的问题，尤其是在您提供pdf并更直接地询问您要实现的目标时。话虽这么说，我不确定此处是否有空白，因为您可以使用以下管道。

library(pdftools)
library(tidyverse)

text <- pdf_text("https://www.dec.ny.gov/docs/wildlife_pdf/09deerrpt.pdf")

clean_text <-
  text %>% 
  str_squish() %>% 
  magrittr::extract(., 14:17) %>%
  paste(collapse = " ") %>%
  # First get rid of the header text
  str_remove("New York State Department of Environmental.*TOTAL TAKE. ") %>%
  # Now get rid of Page numbers, e.g., Page 14, Page 15
  str_remove_all("Page [[:digit:]]{2}") %>%
  # Get rid of the COUNTY labels since they're not going to line up anyway...
  str_remove_all("[A-Z]{2,}") %>%
  # Remove Totals since they won't line up...
  str_remove("Statewide Totals.*") %>%
  # Remove commas from numbers 
  str_remove_all(",") %>%
  # Another squish for good measure and for some less than perfect removals above
  str_squish()

clean_text %>%
  # Remove the individual total lines
  str_remove_all("Total\\s\\w+\\s\\w+") %>%
  str_squish() %>%
  str_extract_all("[A-Za-z ]+\\s\\d+\\s\\d+") %>%
  unlist %>%
  str_squish() %>%
  data_frame(by_line = .) %>%
  extract(
    by_line, c("location", "adult_take", "total_take"), regex = "([A-Za-z ]+\\s?)(\\d+\\s?)(\\d+\\s?)"
  ) %>%
  mutate(
    location = str_squish(location),
    adult_take = str_squish(adult_take) %>% as.numeric,
    total_take = str_squish(total_take) %>% as.numeric
  )

# # A tibble: 943 x 3
#    location    adult_take total_take
#    <chr>            <dbl>      <dbl>
#  1 Carroll            103        215
#  2 Albany City         24         41
#  3 Allegany           115        231
#  4 Charlotte          116        248
#  5 Altona              50         87
#  6 Berne              163        292
#  7 Ashford            338        721
#  8 Chautauqua         242        613
#  9 Ausable             18         21
# 10 Bethlehem          141        280
# # ... with 933 more rows

字符串拆分：是否需要在字符前考虑空格？

1 个答案: