Question

我想创建一个数据框，其中包含类＆＃34;字符＆＃34;的对象内的元素。

9 15 22 23 0 1,052 393 

10 16 23 0 1 1,652 291

11 17 0 1 2 1,593 228 

12 18 1 2 3 1,097 170

我一直试图将所有这些信息分成7列但没有成功。

以下是我使用的代码：

download.file("https://www.asx.com.au/documents/products/asx-24-market-dynamics-mar-18.pdf", "asx-24-market-dynamics-mar-18.pdf", mode = "wb")

#install.packages("pdftools")
library(pdftools)
txt <- pdf_text("asx-24-market-dynamics-mar-18.pdf") 

# get second page text 
page_2 <- txt[2]

# separate lines 
library(dplyr)
library(stringr)
page_2a <- page_2 %>% 
                 str_split(pattern = "\n") %>%
                 unlist()

# create "two tables"
tbl1 <- page_2a[6:29]

tbl2 <- page_2a[33:56]

 # transform into a data frame
 tbl1 <- ?

Answer 1

我们可以先创建一个通用模式，然后使用separate生成所需的输出

library(stringr)
library(dplyr)
data$A <- str_replace(data$A, ",","")      

data %>% 
    separate(A, c("a1", "a2", "a3", "a4","a5", "a6","a7", "a8"), " ")

数据

data <- read.table(text = "A '9 15 22 23 0 1,052 393' '10 16 23 0 1 1,652 291' ",header=T)

<强>更新

tbl1 <- str_replace(tbl1, "\r","") #replace \r at the end of line with "" tbl1 <- trimws(tbl1,which = c("left")) #trim the white space where line start tbl1 <- gsub("[[:blank:]]+", ":", tbl1) #replace any number of space with : test_tbl1 <- data.frame(tbl1) tbl1_final <- test_tbl1 %>% separate(tbl1, c("Chicago*","London*","HK/Sing","Tokyo","Aust\n(AEST)","ASX\nSPI 20\U2122","90 Day","3 Year","10 Year","ASX\nSPI 200\U2122" ,"90 Day","3 Year","10 Year"), ":")

Answer 2

library(pdftools); library(tidyverse)

现在您有两个以空格作为分隔符的字符向量，因此请使用read.table并传递给text参数：

head( read.table( text=tbl1 ) ) 
#---------
V1 V2 V3 V4 V5    V6  V7    V8    V9 V10    V11   V12 V13
1  9 15 22 23  0 1,052 393 6,119 8,176  14 10,196 1,279 646
2 10 16 23  0  1 1,652 291 5,222 7,325  11 10,126 1,053 600
3 11 17  0  1  2 1,593 228 4,750 6,917  13  9,754   923 617
4 12 18  1  2  3 1,097 170 4,193 4,801  12  9,937   910 549
5 13 19  2  3  4   644 624 1,733 2,605  12 10,259   851 512
6 14 20  3  4  5   658 103 1,910 2,081  11 10,070   808 421

由于您似乎对此很陌生，因此当您在看似数字值的逗号中看到逗号时，我建议您非常怀疑。 R不使用逗号作为千位分隔符，因此列：V6，V8，V9，V11和V12至少是因子列。（对于V13也适用，如str所示）

 str( read.table( text=tbl1 ) )
'data.frame':   24 obs. of  13 variables:
 $ V1 : int  9 10 11 12 13 14 15 16 17 18 ...
 $ V2 : int  15 16 17 18 19 20 21 22 23 0 ...
 $ V3 : int  22 23 0 1 2 3 4 5 6 7 ...
 $ V4 : int  23 0 1 2 3 4 5 6 7 8 ...
 $ V5 : int  0 1 2 3 4 5 6 7 8 9 ...
 $ V6 : Factor w/ 24 levels "1,052","1,097",..: 1 5 4 2 16 17 21 20 24 3 ...
 $ V7 : Factor w/ 24 levels "1,324","1,593",..: 19 15 13 7 22 4 21 20 3 9 ...
 $ V8 : Factor w/ 24 levels "1,323","1,733",..: 24 22 21 18 2 3 11 1 4 6 ...
 $ V9 : Factor w/ 24 levels "1,222","10,959",..: 22 21 19 12 9 7 8 1 16 18 ...
 $ V10: int  14 11 13 12 12 11 9 10 0 13 ...
 $ V11: Factor w/ 24 levels "10,070","10,119",..: 5 3 21 22 7 1 19 20 23 6 ...
 $ V12: Factor w/ 24 levels "1,053","1,212",..: 4 1 24 23 22 19 20 21 10 11 ...
 $ V13: Factor w/ 24 levels "1,009","1,019",..: 16 14 15 12 11 10 9 8 13 17 ...

您可以使用gsub修复大多数但不是所有列：

str( read.table( text=gsub(",","",tbl1) ) )
'data.frame':   24 obs. of  13 variables:
 $ V1 : int  9 10 11 12 13 14 15 16 17 18 ...
 $ V2 : int  15 16 17 18 19 20 21 22 23 0 ...
 $ V3 : int  22 23 0 1 2 3 4 5 6 7 ...
 $ V4 : int  23 0 1 2 3 4 5 6 7 8 ...
 $ V5 : int  0 1 2 3 4 5 6 7 8 9 ...
 $ V6 : Factor w/ 24 levels "1052","1097",..: 1 5 4 2 14 16 21 20 24 3 ...
 $ V7 : int  393 291 228 170 624 103 525 44 1690 2308 ...
 $ V8 : int  6119 5222 4750 4193 1733 1910 2116 1323 10606 12539 ...
 $ V9 : int  8176 7325 6917 4801 2605 2081 2486 1222 5319 6119 ...
 $ V10: int  14 11 13 12 12 11 9 10 0 13 ...
 $ V11: int  10196 10126 9754 9937 10259 10070 9465 9606 9960 10239 ...
 $ V12: int  1279 1053 923 910 851 808 839 843 1714 1874 ...
 $ V13: int  646 600 617 549 512 421 316 297 585 688 ...

table(read.table( text=gsub(",","",tbl1) )[6])

  1052   1097   1368   1593   1652   3323   3773   4054   4243   4648    565    599    608    644   6573    658    684   6966 
     1      1      1      1      1      1      1      1      1      1      1      1      1      1      1      1      1      1 
   782    814    823    863    875 Closed 
     1      1      1      1      1      1

拆分或分离类＆＃34;字符＆＃34;

2 个答案: