我正在尝试使用R将此文本数据的多行转换为数据框。 我无法有效地使用read.delim。 我希望将所有这些行填充到由以下内容分隔的10个固定列中: 谢谢。
***
Type:status
Origin: abc
Text: abc
URL:
ID: 123
Time: Fri Jul 22 15:07:37 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags:
***
***
Type:status
Origin: cde
Text: rty
URL: http://ocs
ID: 456
Time: Thu Jul 21 14:09:47 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags: rty
***
***
.
..
...
答案 0 :(得分:0)
这样的事情可行:
a <- readLines(textConnection("
***
Type:status
Origin: abc
Text: abc
URL:
ID: 123
Time: Fri Jul 22 15:07:37 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags:
***
***
Type:status
Origin: cde
Text: rty
URL: http://ocs
ID: 456
Time: Thu Jul 21 14:09:47 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags: rty
***
***"))
ids <- c("Type", "Origin", "Text", "URL", "ID", "Time", "RetCount", "Favorite", "MentionedEntities", "Hashtags")
sapply(ids, function(id) sapply(strsplit(a[grepl(id[1], a)], ":"), "[[", 2))
答案 1 :(得分:0)
使用此答案将文件作为一个字符串读取:https://stackoverflow.com/a/9069670/1412059
现在像这样处理字符串:
text <- "***
Type:status
Origin: abc
Text: abc
URL:
ID: 123
Time: Fri Jul 22 15:07:37 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags:
***
***
Type:status
Origin: cde
Text: rty
URL: http://ocs
ID: 456
Time: Thu Jul 21 14:09:47 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags: rty
***
***"
#replace : with ; to have a nice seperator
text <- gsub("(?<![[:digit:]p]):", ";", text, perl=TRUE)
dat <- read.table(text=text, sep=";", comment.char = "*", fill=TRUE)
dat$id <- rep(seq_len(nrow(dat)/10), each=10)
library(reshape2)
dcast(dat, id~V1, value.var="V2")
# id Favorite Hashtags ID MentionedEntities Origin RetCount Text Time Type URL
#1 1 false 123 abc 0 abc Fri Jul 22 15:07:37 CDT 2011 status
#2 2 false rty 456 cde 0 rty Thu Jul 21 14:09:47 CDT 2011 status http://ocs
答案 2 :(得分:0)
你也可以:(使用@rengis的a
)
如果您http
和https
为URL
text1 <- gsub("(?<=[0-9]|http|https):(*SKIP)(*F)|:", ";", a, perl=TRUE)
text2 <- text1[!grepl("\\*|^$", text1)]
res <- do.call(data.frame,c(split(gsub(".*; ?", "", text2),
gsub(";.*", "", text2)), stringsAsFactors=FALSE))
res
# Favorite Hashtags ID MentionedEntities Origin RetCount Text
#1 false 123 abc 0 abc
#2 false rty 456 cde 0 rty
# Time Type URL
#1 Fri Jul 22 15:07:37 CDT 2011 status
#2 Thu Jul 21 14:09:47 CDT 2011 status http://ocs
或使用cSplit
library(data.table)
library(devtools)
source_gist(11380733)
DT <- cSplit(as.data.frame(text2), "text2",";", "wide")[,
n:= seq_len(.N), by=text2_1]
dcast.data.table(DT, n~text2_1, value.var="text2_2")
# n Favorite Hashtags ID MentionedEntities Origin RetCount Text
# 1: 1 false 123 abc 0 abc
# 2: 2 false rty 456 cde 0 rty
Time Type URL
#1: Fri Jul 22 15:07:37 CDT 2011 status
#2: Thu Jul 21 14:09:47 CDT 2011 status http://ocs
根据新信息,即colons
弹出:
a <- readLines(textConnection("
***
Type:status
Origin: abc
Text: abc
URL:
ID: 123
Time: Fri Jul 22 15:07:37 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags:
***
***
Type:status
Origin: cde: andgg
Text: rty: asndf
URL: http://ocs
ID: 456
Time: Thu Jul 21 14:09:47 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags: rty
***
***"))
text1 <- gsub("(?<=[0-9]|http|https):(*SKIP)(*F)|^([^:]+):(.*)",
"\\1;\\2", a, perl=TRUE)
text2 <- text1[!grepl("\\*|^$", text1)]
splitGroup <- sub(";.*", "", text2)
res <- do.call(data.frame,c(split(gsub(".*; ?", "", text2),
factor(splitGroup, levels=unique(splitGroup))), stringsAsFactors=FALSE))
res
# Type Origin Text URL ID Time
#1 status abc abc 123 Fri Jul 22 15:07:37 CDT 2011
#2 status cde: andgg rty: asndf http://ocs 456 Thu Jul 21 14:09:47 CDT 2011
# RetCount Favorite MentionedEntities Hashtags
#1 0 false
#2 0 false rty
答案 3 :(得分:0)
这是一个似乎可以完成工作的功能。它不使用分隔符,而是使用readLines
和几个正则表达式。
readData <- function(file, stringsAsFactors = TRUE)
{
rl <- readLines(file) ## read the file
rl2 <- rl[!grepl("[*]+", rl)] ## remove the '***' elements
sub <- sub("^[A-Za-z]+[:]( ?)+", "", rl2) ## make the row data
mat <- matrix(sub, ncol = 10, byrow = TRUE, ## create a matrix
dimnames = list(NULL, gsub("[:](.*)", "", rl2[1:10])))
as.data.frame(mat, stringsAsFactors = stringsAsFactors)
}
此处是您的数据运行,其中文件"new.txt"
是使用您的示例数据创建的。
readData("new.txt")
# Type Origin Text URL ID Time RetCount Favorite MentionedEntities Hashtags
# 1 status abc abc 123 Fri Jul 22 15:07:37 CDT 2011 0 false
# 2 status cde rty http://ocs 456 Thu Jul 21 14:09:47 CDT 2011 0 false rty