我花了比预期更多的时间将xml传输到数据帧(代码片段包含xml和xmlTreeParse
以使帖子更小,整个解决方案就在此之后):
users = xmlTreeParse(file=
'<?xml version="1.0" encoding="utf-8"?>
<users>
<row Id="-1" Reputation="1" CreationDate="2010-07-19T06:55:26.860" DisplayName="Community" LastAccessDate="2010-07-19T06:55:26.860" Location="on the server farm" AboutMe="some text" Views="0" UpVotes="4382" DownVotes="771" EmailHash="a007be5a61f6aa8f3e85ae2fc18dd66e" />
<row Id="2" Reputation="101" CreationDate="2010-07-19T14:01:36.697" DisplayName="Geoff Dalgas" LastAccessDate="2012-09-13T17:41:48.300" WebsiteUrl="http://stackoverflow.com" Location="Corvallis, OR" AboutMe="some text 2" Views="7" UpVotes="3" DownVotes="0" EmailHash="b437f461b3fd27387c5d8ab47a293d35" Age="36" />
<row Id="3" Reputation="101" CreationDate="2010-07-19T15:34:50.507" DisplayName="Jarrod Dixon" LastAccessDate="2013-01-15T03:28:47.657" WebsiteUrl="http://stackoverflow.com" Location="New York, NY" AboutMe="some text 3" Views="9" UpVotes="19" DownVotes="0" EmailHash="2dfa19bf5dc5826c1fe54c2c049a1ff1" Age="34" />
<row Id="4" Reputation="101" CreationDate="2010-07-19T19:03:27.400" DisplayName="Emmett" LastAccessDate="2013-04-16T16:51:04.780" WebsiteUrl="http://minesweeperonline.com" Location="New York, NY" AboutMe="some text 4" Views="3" UpVotes="0" DownVotes="0" EmailHash="129bc58fc3f1e3853cdd3cefc75fe1a0" Age="27" />
<row Id="5" Reputation="6182" CreationDate="2010-07-19T19:03:57.227" DisplayName="Shane" LastAccessDate="2013-02-05T11:23:09.587" WebsiteUrl="http://www.statalgo.com" Location="New York, NY" AboutMe="some text 5" Views="605" UpVotes="659" DownVotes="5" EmailHash="0cee97ffd90277bf4ac753331d50af60" Age="34" />
<row Id="6" Reputation="442" CreationDate="2010-07-19T19:04:07.647" DisplayName="Harlan" LastAccessDate="2013-05-09T13:11:29.027" WebsiteUrl="http://www.harlan.harris.name" Location="District of Columbia" AboutMe="some text 6" Views="30" UpVotes="42" DownVotes="0" EmailHash="9f1a68b9e623be5da422b44e733fa8bc" Age="40" />
<row Id="7" Reputation="329" CreationDate="2010-07-19T19:04:37.257" DisplayName="Vince" LastAccessDate="2013-05-21T22:49:10.237" WebsiteUrl="http://bioinformatics.ucdavis.edu" Location="Davis, CA" AboutMe="some text 7" Views="21" UpVotes="14" DownVotes="0" EmailHash="4f7cebc8ac200d15bac5dcff51469425" Age="27" />
<row Id="8" Reputation="6104" CreationDate="2010-07-19T19:04:52.280" DisplayName="csgillespie" LastAccessDate="2013-05-21T17:32:58.693" WebsiteUrl="http://www.mas.ncl.ac.uk/~ncsg3/" Location="Newcastle, United Kingdom" AboutMe="some text 8" Views="399" UpVotes="576" DownVotes="18" EmailHash="3c3eea4eda77ffe95ae18c78c3fc7e55" Age="35" />
<row Id="10" Reputation="121" CreationDate="2010-07-19T19:05:40.403" DisplayName="Pierre" LastAccessDate="2012-10-04T17:17:01.430" WebsiteUrl="http://plindenbaum.blogspot.com" Location="France" AboutMe="some text 10" Views="8" UpVotes="2" DownVotes="0" EmailHash="61200477cf8983809ec152f484750204" Age="43" />
<row Id="11" Reputation="136" CreationDate="2010-07-19T19:06:02.713" DisplayName="wahalulu" LastAccessDate="2013-05-26T20:36:24.567" WebsiteUrl="http://www.linkedin.com/in/marckvaisman" Location="Washington, DC" AboutMe="some text 11" Views="2" UpVotes="10" DownVotes="0" EmailHash="9a9a05e41ae6e3b127697967cea5f8fb" Age="39" />
<row Id="12" Reputation="101" CreationDate="2010-07-19T19:06:34.507" DisplayName="Jin" LastAccessDate="2013-04-11T18:31:58.360" WebsiteUrl="http://www.8164.org" Location="Raleigh, NC" AboutMe="some text 12" Views="5" UpVotes="4" DownVotes="0" EmailHash="70ad2c2830eb9a7753bd6312f3811e3e" Age="37" />
<row Id="13" Reputation="677" CreationDate="2010-07-19T19:06:49.527" DisplayName="Sharpie" LastAccessDate="2012-01-02T22:55:04.743" WebsiteUrl="http://www.sharpsteen.net" Location="United States" AboutMe="Undergraduate studying Environmental Engineering and Applied Mathematics." Views="37" UpVotes="44" DownVotes="1" EmailHash="a52001938ed33a87334447413cc5beaa" Age="27" />
<row Id="15" Reputation="11" CreationDate="2010-07-19T19:07:32.537" DisplayName="hannes.koller" LastAccessDate="2010-08-24T14:23:18.050" WebsiteUrl="http://soma.denkt.org" Location="Vienna, Austria" AboutMe="" Views="2" UpVotes="0" DownVotes="0" EmailHash="0ecd144e2f3d05e6ee6b89404d1d4c53" Age="34" />
<row Id="16" Reputation="101" CreationDate="2010-07-19T19:08:13.957" DisplayName="slashnick" LastAccessDate="2010-08-19T20:40:59.080" Location="London, United Kingdom" Views="2" UpVotes="7" DownVotes="0" EmailHash="5691ff74e21c78cd1563b5123254cbd6" Age="30" />
<row Id="17" Reputation="192" CreationDate="2010-07-19T19:08:28.243" DisplayName="Random" LastAccessDate="2010-09-10T07:34:36.123" AboutMe="" Views="6" UpVotes="13" DownVotes="1" EmailHash="5a3c78de1408aae57797dffd0782b992" />
<row Id="18" Reputation="128" CreationDate="2010-07-19T19:08:29.070" DisplayName="grokus" LastAccessDate="2012-08-09T15:02:00.600" WebsiteUrl="http://wikipedia.org" Location="United States" AboutMe="about me 18" Views="6" UpVotes="16" DownVotes="0" EmailHash="7d1f931327bfab7b214758be17627adc" Age="43" />
<row Id="19" Reputation="101" CreationDate="2010-07-19T19:08:45.250" DisplayName="Noah Snyder" LastAccessDate="2012-06-17T15:53:43.550" WebsiteUrl="http://sbseminar.wordpress.com" Location="New York, NY" AboutMe="about me 19" Views="11" UpVotes="2" DownVotes="0" EmailHash="895385d49eb1f04c5ee1f8d7734f3a62" Age="33" />
</users>',
asText=TRUE)
XML只是表示来自stackexchange数据转储的 Users 表:
<users>
<row Id=..... />
<row Id=..... />
.....
<row Id=..... />
</users>
映射到数据帧与映射表的方式相同。这是为我完成工作的代码:
require(XML)
require(plyr)
# insert xmlTreeParse here
r = xmlRoot(users)
attrs = c('Id', 'Reputation', 'CreationDate', 'DisplayName', 'LastAccessDate',
'WebsiteUrl', 'Location', 'AboutMe', 'Views', 'UpVotes', 'DownVotes',
'EmailHash', 'Age')
mapUserAttrs = function(x, colNames) {
t = data.frame(as.integer(x['Id']),
as.integer(x['Reputation']),
strptime(x['CreationDate'], '%Y-%m-%dT%H:%M:%OS'),
as.character(x['DisplayName']),
strptime(x['LastAccessDate'], '%Y-%m-%dT%H:%M:%OS'),
as.character(x['WebsiteUrl']),
as.character(x['Location']),
as.character(x['AboutMe']),
as.integer(x['Views']),
as.integer(x['UpVotes']),
as.integer(x['DownVotes']),
as.character(x['EmailHash']),
as.integer(x['Age']))
names(t) = colNames
return(t)
}
result = ldply(lapply(xmlChildren(r), xmlAttrs), mapUserAttrs, attrs)
对我来说看起来过于繁忙 - 但我发现没有更好的方法来完成XML包的任务以及我找到的大量示例和文档。
我想知道是否有更简单(或更短)的方法来完成相同的工作?
答案 0 :(得分:3)
您可以使用xmlToList
包中的XML
函数执行此操作,并且由于某些节点包含其他节点不包含的选项,因此您还需要rbind.fill
函数来自plyr
包。
下面的代码行将您的XML转换为列表,循环遍历节点并将字符串转换为数据帧,然后将所有这些数据帧整理在一起。
require(xml)
require(plyr)
out <- do.call("rbind.fill",
lapply(xmlToList(users),
function(x) as.data.frame(as.list(x), stringsAsFactors = FALSE)))
head(out)
Id Reputation CreationDate DisplayName LastAccessDate Location AboutMe Views UpVotes DownVotes
1 -1 1 2010-07-19T06:55:26.860 Community 2010-07-19T06:55:26.860 on the server farm some text 0 4382 771
2 2 101 2010-07-19T14:01:36.697 Geoff Dalgas 2012-09-13T17:41:48.300 Corvallis, OR some text 2 7 3 0
3 3 101 2010-07-19T15:34:50.507 Jarrod Dixon 2013-01-15T03:28:47.657 New York, NY some text 3 9 19 0
4 4 101 2010-07-19T19:03:27.400 Emmett 2013-04-16T16:51:04.780 New York, NY some text 4 3 0 0
5 5 6182 2010-07-19T19:03:57.227 Shane 2013-02-05T11:23:09.587 New York, NY some text 5 605 659 5
6 6 442 2010-07-19T19:04:07.647 Harlan 2013-05-09T13:11:29.027 District of Columbia some text 6 30 42 0
EmailHash WebsiteUrl Age
1 a007be5a61f6aa8f3e85ae2fc18dd66e <NA> <NA>
2 b437f461b3fd27387c5d8ab47a293d35 http://stackoverflow.com 36
3 2dfa19bf5dc5826c1fe54c2c049a1ff1 http://stackoverflow.com 34
4 129bc58fc3f1e3853cdd3cefc75fe1a0 http://minesweeperonline.com 27
5 0cee97ffd90277bf4ac753331d50af60 http://www.statalgo.com 34
6 9f1a68b9e623be5da422b44e733fa8bc http://www.harlan.harris.name 40
修改强>
结果数据框将完全由字符向量组成。如果要将这些向量转换为日期,日期时间,数字等,您可以逐个进行,也可以编写一个函数来指定应该为具有某些名称的列分配哪些类,或者您可以编写一个函数来尝试从数据中推断出正确的类。以下是最后一个选项的示例:
giveClasses <- function(df, threshold = 0.1) {
df_classes <- sapply(df, class)
df_alpha <- sapply(df, function(x) {
mean(grepl("[[:alpha:]]", x)) >= threshold}) &
df_classes == "character"
df_digits <- sapply(df, function(x) mean(grepl("\\d", x))) >= threshold &
df_classes == "character" &
!df_alpha
df_percent <- sapply(df, function(x) mean(grepl("%", x))) >= threshold &
df_classes == "character" &
!df_alpha &
df_digits
df_digits[df_percent] <- FALSE
df_decimal <- sapply(df, function(x) mean(grepl("\\.", x))) >= threshold &
df_classes == "character" &
!df_percent &
df_digits &
!df_alpha
df_dates <- sapply(df, function(x) {
mean(grepl(
"^\\d{2,4}[[:punct:]]\\d{2}[[:punct:]]\\d{2,4}$", x)) >= threshold}) &
df_classes == "character"
df_datetime <- sapply(df, function(x) {
mean(grepl(
"^\\d{2,4}[[:punct:]]\\d{2}[[:punct:]]\\d{2,4}\\D\\d{2}:\\d{2}(:\\d{2})?(\\.\\d{1,})?$", x)) >= threshold}) &
df_classes == "character"
# convert character data to appropriate classes
df_logical <- sapply(df, function(x) {
y <- unique(na.omit(x))
length(y) == 2 &
mean(grepl("^n", y, ignore.case = TRUE) |
grepl("^y", y, ignore.case = TRUE)) == 1
})
df_digits[df_dates | df_datetime] <- FALSE
df[,df_percent] <- lapply(df[,df_percent, drop = FALSE], function(x) {
as.numeric(gsub("[^[:digit:].]", "", x)) / 100})
df[,df_logical] <- lapply(df[,df_logical, drop = FALSE], function(x) {
x[grep("^y", x, ignore.case = TRUE)] <- TRUE
x[grep("^n", x, ignore.case = TRUE)] <- FALSE
as.logical(x)
})
df[,df_decimal] <- lapply(df[,df_decimal, drop = FALSE], function(x) {
as.numeric(gsub("[^[:digit:].]", "", x))})
df[,df_digits] <- lapply(df[,df_digits, drop = FALSE], function(x) {
as.integer(gsub("[^[:digit:]]", "", x))})
df[,df_dates] <- lapply(df[,df_dates, drop = FALSE], function(x) {
as.Date(x)})
df[,df_datetime] <- lapply(df[,df_datetime, drop = FALSE], function(x) {
strptime(x, '%Y-%m-%dT%H:%M:%OS')})
df_ischaracter <- sapply(df, function(x) any(class(x) == "character"))
df[,df_ischaracter] <- lapply(df[,df_ischaracter, drop = FALSE], function(x) {
x <- gsub("^\\s+|\\s+$|(?<=\\s)\\s+", "", x, perl = TRUE)})
df
}
如果该列中超过90%的值符合适合该类的模式,则上述函数会为列分配一个类。否则,它会将它们保留为字符。它解决了示例数据集中没有的模式 - 我只是从我正在处理的另一个项目中复制了代码。所以:
str(giveClasses(out))
'data.frame': 17 obs. of 13 variables:
$ Id : int 1 2 3 4 5 6 7 8 10 11 ...
$ Reputation : int 1 101 101 101 6182 442 329 6104 121 136 ...
$ CreationDate : POSIXlt, format: "2010-07-19 06:55:26" "2010-07-19 14:01:36" "2010-07-19 15:34:50" "2010-07-19 19:03:27" ...
$ DisplayName : chr "Community" "Geoff Dalgas" "Jarrod Dixon" "Emmett" ...
$ LastAccessDate: POSIXlt, format: "2010-07-19 06:55:26" "2012-09-13 17:41:48" "2013-01-15 03:28:47" "2013-04-16 16:51:04" ...
$ Location : chr "on the server farm" "Corvallis, OR" "New York, NY" "New York, NY" ...
$ AboutMe : chr "some text" "some text 2" "some text 3" "some text 4" ...
$ Views : int 0 7 9 3 605 30 21 399 8 2 ...
$ UpVotes : int 4382 3 19 0 659 42 14 576 2 10 ...
$ DownVotes : int 771 0 0 0 5 0 0 18 0 0 ...
$ EmailHash : chr "a007be5a61f6aa8f3e85ae2fc18dd66e" "b437f461b3fd27387c5d8ab47a293d35" "2dfa19bf5dc5826c1fe54c2c049a1ff1" "129bc58fc3f1e3853cdd3cefc75fe1a0" ...
$ WebsiteUrl : chr NA "http://stackoverflow.com" "http://stackoverflow.com" "http://minesweeperonline.com" ...
$ Age : int NA 36 34 27 34 40 27 35 43 39 ...