如何在此数据框中对列名称进行分组

时间:2015-07-21 15:32:42

标签: r dataframe

我有一个为我准备好的数据框,显然有些列是通过一些基础机制组合在一起的。如何以这种方式对列名称进行分组以及如何再次将它们分开?

y.1$Address访问以“Address.XXX”

开头的所有列
 > y.1
       Address.streetAddress Address.position.latitude Address.position.longitude Address.namedAreas Address.region.municipalityName Address.region.countyName Address.ocean nothing rent floor livingArea
    19    Västmannagatan 85C                  59.34500                   18.04370           Vasastan                       Stockholm            Stockholms län          2325 4100000 1586   1.0       40.0
    29          Redargatan 3                  59.30279                   18.09048   Hammarby Sjöstad                       Stockholm            Stockholms län          1570 2800000 2829   4.0       43.5
    18 Doktor Abelins gata 6                  59.31596                   18.05454          Södermalm                       Stockholm            Stockholms län          1223 4875000 3092    NA       70.0
    75     Sibeliusgången 34                  59.41581                   17.91272             Akalla                       Stockholm            Stockholms län            NA 1800000 4876   4.0       80.9
    16        Standarvägen 1                  59.27604                   18.00459       Gamla Älvsjö                       Stockholm            Stockholms län          6360 2950000 3983   1.0       91.0
    32    Kungsbro Strand 17                  59.33027                   18.05143        Kungsholmen                       Stockholm            Stockholms län          1086 1995000 2017   1.0       25.5
    54        Pipersgatan 16                  59.33057                   18.04588        Kungsholmen                       Stockholm            Stockholms län          1405 2195000 2105   3.0       27.0
    22   Alva Myrdals gata 4                  59.28650                   17.95199 Fruängen-Hägersten                       Stockholm            Stockholms län            NA 1995000 2587   3.0       37.0
    35   Norr Mälarstrand 24                  59.32687                   18.04522        Kungsholmen                       Stockholm            Stockholms län          1437 2195000  910   4.0       23.0
    4    Beckbrännarbacken 7                  59.31487                   18.08901          Södermalm                       Stockholm            Stockholms län           329 1395000  520   0.5       11.0
> colnames(y.1)[1]  <- "nothing"
> y.1
   nothing.streetAddress nothing.position.latitude nothing.position.longitude nothing.namedAreas nothing.region.municipalityName nothing.region.countyName nothing.ocean listPrice rent floor livingArea
19    Västmannagatan 85C                  59.34500                   18.04370           Vasastan                       Stockholm            Stockholms län          2325   4100000 1586   1.0       40.0
29          Redargatan 3                  59.30279                   18.09048   Hammarby Sjöstad                       Stockholm            Stockholms län          1570   2800000 2829   4.0       43.5
18 Doktor Abelins gata 6                  59.31596                   18.05454          Södermalm                       Stockholm            Stockholms län          1223   4875000 3092    NA       70.0
75     Sibeliusgången 34                  59.41581                   17.91272             Akalla                       Stockholm            Stockholms län            NA   1800000 4876   4.0       80.9
16        Standarvägen 1                  59.27604                   18.00459       Gamla Älvsjö                       Stockholm            Stockholms län          6360   2950000 3983   1.0       91.0
32    Kungsbro Strand 17                  59.33027                   18.05143        Kungsholmen                       Stockholm            Stockholms län          1086   1995000 2017   1.0       25.5
54        Pipersgatan 16                  59.33057                   18.04588        Kungsholmen                       Stockholm            Stockholms län          1405   2195000 2105   3.0       27.0
22   Alva Myrdals gata 4                  59.28650                   17.95199 Fruängen-Hägersten                       Stockholm            Stockholms län            NA   1995000 2587   3.0       37.0
35   Norr Mälarstrand 24                  59.32687                   18.04522        Kungsholmen                       Stockholm            Stockholms län          1437   2195000  910   4.0       23.0
4    Beckbrännarbacken 7                  59.31487                   18.08901          Södermalm                       Stockholm            Stockholms län           329   1395000  520   0.5       11.0

> dput(y.1)
structure(list(Address = structure(list(address = structure(list(
    streetAddress = c("Västmannagatan 85C", "Redargatan 3", "Doktor Abelins gata 6", 
    "Sibeliusgången 34", "Standarvägen 1", "Kungsbro Strand 17", 
    "Pipersgatan 16", "Alva Myrdals gata 4", "Norr Mälarstrand 24", 
    "Beckbrännarbacken 7")), .Names = "streetAddress", row.names = c(19L, 
29L, 18L, 75L, 16L, 32L, 54L, 22L, 35L, 4L), class = "data.frame"), 
    position = structure(list(latitude = c(59.3449965, 59.3027897, 
    59.3159556, 59.4158109, 59.27603539, 59.33027358, 59.330567, 
    59.28649604, 59.326869, 59.314867), longitude = c(18.0437004, 
    18.0904824, 18.054536, 17.91271847, 18.00459327, 18.05143325, 
    18.045882, 17.95199275, 18.045217, 18.089009)), .Names = c("latitude", 
    "longitude"), row.names = c(19L, 29L, 18L, 75L, 16L, 32L, 
    54L, 22L, 35L, 4L), class = "data.frame"), namedAreas = list(
        "Vasastan", "Hammarby Sjöstad", "Södermalm", "Akalla", 
        "Gamla Älvsjö", "Kungsholmen", "Kungsholmen", "Fruängen-Hägersten", 
        "Kungsholmen", "Södermalm"), region = structure(list(
        municipalityName = c("Stockholm", "Stockholm", "Stockholm", 
        "Stockholm", "Stockholm", "Stockholm", "Stockholm", "Stockholm", 
        "Stockholm", "Stockholm"), countyName = c("Stockholms län", 
        "Stockholms län", "Stockholms län", "Stockholms län", 
        "Stockholms län", "Stockholms län", "Stockholms län", 
        "Stockholms län", "Stockholms län", "Stockholms län")), .Names = c("municipalityName", 
    "countyName"), row.names = c(19L, 29L, 18L, 75L, 16L, 32L, 
    54L, 22L, 35L, 4L), class = "data.frame"), distance = structure(list(
        ocean = c(2325L, 1570L, 1223L, NA, 6360L, 1086L, 1405L, 
        NA, 1437L, 329L)), .Names = "ocean", row.names = c(19L, 
    29L, 18L, 75L, 16L, 32L, 54L, 22L, 35L, 4L), class = "data.frame")), .Names = c("address", 
"position", "namedAreas", "region", "distance"), row.names = c(19L, 
29L, 18L, 75L, 16L, 32L, 54L, 22L, 35L, 4L), class = "data.frame"), 
    nothing = c(4100000L, 2800000L, 4875000L, 1800000L, 2950000L, 
    1995000L, 2195000L, 1995000L, 2195000L, 1395000L), rent = c(1586L, 
    2829L, 3092L, 4876L, 3983L, 2017L, 2105L, 2587L, 910L, 520L
    ), floor = c(1, 4, NA, 4, 1, 1, 3, 3, 4, 0.5), livingArea = c(40, 
    43.5, 70, 80.9, 91, 25.5, 27, 37, 23, 11), source = structure(list(
        name = c("BOSTHLM", "Fastighetsbyrån", "Gripsholms Fastighetsförmedling", 
        "Fastighetsbyrån", "Fastighetsbyrån", "Mäklarhuset", 
        "SkandiaMäklarna", "Svenska Mäklarhuset", "Svensk Fastighetsförmedling", 
        "Svensk Fastighetsförmedling"), id = c(1499L, 1573L, 
        9895524L, 1573L, 1573L, 204L, 1570L, 58L, 713L, 713L), 
        type = c("Broker", "Broker", "Broker", "Broker", "Broker", 
        "Broker", "Broker", "Broker", "Broker", "Broker"), url = c("http://www.bosthlm.se/", 
        "http://www.fastighetsbyran.se/", "http://gripsholms.se/", 
        "http://www.fastighetsbyran.se/", "http://www.fastighetsbyran.se/", 
        "http://www.maklarhuset.se/", "http://www.skandiamaklarna.se/", 
        "http://www.svenskamaklarhuset.se/", "http://www.svenskfast.se/", 
        "http://www.svenskfast.se/")), .Names = c("name", "id", 
    "type", "url"), row.names = c(19L, 29L, 18L, 75L, 16L, 32L, 
    54L, 22L, 35L, 4L), class = "data.frame"), rooms = c(2, 1.5, 
    2.5, 3, 3.5, 1, 1, 2, 1, 1), published = structure(c(16632, 
    16631, 16631, 16629, 16626, 16626, 16626, 16626, 16626, 16626
    ), class = "Date"), constructionYear = c(NA, 2008L, 1929L, 
    1977L, 1937L, 1934L, 1934L, NA, 1907L, 1929L), objectType = c("Lägenhet", 
    "Lägenhet", "Lägenhet", "Lägenhet", "Lägenhet", "Lägenhet", 
    "Lägenhet", "Lägenhet", "Lägenhet", "Lägenhet"), booliId = c(1919949L, 
    1893141L, 1896584L, 1898347L, 1917520L, 1918305L, 1918270L, 
    1918145L, 1918063L, 1918049L), soldDate = structure(c(16635, 
    16633, 16636, 16630, 16636, 16632, 16632, 16635, 16632, 16636
    ), class = "Date"), soldPrice = c(4100000L, 2950000L, 5175000L, 
    1800000L, 4200000L, 2510000L, 2610000L, 2500000L, 2950000L, 
    1850000L), url = c("https://www.booli.se/bostad/lagenhet/vasastan/vastmannagatan+85c/1919949", 
    "https://www.booli.se/bostad/lagenhet/hammarby+sjostad/redargatan+3/1893141", 
    "https://www.booli.se/bostad/lagenhet/sodermalm/doktor+abelins+gata+6/1896584", 
    "https://www.booli.se/bostad/lagenhet/akalla/sibeliusgangen+34/1898347", 
    "https://www.booli.se/bostad/lagenhet/gamla+alvsjo/standarvagen+1/1917520", 
    "https://www.booli.se/bostad/lagenhet/kungsholmen/kungsbro+strand+17/1918305", 
    "https://www.booli.se/bostad/lagenhet/kungsholmen/pipersgatan+16/1918270", 
    "https://www.booli.se/bostad/lagenhet/fruangen-hagersten/alva+myrdals+gata+4/1918145", 
    "https://www.booli.se/bostad/lagenhet/kungsholmen/norr+malarstrand+24/1918063", 
    "https://www.booli.se/bostad/lagenhet/sodermalm/beckbrannarbacken+7/1918049"
    ), isNewConstruction = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), plotArea = c(NA, NA, NA, NA, NA, 
    0L, NA, 0L, NA, NA), additionalArea = c(NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), AreaSize = structure(c(4L, 
    4L, 7L, 8L, 9L, 2L, 2L, 3L, 2L, 1L), .Label = c("10", "20", 
    "30", "40", "50", "60", "70", "80", "90", "100", "110", "120", 
    "130"), class = "factor"), PriceDiff = c(0L, 150000L, 300000L, 
    0L, 1250000L, 515000L, 415000L, 505000L, 755000L, 455000L
    )), .Names = c("Address", "nothing", "rent", "floor", "livingArea", 
"source", "rooms", "published", "constructionYear", "objectType", 
"booliId", "soldDate", "soldPrice", "url", "isNewConstruction", 
"plotArea", "additionalArea", "AreaSize", "PriceDiff"), row.names = c(19L, 
29L, 18L, 75L, 16L, 32L, 54L, 22L, 35L, 4L), class = "data.frame")

1 个答案:

答案 0 :(得分:0)

看起来数据结构来自解析JSON。您可以使用flatten

中的函数将data.frame改为常规jsonlite
library(jsonlite)
flat <- flatten(y.1)
str(flat)
# 'data.frame': 10 obs. of  28 variables:
#  $ nothing                        : int  4100000 2800000 4875000 1800000 2950000 1995000 2195000 1995000 2195000 1395000
#  $ rent                           : int  1586 2829 3092 4876 3983 2017 2105 2587 910 520
#  $ floor                          : num  1 4 NA 4 1 1 3 3 4 0.5
# ...

因此,嵌套的data.frames已扩展到各自的列中。回到另一个方向似乎更难,它看起来不像包提供该功能。您可以使用生成嵌套列表的递归函数获得类似的结构,其中基本元素可以是data.framesdata.frames基本上都是列表,所以这个结构并不太远。这里有两个函数,一个生成嵌套data.frames,另一个生成嵌套列表(我认为标准结构更多)。

## A function to create the nested data.frames structure
collapse <- function(dat) {
    prefixes <- gsub("([A-Za-z]+)\\..*", "\\1", names(dat))
    ns <- table(prefixes)
    out <- dat[, !(prefixes %in% names(ns)[ns>1])]
    for (n in names(ns)[ns>1]) {
        inds <- grepl(n, names(dat))
        subDat <- setNames(dat[, inds], gsub(sprintf("^%s\\.", n), "", names(dat[, inds])))
        res <- collapse(subDat)
        out[[n]] <- res
    }
    return( out )
}

out <- collapse(flat)
str(out)

# 'data.frame': 10 obs. of  19 variables:
#  $ nothing          : int  4100000 2800000 4875000 1800000 2950000 1995000 2195000 1995000 2195000 1395000
#  $ rent             : int  1586 2829 3092 4876 3983 2017 2105 2587 910 520
#  $ floor            : num  1 4 NA 4 1 1 3 3 4 0.5
# ...
#  $ source           :'data.frame':    10 obs. of  4 variables:
#   ..$ name: chr  "BOSTHLM" "Fastighetsbyrån" "Gripsholms Fastighetsförmedling" "Fastighetsbyrån" ...
#   ..$ id  : int  1499 1573 9895524 1573 1573 204 1570 58 713 713
#   ..$ type: chr  "Broker" "Broker" "Broker" "Broker" ...
# ...

## Function to produce nested lists that may contain data.frames as
## base elements
collapseNestedLists <- function(dat) {
    prefixes <- gsub("([A-Za-z]+)\\..*", "\\1", names(dat))
    ns <- table(prefixes)
    out <- as.list(dat[, !(prefixes %in% names(ns)[ns>1])])
    for (n in names(ns)[ns>1]) {
        inds <- grepl(n, names(dat))
        subDat <- setNames(dat[, inds], gsub(sprintf("^%s\\.", n), "", names(dat[, inds])))
        res <- collapse(subDat)
        out[[n]] <- as.list(res)
    }
    return( out )
}

outList <- collapseNestedLists(flat)

这两种数据结构都可以基本相同的方式访问。如果你想拉出一个地址的位置(所以data.frame中的一个元素嵌套在原始结构的data.frame内)或者这些,

all.equal(
    outList$Address$position$latitude[1],
    out$Address$position$latitude[1],
    y.1$Address$position$latitude[1],
    y.1[["Address"]][["position"]][["latitude"]][1]
)
# TRUE