从R中复杂的嵌套JSON结构中提取数据

时间:2019-08-04 19:14:29

标签: r json rvest

数据需要从一个复杂的JSON对象中提取,该对象可以跨越49页。

数据位于无法展平的复杂JSON对象中。我尝试过:

tmp1 <- bind_rows(lapply(tmp, as.data.frame.list, stringsAsFactors=FALSE))

但错误消息是:

Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,  : 
  arguments imply differing number of rows: 0, 1
Min_TPL = 40

body_tags_1 <- lapply(paste0('https://www.eventbrite.com/d/ny--new-york/conference/?page=', 1:49),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes("body") %>% 
                    html_text() %>% 
                    toString() # to produce a single character string describing an R object.
                })


# str_match_all - Extract matched groups from a string.
# output - a list of character matrices
# search window Server data for all items
tmp <- str_match_all(body_tags_1,'window\\.__SERVER_DATA__ = (.*);')  
tmp <- lapply(seq_along(tmp), function(i){jsonlite::fromJSON(tmp[[i]][,2], flatten=TRUE)})

# get Event name from json
Event_Name <- json$suggestions$events$name
# convert to data frame
Event_Name <- as.data.frame(Event_Name)
# convert column variable to character from factor
Event_Name <- data.frame(lapply(Event_Name, as.character), stringsAsFactors=FALSE)

# get event date from dates 
Event_Date <- json$suggestions$events$start_date

# convert dates to data frame
Event_Date <- as.data.frame(Event_Date)

# Convert dates to character
Event_Date <- data.frame(lapply(Event_Date, as.character), stringsAsFactors=FALSE)

#`## convert Date from chr format to Date format using lubridate
Event_Date$Event_Date <- ymd(Event_Date$Event_Date)
# get Location from json
Location <- json$suggestions$events$primary_venue.address.city 

# convert to data frame
Location <- as.data.frame(Location)
# convert column variable to character from factor
Location <- data.frame(lapply(Location, as.character), stringsAsFactors=FALSE)

Tickets <- json$suggestions$events$ticket_availability.minimum_ticket_price.major_value  

# transform a vector into data frame with fixed dimension
# by converting to a matrix, specify the ncol
m1 <- matrix(Tickets, ncol=1, byrow=TRUE)

# convert to data frame
Tickets <- as.data.frame(m1, stringsAsFactors=FALSE)
Tickets <- as.data.frame(Tickets)

# get Currency from json
Currency <- json$suggestions$events$ticket_availability.minimum_ticket_price.currency

# convert to data frame
Currency <- as.data.frame(Currency)

# convert column variable to character from factor
Currency <- data.frame(lapply(Currency, as.character), stringsAsFactors=FALSE)

# bind all the data together by columns
all_data_bind <- cbind.data.frame(Event_Name, Event_Date, Location, Tickets, Currency)

# rename V1 as Mininum Price
all_data <- all_data_bind %>% 
  rename(Min_Price = V1)

all_data$Min_Price <- as.numeric(all_data$Min_Price)

# remove rows with na
all_data_1 <- all_data %>% drop_na()
dput(all_data_1)

str(all_data_1)
# keep rows with price > 40
all_data_filter_Price <- filter(all_data_1, Min_Price > Min_TPL)
all_data_filter_Price

dput(all_data_filter_Price)

从all_data_filter_Price输出的数据

structure(list(Event_Name = c("AFROPUNK FEST BROOKLYN 2019", 
"New York: The Wizard's Brunch & Dinner ", "Mac DeMarco plus special guests / Ex Hex"
), Event_Date = structure(c(18132, 18124, 18114), class = "Date"), 
    Location = c("Brooklyn", "New York City", "Brooklyn"), Min_Price = c(60, 
    45, 45), Currency = c("USD", "USD", "USD")), row.names = c(NA, 
-3L), class = "data.frame")

之类的代码
Event_Name <- json$suggestions$events$name

适用于单个页面,但不适用于返回为NULL的多个页面。

0 个答案:

没有答案
相关问题