数据需要从一个复杂的JSON对象中提取,该对象可以跨越49页。
数据位于无法展平的复杂JSON对象中。我尝试过:
tmp1 <- bind_rows(lapply(tmp, as.data.frame.list, stringsAsFactors=FALSE))
但错误消息是:
Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, :
arguments imply differing number of rows: 0, 1
Min_TPL = 40
body_tags_1 <- lapply(paste0('https://www.eventbrite.com/d/ny--new-york/conference/?page=', 1:49),
function(url){
url %>% read_html() %>%
html_nodes("body") %>%
html_text() %>%
toString() # to produce a single character string describing an R object.
})
# str_match_all - Extract matched groups from a string.
# output - a list of character matrices
# search window Server data for all items
tmp <- str_match_all(body_tags_1,'window\\.__SERVER_DATA__ = (.*);')
tmp <- lapply(seq_along(tmp), function(i){jsonlite::fromJSON(tmp[[i]][,2], flatten=TRUE)})
# get Event name from json
Event_Name <- json$suggestions$events$name
# convert to data frame
Event_Name <- as.data.frame(Event_Name)
# convert column variable to character from factor
Event_Name <- data.frame(lapply(Event_Name, as.character), stringsAsFactors=FALSE)
# get event date from dates
Event_Date <- json$suggestions$events$start_date
# convert dates to data frame
Event_Date <- as.data.frame(Event_Date)
# Convert dates to character
Event_Date <- data.frame(lapply(Event_Date, as.character), stringsAsFactors=FALSE)
#`## convert Date from chr format to Date format using lubridate
Event_Date$Event_Date <- ymd(Event_Date$Event_Date)
# get Location from json
Location <- json$suggestions$events$primary_venue.address.city
# convert to data frame
Location <- as.data.frame(Location)
# convert column variable to character from factor
Location <- data.frame(lapply(Location, as.character), stringsAsFactors=FALSE)
Tickets <- json$suggestions$events$ticket_availability.minimum_ticket_price.major_value
# transform a vector into data frame with fixed dimension
# by converting to a matrix, specify the ncol
m1 <- matrix(Tickets, ncol=1, byrow=TRUE)
# convert to data frame
Tickets <- as.data.frame(m1, stringsAsFactors=FALSE)
Tickets <- as.data.frame(Tickets)
# get Currency from json
Currency <- json$suggestions$events$ticket_availability.minimum_ticket_price.currency
# convert to data frame
Currency <- as.data.frame(Currency)
# convert column variable to character from factor
Currency <- data.frame(lapply(Currency, as.character), stringsAsFactors=FALSE)
# bind all the data together by columns
all_data_bind <- cbind.data.frame(Event_Name, Event_Date, Location, Tickets, Currency)
# rename V1 as Mininum Price
all_data <- all_data_bind %>%
rename(Min_Price = V1)
all_data$Min_Price <- as.numeric(all_data$Min_Price)
# remove rows with na
all_data_1 <- all_data %>% drop_na()
dput(all_data_1)
str(all_data_1)
# keep rows with price > 40
all_data_filter_Price <- filter(all_data_1, Min_Price > Min_TPL)
all_data_filter_Price
dput(all_data_filter_Price)
从all_data_filter_Price输出的数据
structure(list(Event_Name = c("AFROPUNK FEST BROOKLYN 2019",
"New York: The Wizard's Brunch & Dinner ", "Mac DeMarco plus special guests / Ex Hex"
), Event_Date = structure(c(18132, 18124, 18114), class = "Date"),
Location = c("Brooklyn", "New York City", "Brooklyn"), Min_Price = c(60,
45, 45), Currency = c("USD", "USD", "USD")), row.names = c(NA,
-3L), class = "data.frame")
之类的代码
Event_Name <- json$suggestions$events$name
适用于单个页面,但不适用于返回为NULL的多个页面。