我试图分析一些Twitter数据。你可以在这里找到我的R代码:
我不明白如何提取和绘制价值"小时"来自我y轴上的数据集。感谢
library(ggplot2)
# data <- read.csv("data.csv", header=T, stringsAsFactors=FALSE, sep = ",")
data <- read.csv(header = TRUE, stringsAsFactors = FALSE,
text = '"time","impressions","engagements"
"1",2015-10-24 15:39:00,400,"8.0"
"2",2015-10-24 15:28:00,575,"17.0"
"3",2015-10-23 16:52:00,1646,"29.0"
"4",2015-10-23 16:45:00,1489,"46.0"')
# remove duplicate rows
# data <- data[-c(177, 323, 615, 497, 809), ]
data$impressions <-as.numeric(as.character(data[,2]))
data$engagement <-as.numeric(as.character(data[,3]))
# Convert times to POSIXct
data$time <- as.POSIXct(data$time)
ggplot(data, aes(x=time, y=impressions)) +
theme_bw() +
geom_point(shape = 21, color = "red", size = data$engagement*0.1, position = position_jitter(width = 0.5)) + scale_y_continuous(minor_breaks=0,breaks=seq(0,2000,200),limits=c(0,2000))
数据看起来像那样
"time","impressions","engagements"
"1",2015-10-24 15:39:00,400,"8.0"
"2",2015-10-24 15:28:00,575,"17.0"
"3",2015-10-23 16:52:00,1646,"29.0"
"4",2015-10-23 16:45:00,1489,"46.0"
答案 0 :(得分:4)
提取小时数的方法(因为你的时间列格式正确)是使用R中的lubridate包中的hour()函数。
library(lubridate)
library(ggplot2)
data <- read.csv("data.csv", header=T, stringsAsFactors=FALSE, sep = ",")
# remove duplicate rows
data <- data[-c(177, 323, 615, 497, 809), ]
data$impressions <-as.numeric(as.character(data[,2]))
data$engagement <-as.numeric(as.character(data[,3]))
# extract hour from time column
data$Hour <- lubridate::hour(data$time)