使用R

时间:2017-08-14 12:05:52

标签: r twitter

我正在尝试用R来分析转推和提及图形网络。

我的问题: 我尝试创建数据框,其中包括转发/提及的“发送者”和“接收者”。该代码适用于大多数推文,但有时“接收器”列不仅包括名称,还包括文本的其余部分。我该如何解决?

这是我的代码:

    # Get @-messages, senders, receivers
ats <- grep("^\\.?@[a-z0-9_]{1,15}", tolower(munich_data_500$text_clean), perl=T, value=T);
at.sender <- tolower(as.character(munich_data_500$screen_name[grep("^\\.?@[a-z0-9_]{1,15}", tolower(munich_data_500$text_clean), perl=T)]));
at.receiver <- gsub("^\\.?@([a-z0-9_]{1,15})[^a-z0-9_]+.*$", "\\1", ats, perl=T);
print(paste(length(ats), " @-messages from ", length(unique(at.sender)), " senders and ", length(unique(at.receiver)), " receivers.", sep=""));


# Get RTs, senders, receivers
rts <- grep("^rt @[a-z0-9_]{1,15}", tolower(munich_data_500$text_clean), perl=T, value=T);
rt.sender <- tolower(as.character(munich_data_500$screen_name[grep("^rt @[a-z0-9_]{1,15}", tolower(munich_data_500$text_clean), perl=T)]));
rt.receiver <- gsub("^rt @([a-z0-9_]{1,15})[^a-z0-9_]+.*$", "\\1", rts, perl=T);
print(paste(length(rts), " RTs from ", length(unique(rt.sender)), " senders and ", length(unique(rt.receiver)), " receivers.", sep=""));


# This is necessary to avoid problems with empty entries, usually caused by encoding issues in the source files
at.sender[at.sender==""] <- "";
at.receiver[at.receiver==""] <- "";
rt.sender[rt.sender==""] <- "";
rt.receiver[rt.receiver==""] <- "";

# Create a data frame from the sender-receiver information
ats.df <- data.frame(at.sender, at.receiver);
rts.df <- data.frame(rt.sender, rt.receiver);

代码来源

https://www.r-bloggers.com/generating-graphs-of-retweets-and-messages-on-twitter-using-r-and-gephi/

我的数据副本

structure(list(user_id_str = c("4193500174", "2198130404", "455851683", 
"2316380063", "3192451", "2372023345"), screen_name = c("unterdembettRU", 
"NejlaBevab", "MatzePirwitz", "nowakberlin", "leyrer", "RockiDave"
), name = c("Amnesia Anderson", "nejla bevab", "marcel pirwitz", 
"Amaruq", "Martin Leyrer", "dave rocki"), description = c("Eine 1 Mann Armee die jeden zerfickt #pravibalkanci since 2016  backup @hurensohn8811", 
"snapchat:nejlabevab      Instagram; nejla_wiebke", NA, NA, "Mitglied der demokratisch nicht legitimierten Internetgemeinde, CCC Wien, here private, ask before quoting me offline, Whitehat-Troll, OE1LYA", 
"TRUTHER; ANTI NWO; ANTI ILLUMINATI;"), created_at = c("Sat Jul 23 15:41:46 +0000 2016", 
"Sat Jul 23 15:41:46 +0000 2016", "Sat Jul 23 15:41:47 +0000 2016", 
"Sat Jul 23 15:41:52 +0000 2016", "Sat Jul 23 15:41:57 +0000 2016", 
"Sat Jul 23 15:41:57 +0000 2016"), id_str = c("756876968612732928", 
"756876971875897344", "756876974384091136", "756876996366532610", 
"756877014758518784", "756877015173697540"), url = c(NA, NA, 
NA, "https://www.welt.de/politik/ausland/article157243137/Europa-nicht-in-der-Lage-seine-Buerger-zu-schuetzen.html", "https://www.welt.de/newsticker/news2/article157245377/Von-der-Leyen-Einsatz-der-Bundeswehr-in-Muenchen-wurde-erwogen.html", 
NA), expanded_url = c(NA, NA, "https://twitter.com/kippi666/status/756592450450382853", 
"http://www.welt.de/157243137", "http://www.welt.de/newsticker/news2/article157245377/Von-der-Leyen-Einsatz-der-Bundeswehr-in-Muenchen-wurde-erwogen.html", 
NA), followers_count = c(290, 18, 2, 35, 3278, 332), statuses_count = c(3526, 
266, 7, 3411, 115589, 4543), lang = c("de", "de", "de", "de", 
"de", "de"), text_clean = c("ich ficke den #muenchen in den arsch !", 
"rt @bmi_bund: #demaizière ist jetzt auf dem weg nach #muenchen, um sich vor ort ein bild zu machen. ", 
"rt @nielsruf: du wixa, du arschloch, du ", 
"rt @rudisagmal: #muenchen #wuerzburg #nizza \n\n\"europa nicht in der lage, seine bürger zu schützen\"\n ", 
"rt @metronaut: es brechen wirklich alle dämme: die regierung erwog ernsthaft beim amoklauf die bundeswehr im innern einzusetzen¦", 
"rt @mhoepflinger: #hamburg #muenchen #ard\nblut an ihren händen! angela merkel!\nniemand glaubt mehr an einen rücktritt der diktatorin!\nhttps…"
)), .Names = c("user_id_str", "screen_name", "name", "description", 
"created_at", "id_str", "url", "expanded_url", "followers_count", 
"statuses_count", "lang", "text_clean"), row.names = c(NA, 6L
), class = "data.frame") 

(我不得不删除推文中的短链接,我希望副本仍然有效)

系统信息 平台x86_64-w64-mingw32
拱x86_64
os mingw32
system x86_64,mingw32
状态
专业3 未成年人4.1
2017年
年 月06
第30天 svn rev 72865
语言R
version.string R版本3.4.1(2017-06-30) 昵称单烛“

1 个答案:

答案 0 :(得分:0)

我正在寻找的是一个正则表达式,以下代码可以很好地解决问题:

#Filter out all the tweets with the string "rt @" (so every retweet)
new_df <- filter(twitter_data, grepl("rt @",text_clean))

#split the text at the ":" and take the first half ("rt @<name>")    
for (i in 1:length(new_df$text_clean)){
  new_df$text_clean[i] <-sapply(strsplit(new_df$text_clean[i], ":"), "[", 1)
}

#split the text at the "@" and take the second half ("<name>")
for (i in 1:length(new_df$text_clean)){
  new_df$text_clean[i] <-sapply(strsplit(new_df$text_clean[i], "@"), "[", 2)
}

#now the column "new_df$text_clean" includes the name of the retweet receiver