我正在做一个脚本,用于查找文本中的偏斜单词(使用R语言)。现在,我只能得到一个.csv(abstract_atomized.csv)单词列表,这些列表根据单词的频率被雾化和分类。我还有另一个.csv列表,其中包含一组常用和无用的英语单词(wordlistenglish.csv),我想从第一个列表中删除这些单词,以便仅保留相关元素。我不知道如何在R上有效地执行此操作。您能帮我吗?
谢谢。
library(pubmed.mineR)
library(scholar)
library(tools)
library(stringr)
abstractR <- readabs(abstract)
atomized_text <- word_atomizations(abstractR)
file_without_ext <- file_path_sans_ext(abstract)
atomized_file_name <- paste0(file_without_ext, '_atomized.csv')
write.csv(atomized_text, atomized_file_name)
output_text <- paste0('Most used words described in ',atomized_file_name,' take the time to read them and to select the relevent key words')
print(output_text)
abstract_atom<- read.csv('abstract_atomized.csv')
wordlist<- read.csv('wordlistenglish.csv')
abstract_atom[!(abstract_atom$words %in% wordlist$words),]
write.csv(abstract_atom, file='abstract_atom.csv')
更新
数据结构(由于字符太多,我不能放整个结构:第一个列表约6000个单词,第二个约1000个单词)
atomlist结构:
words = structure(c(2772L, 4003L, 737L, 2371L, 3797L, 4988L
), .Label = c("-29", "-325", "-328", "-337", "-59", "-dependent",
"-dichlorophenyl)-1", "-disulfonic", "-induced", "-maleimidyldistilbene-2",
"-sh", "-so(3)(-))", "\"giacomo", "\"paradox", "(-323", "(-335",
"(-s-)", "(-so(-))", "(-so(2)(-)", "(#)contributed", "(1)department",
"(1)institut", "(1)instituto", "(1)laboratoire", "(1)laboratory",
"(1)plant", "(1)unité", "(10)laboratory", "(2-cys", "(2)bio-pharmaceutical",
"(2)department", "(2)sorbonne", "(2)université", "(219)cgpc(222)",
"(28)wcsys(32)", "(3)institute", "(3)laboratoire", "(3)laboratory",
"(3)plant", "(3d)", "(4)laboratoire", "(4)laboratory", "(4)sorbonne",
"(5)bio-pharmaceutical", "(5)department", "(6)laboratoire",
"(6)laboratory", "(6)plant", "(7)laboratoire", "(7)laboratory",
"(7)spemann", "(8)laboratory", "(8)université", "(9)laboratoire",
"(a(4)", "(a(4)-gapdh)", "(and", "(arabidopsis", "(aromatic",
"(atprk)", "(b-containing", "(bio-phase)", "(biogssg)", "(bioss)",
"(c86)", "(cb)", "(cr)", "(crpgk1)", "(crprk)", "(crtk)",
"(crtkapo)", "(cx(2)c)", "(cys(149)-ssg)", "(cys(29))", "(cys(87))",
"(dcmu)", "(deduced", "(diamide", "(e", "(e(m))", "(eda)",
"(eeg)", "(er)", "(fd)", "(for", "(frias)", "(ftr)", "(gapc1",
"(gapdh)", "(glyceraldehyde-3-phosphate", "(glycine", "(gpxs)",
"(grx)", "(grxs)", "(gsh)", "(h2o2)", "(heat-shock", "(hsp70",
"(i", "(icl)", "(inra)", "(isoform", "(lhcii)", "(metso)",
"(mms)", "(msrs)", "(multifunctional)", "(nadp-mdh)", "(negative",
"(no)", "(o2*)", "(pdi)", "(pgk1)", "(phaseolus", "(pk(a)=5",
"(pm)", "(populus", "(prk)", "(prxii)", "(prxs)", "(ptm)",
"(real", "(reduced", "(rns)", "(ros)", "(sll1621)", "(sll1908)",
"(slr1562", "(slr1849)", "(sno)", "(sorghum", "(spinacia",
"(ss)", "(ssg)", "(tk)", "(tpi)", "(tpp)", "(trx", "(trx)",
"(trx)-dependent", "(trxf)", "(trxh1)", "(trxh2)", "(trxs)",
"(upmc)", "(β/α)8-barrel", "[(35)s]cysteine", "[4fe-4s]",
"[fe2s2]", "[gsh]/[gsno]", "[gsh]/[gssg]", "&", "+/-", "+300",
"+80", "<2-fold", "~10-fold", "~20-fold", "~6-fold", "06",
"1-cys", "1-dimethylurea", "10 μm", "1052", "11", "110",
"1136", "1188", "119", "12", "125", "1278", "13", "133",
"1417", "16", "18", "190", "2-cys", "2-cys-peroxiredoxin",
"2)", "200", "225", "24", "25", "26", "29", "2nd", "3-(3",
"3-bisphosphoglycerate", "3-phosphoglycerate", "30", "33",
"381", "383", "392", "3d", "3d-structure", "3rd", "4-acetamido-4",
"40126", "41092", "42", "492", "54506", "55", "56", "561",
"7)", "70 kda)", "70803", "75005", "79104", "81", "8226",
"86", "8600", "8618", "9)", "90", "90095", "91405", "94720-3102",
"98%", "å", "a(2)b(2)", "a(4)-gapdh", "a(4)-glyceraldehyde-3-phosphate",
"a(8)b(8)", "a(n)b(n)-gapdh", "aa", "ab", "åb", "abdelmohsen",
"abe", "abedin", "abeliovich", "ability", "abiotic", "able",
"abnormally", "absence", [...] "zhong", "zhou", "zhu", "zhuang",
"zimmer", "zio", "ziparo", "zj", "zm", "zn", "zois", "zoladek",
"zong", "zorzano", "zughaier", "zw", "zx", "β-sheet"), class = "factor"),
Freq = c(161L, 150L, 114L, 98L, 90L, 79L)), row.names = c(NA,
6L), class = "data.frame")
单词列表结构
structure(list(the = structure(c(8762L, 9971L, 9929L, 9917L,
9628L, 9437L), .Label = c("a", "aa", "aaa", "aaron", "ab", "abandoned",
"abc", "aberdeen", "abilities", "ability", "able", "aboriginal",
"abortion", "about", "above" [...] "yukon", "z", "za", "zambia", "zdnet", "zealand", "zen", "zero",
"zimbabwe", "zinc", "zip", "zoloft", "zone", "zones", "zoning",
"zoo", "zoom", "zoophilia", "zope", "zshops", "zu", "zum", "zus"
), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
答案 0 :(得分:2)
这是您应该使用的通用格式,适合您的实际数据结构:
filtered_list<- list1[!(list1$words %in% list2$words),]
它将所有行保留在list1
中,其中单词未出现在list2
中。如果您想使用tidyverse(速度较慢,但通常更方便编写代码)来执行此操作,则它将如下所示:
require(dplyr)
filtered_list<- list1 %>% filter(!(words %in% list2$words))