R中的wordcloud
库提供了一种可视化一袋单词的简洁方法。我正在进行潜在的语义分析,并希望可视化空间组织的单词。我一直在重新编写库中的wordcloud
函数来绘制没有碰撞的空间组织单词。
我遇到的问题是算法的基本前提。我遍历每个单词,并测量它的大小。然后,使用overlap
函数检查单词框是否重叠,我沿着它们的向量从原点滑动单词,直到它们不重叠。现在有一个方框列表,没有一个重叠,我可以绘制这个图:
然而,当我使用text
函数来编写单词时,我的语气会出现冲突:
如何强制文本在最初测量的框内绘制以适合内部?我很确定这个问题与cex
如何根据图表中的像素数量而不是轴的极限大小相关,但我不知道如何根据轴而不是像素。或者,如果有一种设置图形中像素的方法,以及如何增加像素数量以调整文本大小的一般规则,我也有兴趣知道这一点。
以下是重现的数据和代码:
x = c(0.0224632572704955, 0.0789587202250317, 0.0274591067722305, -0.0215217178576816, -0.0463136563269626, 0.0777315272157781, -0.0136566199077556, -0.0822360321803568, 0.01830065120482, 0.0837054635574331, -0.0838847576602805, 0.0581030843550272, -0.0846639528682175, -0.0363582203093014, -0.0835228445897983, -0.0857369307718219, -0.0278922709863537, 0.0849891449442603, 0.0778496060942224, 0.0494913027256782, -0.0861355504659213, 0.0831126881041073, 0.0681440084766314, -0.0650637986694017, -0.0863135597714858, -0.0373928848380089, 0.074049835693315, 0.0398077951711342, 0.0843457836924724, -0.0899133240081401, 0.0871469163578526, -0.0213824408924798, -0.0882376644590973, 0.000804796205293155, 0.0790127624196678, 0.0131045301900216, -0.0097660701059642, 0.060740028037966, -0.0924082870436164, -0.0251299367403581, -0.0926467825295682, -0.0141241186540234, -0.00310159424174361, 0.0787547867577223, -0.0260184580833716, 0.0517140709157018, -0.0491806875142526, -0.00524874012683386, -0.00464000490379075, -0.00420786759489088, 0.00322708905712382, -0.101134456332954, -0.0418549470890533, 0.102100219195374, -0.071862111421196, -0.0434097834006479, 0.0776191187925381, 0.1060046720406, -0.0272119968043285, -0.106816653394793, 0.0287332895600156, -0.0912972395214369, -0.0366978670141858, -0.00838648870281447, 0.115639532976652, 0.0361645175825202, 0.0713906746569677, -0.0431317109494537, -0.028564774163145, -0.00523293700560141, -0.00575635646273777, -0.0331782186958773, 0.124661327465304, 0.0315968070860678, -0.0558444285994164, -0.114728260657278, -0.0217221489719173, 0.136336954261077, -0.137719537096706, 0.0641498138158443, -0.139031136774261, 0.00680615901169146, -0.0379560303006893, 0.0475675308733084, -0.141738379598886, 0.0619691952983009, -0.0324363288760584, 0.0350539739326913, 0.0076137234015135, -0.0739266927217484, 0.0372857817004895, 0.153202967109765, -0.0155256753396782, 0.00857420592583181, -0.177745604714841, 0.0795229711787737, -0.0471817920012264, -0.02409890273062, 0.196853164152666, 0.204447953262544)
y = c(-0.0752776571230846, -0.00672122916072416, 0.0758086939339219, 0.0785861789639402, -0.0677055410096309, 0.0272785745148846, -0.0813248054739799, 0.0078475780115705, 0.0810146964880099, -0.00478084767048896, 0.00565925463584872, -0.0608790343449287, -0.00555148434178386, -0.0766928751630864, 0.0169646486204334, -0.00334540779833788, 0.0816797036428977, 0.0159111526606456, -0.0377749477564506, 0.0713097047786406, 0.0156471598878643, -0.0276247455980099, 0.0553499635679889, -0.0602588336256788, 0.0203869390323236, 0.0810150278030348, -0.0497827486612123, -0.0799237894454277, -0.0302989442794288, -0.00298711542054578, 0.0236117049639803, -0.0877260872513847, -0.0193599518017899, -0.0908245510012227, -0.0458884920692699, 0.0911063173756906, -0.0916168143543014, -0.0695525217155883, 0.00250090928101712, 0.0891631284676921, 0.00538569085280678, 0.0919523561635045, 0.0940243248638534, 0.0524309559381309, -0.0912540520367487, 0.0800810133995442, -0.0840852427280262, -0.0984769093381384, -0.0985077379630162, -0.0987069709043514, -0.0989748998527878, 0.00452437059722756, -0.0930322733809224, 0.00069319951483364, 0.0740813318736554, 0.0948855232245366, 0.070645224239989, -0.0103845525763274, -0.103024065101316, -0.00789743078596155, 0.105453142673941, 0.0652869710460866, -0.10890208867039, -0.114697484904734, 0.00413462223558936, 0.110353491923549, 0.0944715994210254, 0.110990259989688, 0.115707989007745, -0.120175636155138, -0.120501392622881, 0.1163418318335, -0.0134237550525085, 0.122300074894697, -0.114937000286942, 0.0634133444114911, 0.132638479157556, 0.00851766613319355, -0.00269271356832806, 0.12226959905812, -0.00682649454407891, 0.13966123914174, 0.135636860115536, 0.133453551172787, 0.0124344798034275, -0.13074953842002, 0.141239398728931, 0.141869629560279, -0.152407018146679, -0.133604473268183, -0.148181211362048, 0.00171078006359565, 0.15792413596998, -0.177085008252531, 0.00233033011911132, -0.159325513391783, 0.172886877895966, -0.187762962706901, -0.0305548311528385, -0.00860796795425707)
words = c("self", "due", "pretty", "retire", "location.", "free", "oriented", "different", "notice", "excel", "environment", "superior", "cooperate", "stability", "goal", "effective", "value.", "important", "now", "practice", "employer", "last", "top", "strength", "genuine", "significant", "sustainable", "proud", "meet", "decent", "research", "specialty", "grew", "success.", "lead", "provide.", "wage.", "manager.", "knowledge", "nothing", "impact", "salary.", "thing", "type", "progress", "pursue", "show", "play", "result.", "principles", "realize", "limited", "time", "deal", "special", "think", "place", "door", "throughout", "effort.", "want", "treat", "still", "raise", "everyday", "texas", "something", "well", "reputable", "recognizes", "origin", "organic", "day.", "resources", "quit", "supervisor", "result", "customer", "daily", "success", "coworkers.", "without", "task", "solid", "growth.", "succeed", "workforce", "talent", "provide", "see", "tackle", "deliver", "quality", "present.", "decisions.", "really", "program", "year.", "gain", "education")
library(wordcloud)
freq = rep(1,length(x))
max.words = 100
min.freq = 1
scale = c(0.7,.1)
colors='black'
tails <- "g|j|p|q|y"
last <- 1
nc<- length(colors)
dfc <- sqrt(x^2+y^2)
overlap <- function(x1, y1, sw1, sh1) {
s <- 0
if (length(boxes) == 0)
return(FALSE)
for (i in c(last,1:length(boxes))) {
bnds <- boxes[[i]]
x2 <- bnds[1]
y2 <- bnds[2]
sw2 <- bnds[3]
sh2 <- bnds[4]
if (x1 < x2)
overlap <- x1 + sw1 > x2-s
else
overlap <- x2 + sw2 > x1-s
if (y1 < y2)
overlap <- overlap && (y1 + sh1 > y2-s)
else
overlap <- overlap && (y2 + sh2 > y1-s)
if(overlap){
last <<- i
return(TRUE)
}
}
FALSE
}
ord <- rank(-dfc, ties.method = "random")
words <- words[ord<=max.words]
freq <- freq[ord<=max.words]
dfc <- dfc[ord<=max.words]
x <- x[ord<=max.words]
y <- y[ord<=max.words]
ord <- order(dfc,decreasing=FALSE)
words <- words[ord]
freq <- freq[ord]
x <- x[ord]
y <- y[ord]
words <- words[freq>=min.freq]
freq <- freq[freq>=min.freq]
x <- x[freq>=min.freq]
y <- y[freq>=min.freq]
rStep <- .05
normedFreq <- freq/max(freq)
size <- (scale[1]-scale[2])*normedFreq + scale[2]
boxes <- list()
xy <- list()
plot.new()
op <- par("mar")
par(mar=c(0,0,0,0))
for(i in 1:length(words)){
x1<-x[i]
y1<-y[i]
if(x1==0 & y1==0){
bigger<-sample(c(1,2),1)
bSign<-sample(c(-1,1),1)
ratio<-sample(c(-1,1),1)
} else {
coords<-c(x1,y1)
bigger<-which.max(abs(coords))
ratio<-coords[-bigger]/coords[bigger]
bSign<-sign(coords[bigger])
}
wid <- strwidth(words[i],cex=size[i])
ht <- strheight(words[i],cex=size[i])
#mind your ps and qs
if(grepl(tails,words[i]))
ht <- ht + ht*.2
isOverlaped <- TRUE
while(isOverlaped){
print(paste0(words[i],": ",overlap(x1-.5*wid,y1-.5*ht,wid,ht)))
if(!overlap(x1-.5*wid,y1-.5*ht,wid,ht)){
#rect(x1-.5*wid,y1-.5*ht,x1+.5*wid,y1+.5*ht)
boxes[[length(boxes)+1]] <- c(x1-.5*wid,y1-.5*ht,wid,ht)
xy[[length(xy)+1]] <- c(x1,y1)
isOverlaped <- FALSE
}else{
if(bigger==1){
x1<-x1+rStep*bSign
y1<-y1+rStep*ratio
} else {
x1<-x1+rStep*ratio
y1<-y1+rStep*bSign
}
}
}
}
points<-do.call(rbind,boxes)
points<-rbind(cbind(points[,1],points[,1]+points[,3]),cbind(points[,2],points[,2]+points[,4]))
lims<-c(min(points[,1]),max(points[,1]))
lims<-c(-max(abs(lims)),max(abs(lims)))
plot.window(c(min(points[,2])*1.05,max(points[,2])*1.05),c(min(points[,1])*1.05,max(points[,1])*1.05))
for(i in 1:length(words)){
cc <- colors[sample(1:nc,1)]
text(xy[[i]][1],xy[[i]][2],words[i],cex=size[i],offset=0,col=cc)
#rect(boxes[[i]][1],boxes[[i]][2],boxes[[i]][1]+boxes[[i]][3],boxes[[i]][2]+boxes[[i]][4])
}
par(mar=op)
答案 0 :(得分:1)
strWidth
和strHeight
函数根据当前坐标系计算宽度和高度。但是当你推出改变坐标集的框时,这样当你重新绘制数据时,高度和宽度不再与文本匹配(框看起来很好,因为坐标与坐标系一致)。想想你是否将一个盒子移动得很远,所有盒子都会收缩以匹配新的坐标系。
有两种选择:
从足够大的绘图开始,使得框不会到达边界并更改坐标系。
计算总体范围增加了多少,并使用cex
参数将文本缩小到适当的数量。
在每次迭代时使用新坐标系重新计算框的大小,以便框与文本的实际大小相匹配。