我正在尝试在语料库中使用单词核心,但是当我使用" mv"重新调整代码无法将我选择的参考文本设置为参考文本。此外,即使我将-1和1作为参考值建立,它在重新缩放时也超出了它们。它适用于" lbg"尽管如此重新调整。我希望将-1值分配给" 1999_St_CON"和1到" 1999_St_SNP"。虽然它适用于前者,但它不适用于第二种,而是将其分配给" 1999_St_FAW",子集语料库的第二个。感谢。
以下是代码:
# load
library(quanteda)
require(readtext)
library(stringr)
library(dplyr)
library(tidyr)
library(stringr)
library(rowr)
###Load all general debates
DPG <- readtext("~Parliamentary session/CP/*.txt", encoding="utf-8")
DPG
DPGcorp <- corpus(DPG)
docnames(DPGcorp) <- (DPG$doc_id)#change the names of the documents extracting the text from the default column created by quanteda
summary(DPGcorp)
###Create a new docvar (create a new variable for the document, the party variable)
docvars(DPGcorp, "Year") <- substring(names(texts(DPGcorp)),1,4)
docvars(DPGcorp, "Party") <- substring(names(texts(DPGcorp)),9,11)
summary(DPGcorp)
#wordscores
corpus1999 <- corpus_subset(DPGcorp, Year==1999)#select year 1999
summary(corpus1999)
dfm1999 <- dfm(corpus1999, stem = TRUE, remove = stopwords("english"), remove_punct = TRUE)
head(dfm1999)
#Reference scores
refscores <- rep(NA,nrow(dfm1999))#repeat NA for the number of rows of the dfm
refscores[str_detect(rownames(dfm1999), "1999_St_CON")] <- -1
refscores[str_detect(rownames(dfm1999), "1999_St_SNP")] <- 1
#Wordscore model
ws1999 <- textmodel_wordscores(dfm1999, refscores, scale="linear", smooth=1)
ws1999
wordscore1999 <- predict(ws1999, rescaling="mv")
wordscore1999
#Writing the results into data frame
ws.1999 <- data.frame(cbind(docvars(corpus1999),
wordscore1999))
ws.1999
ws.1999 <- dplyr::rename(ws.1999, wscore = wordscore1999)
ws.1999
这是输出:
> corpus1999 <- corpus_subset(DPGcorp, Year==1999)
> summary(corpus1999)
Corpus consisting of 7 documents:
Text Types Tokens Sentences doc_id Year Party
1999_St_CON.txt 390 948 32 1999_St_CON.txt 1999 CON
1999_St_FAW.txt 181 394 16 1999_St_FAW.txt 1999 FAW
1999_St_GOV.txt 560 2126 84 1999_St_GOV.txt 1999 GOV
1999_St_LAB.txt 289 747 36 1999_St_LAB.txt 1999 LAB
1999_St_LIB.txt 258 640 26 1999_St_LIB.txt 1999 LIB
1999_St_SNP.txt 393 1201 41 1999_St_SNP.txt 1999 SNP
1999_St_SSP.txt 278 632 25 1999_St_SSP.txt 1999 SSP
>
> dfm1999 <- dfm(corpus1999, stem = TRUE, remove = stopwords("english"),
remove_punct = TRUE)
> head(dfm1999)
Document-feature matrix of: 6 documents, 939 features (75.6% sparse).
>
> #Reference scores
> refscores <- rep(NA,nrow(dfm1999))#repeat NA for the number of rows of
the dfm
>
> refscores[str_detect(rownames(dfm1999), "1999_St_CON")] <- -1
> refscores[str_detect(rownames(dfm1999), "1999_St_SNP")] <- 1
>
> #Wordscore model
> ws1999 <- textmodel_wordscores(dfm1999, refscores, scale="linear",
smooth=1)
> ws1999
Call:
textmodel_wordscores.dfm(x = dfm1999, y = refscores, scale = "linear",
smooth = 1)
Scale: linear; 2 reference scores; 939 scored features.
> wordscore1999 <- predict(ws1999, rescaling="mv")
> wordscore1999
1999_St_CON.txt 1999_St_FAW.txt 1999_St_GOV.txt 1999_St_LAB.txt
-1.0000000 1.0000000 0.7614462 1.3593657
1999_St_LIB.txt 1999_St_SNP.txt 1999_St_SSP.txt
1.0536728 3.5124870 0.9350710
>
> #Writing the results into data frame
> ws.1999 <- data.frame(cbind(docvars(corpus1999),
+ wordscore1999))
> ws.1999
doc_id Year Party wordscore1999
1999_St_CON.txt 1999_St_CON.txt 1999 CON -1.0000000
1999_St_FAW.txt 1999_St_FAW.txt 1999 FAW 1.0000000
1999_St_GOV.txt 1999_St_GOV.txt 1999 GOV 0.7614462
1999_St_LAB.txt 1999_St_LAB.txt 1999 LAB 1.3593657
1999_St_LIB.txt 1999_St_LIB.txt 1999 LIB 1.0536728
1999_St_SNP.txt 1999_St_SNP.txt 1999 SNP 3.5124870
1999_St_SSP.txt 1999_St_SSP.txt 1999 SSP 0.9350710
>
> ws.1999 <- dplyr::rename(ws.1999, wscore = wordscore1999)
> ws.1999
doc_id Year Party wscore
1999_St_CON.txt 1999_St_CON.txt 1999 CON -1.0000000
1999_St_FAW.txt 1999_St_FAW.txt 1999 FAW 1.0000000
1999_St_GOV.txt 1999_St_GOV.txt 1999 GOV 0.7614462
1999_St_LAB.txt 1999_St_LAB.txt 1999 LAB 1.3593657
1999_St_LIB.txt 1999_St_LIB.txt 1999 LIB 1.0536728
1999_St_SNP.txt 1999_St_SNP.txt 1999 SNP 3.5124870
1999_St_SSP.txt 1999_St_SSP.txt 1999 SSP 0.9350710
>