我创建了一个文档术语矩阵,用于搜索100000 to 600000
中的数字以查找某些数据挖掘问题,但是我提到它并没有以所需的数字作为结果,它会将每个数字与空格或十进制数字组合在一起组合并将其返回为单个数字
这是我的代码
library(text2vec)
docs = c(doc1 = " letter ltetter (-è) 323.456 1 789 ",
dc2 = "letters 123.45 1letters 100000 98 76 54 ",
dc3 = "123456789 454321 letters 124 258 ")
#delete every thing but numbers
docs = gsub("[^0-9 ]", "", docs, perl = T)
#creating the dtm
itoken = itoken(docs, tokenizer = word_tokenizer, ids = names(docs))
vector = create_vocabulary(itoken)
vectorizer = vocab_vectorizer(vector)
dtm = create_dtm(itoken, vectorizer)
(dtm[, colnames(dtm) %in% 100000:600000])
3 x 4 sparse Matrix of class "dgCMatrix"
100000 454321 323456
doc1 . . 1
dc2 1 . .
dc3 . 1 .
提取的100000
是正确的 =它在所需的边距(100000 and 600000)
454321
是正确的 =它在所需的边距(100000 and 600000)
323456
错误 ==文档中的编号为 323.456 ,它不在页边空白中,但已提取
如何调整它以返回100000 to 600000
中的数字?
答案 0 :(得分:1)
您可以搜索以6个数字开头的单词边界\\b
,该数字以1-6 [1-6]
开头的数字后跟任意5个数字[0-9]{5}
library(stringr)
docs_list <- lapply(docs,
function(x){str_extract_all(x,"\\b[1-6][0-9]{5}\\b", simplify = TRUE)})
docs_list[sapply(docs_list, function(x) length(x)==0L)] <- NA
unlist(docs_list)
doc1 dc2 dc3
NA "100000" "454321"
答案 1 :(得分:1)
如果我正确理解了您的问题,则希望从文档中提取所有数字,包括小数点。
所以你想做类似的事情
docs <- sapply(docs, function(doc) {
nums <- regmatches(doc, gregexpr("[0-9]+\\.*[0-9]*", doc))
paste(unlist(nums), collapse = " ")
})
docs
# doc1 dc2
# "323.456 1 789" "123.45 1 100000 98 76 54"
# dc3
# "123456789 454321 124 258"
如果我们在docs
上运行其余代码:
library(text2vec)
itoken = itoken(docs, tokenizer = word_tokenizer, ids = names(docs))
vector = create_vocabulary(itoken)
vectorizer = vocab_vectorizer(vector)
dtm = create_dtm(itoken, vectorizer)
dtm[, colnames(dtm) %in% 100000:600000]
# 3 x 2 sparse Matrix of class "dgCMatrix"
# 454321 100000
# doc1 . .
# dc2 . 1
# dc3 1 .
答案 2 :(得分:1)
您必须考虑gsub函数中的小数点。
library(text2vec)
docs = c(doc1 = " letter ltetter (-è) 323.456 1 789 ",
dc2 = "letters 123.45 1letters 100000 98 76 54 ",
dc3 = "123456789 454321 letters 124 258 ")
#If you have decimal commas first do this
docs = sub(',','.',docs,perl = T)
#Here what i've changed
docs = gsub("[^0-9^.^ ]", "", docs, perl = T)
#creating the dtm
itoken = itoken(docs, tokenizer = word_tokenizer, ids = names(docs))
vector = create_vocabulary(itoken)
vectorizer = vocab_vectorizer(vector)
dtm = create_dtm(itoken, vectorizer)
dtm_1 <- as.numeric(colnames(dtm))
table <- as.matrix(dtm[, (dtm_1 < 600000 & dtm_1>10000)])
library(reshape)
df_melted <- melt(table)
df_melted <- df_melted[which(df_melted$value != 0),]
colnames(df_melted) <- c("Document","Number Found","times")
这给您:
Document Number Found times
2 dc2 100000 1
6 dc3 454321 1