我有一个带文本的数据集。
dat <- data.frame(id=c("1","2","3","4","5"),text=as.character(c("hello","hello you","hello duck","Dogs and cats","hello cats, ducks and dogs")),stringsAsFactors = F)
str(dat)
我想用keras准备文本分类的文本。这适用于少量令牌和填充。
library(keras)
install_keras()
library(dplyr)
data<- dat$text
tok <- keras::text_tokenizer(10, lower = TRUE, split = " ", char_level
= FALSE) keras::fit_text_tokenizer(tok, data) data_idx <- keras::texts_to_sequences(tok, data)
data_idx <- data_idx %>% pad_sequences(maxlen=10,padding="post",value=0)
> data_idx
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 0 0 0 0 0 0 0 0 0
[2,] 1 5 0 0 0 0 0 0 0 0
[3,] 1 6 0 0 0 0 0 0 0 0
[4,] 2 3 4 0 0 0 0 0 0 0
[5,] 1 4 7 3 2 0 0 0 0 0
但是,如果我提高了令牌和填充的数量(我必须为我的真实文本),我会得到一个奇怪的填充序列。
data<- dat$text
tok <- keras::text_tokenizer(10000, lower = TRUE, split = " ", char_level = FALSE)
keras::fit_text_tokenizer(tok, data)
data_idx <- keras::texts_to_sequences(tok, data)
data_idx <- data_idx %>% pad_sequences(maxlen=10000,padding="post",value=0)
> data_idx
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
[,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49]
[,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62] [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73]
[,74] [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97]
[,98] [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108] [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117] [,118]
[,119] [,120] [,121] [,122] [,123] [,124] [,125] [,126] [,127] [,128] [,129] [,130] [,131] [,132] [,133] [,134] [,135] [,136] [,137] [,138]
[,139] [,140] [,141] [,142] [,143] [,144] [,145] [,146] [,147] [,148] [,149] [,150] [,151] [,152] [,153] [,154] [,155] [,156] [,157] [,158]
[,159] [,160] [,161] [,162] [,163] [,164] [,165] [,166] [,167] [,168] [,169] [,170] [,171] [,172] [,173] [,174] [,175] [,176] [,177] [,178]
[,179] [,180] [,181] [,182] [,183] [,184] [,185] [,186] [,187] [,188] [,189] [,190] [,191] [,192] [,193] [,194] [,195] [,196] [,197] [,198]
[,199] [,200] [,201] [,202] [,203] [,204] [,205] [,206] [,207] [,208] [,209] [,210] [,211] [,212] [,213] [,214] [,215] [,216] [,217] [,218]
我认为我完全错了,但我无法解决。
答案 0 :(得分:2)
输出没有错。我们需要检查维度
#include <stdio.h>
#include <stdlib.h>
char *getfield(char **pp, char sep) {
char *p, *res;
for (res = p = *pp;; p++) {
if (*p == sep) {
*p++ = '\0';
*pp = p;
return res;
}
if (*p == '\0')
return NULL;
}
}
int main() {
char line[256];
char filename[] = "input_file.txt";
int lineno = 0;
FILE *fp = fopen(filename, "r");
if (fp != NULL) {
while (fgets(line, sizeof line, fp)) {
char *p = line;
char *reference = getfield(&p, '\t');
char *description = getfield(&p, '\t');
char *quantity = getfield(&p, '\t');
char *price = strtod(getfield(&p, '\n');
lineno++;
if (price != NULL) {
/* all fields were parsed correctly */
printf("reference: %s\n, reference);
printf("description: %s\n, description);
printf("quantity: %d\n, atoi(quantity));
printf("price: %.2f\n\n, strtod(price, NULL));
} else {
fprintf(stderr, "%s:%d: invalid line\n", filename, lineno);
}
}
fclose(fp);
}
return 0;
}
只是控制台只打印列标题,并且基于dim(data_idx)
#[1] 5 10000
它无法显示整个输出
max.print
如果我们是子集,可以看到输出
#[ reached getOption("max.print") -- omitted 5 rows ]