如何使用keras和pad_sequences填充R中的文本序列?

时间:2018-04-01 04:05:12

标签: r tensorflow keras

我有一个带文本的数据集。

dat <- data.frame(id=c("1","2","3","4","5"),text=as.character(c("hello","hello you","hello duck","Dogs and cats","hello cats, ducks and dogs")),stringsAsFactors = F)
str(dat)

我想用keras准备文本分类的文本。这适用于少量令牌和填充。

library(keras)
install_keras()
library(dplyr)

data<- dat$text

tok <- keras::text_tokenizer(10, lower = TRUE, split = " ", char_level
= FALSE) keras::fit_text_tokenizer(tok, data) data_idx <- keras::texts_to_sequences(tok, data)


data_idx <- data_idx %>% pad_sequences(maxlen=10,padding="post",value=0)

> data_idx
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    1    0    0    0    0    0    0    0    0     0
[2,]    1    5    0    0    0    0    0    0    0     0
[3,]    1    6    0    0    0    0    0    0    0     0
[4,]    2    3    4    0    0    0    0    0    0     0
[5,]    1    4    7    3    2    0    0    0    0     0

但是,如果我提高了令牌和填充的数量(我必须为我的真实文本),我会得到一个奇怪的填充序列。

data<- dat$text

tok <- keras::text_tokenizer(10000, lower = TRUE, split = " ", char_level = FALSE)
keras::fit_text_tokenizer(tok, data)
data_idx <- keras::texts_to_sequences(tok, data)


data_idx <- data_idx %>% pad_sequences(maxlen=10000,padding="post",value=0)

> data_idx
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
     [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49]
     [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62] [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73]
     [,74] [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97]
     [,98] [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108] [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117] [,118]
     [,119] [,120] [,121] [,122] [,123] [,124] [,125] [,126] [,127] [,128] [,129] [,130] [,131] [,132] [,133] [,134] [,135] [,136] [,137] [,138]
     [,139] [,140] [,141] [,142] [,143] [,144] [,145] [,146] [,147] [,148] [,149] [,150] [,151] [,152] [,153] [,154] [,155] [,156] [,157] [,158]
     [,159] [,160] [,161] [,162] [,163] [,164] [,165] [,166] [,167] [,168] [,169] [,170] [,171] [,172] [,173] [,174] [,175] [,176] [,177] [,178]
     [,179] [,180] [,181] [,182] [,183] [,184] [,185] [,186] [,187] [,188] [,189] [,190] [,191] [,192] [,193] [,194] [,195] [,196] [,197] [,198]
     [,199] [,200] [,201] [,202] [,203] [,204] [,205] [,206] [,207] [,208] [,209] [,210] [,211] [,212] [,213] [,214] [,215] [,216] [,217] [,218]

我认为我完全错了,但我无法解决。

1 个答案:

答案 0 :(得分:2)

输出没有错。我们需要检查维度

#include <stdio.h>
#include <stdlib.h>

char *getfield(char **pp, char sep) {
    char *p, *res;
    for (res = p = *pp;; p++) {
        if (*p == sep) {
            *p++ = '\0';
            *pp = p;
            return res;
        }
        if (*p == '\0')
            return NULL;
    }
}

int main() {
    char line[256];
    char filename[] = "input_file.txt";
    int lineno = 0;
    FILE *fp = fopen(filename, "r");
    if (fp != NULL) {
        while (fgets(line, sizeof line, fp)) {
            char *p = line;
            char *reference = getfield(&p, '\t');
            char *description = getfield(&p, '\t');
            char *quantity = getfield(&p, '\t');
            char *price = strtod(getfield(&p, '\n');
            lineno++;
            if (price != NULL) {
                /* all fields were parsed correctly */
                printf("reference: %s\n, reference);
                printf("description: %s\n, description);
                printf("quantity: %d\n, atoi(quantity));
                printf("price: %.2f\n\n, strtod(price, NULL));
            } else {
                fprintf(stderr, "%s:%d: invalid line\n", filename, lineno);
            }
        }
        fclose(fp);
    }
    return 0;
}

只是控制台只打印列标题,并且基于dim(data_idx) #[1] 5 10000 它无法显示整个输出

max.print

如果我们是子集,可以看到输出

#[ reached getOption("max.print") -- omitted 5 rows ]