Question

在10000 item_ids的数据中，给出了项目描述，以便如何使用SAS（不使用数组）计算item_id重复的特定item_id的项目描述列中单个单词的频率。 / p>

目标是确定特定item_id的关键字。

Answer 1

以下方法利用Proc Freq获取'关键字'分布。

data have;
infile cards truncover;
input id var $ 100.;
cards;
1 This is test test
2 failed
1 be test
2 failed is
3 success
3 success ok
;

/*This is to break down the description into single word*/
data want;
set have;
do _n_=1 to countw(var);
new_var=scan(var,_n_);
output;
end;
run;

/*This is to give you words freq by id*/

ods output list=mylist (keep=id new_var frequency);
PROC FREQ DATA = want
    ORDER=FREQ
;
    TABLES id * new_var /
        NOCOL
        NOPERCENT
        NOCUM
        SCORES=TABLE
        LIST
        ALPHA=0.05;
RUN; QUIT;
ods _all_ close;
ods listing;

Answer 2

数组用于读取多个列，因此这里没有任何特殊用途。这听起来有点像家庭作业问题，你应该展示一些你做过的尝试。但是，这不是一个容易解决的问题，因此我将发布一个解决方案。

我对如何解决这个问题的想法是：

按item_id
对于每个item_id，扫描每个单词并检查该item_id是否已存在。如果是，则转到下一个单词，否则将单词添加到唯一列表并将计数器递增1
处理当前item_id的最后一个时，输出唯一的单词列表并计算

我希望以下代码充分评论以下代码，如果没有，请在线查找特定的函数或语句。

/* create dummy dataset */
data have;
input item_id item_desc $30.;
datalines;
1   this is one
1   this is two
2   how many words are here
2   not many
3   random selection
;
run;

/* sort dataset if necessary */
proc sort data=have;
by item_id;
run;

/* extract unique words from description */
data want;
set have;
by item_id;
retain unique_words unique_count; /* retain value from previous row */
length unique_words $200; /* set length for unique word list */
if first.item_id then do; /* reset unique word list and count when item_id changes */
    call missing(unique_words);
    unique_count = 0;
    end;
do i = 1 by 1 while(scan(item_desc,i) ne ''); /* scan each word in description until the end */
    if indexw(unique_words,scan(item_desc,i),'|') > 0 then continue; /* check if word already exists in unique list, if so then go to next word */
    else do;
        call catx('|',unique_words,scan(item_desc,i)); /* add to list of unique words, separated by | */
        unique_count+1; /* count number of unique words */
        end;
end;
drop item_desc i; /* drop unwanted columns */
if last.item_id then output; /* output id, unique word list and count when last id */
run;

如何显示列表中单个单词的总数

2 个答案: