在10000 item_ids的数据中,给出了项目描述,以便如何使用SAS(不使用数组)计算item_id重复的特定item_id的项目描述列中单个单词的频率。 / p>
目标是确定特定item_id的关键字。
答案 0 :(得分:2)
以下方法利用Proc Freq获取'关键字'分布。
data have;
infile cards truncover;
input id var $ 100.;
cards;
1 This is test test
2 failed
1 be test
2 failed is
3 success
3 success ok
;
/*This is to break down the description into single word*/
data want;
set have;
do _n_=1 to countw(var);
new_var=scan(var,_n_);
output;
end;
run;
/*This is to give you words freq by id*/
ods output list=mylist (keep=id new_var frequency);
PROC FREQ DATA = want
ORDER=FREQ
;
TABLES id * new_var /
NOCOL
NOPERCENT
NOCUM
SCORES=TABLE
LIST
ALPHA=0.05;
RUN; QUIT;
ods _all_ close;
ods listing;
答案 1 :(得分:0)
数组用于读取多个列,因此这里没有任何特殊用途。这听起来有点像家庭作业问题,你应该展示一些你做过的尝试。但是,这不是一个容易解决的问题,因此我将发布一个解决方案。
我对如何解决这个问题的想法是:
我希望以下代码充分评论以下代码,如果没有,请在线查找特定的函数或语句。
/* create dummy dataset */
data have;
input item_id item_desc $30.;
datalines;
1 this is one
1 this is two
2 how many words are here
2 not many
3 random selection
;
run;
/* sort dataset if necessary */
proc sort data=have;
by item_id;
run;
/* extract unique words from description */
data want;
set have;
by item_id;
retain unique_words unique_count; /* retain value from previous row */
length unique_words $200; /* set length for unique word list */
if first.item_id then do; /* reset unique word list and count when item_id changes */
call missing(unique_words);
unique_count = 0;
end;
do i = 1 by 1 while(scan(item_desc,i) ne ''); /* scan each word in description until the end */
if indexw(unique_words,scan(item_desc,i),'|') > 0 then continue; /* check if word already exists in unique list, if so then go to next word */
else do;
call catx('|',unique_words,scan(item_desc,i)); /* add to list of unique words, separated by | */
unique_count+1; /* count number of unique words */
end;
end;
drop item_desc i; /* drop unwanted columns */
if last.item_id then output; /* output id, unique word list and count when last id */
run;