我必须按以下格式编写数据:support:category_1; category_2; category_3; ...,其中每一行对应一个类别数据集。支持是频率和类别的地方。亚美尼亚。
dat<-read.csv("https://d3c33hcgiwev3.cloudfront.net/_9b0d0ff87935997de01d221fd74bae90_categories.txt?Expires=1486252800&Signature=P~Q4jU6ufMqC12Usn0n6feJPZdDMERgOVD5WNflxwnbAxREulmhdpo2~YGO7yK5STLrp1KuQeq-06q4IVsCIz3jTxe-u-kL6sh-ZcgYWFC~hUh3zzjL0x6fEKJ5rtYyR8ztzYz9utGAkvhBrNmhIYWB7r36PNvkfl8lV36qfr50_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A")
dat[10, ]
out<-strsplit(as.character(dat[1:79137, ]), ';')
result<-unlist(out)
length(result)
table(result)
interm_result<-as.data.frame((table(result)))
order_result<-interm_result[order(interm_result$Freq, interm_result$result)]
order_result looks like bellow:
result Freq
39 Armenian 1
42 Art Restoration 1
45 Art Tours 1
49 Assisted Living Facilities 1
64 Backshop 1
90 Beer Hall 1
96 Bike Sharing 1
153 Carpenters 1
158 Castles 1
168 Childbirth Education 1
169 Childproofing 1
175 Choirs 1
186 College Counseling 1
195 Community Gardens 1
198 Concept Shops 1
233 Debt Relief Services 1
244 Dialysis Clinics 1
答案 0 :(得分:0)
## read file
dat<- read.csv("https://d3c33hcgiwev3.cloudfront.net/_9b0d0ff87935997de01d221fd74bae90_categories.txt?Expires=1486252800&Signature=P~Q4jU6ufMqC12Usn0n6feJPZdDMERgOVD5WNflxwnbAxREulmhdpo2~YGO7yK5STLrp1KuQeq-06q4IVsCIz3jTxe-u-kL6sh-ZcgYWFC~hUh3zzjL0x6fEKJ5rtYyR8ztzYz9utGAkvhBrNmhIYWB7r36PNvkfl8lV36qfr50_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A")
dat[10, ]
Create function to manipulate data
## create a function that removes the separator
out<-strsplit(as.character(dat[1:79137, ]), ';')
## as there are multiple elements in a line, write each element on a new line
result<-unlist(out)
## list length
length(result)
## find the frequency of each element
table(result)
interm_result<-as.data.frame((table(result)))
##write the data in final format
final<-data.frame(interm_result$Freq, sep=':', interm_result$result)
## write the data to a file
fileConn<- file("DataMining.txt")
write.table(final, fileConn)
close(fileConn)
Part 2: organize data support:category_1, category_2,…
## order_result<-interm_result[order(interm_result$Freq, interm_result$result)]
## indi<-unique(order_result$Freq)
##Arrange list by frequency
new_arr<-split(interm_result$result, interm_result$Freq, drop=FALSE)
## Transform in character strings
new_arr[]<- lapply(new_arr, as.character)
## Number of elements with the same frequency
t_result<-table(order_result$Freq)
## Change header names
names(d_result)[1]<-paste("Freq")
names(d_result)[2]<-paste("Category")
## Change class of order_result
d_result[]<-lapply(d_result, as.character)
## Sort elements by frequency
Result<-aggregate(d_result.Category ~ d_result.Freq, d_result, function(x) paste(x, collapse = ";"))
## Change column names to an easier form
names(Data_result)[3]<-paste("Freq")
names(Data_result)[3]<-paste("Category")
names(Data_result)[2]<-paste(" ")
## Final data frame
Data_result<-data.frame(Result$d_result.Freq, sep=":", Result$d_result.Category)