这是来自8个不同数据集的列名称的列表。某些列在数据集之间是公用的。例如,学校名称出现在数据集1、4、6、7和8中。但是,其输入方式不同。由于R区分大小写,这将使我很难从这些数据集中提取公共列。在这种情况下应该怎么办?我想到将所有元素都转换为大写。但这没用。
library(xlsx)
file_names = list.files(pattern = "*.csv")
files = lapply(file_names, read.csv )
lapply(files,head)
#Reduce(intersect, lapply(files,names))
p = lapply(files,names)
toupper(p)
p = lapply(files,names)
> p
[[1]]
[1] "Demographic"
[2] "DBN"
[3] "School.Name"
[4] "Cohort"
[5] "Total.Cohort"
[6] "Total.Grads...n"
[7] "Total.Grads.....of.cohort"
[8] "Total.Regents...n"
[9] "Total.Regents.....of.cohort"
[10] "Total.Regents.....of.grads"
[11] "Advanced.Regents...n"
[12] "Advanced.Regents.....of.cohort"
[13] "Advanced.Regents.....of.grads"
[14] "Regents.w.o.Advanced...n"
[15] "Regents.w.o.Advanced.....of.cohort"
[16] "Regents.w.o.Advanced.....of.grads"
[17] "Local...n"
[18] "Local.....of.cohort"
[19] "Local.....of.grads"
[20] "Still.Enrolled...n"
[21] "Still.Enrolled.....of.cohort"
[22] "Dropped.Out...n"
[23] "Dropped.Out.....of.cohort"
[[2]]
[1] "DBN" "Grade" "Year"
[4] "Category" "Number.Tested" "Mean.Scale.Score"
[7] "Level.1.." "Level.1...1" "Level.2.."
[10] "Level.2...1" "Level.3.." "Level.3...1"
[13] "Level.4.." "Level.4...1" "Level.3.4.."
[16] "Level.3.4...1"
[[3]]
[1] "DBN" "Name" "schoolyear"
[4] "fl_percent" "frl_percent" "total_enrollment"
[7] "prek" "k" "grade1"
[10] "grade2" "grade3" "grade4"
[13] "grade5" "grade6" "grade7"
[16] "grade8" "grade9" "grade10"
[19] "grade11" "grade12" "ell_num"
[22] "ell_percent" "sped_num" "sped_percent"
[25] "ctt_num" "selfcontained_num" "asian_num"
[28] "asian_per" "black_num" "black_per"
[31] "hispanic_num" "hispanic_per" "white_num"
[34] "white_per" "male_num" "male_per"
[37] "female_num" "female_per"
[[4]]
[1] "CSD"
[2] "BOROUGH"
[3] "SCHOOL.CODE"
[4] "SCHOOL.NAME"
[5] "GRADE"
[6] "PROGRAM.TYPE"
[7] "CORE.SUBJECT..MS.CORE.and.9.12.ONLY."
[8] "CORE.COURSE..MS.CORE.and.9.12.ONLY."
[9] "SERVICE.CATEGORY.K.9..ONLY."
[10] "NUMBER.OF.STUDENTS...SEATS.FILLED"
[11] "NUMBER.OF.SECTIONS"
[12] "AVERAGE.CLASS.SIZE"
[13] "SIZE.OF.SMALLEST.CLASS"
[14] "SIZE.OF.LARGEST.CLASS"
[15] "DATA.SOURCE"
[16] "SCHOOLWIDE.PUPIL.TEACHER.RATIO"
[[5]]
[1] "District" "YTD...Attendance..Avg."
[3] "YTD.Enrollment.Avg."
[[6]]
[1] "DBN"
[2] "SchoolName"
[3] "AP.Test.Takers"
[4] "Total.Exams.Taken"
[5] "Number.of.Exams.with.scores.3.4.or.5"
[[7]]
[1] "DBN"
[2] "SCHOOL.NAME"
[3] "Num.of.SAT.Test.Takers"
[4] "SAT.Critical.Reading.Avg..Score"
[5] "SAT.Math.Avg..Score"
[6] "SAT.Writing.Avg..Score"
[[8]]
[1] "dbn"
[2] "school_name"
[3] "borough"
[4] "building_code"
[5] "phone_number"
[6] "fax_number"
[7] "grade_span_min"
[8] "grade_span_max"
[9] "expgrade_span_min"
[10] "expgrade_span_max"
[11] "bus"
[12] "subway"
[13] "primary_address_line_1"
[14] "city"
[15] "state_code"
[16] "postcode"
[17] "website"
[18] "total_students"
[19] "campus_name"
[20] "school_type"
[21] "overview_paragraph"
[22] "program_highlights"
[23] "language_classes"
[24] "advancedplacement_courses"
[25] "online_ap_courses"
[26] "online_language_courses"
[27] "extracurricular_activities"
[28] "psal_sports_boys"
[29] "psal_sports_girls"
[30] "psal_sports_coed"
[31] "school_sports"
[32] "partner_cbo"
[33] "partner_hospital"
[34] "partner_highered"
[35] "partner_cultural"
[36] "partner_nonprofit"
[37] "partner_corporate"
[38] "partner_financial"
[39] "partner_other"
[40] "addtl_info1"
[41] "addtl_info2"
[42] "start_time"
[43] "end_time"
[44] "se_services"
[45] "ell_programs"
[46] "school_accessibility_description"
[47] "number_programs"
[48] "priority01"
[49] "priority02"
[50] "priority03"
[51] "priority04"
[52] "priority05"
[53] "priority06"
[54] "priority07"
[55] "priority08"
[56] "priority09"
[57] "priority10"
[58] "Location.1"
[59] "Community.Board"
[60] "Council.District"
[61] "Census.Tract"
[62] "BIN"
[63] "BBL"
[64] "NTA"
答案 0 :(得分:2)
如果我们要将names
元素的list
更改为大写,请在更改名称的同时在tolower
的{{1}}上使用names
与list
setNames
然后提取常用列
files1 <- lapply(files, function(x) setNames(x, toupper(names(x))))
在nm1 <- Reduce(intersect, lapply(files1, names))
lst2 <- lapply(files1, `[[`, nm1)
中,这可以通过
tidyverse