我正在尝试生成500个DataFrame,每个学生一个。每个DataFrame都是他们的出勤记录。
(奖金)如何将每个DataFrame的名称设置为attendance_i
。我是学生的身份证。因此,学生1的数据框将是出勤_1,学生2的数据框将是出勤_2,依此类推。
这是代码:
import pandas as pd
import numpy as np
from numpy import random
from datetime import datetime
from datetime import timedelta
from dateutil.rrule import DAILY, rrule, MO, TU, WE, TH, FR, SA, SU
import calendar
# DEFINE GLOBAL VARIABLES
student_count = 500
# DECLARE START AND END DATES
start = '2013-03-01'
end = '2015-07-31'
# GET NUMBER OF WORKDAYS
weekmask = 'Wed Thu Fri Sat Sun'
customworkweek = pd.offsets.CustomBusinessDay(weekmask=weekmask)
workdates = pd.bdate_range(start = start, end = end, freq = customworkweek).to_pydatetime().tolist()
# GET NUMBER OF WEEKS AND MONTHS
x = pd.to_datetime(end) - pd.to_datetime(start)
workweeks_int = int(x / np.timedelta64(1, 'W'))
workmonths_int = int(x / np.timedelta64(1, 'M'))
# MORE GLOBAL VARAIBLES
wed = pd.offsets.CustomBusinessDay(weekmask='Wed')
thu = pd.offsets.CustomBusinessDay(weekmask='Thu')
fri = pd.offsets.CustomBusinessDay(weekmask='Fri')
sat = pd.offsets.CustomBusinessDay(weekmask='Sat')
sun = pd.offsets.CustomBusinessDay(weekmask='Sun')
wed_classes = pd.bdate_range(start = start, end = end, freq = wed).to_pydatetime().tolist()
thu_classes = pd.bdate_range(start = start, end = end, freq = thu).to_pydatetime().tolist()
fri_classes = pd.bdate_range(start = start, end = end, freq = fri).to_pydatetime().tolist()
sat_classes = pd.bdate_range(start = start, end = end, freq = sat).to_pydatetime().tolist()
sun_classes = pd.bdate_range(start = start, end = end, freq = sun).to_pydatetime().tolist()
# SET UP STUDENT DATAFRAME
student_age = random.randint(4, 13, size = (student_count))
student_sex = random.choice(["male", "female"], size = (student_count))
student_id = list(range(0,student_count))
student_name_local = pd.read_csv('Database\student_name_local.csv')
student_name_english = pd.read_csv('Database\student_name_english.csv')
class_id = random.randint(1, 27, size = (student_count))
column_names = ['STUDENT_ID', 'STUDENT_NAME_LOCAL', 'STUDENT_NAME_ENGLISH', 'STUDENT_AGE', 'STUDENT_SEX']
students = pd.DataFrame(columns = column_names)
students['STUDENT_ID'] = student_id
students['STUDENT_NAME_LOCAL'] = student_name_local
students['STUDENT_NAME_ENGLISH'] = student_name_english
students['STUDENT_AGE'] = student_age
students['STUDENT_SEX'] = student_sex
students['CLASS_ID'] = class_id
# SET UP CLASSES DATAFRAME
data = [['0', 'Wednesday', '16:30:00'], ['1', 'Wednesday', '17:30:00'], ['2', 'Wednesday', '18:30:00'], ['3', 'Thursday', '16:30:00'], ['4', 'Thursday', '17:30:00'], ['5', 'Thursday', '18:30:00'], ['6', 'Friday', '16:30:00'], ['7', 'Friday', '17:30:00'], ['8', 'Friday', '18:30:00'], ['9', 'Saturday', '8:30:00'], ['10', 'Saturday', '9:30:00'], ['11', 'Saturday', '10:30:00'], ['12', 'Saturday', '11:30:00'], ['13', 'Saturday', '13:30:00'], ['14', 'Saturday', '14:30:00'], ['15', 'Saturday', '15:30:00'], ['16', 'Saturday', '16:30:00'], ['17', 'Saturday', '17:30:00'], ['18', 'Sunday', '8:30:00'], ['19', 'Sunday', '9:30:00'], ['20', 'Sunday', '10:30:00'], ['21', 'Sunday', '11:30:00'], ['22', 'Sunday', '13:30:00'], ['23', 'Sunday', '14:30:00'], ['24', 'Sunday', '15:30:00'], ['25', 'Sunday', '16:30:00'], ['26', 'Sunday', '17:30:00']]
column_names = ['CLASS_ID', 'CLASS_DAY', 'CLASS_TIME']
classes = pd.DataFrame(data = data, columns = column_names)
classes['CLASS_ID'] = classes['CLASS_ID'].astype(int)
# FOR LOOP VARIABLES
attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)
# THE FOR LOOP
for i in student_id:
ids = list([i])*workweeks_int
column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
i = pd.DataFrame(columns = column_names)
i['STUDENT_ID'] = ids
i['ATTENDANCE_STATUS'] = attendance_status
x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
x = x[-1]
c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
c = c[-1]
if c == 'Wednesday':
i['CLASS_DATE'] = wed_classes
elif c == 'Thursday':
i['CLASS_DATE'] = thu_classes
elif c == 'Friday':
i['CLASS_DATE'] = fri_classes
elif c == 'Saturday':
i['CLASS_DATE'] = sat_classes
elif c == 'Sunday':
i['CLASS_DATE'] = sun_classes
这是在for循环中发生的错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-c04451cf4f9b> in <module>
69 i['ATTENDANCE_STATUS'] = attendance_status
70
---> 71 x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
72 x = x[-1]
73 c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1766
1767 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768 return self._getitem_axis(maybe_callable, axis=axis)
1769
1770 def _is_scalar_access(self, key: Tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1950
1951 if hasattr(key, "ndim") and key.ndim > 1:
-> 1952 raise ValueError("Cannot index with multidimensional key")
1953
1954 return self._getitem_iterable(key, axis=axis)
ValueError: Cannot index with multidimensional key
有趣的是,以下代码在for循环之外起作用:
attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)
id = 326
i = list([id])*workweeks_int
column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
att1 = pd.DataFrame(columns = column_names)
att1['STUDENT_ID'] = i
att1['ATTENDANCE_STATUS'] = attendance_status
x = students['CLASS_ID'].loc[students['STUDENT_ID'] == id].to_list()
x = x[-1]
c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
c = c[-1]
if c == 'Wednesday':
att1['CLASS_DATE'] = wed_classes
elif c == 'Thursday':
att1['CLASS_DATE'] = thu_classes
elif c == 'Friday':
att1['CLASS_DATE'] = fri_classes
elif c == 'Saturday':
att1['CLASS_DATE'] = sat_classes
elif c == 'Sunday':
att1['CLASS_DATE'] = sun_classes
att1