使用For循环生成熊猫数据帧

时间:2020-10-20 21:57:04

标签: python python-3.x pandas dataframe for-loop

我正在尝试生成500个DataFrame,每个学生一个。每个DataFrame都是他们的出勤记录。

(奖金)如何将每个DataFrame的名称设置为attendance_i。我是学生的身份证。因此,学生1的数据框将是出勤_1,学生2的数据框将是出勤_2,依此类推。

这是代码:

import pandas as pd
import numpy as np
from numpy import random
from datetime import datetime
from datetime import timedelta
from dateutil.rrule import DAILY, rrule, MO, TU, WE, TH, FR, SA, SU
import calendar

# DEFINE GLOBAL VARIABLES
student_count = 500

# DECLARE START AND END DATES
start = '2013-03-01'
end = '2015-07-31'
# GET NUMBER OF WORKDAYS
weekmask = 'Wed Thu Fri Sat Sun'
customworkweek = pd.offsets.CustomBusinessDay(weekmask=weekmask)
workdates = pd.bdate_range(start = start, end = end, freq = customworkweek).to_pydatetime().tolist()
# GET NUMBER OF WEEKS AND MONTHS 
x = pd.to_datetime(end) - pd.to_datetime(start)
workweeks_int = int(x / np.timedelta64(1, 'W'))
workmonths_int = int(x / np.timedelta64(1, 'M'))
# MORE GLOBAL VARAIBLES
wed = pd.offsets.CustomBusinessDay(weekmask='Wed')
thu = pd.offsets.CustomBusinessDay(weekmask='Thu')
fri = pd.offsets.CustomBusinessDay(weekmask='Fri')
sat = pd.offsets.CustomBusinessDay(weekmask='Sat')
sun = pd.offsets.CustomBusinessDay(weekmask='Sun')

wed_classes = pd.bdate_range(start = start, end = end, freq = wed).to_pydatetime().tolist()
thu_classes = pd.bdate_range(start = start, end = end, freq = thu).to_pydatetime().tolist()
fri_classes = pd.bdate_range(start = start, end = end, freq = fri).to_pydatetime().tolist()
sat_classes = pd.bdate_range(start = start, end = end, freq = sat).to_pydatetime().tolist()
sun_classes = pd.bdate_range(start = start, end = end, freq = sun).to_pydatetime().tolist()

# SET UP STUDENT DATAFRAME
student_age = random.randint(4, 13, size = (student_count))
student_sex = random.choice(["male", "female"], size = (student_count))
student_id = list(range(0,student_count))
student_name_local = pd.read_csv('Database\student_name_local.csv')
student_name_english = pd.read_csv('Database\student_name_english.csv')
class_id = random.randint(1, 27, size = (student_count))

column_names = ['STUDENT_ID', 'STUDENT_NAME_LOCAL', 'STUDENT_NAME_ENGLISH', 'STUDENT_AGE', 'STUDENT_SEX']
students = pd.DataFrame(columns = column_names)

students['STUDENT_ID'] = student_id
students['STUDENT_NAME_LOCAL'] = student_name_local
students['STUDENT_NAME_ENGLISH'] = student_name_english
students['STUDENT_AGE'] = student_age
students['STUDENT_SEX'] = student_sex
students['CLASS_ID'] = class_id

# SET UP CLASSES DATAFRAME
data = [['0', 'Wednesday', '16:30:00'], ['1', 'Wednesday', '17:30:00'], ['2', 'Wednesday', '18:30:00'], ['3', 'Thursday', '16:30:00'], ['4', 'Thursday', '17:30:00'], ['5', 'Thursday', '18:30:00'], ['6', 'Friday', '16:30:00'], ['7', 'Friday', '17:30:00'], ['8', 'Friday', '18:30:00'], ['9', 'Saturday', '8:30:00'], ['10', 'Saturday', '9:30:00'], ['11', 'Saturday', '10:30:00'], ['12', 'Saturday', '11:30:00'], ['13', 'Saturday', '13:30:00'], ['14', 'Saturday', '14:30:00'], ['15', 'Saturday', '15:30:00'], ['16', 'Saturday', '16:30:00'], ['17', 'Saturday', '17:30:00'], ['18', 'Sunday', '8:30:00'], ['19', 'Sunday', '9:30:00'], ['20', 'Sunday', '10:30:00'], ['21', 'Sunday', '11:30:00'], ['22', 'Sunday', '13:30:00'], ['23', 'Sunday', '14:30:00'], ['24', 'Sunday', '15:30:00'], ['25', 'Sunday', '16:30:00'], ['26', 'Sunday', '17:30:00']]

column_names = ['CLASS_ID', 'CLASS_DAY', 'CLASS_TIME']
classes = pd.DataFrame(data = data, columns = column_names)
classes['CLASS_ID'] = classes['CLASS_ID'].astype(int)

# FOR LOOP VARIABLES
attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)

# THE FOR LOOP
for i in student_id:
    ids = list([i])*workweeks_int
    column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
    i = pd.DataFrame(columns = column_names)

    i['STUDENT_ID'] = ids
    i['ATTENDANCE_STATUS'] = attendance_status

    x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
    x = x[-1]
    c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
    c = c[-1]

    if c == 'Wednesday':
        i['CLASS_DATE'] = wed_classes
    elif c == 'Thursday':
        i['CLASS_DATE'] = thu_classes
    elif c == 'Friday':
        i['CLASS_DATE'] = fri_classes
    elif c == 'Saturday':
        i['CLASS_DATE'] = sat_classes
    elif c == 'Sunday':
        i['CLASS_DATE'] = sun_classes

这是在for循环中发生的错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-c04451cf4f9b> in <module>
     69     i['ATTENDANCE_STATUS'] = attendance_status
     70 
---> 71     x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
     72     x = x[-1]
     73     c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()

~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1766 
   1767             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768             return self._getitem_axis(maybe_callable, axis=axis)
   1769 
   1770     def _is_scalar_access(self, key: Tuple):

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1950 
   1951                 if hasattr(key, "ndim") and key.ndim > 1:
-> 1952                     raise ValueError("Cannot index with multidimensional key")
   1953 
   1954                 return self._getitem_iterable(key, axis=axis)

ValueError: Cannot index with multidimensional key

有趣的是,以下代码在for循环之外起作用:

attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)
id = 326
i = list([id])*workweeks_int

column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
att1 = pd.DataFrame(columns = column_names)

att1['STUDENT_ID'] = i
att1['ATTENDANCE_STATUS'] = attendance_status

x = students['CLASS_ID'].loc[students['STUDENT_ID'] == id].to_list()
x = x[-1]
c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
c = c[-1]

if c == 'Wednesday':
    att1['CLASS_DATE'] = wed_classes
elif c == 'Thursday':
    att1['CLASS_DATE'] = thu_classes
elif c == 'Friday':
    att1['CLASS_DATE'] = fri_classes
elif c == 'Saturday':
    att1['CLASS_DATE'] = sat_classes
elif c == 'Sunday':
    att1['CLASS_DATE'] = sun_classes

att1

0 个答案:

没有答案