如何使用熊猫Series / DataFrame从类字典类的对象中提取数据

时间:2018-10-19 17:02:13

标签: python pandas dictionary dataframe

这是我正在做的学校作业……

因此,基本上,我被要求扫描给定目录并找到其中的所有.py文件,并对给定属性进行计数,这些属性是文件中定义的类和函数(包括类中的方法),以及每个文件。并在终端上的表中打印所有数据。

要打印表格,我的讲师建议使用一个名为 prettytable 的程序包,尽管对我而言这并不漂亮。

我要使用熊猫
原因很简单:每个文件都要计数其4个属性->在这里自然会调用一个嵌套字典。而且pandas.DataFrame非常适合记录嵌套字典。

扫描和汇总是简单的部分,实际上让我陷入困境的是如何使数据容器灵活且可扩展。

内置字典无法使用其中的4个现有键值对进行初始化,因此我构建了一个类CountAttr(MutableMapping),并使用另一个类FileCounter 为每个文件创建并计算每个属性。

但是,pandas.DataFrame只识别此dict类对象的第一层。而且我已经阅读了DataFrame和Series的源文件,但仍然无法找出解决方法。

所以我的问题是,
如何使pandas.DataFrame / Series从其值是类似dict的对象的字典中提取数据?

P.S。我愿意接受以下代码,编码风格,实现方式以及所有内容的所有建议。非常感谢!

from collections.abc import MutableMapping
from collections import defaultdict
import pandas as pd
import os

class CounterAttr(MutableMapping):
""" Initialize a dictionary with 4 keys whose values are all 0,

    keys:value
    - 'class': 0
    - 'function': 0
    - 'line': 0
    - 'char': 0

    interfaces to get and set these attributes """

    def __init__(self):
        """ Initially there are 4 attributes in the storage"""
        # key: counted attributes | value: counting number
        self.__dict__ = {'class': 0, 'function': 0, 'line': 0, 'char': 0}

    def __getitem__(self, key):
        if key in self.__dict__:
            return self.__dict__[key]
        else:
            raise KeyError

    def get(self, key, defaut = None):
        if key in self.__dict__:
            return self.__dict__[key]
        else:
            return defaut

    def __setitem__(self, key, value):
        self.__dict__[key] = value

    def __delitem__(self, key):
        del self.__dict__[key]

    def __len__(self):
        return len(self.__dict__)

    def __iter__(self):
        return iter(self.__dict__)

    def get_all(self):
        """ return a copy of the self._storagem, in case the internal data got polluted"""
        copy = self.__dict__.copy()
        return copy

    def to_dict(self):
        return self.__dict__

    def __repr__(self):
        return '{0.__class__.__name__}()'.format(self)

class FileCounter(MutableMapping):
""" Discribe the object the store all the counters for all .py files

    Attributes:
    - 

"""
    def __init__(self):
        self._storage = dict()

    def __setitem__(self, key, value = CounterAttr()):
        if key not in self._storage.keys():
            self._storage[key] = value
        else:
            print("Attribute exist!")

    def __getitem__(self, key):
        if key in self._storage.keys():
            return self._storage[key]
        else:
            self._storage[key] = CounterAttr()

    def __delitem__(self, key):
        del self._storage[key]

    def __len__(self):
        return len(self._storage)

    def __iter__(self):
        return iter(self._storage)






def scan_summerize_pyfile(directory, give_me_dict = False):
""" Scan the passing directory, find all .py file, count the classes, funcs, lines, chars in each file
    and print out with a table
"""
    file_counter = FileCounter()


    if os.path.isdir(directory):                                            # if the given directory is a valid one

        os.chdir(directory)                                                 # change the CWD
        print("\nThe current working directory is {}\n".format(os.getcwd()))

        file_lst = os.listdir(directory)                                    # get all files in the CWD

        for a_file in file_lst:                                             # traverse the list and find all pyfiles
            if a_file.endswith(".py"):

                file_counter[a_file] 

                try:
                    open_file = open(a_file, 'r')
                except FileNotFoundError:
                    print("File {0} can't be opened!".format(a_file))

                else:

                    with open_file:
                        for line in open_file:

                            if line.lstrip().startswith("class"):           # count the classes
                                file_counter[a_file]['class'] += 1

                            if line.lstrip().startswith("def"):             # count the functions
                                file_counter[a_file]['function'] += 1

                            file_counter[a_file]['line'] += 1               # count the lines

                            file_counter[a_file]['char'] += len(line)       # count the chars, no whitespace

    else:
        print("The directory", directory, "is not existed.\nI'm sorry, program ends.")


    return file_counter

# Haven't had the pandas codes part yet

2 个答案:

答案 0 :(得分:0)

我不知道为什么你需要像你写的东西..这似乎对我来说太过工程了。

假设read_file()返回您想要的class, function, line, chars的4个属性,并且在list_of_files中有一个python文件列表,您可以这样做:

result = []
for file in list_of_files:
    c, f, l, num_c = read_file(file)
    curr_dict = {'class':c, 'function':f, 'line':l, 'chars':num_c}
    result.append(curr_dict)
your_table = pd.DataFrame(result)

这就是您所需要的。

您应该生成文件列表和函数以分别读取它们,每一个不同的事物都应包含在它自己的函数中-绝对有助于分离逻辑。

答案 1 :(得分:0)

这是我对这个问题的解决方案。 我没有为熊猫所做的事情而苦苦挣扎,而是尝试找出如何调整解决方案并使熊猫能够轻松读取我的数据的方法。感谢@RockyLi的建议

class FileCounter(object):
""" A class that contains the .py files counted 
    - .py files that are found in the given directory
    - attributes counted for each .py file
    - methods that scan and sumerized .py file
"""
def __init__(self, directory):
    self._directory = directory
    self._data = dict()        # key: file name | value: dict of counted attributes
    self._update_data()

def _read_file(self, filename):
    """ return a dictionary of attributes statistical data

        return type: dictionary
            - key: attributes' name
            - value: counting number of attributes

        it's not available to add a counting attributes interactively
    """

    class_, function_, line_, char_ = 0, 0, 0, 0
    try:
        open_file = open(filename, 'r')
    except FileNotFoundError:
        print("File {0} can't be opened!".format(filename))
    else:

        with open_file:
            for line in open_file:

                if line.lstrip().startswith("class "):           # count the classes
                    class_ += 1

                if line.lstrip().startswith("def "):             # count the functions
                    function_ += 1

                line_ += 1                                       # count the lines

                char_ += len(line)                               # count the chars, no whitespace
    return {'class': class_, 'function': function_, 'line': line_, 'char': char_}

def _scan_dir(self):
    """ return all of the file in the directory
        if the directory is not valid, raise and OSError
    """
    if os.path.isdir(self._directory):
        os.chdir(self._directory)
        return os.listdir(self._directory)

    else:
        raise OSError("The directory doesn't exist!")

def _find_py(self, lst_of_file):
    """ find all of the .py files in the directory"""
    lst_of_pyfile = list()

    for filename in lst_of_file:
        if filename.endswith('.py'):
            lst_of_pyfile.append(filename)

    return lst_of_pyfile

def _update_data(self):
    """ manipulate the _data\n
        this is the ONLY method that manipulate _data
    """
    lst_of_pyfile = self._find_py(self._scan_dir())

    for filename in lst_of_pyfile:
        self._data[filename] = self._read_file(filename)        # only place manipulate _data

def pretty_print(self):
    """ Print the data!"""

    df_prettyprint = pd.DataFrame.from_dict(self._data, orient = 'index')

    if not df_prettyprint.empty:
        print(df_prettyprint)
    else:
        print("Oops, seems like you don't get any .py file.\n You must be Java people :p")

def get_data(self):
    return self._data.copy()                                    # never give them the original data!

此类为A建立两个接口。打印表B。获取数据以备将来使用,还保护要直接访问和修改的数据。