Django - 获取/保存大型对象需要花费大量时间

时间:2014-07-29 16:06:57

标签: python django

我正在尝试从模型中获取几百万个项目并解析它们。但是,不知何故,它花费了大量时间来保存数据。

这些是我现有的模型:

class mapt(models.Model):
    s = models.IntegerField(primary_key=True)
    name = models.CharField(max_length=2000)

    def __unicode__(self):
        return str(self.s)

class datt(models.Model):
    s = models.IntegerField(primary_key=True)
    setid = models.IntegerField()
    var = models.IntegerField()
    val = models.IntegerField()

    def __unicode(self):
        return str(self.s)

class sett(models.Model):
    setid = models.IntegerField(primary_key=True)
    block = models.IntegerField()
    username = models.IntegerField()
    ts = models.IntegerField()

    def __unicode__(self):
        return str(self.setid)

class data_parsed(models.Model):
    setid = models.IntegerField(max_length=2000, primary_key=True)
    block = models.CharField(max_length=2000)
    username = models.CharField(max_length=2000)
    data = models.CharField(max_length=200000)
    time = models.IntegerField()

    def __unicode__(self):
        return str(self.setid)

datt模型的s参数实际上应该充当mapt参数的外键。此外,沉积的setid字段应该作为setid's setid的外键。

最后,data_parsed的setid是解决模型的外键。

该算法目前以这种方式编写:

def database_rebuild(start_data_parsed):
    listSetID = []
    #Part 1
    for items in sett.objects.filter(setid__gte=start_data_parsed):
        listSetID.append(items.setid)
    uniqueSetID = listSetID 

    #Part 2
    for items in uniqueSetID:
        try:
            SetID = items
            settObject = sett.objects.get(setid=SetID)

            UserName = mapt.objects.get(pk=settObject.username).name
            TS = pk=settObject.ts
            BlockName = mapt.objects.get(pk=settObject.block).name

            DataPairs_1 = []
            DataPairs_2 = []
            DataPairs_1_Data = []
            DataPairs_2_Data = []

            for data in datt.objects.filter(setid__exact=SetID):
                DataPairs_1.append(data.var)
                DataPairs_2.append(data.val)

            for Data in DataPairs_1:
                DataPairs_1_Data.append(mapt.objects.get(pk=Data).name)

            for Data in DataPairs_2:
                DataPairs_2_Data.append(mapt.objects.get(pk=Data).name)

            assert (len(DataPairs_1) == len(DataPairs_2)), "Length not equal"

            #Part 3
            Serialize = []
            for idx, val in enumerate(DataPairs_1_Data):
                Serialize.append(str(DataPairs_1_Data[idx]) + ":PARSEABLE:" + str(DataPairs_2_Data[idx]) + ":PARSEABLENEXT:")


            Serialize_Text = ""
            for Data in Serialize:
                Serialize_Text += Data


            Data = Serialize_Text
            p = data_parsed(SetID,  BlockName, UserName, Data, TS)
            p.save()
        except AssertionError, e:
            print "Error:" + str(e.args)
            print "Possibly DataPairs does not have equal length"
        except Exception as e:
            print "Error:" + str(sys.exc_info()[0])
            print "Stack:" + str(e.args)

基本上,它的作用是:

  1. 查找大于数字的所有设置对象

  2. 获取UserName,TS和BlockName,然后获取datt字段中与var和val字段对应的所有字段映射到mapt的'字段。 Var和Val基本上是NAME_OF_FIELD:VALUE类型的关系。

  3. 序列化所有var和val参数,以便我可以从var和val中获取所有参数,这些参数分布在data_parsed的行中的mapt表中。

  4. 当前的解决方案可以完成我想要的一切,但是,在英特尔酷睿i5-4300U CPU @ 1.90Ghz上,它每天在芹菜周期工作者上解析大约15000行数据。我的设置表中有大约3355566行数据,解析它们大约需要23天。

    有没有办法加快这个过程?

    ============================更新================== ==========

    新模特:

    class mapt(models.Model):
        s = models.IntegerField(primary_key=True)
        name = models.CharField(max_length=2000)
    
        def __unicode__(self):
            return str(self.s)
    
    class sett(models.Model):
        setid = models.IntegerField(primary_key=True)
        block = models.ForeignKey(mapt, related_name='sett_block')
        username = models.ForeignKey(mapt, related_name='sett_username')
        ts = models.IntegerField()
    
        def __unicode__(self):
            return str(self.setid)
    
    # class sett(models.Model):
        # setid = models.IntegerField(primary_key=True)
        # block = models.IntegerField()
        # username = models.IntegerField()
        # ts = models.IntegerField()
    
        # def __unicode__(self):
            # return str(self.setid)
    
    class datt(models.Model):
        s = models.IntegerField(primary_key=True)
        setid = models.ForeignKey(sett, related_name='datt_setid')
        var = models.ForeignKey(mapt, related_name='datt_var')
        val = models.ForeignKey(mapt, related_name='datt_val')
    
        def __unicode(self):
            return str(self.s)
    
    # class datt(models.Model):
        # s = models.IntegerField(primary_key=True)
        # setid = models.IntegerField()
        # var = models.IntegerField()
        # val = models.IntegerField()
    
        # def __unicode(self):
            # return str(self.s)
    
    class data_parsed(models.Model):
        setid = models.ForeignKey(sett, related_name='data_parsed_setid', primary_key=True)
        block = models.CharField(max_length=2000)
        username = models.CharField(max_length=2000)
        data = models.CharField(max_length=2000000)
        time = models.IntegerField()
    
        def __unicode__(self):
            return str(self.setid)
    

    新解析:

    def database_rebuild(start_data_parsed, end_data_parsed):
        for items in sett.objects.filter(setid__gte=start_data_parsed, setid__lte=end_data_parsed):
            try:
                UserName = mapt.objects.get(pk=items.username_id).name
                TS = pk=items.ts
                BlockName = mapt.objects.get(pk=items.block_id).name
    
                DataPairs_1 = []
                DataPairs_2 = []
                DataPairs_1_Data = []
                DataPairs_2_Data = []
    
                for data in datt.objects.filter(setid_id__exact=items.setid):
                    DataPairs_1.append(data.var_id)
                    DataPairs_2.append(data.val_id)
    
                for Data in DataPairs_1:
                    DataPairs_1_Data.append(mapt.objects.get(pk=Data).name)
    
                for Data in DataPairs_2:
                    DataPairs_2_Data.append(mapt.objects.get(pk=Data).name)
    
                assert (len(DataPairs_1) == len(DataPairs_2)), "Length not equal"
    
                Serialize = []
                for idx, val in enumerate(DataPairs_1_Data):
                    Serialize.append(str(DataPairs_1_Data[idx]) + ":PARSEABLE:" + str(DataPairs_2_Data[idx]))
    
                Data = ":PARSEABLENEXT:".join(Serialize)
                p = data_parsed(items.setid, BlockName, UserName, Data, TS)
                p.save()
            except AssertionError, e:
                print "Error:" + str(e.args)
                print "Possibly DataPairs does not have equal length"
            except Exception as e:
                print "Error:" + str(sys.exc_info()[0])
                print "Stack:" + str(e.args)
    

1 个答案:

答案 0 :(得分:1)

通过追加附加来定义列表非常慢。使用列表推导甚至只是list()构造函数。

在python中,你不应该使用for循环和+ =加入字符串列表,你应该使用join()

但这不是这里的主要瓶颈。你有很多objects.get()个,每个都需要数据库往返。如果你在mapt表中没有数百万行,你应该只是创建一个字典映射mapt主键到mapt对象。

如果您将外键定义为外键,那么django orm可以帮助您完成大部分查询。也就是说,代替SomeModel.objects.get(id=some_instance.some_fk_id),您可以some_instance.some_fk(只有在第一次为每个实例执行数据时才会触及数据库)。如果some_instance已初始化为some_instance = SomeOtherModel.objects.select_related('some_fk').get(id=id_of_some_instance),您甚至可以删除外键查询。

也许在不更改数据库的情况下更改模型也可以。