如何使用Scrapy处理多个领域

时间:2016-09-10 05:11:52

标签: python django scrapy manytomanyfield

我想将Scrapy与Django一起使用。

我的目标是将actors字段链接到name字段,但我不知道如何处理Django manytomany。我的数据库是MySQL(我没有使用djangoItem)。

models.py

class Movies(models.Model):
    content_ID = models.CharField(max_length=30)
    release_date = models.CharField(max_length=30)
    running_time = models.CharField(max_length=10)
    actors = models.CharField(max_length=300)
    series = models.CharField(max_length=30)
    director = models.CharField(max_length=30)
    label = models.CharField(max_length=30)
    image_urls = models.CharField(max_length=200, null=True)
    images = models.TextField(null=True)
    image_paths = models.TextField(null=True)

    def __str__(self):
        return self.content_ID

class Actors(models.Model):
    names = models.CharField(max_length=100, null=True)
    movielist = models.ManyToManyField(EnMovielist)
    image_urls = models.CharField(max_length=200)
    images = models.TextField(null=True)
    image_paths = models.TextField(null=True)

    def __str__(self):
        return self.name

1 个答案:

答案 0 :(得分:1)

https://github.com/DevProfi/scrapy-djangoitem 处理scrapy我使用管道

class ItemPersistencePipeline(object):
    def process_item(self, item, spider, partial=True):
        try:
            item_model = item_to_model(item)
        except TypeError:
            return item
        model, created = get_or_create(item_model, spider.unique_fields)

        # Если объект модели не создана значит она уже есть и нужно обновить ее
        if not created:
            try:
                update_model(destination=model, source=item_model, item=item, fields=spider.unique_fields, partial=partial)
            except Exception as e:
                return e

        # Объект модели создан, нужно создать m2m объекты для нее если существуют
        else:
            item_fields_m2m = sorted(item._model_fields_m2m)
            for f in item_fields_m2m:
                val = item.get(f)
                if val:
                    getattr(model, f).set(val)
        #             TODO add bulk insert model fields
        # model.related_set.set(new_list)
        return item


def update_model(destination, source, item, fields, partial, commit=False):
    # partial включено ли частичное обновление
    # commit испольщзуется для анализа изменился ли объект чтобы зря не сохранять его в базу
    pk = destination.pk
    opts = source._meta
    fields_m2m = sorted(opts.many_to_many)
    field_names_m2m = [f.name for f in fields_m2m]
    source_fields = fields_for_model(source, exclude=field_names_m2m)

    for key in source_fields.keys():
        # if key != 'name':
        val_old = getattr(destination, key)
        t = type(val_old)
        try:
             val_new = (getattr(source, key))
        except ObjectDoesNotExist:
            continue
        if partial:
            if val_new:
                if val_new != val_old:
                    setattr(destination, key, val_new)
                    commit = True
        else:
            commit = True
            setattr(destination, key, val_new)
    if not pk:
        setattr(destination, 'pk', pk)

    if commit:
        destination.save()

    # TODO fix for update m2m fields with list
    item_fields_m2m = sorted(item._model_fields_m2m)

    for f in item_fields_m2m:
        val_new = item.get(f)
        val_old = list(getattr(destination, f).all())
        if val_new and (val_new not in val_old):
            getattr(destination, f).add(val_new)

    return destination