Question

我制作了一个脚本，用于搜索免费PC游戏的不同位置。我把它弄好了，并希望将它整合到我的不和谐频道中。我的问题是在为discord添加命令之后它不再刮擦了。就像不和谐阻止脚本的其余部分运行一样。所以我想要做的是将这两个分开，并在我执行#update或其他任何不和谐时调用scrape函数。这样做的最佳方式是什么？要运行刮擦机器人，我只需使用“scrapy crawl botname”。我可以使用os.system（“scrapy crawl bitname”）还是有更好的方法？我将使用我的代码示例更新此内容当我到达我的桌面时修复格式。谢谢你的任何建议！

这是我的pipelines.py

# -*- coding: utf-8 -*-

import os
import pymongo
from scrapy import log
from scrapy.conf import settings
from scrapy.exceptions import DropItem
import discord
from discord.ext import commands
from discord.ext.commands import Bot
import asyncio
import chalk
import csv
from more_itertools import unique_everseen
with open("reddit.csv",'r') as f, open("redditu.csv",'w') as out_file:
out_file.writelines(unique_everseen(f))


class DuplicatesPipeline(object):

def __init__(self):
    self.titles_seen = set()

def process_item(self, item, spider):
    if item['title'] in self.titles_seen:
        raise DropItem("Duplicate item found: %s" % item)
    else:
        self.titles_seen.add(item['title'])
        return item


class MongoDBPipeline(object):
def __init__(self):

    uri = "HIDDEN"
    client = pymongo.MongoClient(uri)
    db = client["freegames"]
    self.collection = db['post']


def process_item(self, item, spider):
    valid = True
    for data in item:
        if not data:
            valid = False
            raise DropItem("Missing {0}!".format(data))
    if valid:
        self.collection.insert(dict(item))
        log.msg("Added to MongoDB database!",
                level=log.DEBUG, spider=spider)
    return item
bot = commands.Bot(command_prefix='#')

@bot.event
async def on_ready():

print ("Scrappy Bot loaded!")

@bot.command(pass_context=True)
async def free(ctx):
os.system('python redditbot.py')

f = open('redditu.csv')
csv_f = csv.reader(f)

for row in csv_f:
    #print(row)
    await bot.say('Free Games: {}'.format(row))




@bot.command(pass_context=True)
async def info(ctx, user: discord.Member):
await bot.say("The users name is: {}".format(user.name))
await bot.say("The users ID is: {}".format(user.id))
await bot.say("The users status is: {}".format(user.status))
await bot.say("The users highest role is: {}".format(user.top_role))
await bot.say("The user joined at: {}".format(user.joined_at)) 

bot.run("HIDDEN")

settings.py

BOT_NAME = 'redditfreegames'

SPIDER_MODULES = ['redditfreegames.spiders']
NEWSPIDER_MODULE = 'redditfreegames.spiders'

#Export as CSV Feed
FEED_FORMAT = "csv"
FEED_URI = "reddit.csv"

#DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {'redditfreegames.pipelines.DuplicatesPipeline':300, 
'redditfreegames.pipelines.MongoDBPipeline':800, }

和我的实际抓取脚本

class RedditbotSpider(scrapy.Spider):
    name = 'redditbot'
    allowed_domains = ['www.reddit.com']
    start_urls = ['https://www.reddit.com/r/FreeGamesOnSteam/']

    def parse(self, response):
        #Extracting the content using css selectors
        titles = response.css('.title.may-blank::text').extract()
        links = response.css(".title::attr(href)").extract()

        #Give the extracted content row wise
        for item in zip(titles,links):
            #create a dictionary to store the scraped info
            scraped_info = {
                'title' : item[0],
                'links' : item[1]

            }

            #yield or give the scraped info to scrapy
            yield scraped_info

我有很多代码甚至没有使用，例如保存到mongodb，因为我目前只是保存到csv并从中拉出来。将通过并清理它。

整合discord python后Scrapy没有运行

0 个答案: