无法在派生类中访问新方法

时间:2018-03-27 15:34:36

标签: python scrapy web-crawler

我编写了以下代码来抓取网页,然后将它们存储在Solr索引中。

crawledLinks = []
solr = pysolr.Solr('some url', timeout=10)

class MySpider(Spider):
    name = "tutsplus"
    start_urls = ["some url"]
    allowed_domains = ["some domain"]

    custom_settings = {
        'CONCURRENT_REQUESTS': 100,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
        'DEPTH_LIMIT': 100,
        'LOG_ENABLED': True,
    }

    def parse(self, response):
        links = response.xpath('//a/@href').extract()
        current_url = response.url

        asyncio.ensure_future(add_to_index(response.body, current_url))

        for link in links:
        # If it is a proper link and is not checked yet, yield it to the Spider
            internal_link = urljoin(current_url, link)
            result = urlparse(internal_link)

            if result.scheme and result.netloc and result.path and not internal_link in crawledLinks:
                crawledLinks.append(internal_link)
                yield Request(internal_link, self.parse)

        item = TutsplusItem()
        item["url"] = current_url

        yield item

    async def add_to_index(body, current_url):
        soup = BeautifulSoup(body)
        texts = soup.find_all(text=True)

        visible_texts = []

        for text in texts:
            if text.parent.name not in ['style', 'script', 'meta', '[document]'] and not isinstance(text, Comment):
            visible_texts.append(text)

            fetched_text = u" ".join(t.strip() for t in visible_texts)
            words = nltk.word_tokenize(fetched_text)
            stop = set(stopwords.words('english'))
            stopwordsfree_words = [word for word in words if word not in stop]

            detokenizer = MosesDetokenizer()
            doc = detokenizer.detokenize(stopwordsfree_words, return_str=True)
            doc = doc.encode('utf-8')

            url = "some url"

            try:
                res = requests.post(url, data=doc)
            except Exception as e:
                print(e)

            if not doc:
                doc = soup.title.string

            if res.status_code == 200:
                words = json.loads(res.text) 
                doc = detokenizer.detokenize(words, return_str=True)
                solr.add([{"doc": doc, "url": str(current_url)}])

我希望以“一劳永逸”的方式调用函数add_to_index()。但我面临的问题是我收到了错误

  

未定义名称'add_to_index'

在解析方法中。所以功能没有得到认可。我是python的新手。你能帮我解决这个问题吗?

谢谢,

Nilesh制作。

1 个答案:

答案 0 :(得分:0)

您是否尝试过像这样调用add_to_indexself.add_to_index(response.body, current_url)