我编写了以下代码来抓取网页,然后将它们存储在Solr
索引中。
crawledLinks = []
solr = pysolr.Solr('some url', timeout=10)
class MySpider(Spider):
name = "tutsplus"
start_urls = ["some url"]
allowed_domains = ["some domain"]
custom_settings = {
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
'DEPTH_LIMIT': 100,
'LOG_ENABLED': True,
}
def parse(self, response):
links = response.xpath('//a/@href').extract()
current_url = response.url
asyncio.ensure_future(add_to_index(response.body, current_url))
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
internal_link = urljoin(current_url, link)
result = urlparse(internal_link)
if result.scheme and result.netloc and result.path and not internal_link in crawledLinks:
crawledLinks.append(internal_link)
yield Request(internal_link, self.parse)
item = TutsplusItem()
item["url"] = current_url
yield item
async def add_to_index(body, current_url):
soup = BeautifulSoup(body)
texts = soup.find_all(text=True)
visible_texts = []
for text in texts:
if text.parent.name not in ['style', 'script', 'meta', '[document]'] and not isinstance(text, Comment):
visible_texts.append(text)
fetched_text = u" ".join(t.strip() for t in visible_texts)
words = nltk.word_tokenize(fetched_text)
stop = set(stopwords.words('english'))
stopwordsfree_words = [word for word in words if word not in stop]
detokenizer = MosesDetokenizer()
doc = detokenizer.detokenize(stopwordsfree_words, return_str=True)
doc = doc.encode('utf-8')
url = "some url"
try:
res = requests.post(url, data=doc)
except Exception as e:
print(e)
if not doc:
doc = soup.title.string
if res.status_code == 200:
words = json.loads(res.text)
doc = detokenizer.detokenize(words, return_str=True)
solr.add([{"doc": doc, "url": str(current_url)}])
我希望以“一劳永逸”的方式调用函数add_to_index()
。但我面临的问题是我收到了错误
未定义名称'add_to_index'
在解析方法中。所以功能没有得到认可。我是python的新手。你能帮我解决这个问题吗?
谢谢,
Nilesh制作。
答案 0 :(得分:0)
您是否尝试过像这样调用add_to_index
:self.add_to_index(response.body, current_url)