Scrapy多个搜索词

时间:2014-01-05 20:19:59

标签: python python-3.x web-scraping scrapy web-crawler

我是Python的新手,我正在学习如何抓取网页(1天)。我想要实现的任务是遍历2000家公司的名单,并提取收入数据和员工数量。我开始使用scrapy,我已经设法让工作流程为一家公司工作(不优雅,但至少我正在尝试) - 但我无法弄清楚如何加载公司列表并循环执行多次搜索。我觉得这是一个相当简单的程序。

所以,我的主要问题是 - 我应该在蜘蛛类中定义要循环的公司查询数组吗?我不知道确切的URL,因为每个公司都有一个唯一的ID并且属于特定市场 - 所以我不能将它们输入为start_urls。
Scrapy是正确的工具还是我应该使用机械化 - 这类任务?

这是我目前的代码。

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json

class DmozSpider(BaseSpider):
    name = "dmoz"
    allowed_domains = ["proff.se"]
    start_urls = ["http://www.proff.se"]

# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.

    def parse(self, response):
        return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)

# I fetch the url from the search result and convert it to correct Financial-url where the information is located.

    def search_result(self,response):
        sel = HtmlXPathSelector(response)
        link = sel.xpath('//ul[@class="company-list two-columns"]/li/a/@href').extract()
        finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
        return Request(finance_url,callback=self.parse_finance)

# I Scraped the information of this particular company, this is hardcoded and will not 
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[@id="accountTable1"]/tbody/tr[3]/@data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.    

    def parse_finance(self, response):
        sel = HtmlXPathSelector(response)
        datachart = sel.xpath('//tr/@data-chart').extract()
        employees=json.loads(datachart[36])
        revenue = json.loads(datachart[0])
        items = []
        item = DmozItem()
        item['company']=response.url.split("/")[-5]
        item['market']=response.url.split("/")[-3]
        item['employees']=employees
        item['revenue']=revenue
        items.append(item)
        return item

2 个答案:

答案 0 :(得分:2)

常见的方法是使用命令行参数执行此操作。给蜘蛛的__init__方法一个参数:

class ProffSpider(BaseSpider):
    name = "proff"
    ...

    def __init__(self, query):
        self.query = query

    def parse(self, response):
        return FormRequest.from_response(response,
            formdata={'q': self.query},
            callback=self.search_result
        )

    ...

然后开始你的蜘蛛(也许是Scrapyd):

$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"

如果要通过传递文件中的参数来一次运行一堆蜘蛛,可以创建一个新命令来运行蜘蛛的多个实例。这只是将内置crawl命令与example code for running multiple spiders混合使用一个抓取工具:

<强> your_project/settings.py

COMMANDS_MODULE = 'your_project_module.commands'

<强> your_project/commands/__init__.py

# empty file

<强> your_project/commands/crawl_many.py

import os
import csv

from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError


class Command(ScrapyCommand):
    requires_project = True

    def syntax(self):
        return '[options]'

    def short_desc(self):
        return 'Run many instances of a spider'

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)

        parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
        parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
        parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)

        if not opts.output:
            return

        if opts.output == '-':
            self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
        else:
            self.settings.set('FEED_URI', opts.output, priority='cmdline')

        feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
        valid_output_formats = feed_exporters.keys()

        if not opts.output_format:
            opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')

        if opts.output_format not in valid_output_formats:
            raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))

        self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')

    def run(self, args, opts):
        if args:
            raise UsageError()

        with open(opts.input_file, 'rb') as handle:
            for spider_options in csv.DictReader(handle):
                spider = spider_options.pop('spider')
                self.crawler_process.crawl(spider, **spider_options)

        self.crawler_process.start()

你可以像这样运行它:

$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl

抓取选项CSV的格式很简单:

spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd

答案 1 :(得分:0)

我要做的第一件事就是创建公司列表并找到获取每个公司网址的方法。这种爬行很容易。我编写了一个爬虫来从疾病列表中提取维基百科的疾病信息。了解它如何适合您的使用案例。

import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()

def crawlwiki(keywords):
    print (keywords)
    columns = ['Category', 'Text']
    page=1
    print ('Fetching for {}....'.format(keywords))
    url = 'https://en.wikipedia.org/wiki/'
    for i in range(len(keywords)):
        url = url + keywords[i]
        url = url + '%20'

    url = url[0:(len(url)-3)]   
    output_obj = {}
    #curr_page = url+str(page)
    while True:
        try:
            page_source = requests.get(url)
        except:

#What you should do if internet connection fails
        break

    plain_text = page_source.text
    bs_obj = BeautifulSoup(plain_text, "lxml")
    '''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
    base_url = 'http://www.webmd.com'
    for div in toc_links:
        links = div.findAll('a')
        for a in links:
            output_obj[a.text] = base_url + a.get('href')
            print (base_url + a.get('href'))
    data = bs_obj.findAll('div', {'class':'search-text-container'})
    for div in data:
        links = div.findAll('a')
        for a in links:
            output_obj[a.text] = a.get('href')
            print (a.get('href'))'''


    """
        Mapping:
        1 : Signs and symptoms
        2 : Diagnosis
        3 : Prognosis
        4 : Treatment

    """

    symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(symptom_text)
    symptoms_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    symptom_data = symptoms_object.findAll('p')
    symptom_paragraphs = ""
    for p in symptom_data:
        symptom_paragraphs += p.text

    symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
    df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)

    diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(diagnosis_text)
    diagnosis_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    diagnosis_data = diagnosis_object.findAll('p')
    diagnosis_paragraphs = ""
    for p in diagnosis_data:
        diagnosis_paragraphs += p.text

    diagnosis_paragraphs = re.sub(r"/?\[\d+]"   , '', diagnosis_paragraphs, re.DOTALL)
    df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)

    prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(prognosis_text)
    prognosis_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    prognosis_data = prognosis_object.findAll('p')
    prognosis_paragraphs = ""
    for p in prognosis_data:
        prognosis_paragraphs += p.text

    prognosis_paragraphs = re.sub(r"/?\[\d+]"   , '', prognosis_paragraphs, re.DOTALL)
    df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)

    treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(treatment_text)
    treatment_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    treatment_data = treatment_object.findAll('p')
    treatment_paragraphs = ""
    for p in treatment_data:
        treatment_paragraphs += p.text

    treatment_paragraphs = re.sub(r"/?\[\d+]"   , '', treatment_paragraphs, re.DOTALL)
    df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)

    df = pd.DataFrame(columns = columns)

    df = df.append(df_1.append(df_2.append(df_3.append(df_4))))

    return df
    print('Fetch completed....')



def main():

    disease_df = pd.read_csv("disease.txt", sep="\n", header=None)

    columns = ['Category', 'Text']
    df_data = pd.DataFrame(columns=columns)
    size = disease_df.size
    print("Initializing....")
    p = Pool(5)
    df_data = p.map(crawlwiki, disease_df.values.tolist())
    """for index, row in disease_df.iterrows():
        print('Iteration {0} out of {1}.....'.format(index+1, size))
        df = crawlwiki(row, columns)
        df_data = df_data.append(df)"""

    df_data.to_csv("TagDataset.csv", index=False)




if __name__ == '__main__':
    main()