Ruby抓取多个页面

时间:2019-01-05 05:15:28

标签: ruby

我想告诉你如何从多个页面抓取。

下面的代码基于list_1,我想以一封电子邮件,在公司名称上附加list_2

另外,我不明白该怎么写。 URL r + 9 digits

https://job.rikunabi.com/2019/company/r294900083/的最后list_2

列表_1

def get_data(uri, companies)
  html = open(uri).read
  documents = Nokogiri::HTML(html)
  #byebug
  documents.xpath("//a[@class='ts-h-search-cassetteTitleMain js-h-search-cassetteTitleMain']").each {|n| companies << n.text.strip}
  return companies
end

def main()
  uri = "https://job.rikunabi.com/2019/s/__13_0_______/"

puts "What is the maximum page? "
  page = gets.to_i

  companies = []

  data = get_data(uri,companies)
  (2..page).to_a.each do |idx|

    uri = "https://job.rikunabi.com/2019/s/__13_0_______/?moduleCd=2&isc=ps054&pn=#{idx}"
    data = get_data(uri,companies)
  end

  len = [companies.size].min - 1

  headers = ["company_name"]
  time = Time.new.strftime("%Y-%m-%d")
  CSV.open("rikunabi_tokyo_2019-#{time}.csv", "a",headers: headers, write_headers: true) do |csv|

    (0..len).to_a.each do |idx|
      csv_column_values = [companies[idx]]
      csv << csv_column_values
    end
  end
end

if __FILE__ == $0
  puts("Process Start")
  main()
  puts("Process Finished")
end

list_2

def get_data(uri, data)

  companies = data[0]

  parameters = data[1]

  html = open(uri).read
  documents = Nokogiri::HTML(html.toutf8, nil, 'utf-8')

companies << documents.xpath("//h1[@class='ts-h-company-mainTitle']").text

parameters << documents.xpath("//div[@class='ts-h-company-sentence']")[1].text.strip.gsub(/(\r)/, " ")

 return [companies, parameters]
end

def main()

  uri = "https://job.rikunabi.com/2019/company/r294900083/"
  puts "What is the maximum page? "
  page = gets.to_i

  companies = []

  parameters = []

  data = [companies, parameters]

  data = get_data(uri,data)

  (2..page).to_a.each do |idx|

    uri = "https://job.rikunabi.com/2019/company/r294900083/"
    data = get_data(uri,data)
  end

  len = [companies.size,parameters.size].min - 1

  headers = ["company_name","Email"]
  time = Time.new.strftime("%Y-%m-%d")
  CSV.open("rikunabi_tokyo_2019-#{time}.csv", "a",headers: headers, write_headers: true) do |csv|

    (0..len).to_a.each do |idx|
      csv_column_values = [companies[idx], parameters[idx]]
      csv << csv_column_values
    end
  end
end

if __FILE__ == $0
  puts("Process Start")
  main()
  puts("Process Finished")
end

0 个答案:

没有答案