对搜索引擎抓取结果数量的限制

时间:2012-07-03 04:49:40

标签: ruby seo screen-scraping

我正在使用Mechanize抓取搜索引擎。但是如果我在bing.com上执行相同的搜索查询,我会以编程方式获得最多200个结果,它会返回1400个结果。这里有什么问题?

 def generate_profiles_from_group(options={})
  raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
  group = options[:group] if options.has_key? :group
  query = build_query(options)
  page = bing_search(query)
  contacts_stack = extract_contacts_from_bing_page page: page
  bing_links_stack = bing_links page
  return contacts_stack, bing_links_stack
end

def extract_contacts_from_bing_page(options)
  page = options[:page]
  company = options[:company] || nil
  title = options[:title] || nil
  stack = []
  while true 
    page.parser.search('h3 a').each do |cite|
      text = cite.text
      unless text == ""
        name_array = text.split(' ')
        if name_array.size >= 2
          name = name_array[0]+' '+name_array[1]
          unless name=~/[^a-zA-Z',\s]/i
            stack << {name: name, company: company, title: title} 
          end
        end
      end
    end
    keyw = page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "sb_pagN", " " ))]').text
    break if keyw == ""
    page = @agent.click page.link_with(text: keyw ) 
  end
  stack
end

def bing_links page
  stack = []
  while true
    page.parser.xpath('//cite').each do |cite|
      stack << cite.text  unless cite.text == ""
    end

    keyw = page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "sb_pagN", " " ))]').text
    break if keyw == ""
    sleep(10+rand(40))
    page = @agent.click page.link_with(text: keyw )
  end
  stack
end
def build_query(options)
  name = options[:name] if options.has_key? :name
  title = options[:title] if options.has_key? :title
  company = options[:company] if options.has_key? :company
  group = options[:group] if options.has_key? :group
  if name && company
    return "site:linkedin.com \"#{name}\" \"at #{company}\""
  elsif name && title
    return "site:linkedin.com \"#{name}\" \"#{title}\""
  elsif title && company
    return "site:linkedin.com/ \"#{title}\" \"at #{company}\""
  elsif group
    return "site:linkedin.com \"groups and association\" + \"#{group}\"" 
  end
end

0 个答案:

没有答案