我正在使用Mechanize
抓取搜索引擎。但是如果我在bing.com
上执行相同的搜索查询,我会以编程方式获得最多200个结果,它会返回1400个结果。这里有什么问题?
def generate_profiles_from_group(options={})
raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
group = options[:group] if options.has_key? :group
query = build_query(options)
page = bing_search(query)
contacts_stack = extract_contacts_from_bing_page page: page
bing_links_stack = bing_links page
return contacts_stack, bing_links_stack
end
def extract_contacts_from_bing_page(options)
page = options[:page]
company = options[:company] || nil
title = options[:title] || nil
stack = []
while true
page.parser.search('h3 a').each do |cite|
text = cite.text
unless text == ""
name_array = text.split(' ')
if name_array.size >= 2
name = name_array[0]+' '+name_array[1]
unless name=~/[^a-zA-Z',\s]/i
stack << {name: name, company: company, title: title}
end
end
end
end
keyw = page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "sb_pagN", " " ))]').text
break if keyw == ""
page = @agent.click page.link_with(text: keyw )
end
stack
end
def bing_links page
stack = []
while true
page.parser.xpath('//cite').each do |cite|
stack << cite.text unless cite.text == ""
end
keyw = page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "sb_pagN", " " ))]').text
break if keyw == ""
sleep(10+rand(40))
page = @agent.click page.link_with(text: keyw )
end
stack
end
def build_query(options)
name = options[:name] if options.has_key? :name
title = options[:title] if options.has_key? :title
company = options[:company] if options.has_key? :company
group = options[:group] if options.has_key? :group
if name && company
return "site:linkedin.com \"#{name}\" \"at #{company}\""
elsif name && title
return "site:linkedin.com \"#{name}\" \"#{title}\""
elsif title && company
return "site:linkedin.com/ \"#{title}\" \"at #{company}\""
elsif group
return "site:linkedin.com \"groups and association\" + \"#{group}\""
end
end