从Google抓取/提取Skype ID

时间:2019-06-16 17:56:01

标签: python

因此,基本上,网站在其网站上使用以下格式的Skype ID:Skype ID:USERNAMEWOULDBEHERE或Skype:USERNAMEWOULDBEHERE

我只是想提取他们的用户名/ Skype ID。

我做错什么了吗?如何检查两个字符串? (Skype:&Skype ID:)

非常感谢您的帮助。我是Python的初学者,所以请和我一起轻松一点。

#!/usr/bin/env python2
# -*- coding: utf8 -*-

import sys
import time
import random
import argparse

from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.common.exceptions import NoSuchFrameException
from selenium.webdriver.common.keys import Keys

# If this script no longer fetches any results check the XPath

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--search', help='Enter the search term')
    parser.add_argument('-p', '--pages', default='1', help='Enter how many pages to scrape (1 page = 100 results)')
    return parser.parse_args()

def start_browser():
    br = webdriver.Firefox()
    br.implicitly_wait(10)
    return br

def get_ua():
    ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']
    ua = random.choice(ua_list)
    return ua

def scrape_results(br):

    links = br.find_elements_by_xpath("Skype ID: ")
    results = []
    for link in links:
        title = link.text.encode('utf8')
        url = link.get_attribute('href')
        title_url = (title, url)
        results.append(title_url)
    return results

def go_to_page(br, page_num, search_term):
    page_num = page_num - 1
    start_results = page_num * 100
    start_results = str(start_results)
    url = 'https://www.google.com/webhp?#num=100&start='+start_results+'&q='+search_term
    print '[*] Fetching 100 results from page '+str(page_num+1)+' at '+url
    br.get(url)
    time.sleep(2)

def main():
    args = parse_args()
    br = start_browser()
    if not args.search:
        sys.exit("[!] Enter a term or phrase to search with the -s option: -s 'dan mcinerney'")
    search_term = args.search
    pages = args.pages

    all_results = []
    for page_num in xrange(int(pages)):
        page_num = page_num+1 # since it starts at 0
        go_to_page(br, page_num, search_term)
        titles_urls = scrape_results(br)
        for title in titles_urls:
            all_results.append(title)

    for result in all_results:
        title = result[0]
        url = result[1]
        print '[+]', title, '--', url
    br.quit()

if __name__ == "__main__":
    main()

0 个答案:

没有答案