python web-scrapping ==> Google搜索

时间:2019-03-18 19:49:54

标签: python

我需要帮助,因为我被困住了。所以我实际上是在进行Web爬网程序项目,问题是我无法设法获得正确的搜索词“ a”标签,而我只能获得www.google.com或我用途是:

   url_dorked ="https://www.google.com/search?q={}&sources=lnms&tbs=nws".format(dork_used)

当我在浏览器中尝试使用URL时,他会向我显示正确的页面。但是python脚本仅接收google主页的标签。

这是完整的脚本:

    #!/bin/bash
# -*- coding: utf-8 -*-
#Codded by Naylor From Exploit-Zone
#Join-us ! ==> https://forum.exploit-zone.eu/
#Create a folder nammed ==> Dork

from urllib.request import *
import os
from bs4 import BeautifulSoup

"""
http ==> 80
https ==> 443
----------
200 ==> OK
400 ==> Bad request
403 ==> Forbidden
404 ==> Not found
"""

def checker():
    dork_used = ""
    url_dorked ="https://www.google.co.ve/?gws_rd=cr&ei=DgBqVpWJMoPA-gHy25fACg#q={}".format(dork_used)
    dorks = open("Dork/{}.txt".format(txtdork_path),"r")
    list_dorks = []
    dorks_lines = dorks.readlines()
    tot_dorks = len(dorks_lines)
    tot_dorks -= 1
    for line in dorks_lines:
      list_dorks.append(line)
      print("\t{}\n (--) Has been charged\n".format(line))
    print("\n(--) All {} dorks charged\n".format(tot_dorks))
    dorks.close()
    choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
    if choosen_dork >= 0 and choosen_dork <= tot_dorks:
        pass
    else:
        print("The choosen number is to big !")
        choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
    dork_used = str(list_dorks[choosen_dork])
    print("\n(--) Selected Dorks ==> {}".format(dork_used))

    req_fbypss = Request(url_dorked, headers = {'User-Agent': 'Mozilla/5.0'}) #user-agent to bypass Anti-crawl)
    init_google = urlopen(req_fbypss)#init connection
    print("(--) Google connection response ==> {}\n".format(init_google.code))
    html_google = init_google.read() #read response of init
    html_decoded = html_google.decode("utf-8")#like ISO-8859-1
    soup = BeautifulSoup(html_google, "html.parser") #start an html html_parser
    result_link = soup.findAll('a')
    for i in result_link:
        print(i,"\n")
    """
    with open("dork_{}.txt".format(choosen_dork),"a") as f:
        for result in result_1:
            f.write(result)
    """

print("\n\n\welcome\n\n")
print("here Are the available dork file :\n")
dork_list = str(os.listdir('Dork/.'))
print("=> {}\n".format(dork_list))

txtdork_path = str(input("Enter dork file's name (without '.txt'/'[]'/''')\n>"))

check_file = os.path.isfile("Dork/{}.txt".format(txtdork_path))

if check_file == True:
    print("\n(--) {} as been selected".format(txtdork_path))
else:
    print("\nWrong name!\n (write only the name of the .txt file like : Google dork 2019)\n the .txt file have to be on the Dork folder\n\n")
    exit()
checker()

脚本还没有完成,它只是一个开发版本。

我的研究结果是:

-既有反履带程序,也没有用户代理绕过他。

-要么是URL的问题,我必须修改她。

感谢您对我的帮助^^

2 个答案:

答案 0 :(得分:0)

def checker():
    dork_used = ""
    url_dorked ="https://www.google.co.ve/?gws_rd=cr&ei=DgBqVpWJMoPA-gHy25fACg#q={}".format(dork_used)

分配url_dorked时,dork_used是一个空字符串,因此q=是空的。你打算那样吗?

此外,我认为应该是&q={},而不是#q={}

答案 1 :(得分:0)

我按照您的建议,将代码替换为:

#!/bin/bash
# -*- coding: utf-8 -*-
#Codded by Naylor From Exploit-Zone
#Join-us ! ==> https://forum.exploit-zone.eu/
#Create a folder nammed ==> Dork

from urllib.request import *
import os
from bs4 import BeautifulSoup

"""
http ==> 80
https ==> 443
----------
200 ==> OK
400 ==> Bad request
403 ==> Forbidden
404 ==> Not found
"""

def checker():
    dorks = open("Dork/{}.txt".format(txtdork_path),"r")
    list_dorks = []
    dorks_lines = dorks.readlines()
    tot_dorks = len(dorks_lines)
    tot_dorks -= 1
    for line in dorks_lines:
      list_dorks.append(line)
      print("\t{}\n (--) Has been charged\n".format(line))
    print("\n(--) All {} dorks charged\n".format(tot_dorks))
    dorks.close()
    choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
    if choosen_dork >= 0 and choosen_dork <= tot_dorks:
        pass
    else:
        print("The choosen number is to big !")
        choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
    dork_used = str(list_dorks[choosen_dork])
    print("\n(--) Selected Dorks ==> {}".format(dork_used))

    url_dorked ="https://www.google.com/search?q={}&sources=lnms&tbs=nws".format(dork_used)

    req_fbypss = Request(url_dorked, headers = {'User-Agent': 'Mozilla/5.0'}) #user-agent to bypass Anti-crawl)
    init_google = urlopen(req_fbypss)#init connection
    print("(--) Google connection response ==> {}\n".format(init_google.code))
    html_google = init_google.read() #read response of init
    html_decoded = html_google.decode("ISO-8859-1")#like utf-8
    soup = BeautifulSoup(html_google, "html.parser") #start an html html_parser
    result_link = soup.findAll('a')
    for i in result_link:
        print(i,"\n")
    """
    with open("dork_{}.txt".format(choosen_dork),"a") as f:
        for result in result_1:
            f.write(result)
    """

print("\n\n\tWelcome\n\n")
print("here Are the available dork file :\n")
dork_list = str(os.listdir('Dork/.'))
print("=> {}\n".format(dork_list))

txtdork_path = str(input("Enter dork file's name (without '.txt'/'[]'/''')\n>"))

check_file = os.path.isfile("Dork/{}.txt".format(txtdork_path))

if check_file == True:
    print("\n(--) {} as been selected".format(txtdork_path))
else:
    print("\nWrong name!\n (write only the name of the .txt file like : Google dork 2019)\n the .txt file have to be on the Dork folder\n\n")
    exit()
checker()

所以我替换了url_dorked,因为我输入的是错误的,我忘记了替换^^

在选择了dork_used之后,我更改了url_dorked的位置以放置他

他现在要求进行一项更改,以ISO-8859-1而非utf-8进行解码 但仍然不起作用:/