Python数据刮刀维基百科保护政策

时间:2018-05-04 07:50:19

标签: python

运行时,我收到以下错误(如下)。我相信这是一个维基保护功能。我怎么过来这个。我基本上试图刮取维基页面并在代码中搜索链接。我为我可怕的代码道歉或者我犯了什么可怕的错误?我是python的新手,很多都被切碎并复制和粘贴。

>     > Traceback (most recent call last):   File
>     > "C:\Users\MICHAEL\Desktop\Project X\dataprod.py", line 51, in <module>
>     >     page = urlopen(pg)   File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 223, in urlopen
>     >     return opener.open(url, data, timeout)   File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 511, in open
>     >     req = Request(fullurl, data)   File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 329, in __init__
>     >     self.full_url = url   File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 355, in full_url
>     >     self._parse()   File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 384, in _parse
>     >     raise ValueError("unknown url type: %r" % self.full_url) ValueError: unknown url type: '/wiki/Wikipedia:Protection_policy#semi'

。代码如下:

##DataFile. Access info -> Store Info
import shelve

#Saving data in raw txt format
f = open("data.txt", 'w')
print("...")

from urllib.request import urlopen

###############
#Data Scraping#
###############

#Importing relevant librarys
from urllib.request import urlopen
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
import warnings
import requests
import contextlib

#Specifying URL(s)

quote_page = 'https://en.wikipedia.org/wiki/Dog'

#
requests.packages.urllib3.disable_warnings()
response = requests.get(quote_page, verify=False)
response.status_code
#
http = httplib2.Http()
status, response = http.request(quote_page)

quotes = []
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
    if link.has_attr('href'):
        quotes.append(link['href'])
#        print(link['href'])

#for loop
info = []
for pg in quotes:

#querying the page and pulling html format
    page = urlopen(pg)

#store and convert using BeautifulSoup into 'soup'
    soup = BeautifulSoup(page, 'html.parser')

#Take out the <div> attribrute
    name_box = soup.find('html')

#Take data using by taking 'text'
    name = name_box.text.strip()

#data info Extra
    info.append((name))

#Displaying data grabbed
    print("PULLED DATA                                         .")

#Saving data as CSV
import csv
from datetime import datetime

# open a csv file with append, so old data will not be erased
with open("index.csv", 'a', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)

#for loop
    for name in info:
        writer.writerow([name])
f.write(name)
print(f, name)


Exit=input("Press '1' to save and close: ")

if Exit == 1:
    f.close()
    exit()

1 个答案:

答案 0 :(得分:1)

您需要在请求中添加用户代理,将脚本标识为机器人https://meta.wikimedia.org/wiki/User-Agent_policy。请更改

response = requests.get(quote_page, verify=False, headers= {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})

尝试这种方式:

##DataFile. Access info -> Store Info
import shelve

#Saving data in raw txt format
f = open("data.txt", 'w')
print("...")


###############
#Data Scraping#
###############

#Importing relevant librarys
from bs4 import BeautifulSoup
import warnings
import requests
import contextlib

#Specifying URL(s)

quote_page = 'https://en.wikipedia.org/wiki/Dog'

#
requests.packages.urllib3.disable_warnings()
response = requests.get(quote_page , verify=False, headers= {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
status = response.status_code
#


quotes = []
linkL = BeautifulSoup(response.content, 'html.parser')
for link in linkL.find_all("a"):
    if link.has_attr('href'):
        quotes.append(link['href'])
#        print(link['href'])

#for loop
info = []
for pg in quotes:

#querying the page and pulling html format
    page = requests.get(pg, verify=False, headers= {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}))

#store and convert using BeautifulSoup into 'soup'
    soup = BeautifulSoup(page.content, 'html.parser')

#Take out the <div> attribrute
    name_box = soup.find('html')

#Take data using by taking 'text'
    name = name_box.text.strip()

#data info Extra
    info.append((name))

#Displaying data grabbed
    print("PULLED DATA                                         .")

#Saving data as CSV
import csv
from datetime import datetime

# open a csv file with append, so old data will not be erased
with open("index.csv", 'a', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)

#for loop
    for name in info:
        writer.writerow([name])
f.write(name)
print(f, name)


Exit=input("Press '1' to save and close: ")

if Exit == 1:
    f.close()
    exit()