运行时,我收到以下错误(如下)。我相信这是一个维基保护功能。我怎么过来这个。我基本上试图刮取维基页面并在代码中搜索链接。我为我可怕的代码道歉或者我犯了什么可怕的错误?我是python的新手,很多都被切碎并复制和粘贴。
> > Traceback (most recent call last): File
> > "C:\Users\MICHAEL\Desktop\Project X\dataprod.py", line 51, in <module>
> > page = urlopen(pg) File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 223, in urlopen
> > return opener.open(url, data, timeout) File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 511, in open
> > req = Request(fullurl, data) File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 329, in __init__
> > self.full_url = url File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 355, in full_url
> > self._parse() File "C:\Program Files (x86)\Python36-32\lib\urllib\request.py", line 384, in _parse
> > raise ValueError("unknown url type: %r" % self.full_url) ValueError: unknown url type: '/wiki/Wikipedia:Protection_policy#semi'
。代码如下:
##DataFile. Access info -> Store Info
import shelve
#Saving data in raw txt format
f = open("data.txt", 'w')
print("...")
from urllib.request import urlopen
###############
#Data Scraping#
###############
#Importing relevant librarys
from urllib.request import urlopen
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
import warnings
import requests
import contextlib
#Specifying URL(s)
quote_page = 'https://en.wikipedia.org/wiki/Dog'
#
requests.packages.urllib3.disable_warnings()
response = requests.get(quote_page, verify=False)
response.status_code
#
http = httplib2.Http()
status, response = http.request(quote_page)
quotes = []
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
quotes.append(link['href'])
# print(link['href'])
#for loop
info = []
for pg in quotes:
#querying the page and pulling html format
page = urlopen(pg)
#store and convert using BeautifulSoup into 'soup'
soup = BeautifulSoup(page, 'html.parser')
#Take out the <div> attribrute
name_box = soup.find('html')
#Take data using by taking 'text'
name = name_box.text.strip()
#data info Extra
info.append((name))
#Displaying data grabbed
print("PULLED DATA .")
#Saving data as CSV
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open("index.csv", 'a', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
#for loop
for name in info:
writer.writerow([name])
f.write(name)
print(f, name)
Exit=input("Press '1' to save and close: ")
if Exit == 1:
f.close()
exit()
答案 0 :(得分:1)
您需要在请求中添加用户代理,将脚本标识为机器人https://meta.wikimedia.org/wiki/User-Agent_policy。请更改
response = requests.get(quote_page, verify=False, headers= {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
尝试这种方式:
##DataFile. Access info -> Store Info
import shelve
#Saving data in raw txt format
f = open("data.txt", 'w')
print("...")
###############
#Data Scraping#
###############
#Importing relevant librarys
from bs4 import BeautifulSoup
import warnings
import requests
import contextlib
#Specifying URL(s)
quote_page = 'https://en.wikipedia.org/wiki/Dog'
#
requests.packages.urllib3.disable_warnings()
response = requests.get(quote_page , verify=False, headers= {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
status = response.status_code
#
quotes = []
linkL = BeautifulSoup(response.content, 'html.parser')
for link in linkL.find_all("a"):
if link.has_attr('href'):
quotes.append(link['href'])
# print(link['href'])
#for loop
info = []
for pg in quotes:
#querying the page and pulling html format
page = requests.get(pg, verify=False, headers= {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}))
#store and convert using BeautifulSoup into 'soup'
soup = BeautifulSoup(page.content, 'html.parser')
#Take out the <div> attribrute
name_box = soup.find('html')
#Take data using by taking 'text'
name = name_box.text.strip()
#data info Extra
info.append((name))
#Displaying data grabbed
print("PULLED DATA .")
#Saving data as CSV
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open("index.csv", 'a', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
#for loop
for name in info:
writer.writerow([name])
f.write(name)
print(f, name)
Exit=input("Press '1' to save and close: ")
if Exit == 1:
f.close()
exit()