请理解我不擅长英语。
我创建了一个使用图片网址下载图片的抓取程序
获取imgs_urls数组中的图片网址。
然后使用urlretrieve函数使用这些URL下载它们。
在使用数组中的所有网址之前,我收到403禁用错误。
##### syncopation ######因为限制为30000个字符而被使用
如何解决错误?
def Remainder_All_ImagesURLs_Google(searchText):
def scroll_page():
for i in range(7):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
def click_button():
more_imgs_button_xpath = "//*[@id='smb']"
element = driver.find_element_by_xpath(more_imgs_button_xpath)
element.click()
sleep(3)
def create_soup():
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
return soup
driver = webdriver.Chrome('C:/Users/ajh46\Anaconda3/ChromeDriver/chromedriver.exe')
driver.maximize_window()
sleep(2)
searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)
driver.get(searchUrl)
try:
scroll_page()
click_button()
scroll_page()
except:
click_button()
scroll_page()
imgs_urls = []
cnt = 0
for j in range(100):
element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
element.click()
sleep(1)
soup = create_soup()
for img in soup.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except:
pass
print(str(cnt + j))
cnt += 2
driver.close()
return(imgs_urls)
def download_image(url,filename):
full_name = str(filename) + ".jpg"
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
print(full_name)
if __name__ == "__main__":
count2 = 0
searchText = 'sites:pinterest white dress'
filename = 'white dress'
for url2 in Remainder_All_ImagesURLs_Google(searchText):
download_image(url2, filename + str(count2))
count2 += 1
print(url2)
297
['https://s-media-cache-ak0.pinimg.com/736x/3f/1b/1d/3f1b1decd26c10e3ce0a14d270c4a3db.jpg' #####syncopation##### 'http://24myfashion.com/2016/wp-content/uploads/2016/04/Red-and-white-dress-2017-2018-15.jpg', 'https://s-media-cache-ak0.pinimg.com/736x/84/3d/c5/843dc5b9879801fce8ca33b569948143.jpg']
white dress0.jpg
https://s-media-cache-ak0.pinimg.com/736x/3f/1b/1d/3f1b1decd26c10e3ce0a14d270c4a3db.jpg
#####syncopation#####
white dress101.jpg
https://s-media-cache-ak0.pinimg.com/originals/4e/9e/83/4e9e83b4aaf3224b5b26482b4639004f.jpg**
Traceback (most recent call last):
File "C:/Users/ajh46/PycharmProjects/untitled3/Crawling.py", line 216, in <module>
download_image(url2, filename + str(count2))
File "C:/Users/ajh46/PycharmProjects/untitled3/Crawling.py", line 192, in download_image
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\ajh46\Anaconda3\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
答案 0 :(得分:0)
.urlretrieve()
没有像在硒浏览器中那样的cookie或会话(为什么要获得403),并且还需要设置用户代理。
只需将您的download_image()
函数更改为以下
def download_image(url,filename):
browser = webdriver.Chrome()
browser.get(url)
userAgent = browser.execute_script("return navigator.userAgent;")
seleniumCookies= browser.get_cookies()
cookies = ''
for cookie in seleniumCookies:
cookies += '%s=%s;' % (cookie['name'], cookie['value'])
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', userAgent)]
opener.addheaders.append(('Cookie', cookies))
full_name = str(filename) + ".jpg"
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
print(full_name)
希望这会有所帮助:)