我正在尝试进行一些剪贴,但是当我尝试进行请求时遇到了SSL问题。
代码:
def lazy_reload(page_code = '122054'):
review_dict = {}
url_api = 'https://www.cruise.co.uk/API-TPL/v3/3600/Review/'+ str(page_code) + '/'
print ('https://www.cruise.co.uk/API-TPL/v3/3600/Review/'+ str(page_code) + '/')
response_url = requests.get(url_api)
response_url.raise_for_status()
soup_url = BeautifulSoup(response_url.text, 'html.parser')
for review in soup_url.findAll('div', {'itemprop':'reviewBody'}):
review_dict ['review'] = review.getText()
for ship in soup_url.findAll('a',{'itemprop':'itemReviewed'}):
review_dict ['ship'] = ship.getText()
for auth in soup_url.findAll('span',{'itemprop':'author'}):
review_dict ['author'] = auth.getText()
for date in soup_url.findAll('time'):
if date.has_attr('datetime'):
review_dict ['date'] = date['datetime']
review_head_dict ={}
for review_head in soup_url.findAll('a',{'class':'review_head_links'}):
ul_review_head = review_head.parent
key_review_head = ul_review_head.find('strong').text
val_review_head = ul_review_head.find('a').text
review_head_dict[key_review_head] = val_review_head
review_dict ['destination'] = review_head_dict['Destination: ']
for rate_sailing_history in soup_url.findAll(lambda tag: tag.name == 'li' and tag.get('class') == ['margin-bottom-small']):
key = ''
if (rate_sailing_history is not None):
key = rate_sailing_history.text.split(':')[0]
if key:
value = ''
if (rate_sailing_history.find('strong') is not None):
value = rate_sailing_history.find('strong').text
review_dict[key] = value
for rate_sailing_history in soup_url.findAll('li', {'class': 'margin-bottom-medium fg-blue text margin-bottom-small'}):
key = ''
if (rate_sailing_history is not None):
key = rate_sailing_history.find('strong').text
if key:
value = ''
if (rate_sailing_history.find('strong') is not None):
value = rate_sailing_history.text.replace(key, '').replace ('\n', '').lstrip()
review_dict[key] = value
print ("Data scraped")
return review_dict
此功能用于延迟重新加载网站。
def search_page_data (page_data, prop):
if prop in page_data:
return page_data[prop]
else:
return None
def extract_number(dataframe):
df_with_rating = dataframe.copy(deep=True)
list_of_columns = ['Quality of Food','Entertainment','Shore Excursions','Staff',"Children's Facilities","Onboard Activities","Cabins"]
for column in list_of_columns:
dataframe[column + ' ' + 'Rating'] = dataframe[column].str.extract('(\d+)')
df_with_rating = df_with_rating.append(dataframe)
return df_with_rating
page_code = []
for i in range (1,2):
url = 'https://www.cruise.co.uk/royal-caribbean-cruises/reviews/?page=' + str(i)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
reviews_link = soup.findAll('div', {'class':'margin-top-10px alpha omega'})
for review in reviews_link:
# print (link.get('href'))
link_list = review.findAll('a')
for link in link_list:
review_code = link.get('href')
review_code_list = review_code.split("-")[-1]
page_code.append(review_code_list)
dataframe_source_dict = { "review":[],
"ship":[],
"author" :[],
"date": [],
"destination": [],
"Times cruised before": [],
"Sailed": [],
"Age": [],
"Occasion": [],
"Quality of Food": [],
"Entertainment": [],
"Shore Excursions": [],
"Staff": [],
"Children's Facilities": [],
"Onboard Activities": [],
"Cabins": [],
"Overall Rating": []
}
print (len(page_code))
i = 1
for code in page_code:
time.sleep(1)
page_data = lazy_reload(code)
for k,v in dataframe_source_dict.items ():
dataframe_source_dict[k].append(search_page_data(page_data, k))
i = i + 1
print ("Review ", i, " finished")
df = pd.DataFrame.from_dict(dataframe_source_dict)
reviews_cruise_co_uk = extract_number(df)
reviews_cruise_co_uk
此代码返回所有抓取的网页,并将其添加到字典中-我遇到SSL错误:
“ SSLError:HTTPSConnectionPool(host ='www.cruise.co.uk',端口= 443):URL超过了最大重试次数:/ royal-caribbean-cruises / reviews /?page = 37(由SSLError(SSLError引起) (“握手不正确:SysCallError(60,'ETIMEDOUT')“))) “