Question

我正在尝试进行一些剪贴，但是当我尝试进行请求时遇到了SSL问题。

代码：

def lazy_reload(page_code = '122054'):

review_dict = {}
url_api = 'https://www.cruise.co.uk/API-TPL/v3/3600/Review/'+ str(page_code) + '/'
print ('https://www.cruise.co.uk/API-TPL/v3/3600/Review/'+ str(page_code) + '/')
response_url = requests.get(url_api)
response_url.raise_for_status()
soup_url = BeautifulSoup(response_url.text, 'html.parser')

for review in soup_url.findAll('div', {'itemprop':'reviewBody'}):
    review_dict ['review'] = review.getText()
for ship in soup_url.findAll('a',{'itemprop':'itemReviewed'}):
    review_dict ['ship'] = ship.getText()
for auth in soup_url.findAll('span',{'itemprop':'author'}):
    review_dict ['author'] = auth.getText()
for date in soup_url.findAll('time'):
    if date.has_attr('datetime'):
        review_dict ['date'] = date['datetime']

review_head_dict ={}
for review_head in soup_url.findAll('a',{'class':'review_head_links'}):
    ul_review_head = review_head.parent
    key_review_head = ul_review_head.find('strong').text
    val_review_head = ul_review_head.find('a').text
    review_head_dict[key_review_head] = val_review_head
review_dict ['destination'] = review_head_dict['Destination: ']


for rate_sailing_history in soup_url.findAll(lambda tag: tag.name == 'li' and tag.get('class') == ['margin-bottom-small']):
    key = ''
    if (rate_sailing_history is not None):
        key = rate_sailing_history.text.split(':')[0]

    if key:
        value = ''
        if (rate_sailing_history.find('strong') is not None):
            value = rate_sailing_history.find('strong').text
        review_dict[key] = value

for rate_sailing_history in soup_url.findAll('li', {'class': 'margin-bottom-medium fg-blue text margin-bottom-small'}):
    key = ''
    if (rate_sailing_history is not None):
        key = rate_sailing_history.find('strong').text

    if key:
        value = ''
        if (rate_sailing_history.find('strong') is not None):
            value = rate_sailing_history.text.replace(key, '').replace ('\n', '').lstrip()
        review_dict[key] = value


print ("Data scraped")
return review_dict

此功能用于延迟重新加载网站。

def search_page_data (page_data, prop):
    if prop in page_data:
        return page_data[prop]
    else:
        return None

def extract_number(dataframe):
    df_with_rating = dataframe.copy(deep=True)
    list_of_columns = ['Quality of Food','Entertainment','Shore Excursions','Staff',"Children's Facilities","Onboard Activities","Cabins"]
    for column in list_of_columns:
        dataframe[column + ' ' + 'Rating'] = dataframe[column].str.extract('(\d+)')
        df_with_rating = df_with_rating.append(dataframe)
    return df_with_rating  

page_code = []
for i in range (1,2):
    url = 'https://www.cruise.co.uk/royal-caribbean-cruises/reviews/?page=' + str(i)

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    reviews_link = soup.findAll('div', {'class':'margin-top-10px alpha omega'})
    for review in reviews_link:
    #   print (link.get('href'))
        link_list = review.findAll('a')
        for link in link_list:
            review_code = link.get('href')
            review_code_list = review_code.split("-")[-1]
            page_code.append(review_code_list)

dataframe_source_dict = { "review":[],                    
                          "ship":[],                         
                          "author" :[],
                          "date": [],
                          "destination": [],
                          "Times cruised before": [],
                          "Sailed": [],
                          "Age": [],
                          "Occasion": [],
                          "Quality of Food": [],
                          "Entertainment": [],
                          "Shore Excursions": [],
                          "Staff": [],
                          "Children's Facilities": [],
                          "Onboard Activities": [],
                          "Cabins": [],
                          "Overall Rating": []
                        }
print (len(page_code))
i = 1

for code in page_code:
    time.sleep(1)
    page_data = lazy_reload(code)
    for k,v in dataframe_source_dict.items ():
        dataframe_source_dict[k].append(search_page_data(page_data, k))
    i = i + 1
    print ("Review ", i, " finished")

df = pd.DataFrame.from_dict(dataframe_source_dict)
reviews_cruise_co_uk = extract_number(df)
reviews_cruise_co_uk

此代码返回所有抓取的网页，并将其添加到字典中-我遇到SSL错误：

“ SSLError：HTTPSConnectionPool（host ='www.cruise.co.uk'，端口= 443）：URL超过了最大重试次数：/ royal-caribbean-cruises / reviews /？page = 37（由SSLError（SSLError引起）（“握手不正确：SysCallError（60，'ETIMEDOUT'）“））） “

SSLError（SSLError（“握手失败：SysCallError（60，'ETIMEDOUT'）”，），））-向我抛出此错误，但在其他系统上运行正常

0 个答案: