在运行python代码时,程序流陷入try块中

时间:2019-02-27 06:57:41

标签: python python-3.x python-requests

Python代码卡在try块中 `

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

#import urllib2
def url1_to_string(url1):
    html=""
    proxyDict = { 
             'http': 'http://username:pwd@proxyurl:8080',
             'https': 'https://username:pwd@proxyurl:8080'             
            }
    try:
        print('Before res in try')
        res = requests.get(url1,proxies=proxyDict)
        print('After res in try')
    except:
        pass

    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]


for index, row in df.iterrows():
    agg_url_list = []

    ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
    r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
    ny_bb1 = url1_to_string(ini_url)
    anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))   
    account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
    print(anchor_count)
    soup = BeautifulSoup(r.text,"html.parser")
    get_details1 = soup.find_all("div", attrs={"class": "g"})
    sublist1 = []
    for details1 in get_details1:
        link1 = details1.find_all("h3")
        for mdetails1 in link1[:]:
            links1 = mdetails1.find_all("a")
            lmk1 = ""
            for lnk1 in links1[:]:
                lmk1 = lnk1.get("href")[7:].split("&")
                sublist1.append(lmk1[0])

    aggregate_count1=0
    for x1 in sublist1[:3]:
        anchorcount1=0
        accountcount1=0
        print("aagg url",x1)
        try:
            print('In try block')
            ny_bb1 = url1_to_string(x1)
        except KeyboardInterrupt: print('You cancelled the operation.')
        finally:
            pass
        ny_bb1=ny_bb1.upper()
        print(ny_bb1)
        row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
        row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
        anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
        account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
        if(anchor_name==account_name):

            if(row['ANCHOR_NAME'] in ny_bb1.upper()): 
                anchorcount1 = anchorcount1 + 1 
            if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
               accountcount1 = accountcount1 + 1
        else:
            if (anchor_name in ny_bb1.upper()):               
                anchorcount1 = anchorcount1 + 1 
            if(account_name in ny_bb1.upper()):
                accountcount1 = accountcount1 + 1

        if(anchorcount1 > 0 and accountcount1 > 0):

          aggregate_count1=aggregate_count1+1
          agg_url_list.append(x1[:])
          print("existance of both",aggregate_count1)

    aggregate_page_count.append(aggregate_count1)
    agg_url_count.append(agg_url_list)

df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)

`

abc.csv文件的内容如下:::

ANCHOR_NAME,ACCOUNT_NAME

ABC,ABC

XYZ,ZYZ

以此类推

对于特定的URL,代码会卡在try块中,而控制不会进入到我想忽略异常并继续执行正常程序流程的块中,例如执行下一个URL等。

0 个答案:

没有答案