我厌倦了循环返回的URL列表

时间:2018-05-15 03:12:13

标签: python-3.x beautifulsoup

我的第一个python项目,我正在努力去餐厅检查。一个站点的摘要提供了我想要抓取的详细报告的密钥。我很难在循环键入的网址列表中获取详细信息。

import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError

try:
    insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv", 
                       usecols=[2,14,18,80,81])
except IOError:
    print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate", 
                "NumHighVio", "LicenseID", "VisitID"]
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)
# prefer to have user set timedelta below:
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=30)
alachua = alachua[(alachua['InspectDate'] > startDay) & 
    (alachua['InspectDate'] < today)]
# takes LicenseID and VisitID, passes it into the urls for detailed reports
for index, rows in alachua.iterrows():
    visitID = rows['VisitID']
    licID = rows['LicenseID']
    urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID= 
        %s &licid= %s" % (visitID, licID)
    urls = urls.replace(' ', '')
    print(urls)
## here's my problem:
for url in urls:
    def get_inspect_detail():   
        html = urlopen(url)
        soup = bs4.BeautifulSoup(html.read(), 'lxml')
        details = soup.find_all('font', {'face':'verdana'})[10:]

        for detail in details:
            siteName = details[0].text
            licNum = details[2].text
            siteRank = details[4].text
            expDate = details[6].text
            primeStatus = details[8].text
            secStatus = details[10].text
            siteAddress = details[12].text
            inspectResult = details[20].text
            observed1 = details[34].get_text
            observed2 = details[36].text
            observed3 = details[38].text
            observed4 = details[40].text
            observed5 = details[42].text
            observed6 = details[44].text
            observed7 = details[46].text
            observed8 = details[48].text
            observed9 = details[50].text
            observed10 = details[52].text

            detailsLib = {
                'Restaurant': siteName,
                'License': licNum,
                'Rank': siteRank,
                'Expires': expDate,
                'Primary': primeStatus,
                'Secondary': secStatus,
                'Address': siteAddress,
                'Result': inspectResult,
                'Observed1': observed1,
                'Observed2': observed2,
                'Observed3': observed3,
                'Observed4': observed4,
                'Observed5': observed5,
                'Observed6': observed6,
                'Observed7': observed7,
                'Observed8': observed8,
                'Observed9': observed9,
                'Observed10': observed10                  
                }
repr(get_inspect_detail())

可能是一个明显的错误或缺乏知识,但我可以获得一个网址的未刷新数据,但不是全部。

1 个答案:

答案 0 :(得分:0)

我没有看到在循环中定义函数的原因。这样你最终会得到很多冗余的定义。其次,您可以定义结果列表并在其中累积detailsLib对象。

def get_inspect_detail(url):
    html = urlopen(url)
    soup = bs4.BeautifulSoup(html.read(), 'lxml')
    details = soup.find_all('font', {'face': 'verdana'})[10:]
    result = []
    for detail in details:
        siteName = details[0].text
        licNum = details[2].text
        siteRank = details[4].text
        expDate = details[6].text
        primeStatus = details[8].text
        secStatus = details[10].text
        siteAddress = details[12].text
        inspectResult = details[20].text
        observed1 = details[34].get_text
        observed2 = details[36].text
        observed3 = details[38].text
        observed4 = details[40].text
        observed5 = details[42].text
        observed6 = details[44].text
        observed7 = details[46].text
        observed8 = details[48].text
        observed9 = details[50].text
        observed10 = details[52].text

        detailsLib = {
            'Restaurant': siteName,
            'License': licNum,
            'Rank': siteRank,
            'Expires': expDate,
            'Primary': primeStatus,
            'Secondary': secStatus,
            'Address': siteAddress,
            'Result': inspectResult,
            'Observed1': observed1,
            'Observed2': observed2,
            'Observed3': observed3,
            'Observed4': observed4,
            'Observed5': observed5,
            'Observed6': observed6,
            'Observed7': observed7,
            'Observed8': observed8,
            'Observed9': observed9,
            'Observed10': observed10
        }
        result.append(detailsLib)

    return result


for url in urls:
    repr(get_inspect_detail(url))