Python Web Scrape Cycle选项卡

时间:2017-12-27 18:18:28

标签: python html web-scraping beautifulsoup

寻求帮助以遍历网站上的所有标签以捕获所有相关信息。

在以下网站中,有一些标签为5x5,5x10,5x15,10x10等。我不知道如何构建它以便它将通过选项卡并在我的脚本中编写循环。感谢您的帮助。

下面是python脚本;

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv

urls = [
    'https://www.lifestorage.com/storage-units/florida/orlando/32810/610-near-lockhart/?size=5x5'
]

filename = 'life_storage.csv'

f = open(filename, 'a+')
csv_writer = csv.writer(f) 

headers = ['unit_size', 'unit_type', 'description', 'online_price', 'reg_price', 'store_address', 'store_city', 'store_state', 'store_postalcode' ]

##unit_size = 5'x10' withouth the '
##unit_type = climate controlled or not (this could be blank if non-climate)
##descirption = the level it's on and type of access.
##online_price = $##/mo text
##reg_price = the scratched off $## text

csv_writer.writerow(headers)

for my_url in urls:
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, 'html.parser')   


    store_locator = page_soup.findAll("div", {"itemprop": "address"})
    containers = page_soup.findAll("ul", {"id": "spaceList"})

    for container in containers:
        for store_location in store_locator:
            store_address1 = store_location.find("span", {"itemprop": "streetAddress"})
            store_address = store_address1.text
            store_city1 = store_location.find("span", {"itemprop": "addressLocality"})
            store_city = store_city1.text
            store_state1 = store_location.find("span", {"itemprop": "addressRegion"})
            store_state = store_state1.text
            store_postalcode1 = store_location.find("span", {"itemprop": "postalCode"})
            store_postalcode = store_postalcode1.text
            title_container = container.find("div", {"class": "storesRow"})
            unit_size = title_container.text
            unit_container = container.find("div", {"class": "storesRow"})
            unit_type = unit_container.strong.text
            description_container = container.find("ul", {"class": "features"})
            description = description_container.text
            online_price_container = container.find("div", {"class": "priceBox"})
            online_price =  online_price_container.strong.text
            reg_price_container = container.find("div", {"class": "priceBox"})
            reg_price = reg_price_container.i.text

        csv_writer.writerow([unit_size, unit_type, description, online_price, reg_price, store_address, store_city, store_state, store_postalcode])

f.close()

以下是html正文中与循环相关的片段;

//////////\\\\\\\Description BOX



<div class="storesRow">
    <strong>
<a href="/reservation/choose/?store=610&amp;type=1"> 5' x 5'<sup>*</sup> - Climate Controlled </a>
</strong>
    <ul class="features">
        <li>Indoor access</li>
        <li>Ground Level</li>
    </ul>
</div>



//////////\\\\\\\\\PRICE BOX

<div class="priceBox">
<strong>

                                        $25/mo





                                                <i> $27</i>
</strong>
<em class="pOnly ">Phone &amp; online only</em>
<div class="specialsMessage">
</div>
</div>


//////////\\\\\\\\\ADDRESS BOX


<div itemprop="address" itemscope="" itemtype="https://schema.org/PostalAddress">
<em>
<i class="fa fa-map-marker"></i>
<span itemprop="streetAddress">7244 Overland Rd </span>
<span itemprop="addressLocality">Orlando</span>,

        <span itemprop="addressRegion">FL</span>
<span itemprop="postalCode">32810</span>
</em>
</div>

当前输出 enter image description here

期望的输出 enter image description here

1 个答案:

答案 0 :(得分:0)

你有错误的缩进 - writerow()应该在内部for内。

但是从项目中挤出正确的文本可能需要更多的工作。见代码。

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv

urls = [
    'https://www.lifestorage.com/storage-units/florida/orlando/32810/610-near-lockhart/?size=5x5'
]

filename = 'life_storage.csv'

f = open(filename, 'a+')
csv_writer = csv.writer(f) 

headers = ['unit_size', 'unit_type', 'description', 'online_price', 'reg_price', 'store_address', 'store_city', 'store_state', 'store_postalcode' ]

##unit_size = 5'x10' withouth the '
##unit_type = climate controlled or not (this could be blank if non-climate)
##descirption = the level it's on and type of access.
##online_price = $##/mo text
##reg_price = the scratched off $## text

csv_writer.writerow(headers)

for my_url in urls:
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, 'html.parser')   

    store_location = page_soup.find("div", {"itemprop": "address"})

    # need `li`
    containers = page_soup.find("ul", {"id": "spaceList"}).findAll('li')
    print('len(containers):', len(containers))

    item = store_location.find("span", {"itemprop": "streetAddress"})
    store_address = item.text.strip()

    item = store_location.find("span", {"itemprop": "addressLocality"})
    store_city = item.text.strip()

    item = store_location.find("span", {"itemprop": "addressRegion"})
    store_state = item.text.strip()

    item = store_location.find("span", {"itemprop": "postalCode"})
    store_postalcode = item.text.strip()

    for container in containers:
        item = container.find("div", {"class": "storesRow"})

        if item and item.strong:
            text = item.strong.text.strip()
            parts = text.split('-')
            if len(parts) > 0:
                unit_size = parts[0].strip().replace('*', "")
            else:
                unit_size = ''

            if len(parts) > 1:
                unit_type = parts[1].strip()
            else:
                unit_type = ''
        else:
            continue

        item = container.find("ul", {"class": "features"})

        if item:
            description = item.text.strip().replace("\n", ',')
        else:
            description = ''

        item = container.find("div", {"class": "priceBox"})

        if item and item.i:
            reg_price = item.i.text.strip()
        else:
            reg_price = ''

        if item and item.strong:
            if item.i:
                item.i.extract() # remove <i>`
            online_price = item.strong.text.strip()
        else:
            online_price = ''

        csv_writer.writerow([unit_size, unit_type, description, online_price, reg_price, store_address, store_city, store_state, store_postalcode])

f.close()

结果:

unit_size,unit_type,description,online_price,reg_price,store_address,store_city,store_state,store_postalcode
5' x 5',Climate Controlled,"Indoor access,Ground Level",$25/mo,$27,7244 Overland Rd,Orlando,FL,32810
5' x 5',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810
5' x 10',,"Outdoor/Drive-up access,Ground Level",$46/mo,$50,7244 Overland Rd,Orlando,FL,32810
10' x 5',Climate Controlled,"Indoor access,Ground Level",$57/mo,$62,7244 Overland Rd,Orlando,FL,32810
5' x 10',Climate Controlled,"Indoor access,Ground Level",$67/mo,$73,7244 Overland Rd,Orlando,FL,32810
5' x 10',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810
5' x 15',Climate Controlled,"Indoor access,Ground Level",$69/mo,$75,7244 Overland Rd,Orlando,FL,32810
10' x 10',,"Outdoor/Drive-up access,Ground Level",$105/mo,$115,7244 Overland Rd,Orlando,FL,32810
10' x 10',Climate Controlled,"Indoor access,Ground Level",$105/mo,$115,7244 Overland Rd,Orlando,FL,32810
10' x 10',Climate Controlled,"Indoor access,Ground Level",$124/mo,$136,7244 Overland Rd,Orlando,FL,32810
10' x 15',,"Outdoor/Drive-up access,Ground Level",$144/mo,$158,7244 Overland Rd,Orlando,FL,32810
10' x 16',,"Outdoor/Drive-up access,Ground Level",$145/mo,$159,7244 Overland Rd,Orlando,FL,32810
10' x 15',Climate Controlled,"Indoor access,Ground Level",$149/mo,$163,7244 Overland Rd,Orlando,FL,32810
10' x 18',,"Outdoor/Drive-up access,Ground Level",$149/mo,$163,7244 Overland Rd,Orlando,FL,32810
10' x 15',Climate Controlled,"Indoor access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810
10' x 20',,"Outdoor/Drive-up access,Ground Level",$147/mo,$161,7244 Overland Rd,Orlando,FL,32810
10' x 25',Climate Controlled,"Indoor access,Ground Level",$175/mo,$192,7244 Overland Rd,Orlando,FL,32810
10' x 20',Climate Controlled,"Indoor access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810
10' x 28',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810
41' x 41',,"Outdoor/Drive-up access,Ground Level",$1400/mo,$1540,7244 Overland Rd,Orlando,FL,32810
22' x 25',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810
18' x 38',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810