在迭代以下内容时,我无法将数据附加到列表中:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import pandas
def make_soup(url):
thepage = urllib.request.urlopen(url)
thepage.addheaders = [('User-Agent', 'Mozilla/5.0')]
soupdata = BeautifulSoup(thepage, 'html.parser')
return soupdata
soup = make_soup('https://www.wellstar.org/locations/pages/default.aspx')
locationdata = []
for table in soup.findAll('table', class_ = 's4-wpTopTable'):
for name in table.findAll('div', 'PurpleBackgroundHeading'):
name = name.get_text(strip = True)
for loc_type in table.findAll('h3', class_ = 'WebFont SpotBodyGreen'):
loc_type = loc_type.get_text()
for address in table.findAll('div', class_ = ['WS_Location_Address', 'WS_Location_Adddress']):
address = address.get_text(strip = True, separator = ' ')
locationdata.append([name, loc_type, address])
df = pandas.DataFrame(columns = ['name', 'loc_type', 'address'], data = locationdata)
print(df)
生成的数据框包含所有唯一地址,但只包含与name
对应的最后一个可能的文本。
例如,即使是WellStar Windy Hill医院'是医院类别/类型中的最后一家医院,它似乎是所有医院的名称。如果可能的话,我更喜欢list.append
解决方案,因为我还有几个类似的步骤来完成这个项目。
答案 0 :(得分:1)
之所以发生这种情况,是因为您在附加到locationdata
之前循环遍历了所有名称和loc_types 。
您可以改为:
import itertools as it
from pprint import pprint as pp
for table in soup.findAll('table', class_='s4-wpTopTable'):
names = [name.get_text(strip=True) for
name in table.findAll('div', 'PurpleBackgroundHeading')]
loc_types = [loc_type.get_text() for
loc_type in table.findAll('h3', class_='WebFont SpotBodyGreen')]
addresses = [address.get_text(strip=True, separator=' ') for
address in table.findAll('div', class_=['WS_Location_Address',
'WS_Location_Adddress'])]
for name, loc_type, address in it.izip_longest(names,loc_types,addresses):
locationdata.append([name, loc_type, address])
结果:
>>> pp.pprint(locationdata)
[[u'WellStar Urgent Care in Acworth',
u'WellStar Urgent Care Centers',
u'4550 Cobb Parkway NW Suite 101 Acworth, GA 30101 770-917-8140'],
[u'WellStar Urgent Care in Kennesaw',
None,
u'3805 Cherokee Street Kennesaw, GA 30144 770-426-5665'],
[u'WellStar Urgent Care in Marietta - Delk Road',
None,
u'2890 Delk Road Marietta, GA 30067 770-955-8620'],
[u'WellStar Urgent Care in Marietta - East Cobb',
None,
u'3747 Roswell Road Ne Suite 107 Marietta, GA 30062 470-956-0150'],
[u'WellStar Urgent Care in Marietta - Kennestone',
None,
u'818 Church Street Suite 100 Marietta, GA 30060 770-590-4190'],
[u'WellStar Urgent Care in Marietta - Sandy Plains Road',
None,
u'3600 Sandy Plains Road Marietta, GA 30066 770-977-4547'],
[u'WellStar Urgent Care in Smyrna',
None,
u'4480 North Cooper Lake Road SE Suite 100 Smryna, GA 30082 770-333-1300'],
[u'WellStar Urgent Care in Woodstock',
None,
u'120 Stonebridge Parkway Suite 310 Woodstock, GA 30189 678-494-2500']]