我对网络报废和python都很新。对于我的学士论文,我需要来自rivercruise船的数据。我能够编写以下代码,该代码正在处理http://www.cruiseshipschedule.com。
我还写了第二个代码来获取我感兴趣的船只的所有链接。我请求帮助,将这两个代码组合起来,以便从所有链接中删除相同的3个表。这是我的刮刀,它没有任何错误。
from mechanize import Browser
from bs4 import BeautifulSoup
import lxml
import csv
url1 = 'http://www.cruiseshipschedule.com/ama-waterways/ms-amabella-cruises/'
mech = Browser()
page1 = mech.open(url1)
html = page1.read()
soup1 = BeautifulSoup(html, "lxml")
ship_in = soup1.h1
ship_in = ship_in.text
ship = u' '.join(ship_in.split())
u' '.join(ship_in.split())
ship = [ship]
h21 = soup1.h2
h22 = h21.findNext('h2')
h23 = h22.findNext('h2')
h24 = h23.findNext('h2')
h25 = h24.findNext('h2')
h_y1 = h22.text
h_y2 = h23.text
h_y3 = h24.text
itinerary1_header = u' '.join(h_y1.split())
u' '.join(h_y1.split())
itinerary2_header = u' '.join(h_y2.split())
u' '.join(h_y2.split())
itinerary3_header = u' '.join(h_y3.split())
u' '.join(h_y3.split())
table_1 = soup1.findAll('table')[0]
table_2 = soup1.findAll('table')[1]
table_3 = soup1.findAll('table')[2]
rows_1 = table_1.findAll("tr")
rows_2 = table_2.findAll("tr")
rows_3 = table_3.findAll("tr")
for row_1 in rows_1:
cells_1 = row_1.findAll('td')
list_1 = table_1.findAll('li')
decks = str(list_1[0].get_text()).split(':')
cabin = str(list_1[1].get_text()).split(':')
cabin_number = str(list_1[2].get_text()).split(':')
list_key = ''.join(list(decks[0] + '|' + cabin[0] + '|' + cabin_number[0]))
list_value = ''.join(list(decks[1] + '|' + cabin[1] + '|' + cabin_number[1]))
list_key = list_key.split('|')
list_value = list_value.split('|')
try: #we are using "try" because the table is not well formatted. This allows the program to continue after encountering an error.
col1_1 = str(cells_1[0].get_text()).split('\n') # This structure isolate the item by its column in the table and converts it into a string.
col2_1 = str(cells_1[1].get_text()).split('\n')
col3_1 = str(cells_1[2].get_text()).split('\n')
col4_1 = str(cells_1[3].get_text()).split('\n')
except:
continue #This tells the computer to move on to the next item after it encounters an error
keys_1 = ['ship'] + col1_1 + col3_1 + list_key
values_1 = ship + col2_1 + col4_1 + list_value
dict_1 = dict(zip(keys_1, values_1))
with open('Z:/Cruiseshipschedule/details/details_'+ ship_in + '.csv', 'wb') as f: # Just use 'w' mode in 3.x
w = csv.DictWriter(f, dict_1.keys())
w.writeheader()
w.writerow(dict_1)
f.close()
list_of_rows_2=[]
for row_2 in rows_2:
cells_2 = row_2.find_all("td")
list_of_cells_2 = [itinerary1_header]
try: #we are using "try" because the table is not well formatted. This allows the program to continue after encountering an error.
date1_2 = str(cells_2[0].get_text()) # This structure isolate the item by its column in the table and converts it into a string.
itinerary2_2 = str(cells_2[1].get_text()).split('\n')
price3_2 = str(cells_2[2].get_text()).split('\n')
list_of_cells_2.append(date1_2)
list_of_cells_2.append(itinerary2_2)
list_of_cells_2.append(price3_2)
except:
continue #This tells the computer to move on to the next item after it encounters an error
list_of_cells_2.append(ship)
list_of_rows_2.append(list_of_cells_2)
outfile_2 = open('Z:/Cruiseshipschedule/itinerary1/itinerary1_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_2, delimiter='|')
writer.writerows(list_of_rows_2)
outfile_2.close()
list_of_rows_3=[]
for row_3 in rows_3:
cells_3 = row_3.find_all("td")
list_of_cells_3 = [itinerary2_header]
try: #we are using "try" because the table is not well formatted. This allows the program to continue after encountering an error.
date1_3 = str(cells_3[0].get_text()) # This structure isolate the item by its column in the table and converts it into a string.
itinerary2_3 = str(cells_3[1].get_text()).split('\n')
price3_3 = str(cells_3[2].get_text()).split('\n')
list_of_cells_3.append(date1_3)
list_of_cells_3.append(itinerary2_3)
list_of_cells_3.append(price3_3)
except:
continue #This tells the computer to move on to the next item after it encounters an error
list_of_cells_3.append(ship)
list_of_rows_3.append(list_of_cells_3)
outfile_3 = open('Z:/Cruiseshipschedule/itinerary2/itinerary2_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_3, delimiter='|')
writer.writerows(list_of_rows_3)
outfile_3.close()
print "check out the data!"
这是第二个代码 - 它也可以工作并获取所有链接并将它们存储在page_array
变量中。
from mechanize import Browser
from bs4 import BeautifulSoup
import lxml
from lxml import html
import csv
import requests
page_array = []
mech = Browser()
url = 'http://www.cruiseshipschedule.com/'
page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html, "lxml")
table1 = soup.table #Ocean Cruise
table2 = table1.findNext('table')#River Cruise
pages = table2.findAll('a')
for page in pages:
page_array.append(page.get('href').replace('http://www.cruiseshipschedule.com/', ''))
什么是合适的python方法来组合这些代码并获取我需要的所有数据。感谢您的帮助。
编辑:
from mechanize import Browser
from bs4 import BeautifulSoup
import lxml
from lxml import html
import csv
import requests
page_array = []
mech = Browser()
url = 'http://www.cruiseshipschedule.com/'
page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html, "lxml")
table1 = soup.table #Ocean Cruise
table2 = table1.findNext('table')#River Cruise
pages = table2.findAll('a')
for page in pages:
page_array.append(page.get('href'))
for page in page_array:
mech = Browser()
page1 = mech.open(page)
html = page1.read()
soup1 = BeautifulSoup(html, "lxml")
ship_in = soup1.h1
ship_in = ship_in.text
ship = u' '.join(ship_in.split())
u' '.join(ship_in.split())
ship = [ship]
h21 = soup1.h2
h22 = h21.findNext('h2')
h23 = h22.findNext('h2')
h24 = h23.findNext('h2')
h25 = h24.findNext('h2')
h_y1 = h22.text
h_y2 = h23.text
h_y3 = h24.text
itinerary1_header = u' '.join(h_y1.split())
u' '.join(h_y1.split())
itinerary2_header = u' '.join(h_y2.split())
u' '.join(h_y2.split())
itinerary3_header = u' '.join(h_y3.split())
u' '.join(h_y3.split())
table_1 = soup1.findAll('table')[0]
table_2 = soup1.findAll('table')[1]
table_3 = soup1.findAll('table')[2]
rows_1 = table_1.findAll("tr")
rows_2 = table_2.findAll("tr")
rows_3 = table_3.findAll("tr")
for row_1 in rows_1:
cells_1 = row_1.findAll('td')
list_1 = table_1.findAll('li')
decks = str(list_1[0].get_text()).split(':')
cabin = str(list_1[1].get_text()).split(':')
cabin_number = str(list_1[2].get_text()).split(':')
list_key = ''.join(list(decks[0] + '|' + cabin[0] + '|' + cabin_number[0]))
list_value = ''.join(list(decks[1] + '|' + cabin[1] + '|' + cabin_number[1]))
list_key = list_key.split('|')
list_value = list_value.split('|')
try: #we are using "try" because the table is not well formatted. This allows the program to continue after encountering an error.
col1_1 = str(cells_1[0].get_text()).split('\n') # This structure isolate the item by its column in the table and converts it into a string.
col2_1 = str(cells_1[1].get_text()).split('\n')
col3_1 = str(cells_1[2].get_text()).split('\n')
col4_1 = str(cells_1[3].get_text()).split('\n')
except:
continue #This tells the computer to move on to the next item after it encounters an error
keys_1 = ['ship'] + col1_1 + col3_1 + list_key
values_1 = ship + col2_1 + col4_1 + list_value
dict_1 = dict(zip(keys_1, values_1))
with open('Z:/Cruiseshipschedule/details/details_'+ ship_in + '.csv', 'wb') as f: # Just use 'w' mode in 3.x
w = csv.DictWriter(f, dict_1.keys())
w.writeheader()
w.writerow(dict_1)
f.close()
list_of_rows_2=[]
for row_2 in rows_2:
cells_2 = row_2.find_all("td")
list_of_cells_2 = [itinerary1_header]
try: #we are using "try" because the table is not well formatted. This allows the program to continue after encountering an error.
date1_2 = str(cells_2[0].get_text()) # This structure isolate the item by its column in the table and converts it into a string.
itinerary2_2 = str(cells_2[1].get_text()).split('\n')
price3_2 = str(cells_2[2].get_text()).split('\n')
list_of_cells_2.append(date1_2)
list_of_cells_2.append(itinerary2_2)
list_of_cells_2.append(price3_2)
except:
continue #This tells the computer to move on to the next item after it encounters an error
list_of_cells_2.append(ship)
list_of_rows_2.append(list_of_cells_2)
outfile_2 = open('Z:/Cruiseshipschedule/itinerary1/itinerary1_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_2, delimiter='|')
writer.writerows(list_of_rows_2)
outfile_2.close()
list_of_rows_3=[]
for row_3 in rows_3:
cells_3 = row_3.find_all("td")
list_of_cells_3 = [itinerary2_header]
try: #we are using "try" because the table is not well formatted. This allows the program to continue after encountering an error.
date1_3 = str(cells_3[0].get_text()) # This structure isolate the item by its column in the table and converts it into a string.
itinerary2_3 = str(cells_3[1].get_text()).split('\n')
price3_3 = str(cells_3[2].get_text()).split('\n')
list_of_cells_3.append(date1_3)
list_of_cells_3.append(itinerary2_3)
list_of_cells_3.append(price3_3)
except:
continue #This tells the computer to move on to the next item after it encounters an error
list_of_cells_3.append(ship)
list_of_rows_3.append(list_of_cells_3)
outfile_3 = open('Z:/Cruiseshipschedule/itinerary2/itinerary2_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_3, delimiter='|')
writer.writerows(list_of_rows_3)
outfile_3.close()
print "check out the data!"
现在是我编辑的版本。输出是3 .csv文件。然后它会抛出一个错误:
查看数据!
追踪(最近一次通话): 文件“C:/ Python27 / ship scraper editedt.py”,第55行,in table_3 = soup1.findAll('table')[2] IndexError:列表索引超出范围
但是,如果只运行一个网址(url1 ='http://www.cruiseshipschedule.com/ama-waterways/ms-amabella-cruises/'),我就不会收到错误。
修改
from mechanize import Browser
from bs4 import BeautifulSoup
import lxml
from lxml import html
import csv
import requests
base_url = 'http://www.cruiseshipschedule.com/'
def get_links():
links_array = []
mech = Browser()
mech.set_handle_robots(False)
page = mech.open(base_url)
html = page.read()
soup = BeautifulSoup(html, "lxml")
tables = soup.findAll('table')
for table in tables:
links = tables[1].findAll('a')
for link in links:
links_array.append(link.get('href').replace('http://www.cruiseshipschedule.com/', ''))
return links_array
def get_headings(url):
mech = Browser()
mech.set_handle_robots(False)
page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html, "lxml")
headings = soup.findAll('h2')
return headings
get_links()
urls = [base_url + link for link in get_links()]
for url in urls:
mech = Browser()
mech.set_handle_robots(False)
try:
page = mech.open(url)
except:
continue
html = page.read()
soup = BeautifulSoup(html, "lxml")
tables = soup.findAll('table')
ship_in = soup.h1
ship_in = ship_in.text
ship = u' '.join(ship_in.split())
u' '.join(ship_in.split())
ship = [ship]
try:
details = tables[0]
except:
continue
rows_1 = details.findAll("tr")
for row_1 in rows_1:
cells_1 = row_1.findAll('td')
try:
list_1 = details.findAll('li')
decks = list_1[0].text.encode('utf8').split(':')
cabin = list_1[1].text.encode('utf8').split(':')
cabin_number = list_1[2].text.encode('utf8').split(':')
list_key = ''.join(list(decks[0] + '|' + cabin[0] + '|' + cabin_number[0]))
list_value = ''.join(list(decks[1] + '|' + cabin[1] + '|' + cabin_number[1]))
list_key = list_key.split('|')
list_value = list_value.split('|')
try:
col1_1 = str(cells_1[0].get_text()).split('\n')
col2_1 = str(cells_1[1].get_text()).split('\n')
col3_1 = str(cells_1[2].get_text()).split('\n')
col4_1 = str(cells_1[3].get_text()).split('\n')
except:
continue
keys_1 = ['ship'] + col1_1 + col3_1 + list_key
values_1 = ship + col2_1 + col4_1 + list_value
dict_1 = dict(zip(keys_1, values_1))
with open('Z:/Cruiseshipschedule/details/details_'+ ship_in + '.csv', 'wb') as f:
w = csv.DictWriter(f, dict_1.keys())
w.writeheader()
w.writerow(dict_1)
f.close()
except:
if not list_1:
list_of_rows_1=[]
for row_1 in rows_1:
cells_1 = row_1.findAll('td')
try:
col1_1 = cells_1[0].text.encode('utf8').split(':')
col2_1 = cells_1[1].text.encode('utf8').split(':')
col3_1 = cells_1[2].text.encode('utf8').split(':')
col4_1 = cells_1[3].text.encode('utf8').split(':')
list_of_cells_1.append(col1_1)
list_of_cells_1.append(col2_1)
list_of_cells_1.append(col3_1)
list_of_cells_1.append(col4_1)
except:
continue
list_of_rows_1.append(list_of_cells_1)
outfile_1 = open('Z:/Cruiseshipschedule/details/details_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_1, delimiter='|')
writer.writerows(list_of_rows_1)
outfile_1.close()
else:
continue
try:
itineray1 = tables[1]
rows_2 = itineray1.findAll("tr")
list_of_rows_2=[]
for row_2 in rows_2:
cells_2 = row_2.find_all("td")
list_of_cells_2 = [get_headings(url)[2].text]
try:
date1_2 = str(cells_2[0].get_text())
itinerary2_2 = str(cells_2[1].get_text()).split('\n')
price3_2 = str(cells_2[2].get_text()).split('\n')
list_of_cells_2.append(date1_2)
list_of_cells_2.append(itinerary2_2)
list_of_cells_2.append(price3_2)
except:
continue
list_of_cells_2.append(ship)
list_of_rows_2.append(list_of_cells_2)
outfile_2 = open('Z:/Cruiseshipschedule/itinerary1/itinerary1_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_2, delimiter='|')
writer.writerows(list_of_rows_2)
outfile_2.close()
except:
continue
try:
itineray2 = tables[2]
list_of_rows_3=[]
for row_3 in rows_3:
cells_3 = row_3.find_all("td")
list_of_cells_3 = [get_headings(url)[3].text]
try:
date1_3 = str(cells_3[0].get_text())
itinerary2_3 = str(cells_3[1].get_text()).split('\n')
price3_3 = str(cells_3[2].get_text()).split('\n')
list_of_cells_3.append(date1_3)
list_of_cells_3.append(itinerary2_3)
list_of_cells_3.append(price3_3)
except:
continue
list_of_cells_3.append(ship)
list_of_rows_3.append(list_of_cells_3)
outfile_3 = open('Z:/Cruiseshipschedule/itinerary2/itinerary2_'+ ship_in + '.csv', "wb")
writer = csv.writer(outfile_3, delimiter='|')
writer.writerows(list_of_rows_3)
outfile_3.close()
except:
continue
print "check out the data!"
感谢所有支持!代码工作 - 这意味着我得到数据。但对于某些船只来说奇怪的是它只需要一些行程。我在代码中找不到错误。 Python不会抛出错误。
在刮擦时,我看到一些网址没有完全相同的表格结构。这就是为什么我把尝试和除了避免停止脚本。
我真的很感激一些想法。
答案 0 :(得分:1)
#Second program here:
...
...
page_array = [....]
for page in page_array:
mech = Browser()
page1 = mech.open(page)
#...The rest of the 1st program here
另一个选择是将第二个程序转换为函数:
from mechanize import Browser
from bs4 import BeautifulSoup
import lxml
from lxml import html
import csv
import requests
def get_links(url):
links_array = []
mech = Browser()
#url = 'http://www.cruiseshipschedule.com/'
page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html, "lxml")
table1 = soup.table #Ocean Cruise
table2 = table1.findNext('table')#River Cruise
links = table2.findAll('a')
for link in links:
links_array.append(link.get('href').replace('http://www.cruiseshipschedule.com/', ''))
return links_array #<****HERE
然后在第一个程序中,您将导入包含第二个程序的文件:
import second_prog
url = 'http://www.cruiseshipschedule.com/ama-waterways/ms-amabella-cruises/'
mech = Browser()
for link in second_prog.get_links(url):
page = mech.open(link)
#Continue with first program here
而且,如果你想将所有内容保存在同一个文件中,那么你可以将第二个程序中的函数移动到你的第一个程序中 - 这意味着你不再需要import语句了。
修改强>
以下任何代码是否与您的错误有关:
ship_in = soup1.h1
ship_in = ship_in.text
ship = u' '.join(ship_in.split())
u' '.join(ship_in.split())
ship = [ship]
h21 = soup1.h2
h22 = h21.findNext('h2')
h23 = h22.findNext('h2')
h24 = h23.findNext('h2')
h25 = h24.findNext('h2')
h_y1 = h22.text
h_y2 = h23.text
h_y3 = h24.text
itinerary1_header = u' '.join(h_y1.split())
u' '.join(h_y1.split())
itinerary2_header = u' '.join(h_y2.split())
u' '.join(h_y2.split())
itinerary3_header = u' '.join(h_y3.split())
u' '.join(h_y3.split())
让我们看看:
soup1 = BeautifulSoup(html, "lxml")
#Code above here
table_1 = soup1.findAll('table')[0]
table_2 = soup1.findAll('table')[1]
table_3 = soup1.findAll('table')[2]
计算table_1不会引用任何已删除的内容 计算table_2不会引用任何已删除的内容 计算table_3不会引用任何已删除的内容。
因此,您复制了程序,并删除了整个部分。然后你试着弄清楚出了什么问题。删除代码以隔离问题的过程称为 debugging 代码。
接下来,为什么要让BS去麻烦 - 以及时间 - 搜索整个html页面3次以获得所有表格?
table_1 = soup1.findAll('table')[0]
table_2 = soup1.findAll('table')[1]
table_3 = soup1.findAll('table')[2]
每次撰写soup1.findAll('table')
时,BS都必须搜索整个html页面才能找到所有<table>
代码。
相反,您只需搜索一次:
tables = soup1.findAll('table')
table1 = tables[0]
table2 = tables[1]
table3 = tables[2]
让python从列表中检索元素非常快 - 比通过搜索所有<table>
标记的整个网页进行BS搜索要快得多。
接下来,每当您发现自己编写带有名称的变量时:
table1
table2
table3
并且它们只是一个数字不同,你需要停止你正在做的事情 - 而是使用一个列表。在这种情况下,您已经有一个列表:tables
,tables
中的元素已经有了tables[0], tables[1], tables[2]
等名称,因此您不需要创建变量{{ 1}}。实际上,您甚至不必通过名称table1, table2, table3
引用tables
的元素 - 而是可以使用tables[0], tables[1], tables[2]
来遍历所有表:
for loop
这有两个好处:
1)您不必在代码中写下所有名称for table in tables:
#Do something with the table variable
。如果你必须检查1,000张桌子怎么办?你真的要写:
tables[0], tables[1], tables[2]
2)使用for循环的第二个好处是你只需编写处理表ONCE的代码,tables[0] = ...
tables[1] = ...
...
...
<an hour later>
tables[999] = ...
将代码应用于for loop
列表中的每个表
如果tables
的表标签多于您想要检查的表标签,那么您可以写:
tables
但请注意,如果表只有两个表,那么first_three = tables[:3]
将只包含两个表。那个可以是一个优点:for循环不需要知道表列表中有多少表标签 - for循环将盲目地处理你给它的列表中的所有元素,no无论列表中有多少个表。
最后,您的错误说明:
first_three
如果您这样做,那就是同样的错误:
table_3 = soup1.findAll('table')[2] IndexError: list index out of range
这意味着findAll()在页面上找到的表少于3个。你怎么解决这个问题?如果页面不包含三个表,则它不包含三个表。您只能处理页面包含的最多三个表格,如下所示:
data = ['a', 'b']
print(data[2])