以下是我用于读取表格并将其插入csv文件的代码。但它只读取标题和第一行。
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import urllib2
import csv
import MySQLdb
import itertools
import time
wiki = "http://10.202.215.24:8081/nmediation/cdot_ces_status_xx.jsp?userName=RJCADMIN"
time.sleep(50)
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table" , { "border" : "1" })
rows=[]
headers = [header.text for header in table.find_all('th')]
for row in table.find_all('tr'):
rows.append([val.text.encode('utf8') for val in row.find_all('td')])
with open('/home/hirdesh/cronrun/iop_status.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(row for row in rows if row)
#to insert into db
# Establish a MySQL connection
database = MySQLdb.connect (host="localhost", user = "hfhgfh", passwd = "fghgfhfgh", db = "rghfghj")
cursor = database.cursor()
csv_data = csv.reader(file('/home/hirdesh/cronrun/iop_status.csv'))
query1='''truncate table iop_status'''
cursor.execute(query1)
file = open("/home/hirdesh/cronrun/iop_stauslog.txt", "wb")
#file.write("1.Old data deleted From Master Less table\n")
rows=len(list(csv.reader(open('/home/hirdesh/cronrun/iop_status.csv'))))
i=1
j=rows
query2='''INSERT INTO iop_status (CIRCLE, SSA, Switch, CES_NAME, CES_IP, IOP_A_STATUS, IOP_B_STATUS, IOP_TESTING_DATE, IOP_STATUS,CURNT_DATE) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
#query2='''INSERT INTO iop_status (CURRENT_DATE) VALUES (%s)'''
for row in itertools.islice(csv_data,i,j):
CIRCLE = row[1]
SSA= row[2]
Switch= row[3]
CES_NAME= row[4]
CES_IP = row[5]
IOP_A_STATUS = row[6]
IOP_B_STATUS = row[7]
IOP_TESTING_DATE= row[8]
IOP_STATUS = row[9]
CURNT_DATE= row[10]
values = (CIRCLE, SSA, Switch,CES_NAME, CES_IP, IOP_A_STATUS, IOP_B_STATUS, IOP_TESTING_DATE, IOP_STATUS,CURNT_DATE)
cursor.execute(query2,values)
cursor.close()
database.commit()
database.close()
# Print results
file.write("2.%d Rows Inserted\n" % j)
file.write("Current date & time " + time.strftime("%c"))
file.write("*****************\n")
file.close()
以下是我想用美丽的汤读的表格的一部分。在Ubuntu 14.04上运行正常,但在Ubuntu 17.04上它只读取第一行和标题。
<pre>
<table border=1 width=70 font size=2 >
<tr><th>S.No.</th><th>Circle</th><th>SSA</th><th>Switch</th><th>CES_NAME</th><th>CES_IP</th><th>IOP_A_STATUS</th><th>IOP_B_STATUS</th><th>IOP_TESTING_DATE</th><th>IOP_STATUS</th><th>CURRENT_DATE</th></tr>
<tr><td height=5px>
1
</td><td height=5px>
RJC
</td><td height=5px>
CTT
</td><td height=5px>
<a href="cdot_ces_status_switch.jsp?userName=CTTCDOT_PARTA">CTTCDOT_PARTA</a>
</td><td height=5px>
RJ-CTG-PTG-CES
</td><td height=5px>
10.84.4.30
</td><td height=5px>
</th></tr>
</body>
</pre>
答案 0 :(得分:1)
表构造不正确 - 行中的最后一个元素有</th>
而不是</td>
,这会产生问题。
但如果我使用"lxml"
代替"html.parser"
,那么它会获得所有行。
soup = BeautifulSoup(page, "lxml")
工作示例
page = '''
<table border=1 width=70 font size=2 >
<tr><th>S.No.</th><th>Circle</th><th>SSA</th><th>Switch</th><th>CES_NAME</th><th>CES_IP</th><th>IOP_A_STATUS</th><th>IOP_B_STATUS</th><th>IOP_TESTING_DATE</th><th>IOP_STATUS</th><th>CURRENT_DATE</th></tr>
<tr><td height=5px>
1
</td><td height=5px>
RJC
</td><td height=5px>
CTT
</td><td height=5px>
<a href="cdot_ces_status_switch.jsp?userName=CTTCDOT_PARTA">CTTCDOT_PARTA</a>
</td><td height=5px>
RJ-CTG-PTG-CES
</td><td height=5px>
10.84.4.30
</td><td height=5px>
INS_ACT
</td><td height=5px>
OOS
</td><td height=5px>
28/Dec/2017 08:03:30
</td><td height=5px>
<center><IMG SRC=images/Aredo.gif width=20 height=20>Disabled</center>
</td><td height=5px>
02/Jan/2018 10:44:29
</th></tr>
<tr><td height=5px>
2
</td><td height=5px>
RJC
</td><td height=5px>
JJN
</td><td height=5px>
<a href="cdot_ces_status_switch.jsp?userName=JJNCDOT_CHIRA">JJNCDOT_CHIRA</a>
</td><td height=5px>
RJ-JJN-CHW-CES
</td><td height=5px>
10.84.7.250
</td><td height=5px>
INS_SBY
</td><td height=5px>
INS_ACT
</td><td height=5px>
2/Jan/2018 08:05:40
</td><td height=5px>
<center><IMG SRC=images/Aredo.gif width=20 height=20>Disabled</center>
</td><td height=5px>
02/Jan/2018 10:44:30
</th></tr>
<tr><td height=5px>
3
</td><td height=5px>
RJC
</td><td height=5px>
JJN
</td><td height=5px>
<a href="cdot_ces_status_switch.jsp?userName=JJNCDOT_NAWAL">JJNCDOT_NAWAL</a>
</td><td height=5px>
RJ-JJN-NWG-CES
</td><td height=5px>
10.84.7.246
</td><td height=5px>
INS_ACT
</td><td height=5px>
OOS
</td><td height=5px>
1/Jan/2018 15:08:42
</td><td height=5px>
<center><IMG SRC=images/Aredo.gif width=20 height=20>Disabled</center>
</td><td height=5px>
02/Jan/2018 10:44:32
</th></tr>
<tr><td height=5px>
4
</td><td height=5px>
RJC
</td><td height=5px>
SRO
</td><td height=5px>
<a href="cdot_ces_status_switch.jsp?userName=SROCDOT_BHINM">SROCDOT_BHINM</a>
</td><td height=5px>
RJ-SIR-BML-CES
</td><td height=5px>
10.84.6.206
</td><td height=5px>
NA
</td><td height=5px>
NA
</td><td height=5px>
NA
</td><td height=5px>
<center><IMG SRC=images/Aredo.gif width=20 height=20>Network Down</center>
</td><td height=5px>
02/Jan/2018 11:01:33
</th></tr>
<tr><td height=5px>
78
</td><td height=5px>
RJC
</td><td height=5px>
BAM
</td><td height=5px>
<a href="cdot_ces_status_switch.jsp?userName=BAMCDOT_BARME">BAMCDOT_BARME</a>
</td><td height=5px>
RJ-BMR-MAIN-CES
</td><td height=5px>
10.84.4.166
</td><td height=5px>
INS_ACT
</td><td height=5px>
INS_SBY
</td><td height=5px>
2/Jan/2018 10:12:02
</td><td height=5px>
<center><IMG SRC=images/greenb.gif width=20 height=20>Enabled</center>
</td><td height=5px>
02/Jan/2018 10:47:37
</th></tr>
</body>
'''
from bs4 import BeautifulSoup
import csv
soup = BeautifulSoup(page, "lxml") # "html.parser"
table = soup.find("table" , { "border" : "1" })
for row in table.find_all('tr')[1:]: # [1:] skip header
data = [val.text.strip().encode('utf8') for val in row.find_all('td')]
print(data)
结果
[b'1', b'RJC', b'CTT', b'CTTCDOT_PARTA', b'RJ-CTG-PTG-CES', b'10.84.4.30', b'INS_ACT', b'OOS', b'28/Dec/2017 08:03:30', b'Disabled', b'02/Jan/2018 10:44:29']
[b'2', b'RJC', b'JJN', b'JJNCDOT_CHIRA', b'RJ-JJN-CHW-CES', b'10.84.7.250', b'INS_SBY', b'INS_ACT', b'2/Jan/2018 08:05:40', b'Disabled', b'02/Jan/2018 10:44:30']
[b'3', b'RJC', b'JJN', b'JJNCDOT_NAWAL', b'RJ-JJN-NWG-CES', b'10.84.7.246', b'INS_ACT', b'OOS', b'1/Jan/2018 15:08:42', b'Disabled', b'02/Jan/2018 10:44:32']
[b'4', b'RJC', b'SRO', b'SROCDOT_BHINM', b'RJ-SIR-BML-CES', b'10.84.6.206', b'NA', b'NA', b'NA', b'Network Down', b'02/Jan/2018 11:01:33']
[b'78', b'RJC', b'BAM', b'BAMCDOT_BARME', b'RJ-BMR-MAIN-CES', b'10.84.4.166', b'INS_ACT', b'INS_SBY', b'2/Jan/2018 10:12:02', b'Enabled', b'02/Jan/2018 10:47:37']