我想从html文件中提取表格。我编写了以下代码片段来提取第一个表:
import urllib2
import os
import time
import traceback
from bs4 import BeautifulSoup
#find('table',{'class':'tbl_with_brdr'})
outfile= open('D:/Dropbox/Python/apelec.txt','wb')
rfile = open('D:/Dropbox/PRI/Data/AP/195778.html')
rsoup = BeautifulSoup(rfile)
nodes = rsoup.find('div',{'class':'frmtext'}).find('table').find('tr')
for node in nodes[1:]:
x = node.find('th').find('b').get_text().encode("utf-8")
print x
y = node.find('th').findNext('th').find('b').get_text().encode("utf-8")
print y
outfile.write(str(x)+"\t"+str(y)+"\n")
outfile.close()
这是错误:
9 rfile = open('D:/Dropbox/PRI/Data/AP/195778.html')
10 rsoup = BeautifulSoup(rfile)
---> 11 nodes = rsoup.find('div',{'class':'frmtext'}).find('table').find('tr')
12 for node in nodes[1:]:
13 x = node.find('th').find('b').get_text().encode("utf-8")
AttributeError: 'NoneType' object has no attribute 'find'
html文件是:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<link rel="icon" type="image/ico" href="images/favicon.ico"/>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<link rel="stylesheet" href="themes/panchayat_default.css" type="text/css"/>
<title>consolidated Election Report</title>
</head>
<body>
<!-- To blur the background while processing dwr -->
<div class="faded_div process"></div>
<div class="popup_block_div process" style="display: none;">
<img alt="" src="images/loading_animation.gif" style="margin-left: auto; margin-right: auto;">
</div>
<div id="maincontainer" class="resize">
<div id="headerwrap">
<!-- Header -->
<html>
<head>
<script type='text/javascript' src="/profilerdwr/engine.js"> </script>
<script type='text/javascript' src="/profilerdwr/util.js"> </script>
<script type="text/javascript" src="/profilerdwr/interface/lgdDao.js"></script>
<script type="text/javascript" src="js/common_util_js.js"></script>
<link rel="stylesheet" href="css/common_css.css" type="text/css"></link>
<meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />
</head>
<body >
<div class="clear"></div>
<div id="headerwrap">
<div id="header">
<div id="new_header">
<div id="logoleft">Area Profiler</div>
<div id="logoright"></div>
<div class="clear"></div>
</div>
<div class="clear"></div>
<div id="loginnav" align="right">
<table width="100%" class="tbl_no_brdr">
<tr>
<td class="tblclear" align="left">
<div id="mainnav"><a href="welcome.do?OWASP_CSRFTOKEN=CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU">Home</a> </div>
</td>
</tr>
</table>
</div>
</div>
<div class="clear"></div>
<div id="topnav">
<table width="100%" class="tbl_no_brdr">
<tr>
<td width="85" class="tblclear">Choose Theme :</td>
<td width="200" class="tblclear">
<form id="themeForm" name="themeForm" method="get" action="welcome.do">
<input type="hidden" name='OWASP_CSRFTOKEN' value='CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU' />
<select name="theme" id="themeId" class="combofield" onchange="submitThemeForm()" style="width: 120px;">
<option value="default">Default Theme</option>
<option value="mustard">Mustard Theme</option>
<option value="peach">Peach Theme</option>
<option value="green">Green Theme</option>
<option value="blue">Blue Theme</option>
</select>
</form>
</td>
<td style="padding: 0px">
</td>
<td class="tblclear"> </td>
<td width="14" class="tblclear txticon"><a href="#" class="texttoggler " rel="smallview" title="small size"><img src="images/btnMinus.jpg" width="16" height="14" border="0" /></div></a></td>
<td width="14" class="tblclear txticon"><a href="#" class="texttoggler" rel="normalview" title="normal size"><img src="images/btnDefault.jpg" width="16" height="14" border="0" /></a> </td>
<td width="28" class="tblclear txticon"><a href="#" class="texttoggler" rel="largeview" title="large size"><img src="images/btnPlus.jpg" width="16" height="14" border="0" /></a></td>
<script type="text/javascript" >
//documenttextsizer.setup("shared_css_class_of_toggler_controls")
documenttextsizer.setup("texttoggler")
</script>
<td width="100" align="right" class="tblclear">Select Language :</td>
<td width="108" align="right" class="tblclear">
<form id="languageForm" name="languageForm" method="get" action="welcome.do">
<input type="hidden" name='OWASP_CSRFTOKEN' value='CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU' />
<select id="languageId" name="language" class="combofield" style="width: 120px;" onchange="submitLanguageForm()" >
<option value=""> Select Language </option>
</select>
</form>
</td>
</tr>
</table>
</div>
<div id="breadcrumbnav"> </div>
</div>
<script type="text/javascript">
function submitThemeForm()
{
var isOK = confirm("This will Refresh Your Page. Any Unsaved data will be Lost. Do You still want to Continue?");
if(isOK)
{
document.getElementById('themeForm').submit();
}
else
{
return;
}
}
function submitLanguageForm()
{
var isOK = confirm("This will Refresh Your Page. Any Unsaved data will be Lost. Do You still want to Continue?");
if(isOK)
{
document.getElementById('languageForm').submit();
}
else
{
return;
}
}
</script>
</body>
</html>
</div>
<div class="clear"></div>
<div id="content">
<div id="leftpnl">
<table width="100%" border="0" cellspacing="0" cellpadding="0">
<tr>
<td width="100%" valign="top" class="tblclear">
<!-- content -->.
<script type="text/javascript" src="js/common_js.js"></script>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<script type="text/javascript">
var pathname;
$(document).ready(function() {pathname = window.location.pathname;});
function onBack(s) {
var position =pathname.indexOf("/", 2);
var newPath = "";
var val = s.indexOf("?", 1);
if(val>0)
{
newPath = s+"&redirect=true";
}
else
{
newPath = s+"?redirect=true";
}
window.location.replace(".."+pathname.substring(0,position)+"/"+newPath);
}
function downloadReport(repformat){
//window.location="downloadConsolidatedElectionReportPDF.do?OWASP_CSRFTOKEN=CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU";
//document.forms["electionReportForm"].action="downloadConsolidatedElectionReportPDF.do?repformat="+repformat+"&OWASP_CSRFTOKEN=CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU";
document.forms["electionReportForm"].action="downloadConsolidatedElectionReportPDF.do?reportformat="+repformat+"&OWASP_CSRFTOKEN=CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU";
document.forms["electionReportForm"].method="POST";
document.getElementById('electionReportForm').target="_blank";
document.forms["electionReportForm"].submit();
}
</script>
<style type="text/css">
.data_link{
color:blue;
display: block;
text-decoration: none;
font-size: 1em;
font-weight: bolder;
}
.disable_link
{
cursor:default;
color:blue;
display: block;
text-decoration: none;
font-size: 1em;
font-weight: bolder;
}
.data_link:VISITED
{
color:blue;
display: block;
text-decoration: none;
font-size: 1em;
font-weight: bolder;
}
.data_link:HOVER{
text-decoration: underline;
}
</style>
</head>
<body>
<div id="frmcontent">
<div class="frmhd">
<table width="100%" class="tbl_no_brdr">
<tr>
<td align="left" width="90%">
Consolidated Election</td>
</tr>
</table>
</div>
<div class="clear"></div>
<div class="frmpnlbrdr">
<div class="frmpnlbg">
<div class="frmtxt">
<table width="100%" style="margin-bottom: 10px;" class="tbl_with_brdr">
<tr class="tblRowTitle tblclear" >
<th align="left" ><b>State Name</b></th>
<th align="left" ><b>Local Body Type</b></th>
<th align="left" ><b>Election Term</b></th>
<th align="left" ><b>Local Body Name</b></th>
</tr>
<tr class="tblRowB" style="color: blue;">
<th align="left" >ANDHRA PRADESH</th>
<th align="left" >Village Panchayat</th>
<th align="left" >
02-Aug-2013 To
01-Aug-2018
</th>
<th align="left" >KODIHALLI</th>
</tr>
</table>
<div class="frmhdtitle">Consolidated Election</div>
<table width="100%" class="tbl_with_brdr">
<thead>
<tr class="tblRowTitle tblclear">
<th align="center" width="5%" ><b>S.No.</b></th>
<th align="left" width="9%"><b>Name</b></th> 0
<th align="left" width="9%"><b>Age</b></th> 1
<th align="left" width="9%"><b>Caste Category</b></th> 2
<th align="left" width="9%"><b>Gender</b></th> 3
<th align="left" width="9%"><b>Qualification</b></th> 4
<th align="left" width="9%"><b>Occupation</b></th> 5
<th align="left" width="9%"><b>Email Address</b></th> 6
<th align="left" width="9%"><b>Ward Name</b></th> 7
<th align="left" width="9%"><b>Reservation</b></th> 8
</tr>
</thead>
<tbody>
<tr class="tblRowB">
<td align="center" >1</td>
<td>Kambanna</td>
<td>36</td>
<td>OBC</td>
<td>Male</td>
<td>Middle or Lower Secondary</td>
<td>N/A</td>
<td>
N/A
</td>
<td>N/A</td>
<td >
Yes (OBC / Others)
</td>
</tr>
<tr class="tblRowA">
<td align="center" >2</td>
<td>Ramesh</td>
<td>39</td>
<td>OBC</td>
<td>Male</td>
<td>Middle or Lower Secondary</td>
<td>Workers not reporting any occupations</td>
<td>
N/A
</td>
<td>Ward no 1</td>
<td >
Yes (OBC / Others)
</td>
</tr>
<tr class="tblRowB">
<td align="center" >3</td>
<td>S.Manjunath</td>
<td>29</td>
<td>OBC</td>
<td>Male</td>
<td>Higher Secondary or Intermediate or Pre University or Senior Secondary</td>
<td>Workers not reporting any occupations</td>
<td>
N/A
</td>
<td>Ward no 2</td>
<td >
No (General / Others)
</td>
</tr>
<tr class="tblRowA">
<td align="center" >4</td>
<td>Obuleshu</td>
<td>48</td>
<td>OBC</td>
<td>Male</td>
<td>Below Primary</td>
<td>Workers not reporting any occupations</td>
<td>
N/A
</td>
<td>Ward no 3</td>
<td >
No (General / Others)
</td>
</tr>
<tr class="tblRowB">
<td align="center" >5</td>
<td>Mamatha</td>
<td>24</td>
<td>OBC</td>
<td>Female</td>
<td>Matriculation or Junior School Certificate or Secondary</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 4</td>
<td >
Yes (General / Female)
</td>
</tr>
<tr class="tblRowA">
<td align="center" >6</td>
<td>Shivamma</td>
<td>38</td>
<td>OBC</td>
<td>Female</td>
<td>Below Primary</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 5</td>
<td >
Yes (General / Female)
</td>
</tr>
<tr class="tblRowB">
<td align="center" >7</td>
<td>Hanumantappa</td>
<td>46</td>
<td>SC</td>
<td>Male</td>
<td>Illiterate</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 6</td>
<td >
No (General / Others)
</td>
</tr>
<tr class="tblRowA">
<td align="center" >8</td>
<td>Malingappa</td>
<td>45</td>
<td>SC</td>
<td>Male</td>
<td>Illiterate</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 7</td>
<td >
No (General / Others)
</td>
</tr>
<tr class="tblRowB">
<td align="center" >9</td>
<td>Kamalamma</td>
<td>52</td>
<td>OBC</td>
<td>Female</td>
<td>Illiterate</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 8</td>
<td >
Yes (OBC / Female)
</td>
</tr>
<tr class="tblRowA">
<td align="center" >10</td>
<td>Muddamma</td>
<td>48</td>
<td>OBC</td>
<td>Female</td>
<td>Illiterate</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 9</td>
<td >
Yes (General / Female)
</td>
</tr>
<tr class="tblRowB">
<td align="center" >11</td>
<td>Patta Tayamma</td>
<td>45</td>
<td>SC</td>
<td>Female</td>
<td>Middle or Lower Secondary</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 10</td>
<td >
Yes (SC / Female)
</td>
</tr>
<tr class="tblRowA">
<td align="center" >12</td>
<td>Sujatha</td>
<td>35</td>
<td>OBC</td>
<td>Female</td>
<td>Middle or Lower Secondary</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 11</td>
<td >
Yes (OBC / Female)
</td>
</tr>
<tr class="tblRowB">
<td align="center" >13</td>
<td>Kadurappa</td>
<td>35</td>
<td>SC</td>
<td>Male</td>
<td>Middle or Lower Secondary</td>
<td>N/A</td>
<td>
N/A
</td>
<td>Ward no 12</td>
<td >
Yes (SC / Others)
</td>
</tr>
</tbody>
</table>
<br />
<table width="100%" class="tbl_no_brdr">
<tr>
<td align="center">
<input type="button" class="btn" onclick="onClose('welcome.do?OWASP_CSRFTOKEN=CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU')" value=Close />
<input type="button" class="btn" onclick="this.disabled=true; this.value='Please Wait .!';onBack('consolidatedElectionReport.do?OWASP_CSRFTOKEN=CN72-BGJW-G7FM-K1S3-P5FF-V1EN-IO4T-GHWU&electionTermId=35107&stateId=28')" value=Back />
</td>
</tr>
</table>
<form id="electionReportForm" name="electionReportForm" action="#" method="post">
<div align="center"><br/>
<input type="button" class="btn" onclick="downloadReport('pdf');" value="Export to PDF" size="5" />
<input type="button" class="btn" onclick="downloadReport('xls');" value="Export to Excel" size="5" />
</div>
</form>
</div>
<div class="myclass"
style="font-family: Times; text-align: center; font-size: 10.0pt; color: white; font-weight: bold; border: 1px solid gray">
Report generated through Area Profiler (http://areaprofiler.gov.in)Thu Oct 02 22:34:20 IST 2014
</div>
</div>
</div>
</div>
</body>
</html>
</td>
</tr>
</table>
</div>
</div>
<div class="clear"></div>
<div id="footer">
<!-- Footer -->
<html>
<head>
</head>
<body>
<table width="100%" class="tbl_no_brdr">
<tr>
<td colspan="3" class="fotbrdr"></td>
</tr>
<tr>
<td width="161" class="btmlogospace"><a href="http://www.negp.gov.in/" target= "_blank" ><img src="images/e_governance_logo.jpg" width="161" height="38" /></a></td>
<td width="93" class="btmlogospace"><a href="http://www.panchayat.gov.in/" target= "_blank" ><img src="images/panchayatilogo.jpg" width="93" height="38" /></a></td>
<td align="right" class="btmlogospace">Site is designed, hosted
and maintained by National Informatics Centre<br /> Contents on
this website is owned,updated and managed by the Ministry of
Panchayati Raj</td>
</tr>
</table>
</body>
</html>
</div>
</div>
</body>
</html>
答案 0 :(得分:1)
我在这里粘贴一种方法,它不完全是解决方案,但您可以将其用作指南。 您必须遍历DOM树并提取所需的值。 我将你要查找的div的类从frmtext更改为frmtxt,在遍历中你必须检查是否找到了任何内容。
import urllib2
import os
import time
import traceback
from bs4 import BeautifulSoup
outfile= open('out.txt','wb')
rfile = open('195778.html')
rsoup = BeautifulSoup(rfile)
nodes1 = rsoup.find('div',{'class':'frmtxt'})
nodes = nodes1.find('table').find_all('tr')
for node in nodes:
a = node.find('th')
x = None
if a != None:
x1 = x.find('b')
if x1 != None:
x2 = x1.get_text().encode("utf-8")
print x2
x = x2
y = node.find('th')
if y != None:
print 'y',y
y2 = y.findNext('th')
if y2 != None:
print 'y2',y2
y3 = y2.find('b')
if y3 != None:
y = y3.get_text().encode("utf-8")
print y
outfile.write(str(x)+"\t"+str(y)+"\n")
outfile.close()