我正在尝试为一个研究项目抓取一堆HTML文件,但它无法正常工作。除了四个列名之外,csv文件最终为空,我不确定我做错了什么。我曾经做过一次类似的抓取工作,但是这个网站的html看起来更加混乱(或者另一个非常干净)。
我希望有人可以提供帮助,请原谅我写的代码 - 我没有多少经验。
from bs4 import BeautifulSoup
import csv
import urllib2
import os
def processData( pageFile ):
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
one = soup.findAll("td", attrs={"class":"t_user"})
two = soup.findAll("div", attrs={"class":"right t_number"})
three = soup.findAll("div", attrs={"style":"font-size: 9pt"})
four = soup.findAll("div", atrs={"style":"padding-top: 4px;"})
names = []
threads = []
posts = []
timestamps = []
for html in one:
names.append(BeautifulSoup(str(name)).get_text().encode("utf-8").strip())
for html in two:
threads = BeautifulSoup(str(thread).strip()).get_text().encode("utf-8").replace("\n", "")
for html in three:
posts = BeautifulSoup(str(post).strip()).get_text().encode("utf-8").replace("\n", "")
for html in four:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "")
timestamps.append(text.split("Post at:")[1].strip())
csvfile = open('S141test.csv', 'ab')
writer = csv.writer(csvfile)
for name, thread, post, timestamp in zip(names, threads, posts, timestamps):
writer.writerow([name, thread, post, timestamp])
csvfile.close()
dir = "S141test"
csvfile = "S141test.csv"
csvfile = open(csvfile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Name", "thread", "post", "timestamp"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter
我正在试图抓取的HTML :(我要抓的东西用大写字母突出显示,查找AUTHOR NAME,THREAD TITLE,TIMESTAMP,POST CONTENT)。
HTML S141
<div class="maintable">
<div class="spaceborder" style="width: 100%; border-bottom: none">
<table cellspacing="1" cellpadding="4" width="100%" align="center">
<tr class="header"><td colspan="2" style="color: #000000">
<div class="right" style="font-weight: normal">
<a href="misc.php?action=emailfriend&tid=50510">Email to Friend</a> |
<a href="my.php?item=subscriptions&subadd=50510">Subscription</a> |
<a href="my.php?item=favorites&favadd=50510&type=thread">Favorites</a>
</div>
Subject:
Ayuki - TST HG</td></tr>
</table></div>
<form method="post" name="delpost" action="topicadmin.php?action=delpost&fid=28&tid=50510&page=1">
<input type="hidden" name="formhash" value="66b6dc45">
<div class="spaceborder" style="width: 100%; margin-bottom: 4px;
border-top: none
">
<table cellspacing="1" cellpadding="4" width="100%" align="center" class="t_row">
<tr style="height: 100%">
<td width="18%" valign="top" class="t_user"> <a name="lastpost"></a><a href="viewpro.php?uid=7455" target="_blank" class="bold">asta (AUTHOR NAME)</a>
<br><div class="smalltxt">
Carnal Conqueror<br>
<img src="images/website141.COM/star_level2.gif" alt="Rank: 3" /><img src="images/website141.COM/star_level1.gif" alt="Rank: 3" /><br>
<br>
<br><br>
UID 7455<br>
Digest Posts
0<br>
Credits 328<br>
Posts 28<br>
Karma 327 <br>
Money 156 <br>
Acceptance 25 <br>
Reading Access 30<br>
Registered 20-1-2008<br>
Status Offline
</div>
</td>
<td width="82%" valign="top" style="padding: 0px">
<table border="0" cellspacing="0" cellpadding="4" class="t_msg">
<tr><td>
<div>
<div class="right t_number"><a href="###" class="bold" onclick="window.clipboardData.setData('text','http://forum.website141.com/eforum/viewthread.php?tid=50510&page=1#pid502926')">#1</a></div>
<div style="padding-top: 4px;">
Post at 26-6-2014 00:20 (TIMESTAMP)
<a href="viewpro.php?uid=7455">Profile</a>
<a href="pm.php?action=send&uid=7455" target="_blank">P.M.</a>
</div></div>
</td></tr>
<tr><td valign="top" class="line" height="100%" style="padding-top: 10px;">
<a name="pid502926" href="misc.php?action=viewratings&tid=50510&pid=502926" title="[Rating] 29"><img src="images/website141.COM/agree.gif" border="0" align="right" alt="" /><img src="images/website141.COM/agree.gif" border="0" align="right" alt="" /><img src="images/website141.COM/agree.gif" border="0" align="right" alt="" /></a>
<div class="right t_number">Font size:
<a style="cursor:hand" onclick="text502926.style.fontSize='9pt';">S</a>
<a style="cursor:hand" onclick="text502926.style.fontSize='12pt';">M</a>
<a style="cursor:hand" onclick="text502926.style.fontSize='15pt';">L</a></div>
<span class="bold" style="color: #999999">Ayuki - TST HG (THREAD TITLE) </span><br><br>
<div style="font-size: 9pt" id=text502926><a href="http://go141.com/en/27120-Tsim%20Sha%20Tsui%20-%20Honey%20Girls%20-%20Ayuki.html" target="_blank">http://go141.com/en/27120-Tsim%2 ... ls%20-%20Ayuki.html</a><br />
<br />
Do I recommend her ? It depends what you are looking for.
<br />
I asked to take a photo but she didn't allow. (POST CONTENT) </div>
<br><fieldset><legend><a href="misc.php?action=viewratings&tid=50510&pid=502926" title="View Rating Log">Recent Ratings</a></legend><br>
<table border="0" cellspacing="0" cellpadding="0">
<tr><td><a href="viewpro.php?uid=71695" target="_blank">banter141a</a></td>
<td> 2-7-2014 13:31</td><td> Karma</td><td> <b>+3</b></td>
<td> </td></tr>
<tr><td><a href="viewpro.php?uid=97825" target="_blank">hlau</a></td>
<td> 28-6-2014 00:36</td><td> Karma</td><td> <b>+6</b></td>
答案 0 :(得分:1)
我自己想通了。我不确定是要删除问题还是用我自己的解决方案回答问题,但如果其他人有类似的问题,我会选择后者。一个关键错误是我把csvfile与csvFile混淆了(我需要以不同的方式使用它们,如下面的代码所示)。
from bs4 import BeautifulSoup
import csv
import urllib2
import os
def processData( pageFile ):
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
one = soup.findAll('td', attrs={"width": "18%"}) #close
two = soup.findAll("span", attrs={"style":"color: #999999"}) #close
three = soup.findAll("div", attrs={"style":"font-size: 9pt"}) #works
four = soup.findAll("div", attrs={"style":"padding-top: 4px;"})
names = []
threads = []
posts = []
timestamps = []
print four
for html in one:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", " ")
names.append(text.split('UID')[0].strip()) #works, at the moment
for color in two:
text2 = BeautifulSoup(str(color).strip()).get_text().encode("utf-8").replace("\n", "")
threads.append(text2.strip())
for html in three:
text3 = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "").replace(" ", "")
posts.append(text3.strip())
for html in four:
text4 = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "").replace(" ", "")
timestamps.append(text4.split("Post at")[1].split('Profile')[0].strip())
csvfile = open('S141test.csv', 'ab')
writer = csv.writer(csvfile)
for name, thread, post, timestamp in zip(names, threads, posts, timestamps):
writer.writerow([name, thread, post, timestamp])
csvfile.close()
dir = "S141test"
csvFile = "S141test.csv"
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Name", "thread", "post", "timestamp"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter