Question

我正在尝试为一个研究项目抓取一堆HTML文件，但它无法正常工作。除了四个列名之外，csv文件最终为空，我不确定我做错了什么。我曾经做过一次类似的抓取工作，但是这个网站的html看起来更加混乱（或者另一个非常干净）。

我希望有人可以提供帮助，请原谅我写的代码 - 我没有多少经验。

from bs4 import BeautifulSoup
import csv
import urllib2
import os

def processData( pageFile ): 
    f = open(pageFile, "r")
    page = f.read()
    f.close()
    soup = BeautifulSoup(page)

    one = soup.findAll("td", attrs={"class":"t_user"})
    two = soup.findAll("div", attrs={"class":"right t_number"})
    three = soup.findAll("div", attrs={"style":"font-size: 9pt"})
    four = soup.findAll("div", atrs={"style":"padding-top: 4px;"})

    names = []
    threads = []
    posts = []
    timestamps = []

    for html in one:
        names.append(BeautifulSoup(str(name)).get_text().encode("utf-8").strip())

    for html in two:
        threads = BeautifulSoup(str(thread).strip()).get_text().encode("utf-8").replace("\n", "")

    for html in three:
        posts = BeautifulSoup(str(post).strip()).get_text().encode("utf-8").replace("\n", "")

    for html in four:
        text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "")
        timestamps.append(text.split("Post at:")[1].strip())

    csvfile = open('S141test.csv', 'ab')    
    writer = csv.writer(csvfile)

    for name, thread, post, timestamp in zip(names, threads, posts, timestamps):
            writer.writerow([name, thread, post, timestamp])

    csvfile.close()

dir = "S141test"
csvfile = "S141test.csv"
csvfile = open(csvfile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Name", "thread", "post", "timestamp"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
    path = os.path.join(dir, htmlFile) # get the file path
    processData(path) # process the data in the file
    print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
    count = count + 1 # incriment counter

我正在试图抓取的HTML :(我要抓的东西用大写字母突出显示，查找AUTHOR NAME，THREAD TITLE，TIMESTAMP，POST CONTENT）。

HTML S141
<div class="maintable">
<div class="spaceborder" style="width: 100%; border-bottom: none">
<table cellspacing="1" cellpadding="4" width="100%" align="center">
<tr class="header"><td colspan="2" style="color: #000000">
<div class="right" style="font-weight: normal">
<a href="misc.php?action=emailfriend&amp;tid=50510">Email to Friend</a> |
<a href="my.php?item=subscriptions&amp;subadd=50510">Subscription</a> |
<a href="my.php?item=favorites&amp;favadd=50510&amp;type=thread">Favorites</a>
</div>
Subject:
Ayuki - TST HG</td></tr>
</table></div>
<form method="post" name="delpost" action="topicadmin.php?action=delpost&amp;fid=28&amp;tid=50510&amp;page=1">
<input type="hidden" name="formhash" value="66b6dc45">
<div class="spaceborder" style="width: 100%; margin-bottom: 4px;
 border-top: none
">
<table cellspacing="1" cellpadding="4" width="100%" align="center" class="t_row">
<tr style="height: 100%">
<td width="18%" valign="top" class="t_user"> <a name="lastpost"></a><a href="viewpro.php?uid=7455" target="_blank" class="bold">asta (AUTHOR NAME)</a>
<br><div class="smalltxt">
Carnal Conqueror<br>
<img src="images/website141.COM/star_level2.gif" alt="Rank: 3" /><img src="images/website141.COM/star_level1.gif" alt="Rank: 3" /><br>
<br>
<br><br>
UID 7455<br>
Digest Posts
0<br>
Credits 328<br>
Posts 28<br>
Karma 327 <br>
Money 156 <br>
Acceptance 25 <br>
Reading Access 30<br>
Registered 20-1-2008<br>
Status Offline
</div>
</td>
<td width="82%" valign="top" style="padding: 0px">
<table border="0" cellspacing="0" cellpadding="4" class="t_msg">
<tr><td>
<div>
<div class="right t_number"><a href="###" class="bold" onclick="window.clipboardData.setData('text','http://forum.website141.com/eforum/viewthread.php?tid=50510&amp;page=1#pid502926')">#1</a></div>
<div style="padding-top: 4px;">
Post at 26-6-2014 00:20 (TIMESTAMP)&nbsp;
<a href="viewpro.php?uid=7455">Profile</a>
<a href="pm.php?action=send&amp;uid=7455" target="_blank">P.M.</a>&nbsp;
</div></div>
</td></tr>
<tr><td valign="top" class="line" height="100%" style="padding-top: 10px;">
<a name="pid502926" href="misc.php?action=viewratings&amp;tid=50510&amp;pid=502926" title="[Rating] 29"><img src="images/website141.COM/agree.gif" border="0" align="right" alt="" /><img src="images/website141.COM/agree.gif" border="0" align="right" alt="" /><img src="images/website141.COM/agree.gif" border="0" align="right" alt="" /></a>
<div class="right t_number">Font size:
<a style="cursor:hand" onclick="text502926.style.fontSize='9pt';">S</a>
<a style="cursor:hand" onclick="text502926.style.fontSize='12pt';">M</a>
<a style="cursor:hand" onclick="text502926.style.fontSize='15pt';">L</a></div>
<span class="bold" style="color: #999999">Ayuki - TST HG (THREAD TITLE) </span><br><br>
<div style="font-size: 9pt" id=text502926><a href="http://go141.com/en/27120-Tsim%20Sha%20Tsui%20-%20Honey%20Girls%20-%20Ayuki.html" target="_blank">http://go141.com/en/27120-Tsim%2 ... ls%20-%20Ayuki.html</a><br />
<br />
Do I recommend her ? It depends what you are looking for.
<br />
I asked to take a photo but she didn't allow. (POST CONTENT) </div>
<br><fieldset><legend><a href="misc.php?action=viewratings&amp;tid=50510&amp;pid=502926" title="View Rating Log">Recent Ratings</a></legend><br>
<table border="0" cellspacing="0" cellpadding="0">
<tr><td><a href="viewpro.php?uid=71695" target="_blank">banter141a</a></td>
<td>&nbsp;&nbsp;2-7-2014 13:31</td><td>&nbsp;&nbsp;Karma</td><td>&nbsp;&nbsp;<b>+3</b></td>
<td>&nbsp;&nbsp;</td></tr>
<tr><td><a href="viewpro.php?uid=97825" target="_blank">hlau</a></td>
<td>&nbsp;&nbsp;28-6-2014 00:36</td><td>&nbsp;&nbsp;Karma</td><td>&nbsp;&nbsp;<b>+6</b></td>

Answer 1

我自己想通了。我不确定是要删除问题还是用我自己的解决方案回答问题，但如果其他人有类似的问题，我会选择后者。一个关键错误是我把csvfile与csvFile混淆了（我需要以不同的方式使用它们，如下面的代码所示）。

from bs4 import BeautifulSoup
import csv
import urllib2
import os

def processData( pageFile ): 
    f = open(pageFile, "r")
    page = f.read()
    f.close()
    soup = BeautifulSoup(page)

    one = soup.findAll('td', attrs={"width": "18%"}) #close
    two = soup.findAll("span", attrs={"style":"color: #999999"})  #close
    three = soup.findAll("div", attrs={"style":"font-size: 9pt"}) #works
    four = soup.findAll("div", attrs={"style":"padding-top: 4px;"}) 


    names = []
    threads = []
    posts = []
    timestamps = []

    print four

    for html in one:
            text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", " ")
            names.append(text.split('UID')[0].strip()) #works, at the moment

    for color in two:
            text2 = BeautifulSoup(str(color).strip()).get_text().encode("utf-8").replace("\n", "")
            threads.append(text2.strip())

    for html in three:
            text3 = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "").replace(" ", "")
            posts.append(text3.strip())

    for html in four:
            text4 = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "").replace(" ", "")
            timestamps.append(text4.split("Post at")[1].split('Profile')[0].strip())

    csvfile = open('S141test.csv', 'ab')    
    writer = csv.writer(csvfile)

    for name, thread, post, timestamp in zip(names, threads, posts, timestamps):
            writer.writerow([name, thread, post, timestamp])

    csvfile.close()


dir = "S141test"
csvFile = "S141test.csv"
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Name", "thread", "post", "timestamp"])
csvfile.close()

# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
    path = os.path.join(dir, htmlFile) # get the file path
    processData(path) # process the data in the file
    print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
    count = count + 1 # incriment counter

使用Beautiful Soup 4扫描论坛页面 - csv文件最终为空

1 个答案: