我试图在计算机上的文件夹中抓取一堆HTML文件。我想要的数据存储在一个表中,我能够从每个文件中获取表中的最后一行,但忽略其他行!
我已将部分HTML复制到Pastebin,此处为:http://pastebin.com/hajr8SFi
这是我到目前为止的代码。同样,它适用于最后一行,但不适用于其他行。所以我猜这回路有问题吗?我试图弄明白,但到目前为止没有结果:(
def processData( pageFile ):
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
ewo = soup.find_all("td", {"class":"date"})
ewo2 = soup.find_all("td", {"class":"user"})
ewo3 = soup.find_all("p", {"class":"single"})
fishs = [ ]
dogs = [ ]
rats = [ ]
for html in ewo:
feedbacks = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
fishs.append(feedbacks.encode("utf-8").strip())
for html2 in ewo2:
feedbacks2 = BeautifulSoup(str(html2).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
dogs.append(feedbacks2.encode("utf-8").strip())
str1 = ''.join(dogs)
for html3 in ewo3:
feedbacks3 = BeautifulSoup(str(html3).strip()).encode("utf-8").replace("\n", "") # convert the html to text
rats.append(feedbacks3.encode("utf-8").split('<p class="single">')[1].split("</p>")[0].strip())
csvfile = open(today + ' evo.csv', 'ab')
writer = csv.writer(csvfile)
for fish, dog, rat in zip(fishs, dogs, rats):
writer.writerow([fish, dog, rat])
csvfile.close()
today = datetime.datetime.now().strftime('%Y-%m-%d')
dir = "files/"
csvFile = today + " file.csv"
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["F", "I", "V"])
csvfile.close()
fileList = os.listdir(dir)
totalLen = len(fileList)
count = 1
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter