我是python的新手,并编写了以下代码来解析目录中的多个xml文件并将内容写入中央CSV文件。
我有一个包含大约30个xml文件的文件夹。我的问题是,它仅从文件夹中第一个xml抓取内容,而不从其余内容抓取。我想我有没有循环的问题吗?我正在使用Beautifulsoup,并希望坚持这一点,因为我已经对此有所了解。
#open beautifulsoup library AND csv function
from bs4 import BeautifulSoup
import csv
import glob
#Open and read files in folder ending with .xml
for filename in glob.glob("*.xml"):
with open(filename) as open_file:
content = open_file.read()
soup = BeautifulSoup(content, 'lxml')
#open and write csv file
csv_file = open('scrape.csv', 'a')
post_line = ['postid', 'subreddit', 'post title', 'author', 'post url', 'post date', 'post time', 'post score', 'submission text']
csv_writer = csv.writer(csv_file)
csv_writer.writerow(post_line)
#grab content from xml from following textblocks
#postid
for postid in soup.find('textblock', tid='7').text:
pid = postid.split(':')[1]
print(pid)
#subreddit
for subreddit in soup.find('textblock', tid='15').text
subred = subreddit.split(':')[1]
print(subred)
#post title
for posttitle in soup.find('textblock', tid='12').text
ptitle = posttitle.split(':')[1]
print(ptitle)
#author
for username in soup.find('textblock', tid='0').text
author = username.split(':')[1]
print(author)
#post url
for posturl in soup.find('textblock', tid='13').text
url = posturl.split(':')[2]
purl = f'https:{url}'
print(purl)
#post date
for postdate in soup.find('textblock', tid='3').text
pdate = postdate.split()[1]
print(pdate)
#post time
for posttime in soup.find('textblock', tid='3').text
ptime = posttime.split()[2]
print(ptime)
#post score
for postscore in soup.find('textblock', tid='10').text
pscore = postscore.split(':')[1]
print(pscore)
#submission text
for submission in soup.find('textblock', tid='20').text
print(submission)
#blank space
print()
csv_writer.writerow([pid, subred, ptitle, author, purl, pdate, ptime, pscore, submission])
csv_file.close()
答案 0 :(得分:0)
您只使用最后一个文件的汤。
...
for filename in glob.glob("*.xml"):
with open(filename) as open_file:
content = open_file.read()
soup = BeautifulSoup(content, 'lxml') # overwrite soup with the soup of current file
...
# handle soup
您可以使handle_soup成为一个函数,并在每个汤上调用它。
import glob
import csv
from bs4 import BeautifulSoup
csv_file = open('scrape.csv', 'a')
post_line = ['postid', 'subreddit', 'post title', 'author', 'post url', 'post date', 'post time', 'post score', 'submission text']
csv_writer = csv.writer(csv_file)
csv_writer.writerow(post_line)
def handle_soup(soup, csv_writer):
pid = soup.find('textblock', tid='7').text.split(":")[1]
print(pid)
subred = soup.find('textblock', tid='15').text.split(':')[1]
print(subred)
... # replace ... with other items
csv_writer.writerow([pid, subred, ...]) # replace ... with other items
for filename in glob.glob("*.xml"):
with open(filename) as open_file:
content = open_file.read()
soup = BeautifulSoup(content, 'lxml')
handle_soup(soup, csv_writer)
csv_file.close()