解析文件夹中的多个xml文件,然后写入中央csv

时间:2019-10-14 02:07:01

标签: python xml parsing beautifulsoup

我是python的新手,并编写了以下代码来解析目录中的多个xml文件并将内容写入中央CSV文件。

我有一个包含大约30个xml文件的文件夹。我的问题是,它仅从文件夹中第一个xml抓取内容,而不从其余内容抓取。我想我有没有循环的问题吗?我正在使用Beautifulsoup,并希望坚持这一点,因为我已经对此有所了解。

#open beautifulsoup library AND csv function

from bs4 import BeautifulSoup
import csv
import glob

#Open and read files in folder ending with .xml
for filename in glob.glob("*.xml"):
    with open(filename) as open_file:
        content = open_file.read()
        soup = BeautifulSoup(content, 'lxml')

#open and write csv file

csv_file = open('scrape.csv', 'a')
post_line = ['postid', 'subreddit', 'post title', 'author', 'post url', 'post date', 'post time', 'post score', 'submission text']
csv_writer = csv.writer(csv_file)
csv_writer.writerow(post_line)

#grab content from xml from following textblocks
#postid

for postid in soup.find('textblock', tid='7').text:
pid = postid.split(':')[1]
print(pid)

#subreddit
for subreddit in soup.find('textblock', tid='15').text
subred = subreddit.split(':')[1]
print(subred)

#post title
for posttitle in soup.find('textblock', tid='12').text
ptitle = posttitle.split(':')[1]
print(ptitle)

#author
for username in soup.find('textblock', tid='0').text
author = username.split(':')[1]
print(author)

#post url
for posturl in soup.find('textblock', tid='13').text
url = posturl.split(':')[2]
purl = f'https:{url}'
print(purl)

#post date
for postdate in soup.find('textblock', tid='3').text
pdate = postdate.split()[1]
print(pdate)

#post time
for posttime in soup.find('textblock', tid='3').text
ptime = posttime.split()[2]
print(ptime)

#post score
for postscore in soup.find('textblock', tid='10').text
pscore = postscore.split(':')[1]
print(pscore)

#submission text
for submission in soup.find('textblock', tid='20').text
print(submission)

#blank space
print()

csv_writer.writerow([pid, subred, ptitle, author, purl, pdate, ptime, pscore, submission])

csv_file.close()

1 个答案:

答案 0 :(得分:0)

您只使用最后一个文件的汤。

...
for filename in glob.glob("*.xml"):
    with open(filename) as open_file:
        content = open_file.read()
        soup = BeautifulSoup(content, 'lxml')    # overwrite soup with the soup of current file
...
# handle soup

您可以使handle_soup成为一个函数,并在每个汤上调用它。

import glob
import csv

from bs4 import BeautifulSoup

csv_file = open('scrape.csv', 'a')
post_line = ['postid', 'subreddit', 'post title', 'author', 'post url', 'post date', 'post time', 'post score', 'submission text']
csv_writer = csv.writer(csv_file)
csv_writer.writerow(post_line)

def handle_soup(soup, csv_writer):
    pid = soup.find('textblock', tid='7').text.split(":")[1]
    print(pid)

    subred = soup.find('textblock', tid='15').text.split(':')[1]
    print(subred)
    ...     # replace ... with other items
    csv_writer.writerow([pid, subred, ...]) # replace ... with other items


for filename in glob.glob("*.xml"):
    with open(filename) as open_file:
        content = open_file.read()
        soup = BeautifulSoup(content, 'lxml')
        handle_soup(soup, csv_writer)

csv_file.close()