我正在尝试抓取数据并将其存储在csv或xlsx文件中,但是当我运行代码时,文件返回空。
一个周期后,当我使用break
停止迭代器时,我发现代码保存了我想要的一行数据。最终目标是将此数据逐行写入文件中。如果知道的话,我正在使用beautifulsoup4。
代码如下:
from bs4 import BeautifulSoup
import requests
import xlsxwriter
url = 'https://www.rwaq.org/courses'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
base = 'https://www.rwaq.org'
course_div = soup.find_all('div', attrs={'class': 'course-info'})
course_links = [base + item.h3.a['href'] for item in course_div]
row = 0
for link in course_links:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.content, 'html.parser')
course_name = inner_soup.find('div', attrs={'class': 'page-title'}).h2.text
course_lecturer_name = inner_soup.find('div', attrs={'class': 'instructor-details'}).a.text.strip()
course_desc = inner_soup.find('div', attrs={'class': 'lecture_desc'}).p.text.strip()
if inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid ul'):
course_manhag = inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid ul').text
elif inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid p'):
course_manhag = inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid p').text
else:
course_manhag = ''
if inner_soup.select_one('#organization div.course-content div:nth-child(5) div.row-fluid ul'):
course_require = inner_soup.select_one(
'#organization div.course-content div:nth-child(5) div.row-fluid ul').text
elif inner_soup.select_one('#organization div.course-content div:nth-child(5) div.row-fluid p'):
course_require = inner_soup.select_one('#organization div.course-content div:nth-child(5) div.row-fluid p').text
else:
course_require = ''
if inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid ul'):
course_out = inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid ul').text
elif inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid p'):
course_out = inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid p').text
else:
course_out = ''
course_company = inner_soup.select_one(
'body div.container-fluid div div.subject-cover div.cover-info div div.subject-organization p a').text
course_date_from = inner_soup.select_one('p.subject-date').text.strip()[3:16]
if inner_soup.select_one('p.subject-date') is True:
course_date_to = inner_soup.select_one('p.subject-date').text.strip()[31:]
else:
course_date_to = ''
course_status = inner_soup.select_one('p.subject-date span').text
course_lecturer_link = [base + li.a['href'] for li in
inner_soup.find_all("div", attrs={'class': 'instructor-details'})]
course_iframe = inner_soup.select_one('iframe').attrs["src"]
course_promo_link = course_iframe[:24] + 'watch?v=' + course_iframe[30:course_iframe.find('?')]
wb = xlsxwriter.Workbook('file001.xlsx')
sheet = wb.add_worksheet()
sheet.write(row, 0, course_promo_link)
sheet.write_row(row, 1, course_lecturer_link)
sheet.write(row, 2, course_desc)
sheet.write(row, 3, course_out)
sheet.write(row, 4, course_status)
sheet.write(row, 5, course_name)
sheet.write(row, 6, course_date_from)
sheet.write(row, 7, course_date_to)
sheet.write(row, 8, course_manhag)
sheet.write(row, 9, course_require)
row += 1
wb.close()
break
答案 0 :(得分:0)
好吧,我在运行代码时遇到错误,因此无法进行测试。但是我首先要尝试的是从wb = xlsxwriter.Workbook('file001.xlsx')
循环中取出wb.close()
和for
。我最初的想法是您每次都在写文件。像这样:
from bs4 import BeautifulSoup
import requests
import xlsxwriter
url = 'https://www.rwaq.org/courses'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
base = 'https://www.rwaq.org'
course_div = soup.find_all('div', attrs={'class': 'course-info'})
course_links = [base + item.h3.a['href'] for item in course_div]
#Initialize your file/workbook
wb = xlsxwriter.Workbook('C:/file001.xlsx')
sheet = wb.add_worksheet()
row = 0
for link in course_links:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.content, 'html.parser')
course_name = inner_soup.find('div', attrs={'class': 'page-title'}).h2.text
course_lecturer_name = inner_soup.find('div', attrs={'class': 'instructor-details'}).a.text.strip()
course_desc = inner_soup.find('div', attrs={'class': 'lecture_desc'}).p.text.strip()
if inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid ul'):
course_manhag = inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid ul').text
elif inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid p'):
course_manhag = inner_soup.select_one('#organization div.course-content div:nth-child(4) div.row-fluid p').text
else:
course_manhag = ''
if inner_soup.select_one('#organization div.course-content div:nth-child(5) div.row-fluid ul'):
course_require = inner_soup.select_one(
'#organization div.course-content div:nth-child(5) div.row-fluid ul').text
elif inner_soup.select_one('#organization div.course-content div:nth-child(5) div.row-fluid p'):
course_require = inner_soup.select_one('#organization div.course-content div:nth-child(5) div.row-fluid p').text
else:
course_require = ''
if inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid ul'):
course_out = inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid ul').text
elif inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid p'):
course_out = inner_soup.select_one('#organization div.course-content div:nth-child(6) div.row-fluid p').text
else:
course_out = ''
course_company = inner_soup.select_one(
'body div.container-fluid div div.subject-cover div.cover-info div div.subject-organization p a').text
course_date_from = inner_soup.select_one('p.subject-date').text.strip()[3:16]
if inner_soup.select_one('p.subject-date') is True:
course_date_to = inner_soup.select_one('p.subject-date').text.strip()[31:]
else:
course_date_to = ''
course_status = inner_soup.select_one('p.subject-date span').text
course_lecturer_link = [base + li.a['href'] for li in
inner_soup.find_all("div", attrs={'class': 'instructor-details'})]
course_iframe = inner_soup.select_one('iframe').attrs["src"]
course_promo_link = course_iframe[:24] + 'watch?v=' + course_iframe[30:course_iframe.find('?')]
sheet.write(row, 0, course_promo_link)
sheet.write(row, 1, course_lecturer_link)
sheet.write(row, 2, course_desc)
sheet.write(row, 3, course_out)
sheet.write(row, 4, course_status)
sheet.write(row, 5, course_name)
sheet.write(row, 6, course_date_from)
sheet.write(row, 7, course_date_to)
sheet.write(row, 8, course_manhag)
sheet.write(row, 9, course_require)
row += 1
# Close it once all rows are written
wb.close()