我正在使用BeautifulSoup和Selenium进行网络爬虫,我想将数据写入excel文件
# coding: utf-8
import requests
import bs4
from datetime import datetime
import re
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
import time
initialpage = 'https://www.boxofficemojo.com/yearly/chart/?yr=2017&p=.htm'
res = requests.get(initialpage, timeout=None)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
pages = []
pagelinks=soup.select('a[href^="/yearly/chart/?page"]')
for i in range(int(len(pagelinks)/2)):
pages.append(str(pagelinks[i])[9:-14])
pages[i]=pages[i].replace("amp;","")
pages[i]= "https://www.boxofficemojo.com" + pages[i]
pages[i]=pages[i][:-1]
pages.insert(0, initialpage)
date_dic = {}
movie_links = []
titles = []
Domestic_Gross_Arr=[]
Genre_Arr=[]
Release_Date_Arr = []
Theaters_Arr=[]
Budget_Arr = []
Views_Arr = []
Edits_Arr = []
Editors_Arr = []
for i in range(int(len(pagelinks)/2 + 1)):
movie_count=0;
res1 = requests.get(pages[i])
souppage=bs4.BeautifulSoup(res1.text, 'html.parser')
for j in souppage.select('tr > td > b > font > a'):
link = j.get("href")[7:].split("&")
str1 = "".join(link)
final = "https://www.boxofficemojo.com/movies" + str1
if "/?id" in final:
movie_links.append(final)
movie_count += 1
number_of_theaters=souppage.find("tr", bgcolor="#dcdcdc")
for k in range(movie_count):
#print(number_of_theaters.next_sibling.contents[4].text)
Theaters_Arr.append(number_of_theaters.next_sibling.contents[4].text)
number_of_theaters=number_of_theaters.next_sibling
k=0
path = os.getcwd()
path = path + '/movie_pictures'
os.makedirs(path)
os.chdir(path)
while(k < 2):
j = movie_links[k]
try:
res1 = requests.get(j)
soup1 = bs4.BeautifulSoup(res1.text, 'html.parser')
c = soup1.select('td[width="35%"]')
d=soup1.select('div[class="mp_box_content"]')
genre = soup1.select('td[valign="top"]')[5].select('b')
image = soup1.select('img')[6].get('src')
budget = soup1.select('tr > td > b')
domestic = str(c[0].select('b'))[4:-5]
release = soup1.nobr.a
title = soup1.select('title')[0].getText()[:-25]
print ("-----------------------------------------")
print ("Title: " +title)
titles.append(title)
print ("Domestic Gross: " +domestic)
Domestic_Gross_Arr.append(domestic)
print ("Genre: "+genre[0].getText())
Genre_Arr.append(genre[0].getText())
print ("Release Date: " +release.contents[0])
Release_Date_Arr.append(release.contents[0])
print ("Production Budget: " +budget[5].getText())
Budget_Arr.append(budget[5].getText())
year1=str(release.contents[0])[-4:]
a,b=str(release.contents[0]).split(",")
month1, day1=a.split(" ")
datez= year1 + month1 + day1
new_date= datetime.strptime(datez , "%Y%B%d")
date_dic[title]=new_date
with open('pic' + str(k) + '.png', 'wb') as handle:
response = requests.get(image, stream=True)
if not response.ok:
print response
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
except:
print("Error Occured, Page Or Data Not Available")
k+=1
def subtract_one_month(t):
import datetime
one_day = datetime.timedelta(days=1)
one_month_earlier = t - one_day
while one_month_earlier.month == t.month or one_month_earlier.day > t.day:
one_month_earlier -= one_day
year=str(one_month_earlier)[:4]
day=str(one_month_earlier)[8:10]
month=str(one_month_earlier)[5:7]
newdate= year + "-" + month +"-" + day
return newdate
number_of_errors=0
browser = webdriver.Chrome("/Users/Gokce/Downloads/chromedriver")
browser.maximize_window()
browser.implicitly_wait(20)
for i in titles:
try:
release_date = date_dic[i]
i = i.replace(' ', '_')
i = i.replace("2017", "2017_film")
#end = datetime.strptime(release_date, '%B %d, %Y')
end_date = release_date.strftime('%Y-%m-%d')
start_date = subtract_one_month(release_date)
url = "https://tools.wmflabs.org/pageviews/?project=en.wikipedia.org&platform=all-access&agent=user&start="+ start_date +"&end="+ end_date + "&pages=" + i
browser.get(url)
page_views_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--pageviews .linear-legend--counts:first-child span.pull-right ")
page_edits_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .linear-legend--counts:first-child span.pull-right ")
page_editors_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .legend-block--body .linear-legend--counts:nth-child(2) span.pull-right ")
print (i)
print ("Number of Page Views: " +page_views_count.text)
Views_Arr.append(page_views_count.text)
print ("Number of Edits: " +page_edits_count.text)
Edits_Arr.append(page_edits_count.text)
print ("Number of Editors: " +page_editors_count.text)
Editors_Arr.append(page_editors_count.text)
except:
print("Error Occured for this page: " + str(i))
number_of_errors += 1
Views_Arr.append(-1)
Edits_Arr.append(-1)
Editors_Arr.append(-1)
time.sleep(5)
browser.quit()
import xlsxwriter
os.chdir("/home")
workbook = xlsxwriter.Workbook('WebScraping.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0,0, "Hello")
worksheet.write(0,1, 'Genre')
worksheet.write(0,2, 'Production Budget')
worksheet.write(0,3, 'Domestic Gross')
worksheet.write(0,4, 'Release Date')
worksheet.write(0,5, 'Number of Wikipedia Page Views')
worksheet.write(0,6, 'Number of Wikipedia Edits')
worksheet.write(0,7, 'Number of Wikipedia Editors')
row=1
for i in range(len(titles)):
worksheet.write(row, 0, titles[i])
worksheet.write(row, 1, Genre_Arr[i])
worksheet.write(row, 2, Budget_Arr[i])
worksheet.write(row, 3, Domestic_Gross_Arr[i])
worksheet.write(row, 4, Release_Date_Arr[i])
worksheet.write(row, 5, Theaters_Arr[i])
worksheet.write(row, 6, Views_Arr[i])
worksheet.write(row, 7, Edits_Arr[i])
worksheet.write(row, 8, Editors_Arr[i])
row += 1
workbook.close()
代码在导入xlsxwriter之前一直有效,然后出现此错误:
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
<ipython-input-9-c99eea52d475> in <module>()
27
28
---> 29 workbook.close()
/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in close(self)
309 if not self.fileclosed:
310 self.fileclosed = 1
--> 311 self._store_workbook()
312
313 def set_size(self, width, height):
/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in _store_workbook(self)
638
639 xlsx_file = ZipFile(self.filename, "w", compression=ZIP_DEFLATED,
--> 640 allowZip64=self.allow_zip64)
641
642 # Add XML sub-files to the Zip file with their Excel filename.
/Users/Gokce/anaconda2/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
754 modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
755 try:
--> 756 self.fp = open(file, modeDict[mode])
757 except IOError:
758 if mode == 'a':
IOError: [Errno 45] Operation not supported: 'WebScraping.xlsx'
可能是什么问题?如果切断了最后一部分,并使用伪造数据在新的IDLE中运行,那么它将起作用。但是它在主IDLE中不起作用。所以问题一定出在我相信的前一部分
答案 0 :(得分:1)
当代码尝试写入文件时,将触发错误。确认您对该目录具有写权限,并且该文件尚不存在。您不太可能访问/home
。
答案 1 :(得分:1)
几年前,我使用openpyxl来自动化一些excel电子表格。您可以使用如下代码:
import openpyxl
# you could create a template spreadsheet that does all of the
# worksheet.write() commands on lines 188-195
wb = openpyxl.load_workbook('Template.xlsx')
wb.create_sheet('my_sheet')
ws = wb.get_sheet_by_name(title='my_sheet')
row=1
for i in range(len(titles)):
ws.cell(row=row, column=0, value=titles[i])
ws.cell(row=row, column=1, value=Genre_Arr[i])
ws.cell(row=row, column=2, value=Budget_Arr[i])
ws.cell(row=row, column=3, value=Domestic_Gross_Arr[i])
ws.cell(row=row, column=4, value=Release_Date_Arr[i])
ws.cell(row=row, column=5, value=Theaters_Arr[i])
ws.cell(row=row, column=6, value=Views_Arr[i])
ws.cell(row=row, column=7, value=Edits_Arr[i])
ws.cell(row=row, column=8, value=Editors_Arr[i])
row += 1
wb.save('my_file.xlsx')