我无法使用我在github上找到的这段代码来吸引财富500强公司。该代码适用于Fortune global500,但是一旦将URL地址更改为http://fortune.com/fortune500/“ + str(i)+” /“)以获得Fortune500,它就会给我以下错误。
Traceback (most recent call last):
File "<input>", line 78, in <module>
File "<input>", line 47, in grab
TypeError: 'NoneType' object has no attribute '__getitem__'
我对python非常陌生,到目前为止,我的Google搜索没有帮助。
代码:
# -*- coding: utf-8 -*-
# Fortune 1000 grabber. Download and parse companies information
# Import libraries
from bs4 import BeautifulSoup
import urllib2
import json
import re
from urlparse import urljoin
import unicodecsv
# Define class for companies
class Company:
rank = 0
title = ""
ticker = ""
industry = ""
sector = ""
hq_location = ""
website = ""
years_on_list = 0
ceo = ""
eps = 0
employees = 0
revenues = 0
profits = 0
img_url = ""
state_geo_code = ""
# Fortune 1000 graber function
def grab(i):
# Obtaining post id
data = urllib2.urlopen("http://fortune.com/fortune500/" + str(i) + "/")#("http://fortune.com/fortune500/")
soup = BeautifulSoup(data,"html.parser")
postid = next(attr for attr in soup.body['class'] if attr.startswith('postid'))
postid = re.match(r'postid-(\d+)', postid).group(1)
companies = []
# Fetch for pages with data and process JSONs
for i in range(1,51):
page_url = "http://fortune.com/data/franchise-list/{postid}/{index}/".format(postid=postid,index=str(i))
page_data = json.load(urllib2.urlopen(page_url), encoding='utf-8')
# Process JSON data
for item in page_data["articles"]:
company = Company()
company.rank = item["rank"]
company.title = item["title"]
company.ticker = item["ticker_text"].upper()
#company.industry = item["tables"]["Company Info"]["data"]["Industry"][0]
#company.sector = item["tables"]["Company Info"]["data"]["Sector"][0]
#company.hq_location = item["tables"]["Company Info"]["data"]["HQ Location"][0]
#company.years_on_list = item["tables"]["Company Info"]["data"]["Years on List"][0]
#company.ceo = item["tables"]["Company Info"]["data"]["CEO"][0]
#company.eps = item["tables"]["Earnings Per Share"]["data"]["Earnings Per Share ($)"][0]
#company.employees = item["tables"]["Key Financials (last fiscal year)"]["data"]["Employees"][0]
#company.revenues = item["tables"]["Key Financials (last fiscal year)"]["data"]["Revenues ($M)"][0]
#company.profits = item["tables"]["Key Financials (last fiscal year)"]["data"]["Profits ($M)"][0]
#company.state_geo_code = company.hq_location.split(", ")[1]
if item["featured_image"] != "":
company.img_url = item["featured_image"]["src"]
# Parsing website link
#s = BeautifulSoup(item["tables"]["Company Info"]["data"]["Website"][0],"html.parser")
#company.website = s.a.get('href')
print company.rank + ". " + company.title
companies.append(company)
return companies
# Obtain companies
for i in range(1995, 2017):
companies = grab(i)
# Saving to CSV
f = open("C:\Users\kourosh.amirkhani\Documents\Reputations and Conservatism\output/fortune" + str(i) + ".csv",
"wt")
try:
writer = unicodecsv.writer(f, encoding='utf-8')
writer.writerow(("Rank", "Title"))
for company in companies:
writer.writerow((company.rank, company.title))
finally:
f.close()
非常感谢您的帮助或建议。