项目:https://github.com/yoyolin/DataScienceJobs
问题:什么是['jd']以及如何解决错误?
我正在尝试运行此作者的python代码来刮取Indeed.com以生成我自己的结果。
这是我收到错误的地方:
Traceback (most recent call last):
File "webCrawl.py", line 100, in <module>
dataJobs[['jd']]= cleaned_list
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2414, in __setitem__
self._setitem_array(key, value)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2442, in _setitem_array
indexer = self.ix._convert_to_indexer(key, axis=1)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1230, in _convert_to_indexer
raise KeyError('%s not in index' % objarr[mask])
错误由webCrawler.py中的第100行生成:
dataJobs[['jd']]= cleaned_list
['jd']只在整个代码中弹出一次。此外,我不确定作者的意图是什么代表。它也出现在dataJobs.py的2行中但是对webCrawler.py没有影响,对吗?
webcrawler.py:
# -*- coding: utf-8 -*-
__author__ = 'Yiyou'
import sys
from bs4 import BeautifulSoup
import re
import pandas as pd
import urllib2
import nltk
reload(sys)
sys.setdefaultencoding('utf-8')
def webCrawl(url):
"""Given an indeed job url, return the whole text excluding script and style
Input:
url: String
Output:
content: String
"""
try:
html = urllib2.urlopen(url).read() # Connect to the job posting
except:
return ""
soup = BeautifulSoup(html, "html.parser")
# Reference for this step: https://jessesw.com/Data-Science-Skills/
for script in soup(["script", "style"]):
script.extract() # Remove these two elements from the BS4 object to get clean text
content = soup.getText().lower()
return content
def extractUseful (content):
if type(content) == float: #i
return "notok"
else:
content = content.replace("\r"," ").replace("\n", " ")
startwords = ["qualification", "responsibility", "require", "skill", "role", "experience", "demonstrate"]
start = set([content.find(i) for i in startwords])
if (-1 in start): #if doesn't find then it will be -1
start.remove(-1)
if (len(start) != 0): #if at least one of words is found
start_pos = min(start)
end_pos = content.find("days ago")-3 #end pos -3 is because we want to eliminate number if possible
return content[start_pos:end_pos]
else:
return "notok"
def process(text, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
""" Normalizes case and handles punctuation
Inputs:
text: str: raw text
lemmatizer: an instance of a class implementing the lemmatize() method
(the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
Outputs:
list(str): tokenized text
"""
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
word_list = nltk.word_tokenize(text);
lemma_list = [];
for i in word_list:
if i not in filters:
try:
lemma = lemmatizer.lemmatize(i);
lemma_list.append(str(lemma));
except:
pass
return " ".join(lemma_list)
if __name__ == '__main__':
#construct filter for processor
file = open("accountant.txt").read().lower()
filters = set(nltk.word_tokenize(file))
filters.update(nltk.corpus.stopwords.words('english'))
filters = list(filters)
#webcrawling
webContent = []
dataJobs = pd.read_csv("dataJobs.csv");
webContent = []
for i in dataJobs["url"]:
content = webCrawl(i);
webContent.append(content);
#clean the crawled text
cleaned_list = []
for j in webContent:
cleaned = extractUseful(j);
processed = process(cleaned, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer());
cleaned_list.append(processed)
#save to csv
contents = pd.DataFrame({ "Content":webContent, "Cleaned": cleaned_list})
contents.to_csv("webcrawled.csv")
dataJobs[['jd']]= cleaned_list
dataJobs.to_csv("dataJobs_v2_crawled.csv")
删除了确实已删除API代码的dataJobs.py:
__author__ = 'Yiyou'
import sys
import urllib2
import pandas as pd
import xml
import xml.etree.ElementTree as ET
import numpy as np
reload(sys)
sys.setdefaultencoding('utf-8')
def getTotalResults(query):
"""Obtain total number of jobs given a query
Inputs:
string: query, seperated by +
Outputs:
int: indicating no. of total jobs of the query
"""
#form url
query = "\"" + query + "\"" #double quotes mean it's querying exact title
url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start=0&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2"
#url = 'http://api.indeed.com/ads/apisearch?publisher=8710117352111766&v=2&limit=100000&format=json
#read website
response = urllib2.urlopen(url)
content = response.read()
#parse XML
root = ET.fromstring(content)
num = int(root.find('totalresults').text)
return num
def indeedrequest(query, start):
"""form the url using query and startNo
Input:
query: String, job title, using double quotes means exact wording in the title
startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
Output:
content: String, the XML file read from constructed API url
"""
query = "\"" + query + "\""
url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start="+str(start)+"&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2"
response = urllib2.urlopen(url)
content = response.read()
return(content)
def parseXMLtoDF(query, startNo):
"""parse xml file and then return a dataFrame of the 25 job results on the page
Input:
query: String, job title, using double quotes means exact wording in the title
startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
Output:
positionDB: a dataframe containing all job details from the XML page
"""
#Read and parse XML file
content = indeedrequest(query, startNo)
root = ET.fromstring(content)
#Iter through node result and store in dataframe
position_nodes = root.iter('result') #obtain all 25 XML formated Job files as an iterator
positionDB = pd.DataFrame()
for position_node in position_nodes: #iterate through 25 XML formatted jobs
position = position_node.getchildren() #obtain all tags and its content for one particular job
#construct a row in the dataframe
row = dict()
for jd in position: #iterate through all tags
row[jd.tag] = jd.text
#append the row into positionDB
positionDB = positionDB.append(row, ignore_index=True)
return(positionDB)
def queryJobs(query):
"""Given a query, obtain all the job results as much as the API could return
Input:
query: String, job title, using double quotes means exact wording in the title
Output:
dataframe, containing all the job details and query
"""
total = min(1025,getTotalResults(query)) #as the API has a constrain at 1025 records to return at maximum
start = 0 # for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
jobs = []
while(start <= total):
jobs.append(parseXMLtoDF(query, start)) #append dataframe on each page to jobs
start += 25 #"turn the page"
allDf = pd.concat(jobs) #concate all the dataframe to one
allDf['query'] = query #record the query
return allDf
def queryAllJobs(queries):
"""Given a list of queries, obtain all the job results as much as the API could return
Input:
queries: List of String, job title, using double quotes means exact wording in the title
Output:
dataframe, containing all the job details and query
"""
dataJobs = []
for i in queries:
dataJobs.append(queryJobs(i));
dataJobs = pd.concat(dataJobs)
#drop duplicated record from the dataframe, given unique jobkey
dataJobs = dataJobs.drop_duplicates(subset = "jobkey", keep = "first")
return dataJobs
if __name__ == '__main__':
publisher_key = " "
data = ["data+scientist", "data+engineer","data+analyst", "business+analyst","marketing+analyst", "machine+learning", "mechanical+engineer"]
queryAllJobs(data).to_csv("dataJobs.csv")