Question

项目：https://github.com/yoyolin/DataScienceJobs

问题：什么是['jd']以及如何解决错误？

我正在尝试运行此作者的python代码来刮取Indeed.com以生成我自己的结果。

我运行了dataJobs.py。跑得很成功。输出：dataJobs.csv。
我运行了webCrawl.py。运行最后2行代码时出错。输出：webcrawler.csv，没有dataJobs_v2_crawled.csv的输出。

这是我收到错误的地方：

Traceback (most recent call last):
File "webCrawl.py", line 100, in <module>
dataJobs[['jd']]= cleaned_list
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2414, in __setitem__
self._setitem_array(key, value)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2442, in _setitem_array
indexer = self.ix._convert_to_indexer(key, axis=1)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1230, in _convert_to_indexer
raise KeyError('%s not in index' % objarr[mask])

错误由webCrawler.py中的第100行生成：

  dataJobs[['jd']]= cleaned_list

['jd']只在整个代码中弹出一次。此外，我不确定作者的意图是什么代表。它也出现在dataJobs.py的2行中但是对webCrawler.py没有影响，对吗？

webcrawler.py：

# -*- coding: utf-8 -*-
__author__ = 'Yiyou'

import sys
from bs4 import BeautifulSoup
import re
import pandas as pd
import urllib2
import nltk
reload(sys)
sys.setdefaultencoding('utf-8')

def webCrawl(url):
"""Given an indeed job url, return the whole text excluding script and style
Input:
    url: String
Output:
    content: String
"""
try:
    html = urllib2.urlopen(url).read() # Connect to the job posting
except:
    return ""


soup = BeautifulSoup(html, "html.parser")

# Reference for this step: https://jessesw.com/Data-Science-Skills/
for script in soup(["script", "style"]):
    script.extract() # Remove these two elements from the BS4 object to get clean text
content = soup.getText().lower()
return content

def extractUseful (content):
if type(content) == float: #i
    return "notok"
else:
    content = content.replace("\r"," ").replace("\n", " ")
    startwords = ["qualification", "responsibility", "require", "skill", "role", "experience", "demonstrate"]
    start = set([content.find(i) for i in startwords])
    if (-1 in start): #if doesn't find then it will be -1
        start.remove(-1)
    if (len(start) != 0): #if at least one of words is found
        start_pos = min(start)
        end_pos = content.find("days ago")-3 #end pos -3 is because we want to eliminate number if possible
        return  content[start_pos:end_pos]
    else:
        return "notok"

def process(text, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
""" Normalizes case and handles punctuation
Inputs:
    text: str: raw text
    lemmatizer: an instance of a class implementing the lemmatize() method
                (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
Outputs:
    list(str): tokenized text
"""
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
word_list = nltk.word_tokenize(text);

lemma_list = [];
for i in word_list:
    if i not in filters:
        try:
            lemma = lemmatizer.lemmatize(i);
            lemma_list.append(str(lemma));
        except:
            pass
return " ".join(lemma_list)


if __name__ == '__main__':
#construct filter for processor
file = open("accountant.txt").read().lower()
filters = set(nltk.word_tokenize(file))
filters.update(nltk.corpus.stopwords.words('english'))
filters = list(filters)

#webcrawling
webContent = []
dataJobs = pd.read_csv("dataJobs.csv");
webContent = []
for i in dataJobs["url"]:
    content = webCrawl(i);
    webContent.append(content);

#clean the crawled text
cleaned_list = []
for j in webContent:
        cleaned = extractUseful(j);
        processed = process(cleaned, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer());
        cleaned_list.append(processed)

#save to csv
contents = pd.DataFrame({ "Content":webContent, "Cleaned": cleaned_list})
contents.to_csv("webcrawled.csv")


dataJobs[['jd']]= cleaned_list
dataJobs.to_csv("dataJobs_v2_crawled.csv")

dataJobs.py：

__author__ = 'Yiyou'
import sys
import urllib2
import pandas as pd
import xml
import xml.etree.ElementTree as ET
import numpy as np
reload(sys)
sys.setdefaultencoding('utf-8')

def getTotalResults(query):
"""Obtain total number of jobs given a query
Inputs:
    string: query, seperated by +
Outputs:
    int: indicating no. of total jobs of the query
"""

#form url
query = "\"" + query + "\""   #double quotes mean it's querying exact title
url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start=0&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2"
#url = 'http://api.indeed.com/ads/apisearch?publisher=8710117352111766&v=2&limit=100000&format=json
#read website
response = urllib2.urlopen(url)
content = response.read()

#parse XML
root = ET.fromstring(content)
num = int(root.find('totalresults').text)
return num

def indeedrequest(query, start):
"""form the url using query and startNo
Input:
    query: String, job title, using double quotes means exact wording in the title
    startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
Output:
    content: String, the XML file read from constructed API url
"""
query = "\"" + query + "\""
url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start="+str(start)+"&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2"
response = urllib2.urlopen(url)
content = response.read()
return(content)

def parseXMLtoDF(query, startNo):
"""parse xml file and then return a dataFrame of the 25 job results on the page
Input:
    query: String, job title, using double quotes means exact wording in the title
    startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
Output:
    positionDB: a dataframe containing all job details from the XML page
"""

#Read and parse XML file
content = indeedrequest(query, startNo)
root = ET.fromstring(content)

#Iter through node result and store in dataframe
position_nodes = root.iter('result') #obtain all 25 XML formated Job files as an iterator
positionDB  = pd.DataFrame()

for position_node in position_nodes: #iterate through 25 XML formatted jobs
    position = position_node.getchildren()  #obtain all tags and its content for one particular job

    #construct a row in the dataframe
    row = dict()
    for jd in position: #iterate through all tags
        row[jd.tag] = jd.text

    #append the row into positionDB
    positionDB = positionDB.append(row, ignore_index=True)

return(positionDB)

def queryJobs(query):
"""Given a query, obtain all the job results as much as the API could return
Input:
    query: String, job title, using double quotes means exact wording in the title
Output:
    dataframe, containing all the job details and query
"""
total = min(1025,getTotalResults(query))  #as the API has a constrain at 1025 records to return at maximum
start = 0 # for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25

jobs = []
while(start <= total):
    jobs.append(parseXMLtoDF(query, start)) #append dataframe on each page to jobs
    start += 25 #"turn the page"
allDf =  pd.concat(jobs) #concate all the dataframe to one
allDf['query'] = query #record the query

return allDf

def queryAllJobs(queries):
"""Given a list of queries, obtain all the job results as much as the API could return
Input:
    queries: List of String, job title, using double quotes means exact wording in the title
Output:
    dataframe, containing all the job details and query
"""
dataJobs = []
for i in queries:
    dataJobs.append(queryJobs(i));
dataJobs = pd.concat(dataJobs)

#drop duplicated record from the dataframe, given unique jobkey
dataJobs = dataJobs.drop_duplicates(subset = "jobkey", keep = "first")
return dataJobs

if __name__ == '__main__':
publisher_key = " "
data = ["data+scientist", "data+engineer","data+analyst", "business+analyst","marketing+analyst", "machine+learning", "mechanical+engineer"]
queryAllJobs(data).to_csv("dataJobs.csv")

KeyError：“['jd']不在索引中”

0 个答案: