KeyError:“['jd']不在索引中”

时间:2017-01-08 19:26:49

标签: python python-2.7

项目:https://github.com/yoyolin/DataScienceJobs

问题:什么是['jd']以及如何解决错误?

我正在尝试运行此作者的python代码来刮取Indeed.com以生成我自己的结果。

  1. 我运行了dataJobs.py。跑得很成功。输出:dataJobs.csv。
  2. 我运行了webCrawl.py。运行最后2行代码时出错。输出:webcrawler.csv,没有dataJobs_v2_crawled.csv的输出。
  3. 这是我收到错误的地方:

    Traceback (most recent call last):
    File "webCrawl.py", line 100, in <module>
    dataJobs[['jd']]= cleaned_list
    File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2414, in __setitem__
    self._setitem_array(key, value)
    File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2442, in _setitem_array
    indexer = self.ix._convert_to_indexer(key, axis=1)
    File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1230, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
    

    错误由webCrawler.py中的第100行生成:

      dataJobs[['jd']]= cleaned_list
    

    ['jd']只在整个代码中弹出一次。此外,我不确定作者的意图是什么代表。它也出现在dataJobs.py的2行中但是对webCrawler.py没有影响,对吗?

    webcrawler.py:

    # -*- coding: utf-8 -*-
    __author__ = 'Yiyou'
    
    import sys
    from bs4 import BeautifulSoup
    import re
    import pandas as pd
    import urllib2
    import nltk
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    def webCrawl(url):
    """Given an indeed job url, return the whole text excluding script and style
    Input:
        url: String
    Output:
        content: String
    """
    try:
        html = urllib2.urlopen(url).read() # Connect to the job posting
    except:
        return ""
    
    
    soup = BeautifulSoup(html, "html.parser")
    
    # Reference for this step: https://jessesw.com/Data-Science-Skills/
    for script in soup(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object to get clean text
    content = soup.getText().lower()
    return content
    
    def extractUseful (content):
    if type(content) == float: #i
        return "notok"
    else:
        content = content.replace("\r"," ").replace("\n", " ")
        startwords = ["qualification", "responsibility", "require", "skill", "role", "experience", "demonstrate"]
        start = set([content.find(i) for i in startwords])
        if (-1 in start): #if doesn't find then it will be -1
            start.remove(-1)
        if (len(start) != 0): #if at least one of words is found
            start_pos = min(start)
            end_pos = content.find("days ago")-3 #end pos -3 is because we want to eliminate number if possible
            return  content[start_pos:end_pos]
        else:
            return "notok"
    
    def process(text, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ Normalizes case and handles punctuation
    Inputs:
        text: str: raw text
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs:
        list(str): tokenized text
    """
    lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
    word_list = nltk.word_tokenize(text);
    
    lemma_list = [];
    for i in word_list:
        if i not in filters:
            try:
                lemma = lemmatizer.lemmatize(i);
                lemma_list.append(str(lemma));
            except:
                pass
    return " ".join(lemma_list)
    
    
    if __name__ == '__main__':
    #construct filter for processor
    file = open("accountant.txt").read().lower()
    filters = set(nltk.word_tokenize(file))
    filters.update(nltk.corpus.stopwords.words('english'))
    filters = list(filters)
    
    #webcrawling
    webContent = []
    dataJobs = pd.read_csv("dataJobs.csv");
    webContent = []
    for i in dataJobs["url"]:
        content = webCrawl(i);
        webContent.append(content);
    
    #clean the crawled text
    cleaned_list = []
    for j in webContent:
            cleaned = extractUseful(j);
            processed = process(cleaned, filters=nltk.corpus.stopwords.words('english'), lemmatizer=nltk.stem.wordnet.WordNetLemmatizer());
            cleaned_list.append(processed)
    
    #save to csv
    contents = pd.DataFrame({ "Content":webContent, "Cleaned": cleaned_list})
    contents.to_csv("webcrawled.csv")
    
    
    dataJobs[['jd']]= cleaned_list
    dataJobs.to_csv("dataJobs_v2_crawled.csv")
    
    删除了确实已删除API代码的

    dataJobs.py:

    __author__ = 'Yiyou'
    import sys
    import urllib2
    import pandas as pd
    import xml
    import xml.etree.ElementTree as ET
    import numpy as np
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    def getTotalResults(query):
    """Obtain total number of jobs given a query
    Inputs:
        string: query, seperated by +
    Outputs:
        int: indicating no. of total jobs of the query
    """
    
    #form url
    query = "\"" + query + "\""   #double quotes mean it's querying exact title
    url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start=0&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2"
    #url = 'http://api.indeed.com/ads/apisearch?publisher=8710117352111766&v=2&limit=100000&format=json
    #read website
    response = urllib2.urlopen(url)
    content = response.read()
    
    #parse XML
    root = ET.fromstring(content)
    num = int(root.find('totalresults').text)
    return num
    
    def indeedrequest(query, start):
    """form the url using query and startNo
    Input:
        query: String, job title, using double quotes means exact wording in the title
        startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
    Output:
        content: String, the XML file read from constructed API url
    """
    query = "\"" + query + "\""
    url = "http://api.indeed.com/ads/apisearch?publisher=" + publisher_key + "&v=2&q="+query +"&l=&sort=&radius=&st=&jt=fulltime&start="+str(start)+"&limit=26&fromage=365&highlight=0&filter=&latlong=1&co=us&chnl=&userip=45.56.94.21&useragent=&v=2"
    response = urllib2.urlopen(url)
    content = response.read()
    return(content)
    
    def parseXMLtoDF(query, startNo):
    """parse xml file and then return a dataFrame of the 25 job results on the page
    Input:
        query: String, job title, using double quotes means exact wording in the title
        startNo : int, for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
    Output:
        positionDB: a dataframe containing all job details from the XML page
    """
    
    #Read and parse XML file
    content = indeedrequest(query, startNo)
    root = ET.fromstring(content)
    
    #Iter through node result and store in dataframe
    position_nodes = root.iter('result') #obtain all 25 XML formated Job files as an iterator
    positionDB  = pd.DataFrame()
    
    for position_node in position_nodes: #iterate through 25 XML formatted jobs
        position = position_node.getchildren()  #obtain all tags and its content for one particular job
    
        #construct a row in the dataframe
        row = dict()
        for jd in position: #iterate through all tags
            row[jd.tag] = jd.text
    
        #append the row into positionDB
        positionDB = positionDB.append(row, ignore_index=True)
    
    return(positionDB)
    
    def queryJobs(query):
    """Given a query, obtain all the job results as much as the API could return
    Input:
        query: String, job title, using double quotes means exact wording in the title
    Output:
        dataframe, containing all the job details and query
    """
    total = min(1025,getTotalResults(query))  #as the API has a constrain at 1025 records to return at maximum
    start = 0 # for mannually "turning page" as the indeed API has a restriction on number of jobs returned per query at 25
    
    jobs = []
    while(start <= total):
        jobs.append(parseXMLtoDF(query, start)) #append dataframe on each page to jobs
        start += 25 #"turn the page"
    allDf =  pd.concat(jobs) #concate all the dataframe to one
    allDf['query'] = query #record the query
    
    return allDf
    
    def queryAllJobs(queries):
    """Given a list of queries, obtain all the job results as much as the API could return
    Input:
        queries: List of String, job title, using double quotes means exact wording in the title
    Output:
        dataframe, containing all the job details and query
    """
    dataJobs = []
    for i in queries:
        dataJobs.append(queryJobs(i));
    dataJobs = pd.concat(dataJobs)
    
    #drop duplicated record from the dataframe, given unique jobkey
    dataJobs = dataJobs.drop_duplicates(subset = "jobkey", keep = "first")
    return dataJobs
    
    if __name__ == '__main__':
    publisher_key = " "
    data = ["data+scientist", "data+engineer","data+analyst", "business+analyst","marketing+analyst", "machine+learning", "mechanical+engineer"]
    queryAllJobs(data).to_csv("dataJobs.csv")  
    

0 个答案:

没有答案