加速python scrape sqllite3

时间:2014-12-06 18:27:00

标签: python sqlite python-requests

我想从NBA.com json feed中抽取NBA的比分。我想知道是否还有加速我当前的代码。

def ParsePlayerBox(boxObj):
    statsDict = {}
    homeId, awayId = boxObj['resultSets'][1].values()[1][0][3], boxObj['resultSets'][1].values()[1][1][3]
    dateObj = parse(boxObj['resultSets'][1].values()[1][0][0].split('T')[0]).date()
    for x in  boxObj['resultSets'][4].values()[1:][0]:
         gameId, teamId, teamAbbr, teamCity, plyrId, plyrName, posStr, comment, mpStr, fgm, fga, fgPerc, fg3m, fg3a, pgPerc, ftm, fta, ftPerc, orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus = x
         plyrGameId = gameId + '_' + str(plyrId)
         if mpStr != None: # PLAYER PLAYED IN GAME
            fg2m, fg2a =  fgm - fg3m, fga - fg3a
            minutes, seconds = [int(x) for x in mpStr.split(':')]
            mp = minutes +  seconds / 60 # CONVERT MP INTO DECIMAL
         else: # PLAYER DIDNT PLAY
            fg2m, fg2a = None, None
            mp = None
          if posStr == '':
             position = np.nan
          else: 
             position = posStr
          if teamId == homeId:
              isHome = True
              oppId = awayId
          elif teamId == awayId:
              isHome = False
              oppId = homeId
          statsDict[plyrGameId] = [gameId, dateObj, str(teamId), str(plyrId), plyrName, str(oppId), isHome, position, comment.strip(), mp, fgm, fga, fg2m, fg2a, fg3m, fg3a, ftm, fta,  orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus]
    return statsDict



def ImportBoxScores(start, end):
    con = sqlite3.connect("nba.db")
    con.execute("CREATE TABLE IF NOT EXISTS NBAPLYRBOXTEST(PlyrGameId, GameId, DateObj, TeamId, PlyrId, PlyrName, OppId, IsHome, Position, Comment, MP, FGM, FGA, FG2M, FG2A, FG3M, FG3A, FTM, FTA, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS, PLUS_MINUS)")
    for gameId in range(start, end):
        urlBox = 'http://stats.nba.com/stats/boxscore?GameID=00' + str(gameId) + '&RangeType=0&StartPeriod=0&EndPeriod=0&StartRange=0&EndRange=0'
        rBox = requests.get(urlBox)
        boxObj = json.loads(rBox.content)
        playerBoxScore = ParsePlayerBox(boxObj)
        for k, v in playerBoxScore.iteritems():
            plyrGameId = k
            gameId, dateObj, teamId, plyrId, plyrName, oppId, isHome, position, comment, mp, fgm, fga, fg2m, fg2a, fg3m, fg3a, ftm, fta,  orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus = v
            data = [(plyrGameId, gameId, dateObj, teamId, plyrId, plyrName, oppId, isHome, position, comment, mp, fgm, fga, fg2m, fg2a, fg3m, fg3a, ftm, fta,  orb, drb, trb, ast, stl, blk, tov, pf, pts, plusMinus)]
            stmt = "INSERT INTO  NBAPLYRBOXTEST VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"         
            con.executemany(stmt, data) 
            con.commit()     
    print gameId, datetime.datetime.now().time()
con.close() 

导入游戏的所有统计数据大约需要11秒。我还想通过播放数据来刮擦游戏,大约400左右玩游戏。我不太了解SQL,所以我希望有更好的方法来导入数据。

谢谢

0 个答案:

没有答案