Naver Crawler:每个循环Python组合DataFrame

时间:2017-12-03 15:29:19

标签: python join dataframe merge web-crawler

我正在研究我的Naver Crawler(韩国谷歌:P)。我已经在这个代码上工作了一个星期,我还有最后一个要解决的任务!因此,下面的代码显示了通过Naver API进行数据爬网并将数据接收到" js"在每个循环中。我需要做的就是组合每个数据帧(dfdfdf)并在底部组合。但我的结果总是显示最后的循环数据。底线是我想为我正在采取的每个循环添加DataFrame。 我尝试合并,加入但似乎无法正常工作。请告诉我,如果下面的代码没有意义(或太脏)请告诉我!

import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time

ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1') 
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
    if item not in seen:
        seen.add(item)
        DNA.append(item)

# len(DNA)

#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')

setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d') 

#Setting DataFrame & List
Data = pd.DataFrame(index=dd)

#Naver API Connection 
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";

#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"

df_list=[]

for i in range(2270,len(DNA),5):
    if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last    
        print("5")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==4):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last    
        print("4")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==3):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last    
        print("3")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==2):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last    
        print("2")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    else:
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last 
        print("1")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    #Combining all Data
    #Naver = Data.join(dfdfdf) 
    print("end")
    time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")

1 个答案:

答案 0 :(得分:1)

考虑使用在for循环之外连接的数据帧列表。虽然各个循环运行水平合并,但最终的主合并运行垂直附加。

此外,对于DRY-er解决方案,请考虑使用定义的方法运行对数据帧的响应,将 body 变量作为参数传入,if块之间的唯一区别

...
def response_to_df(body):
   request = urllib.request.Request(url)
   request.add_header("X-Naver-Client-Id",client_id)
   request.add_header("X-Naver-Client-Secret",client_secret)
   request.add_header("Content-Type","application/json")
   response = urllib.request.urlopen(request, data=body.encode("utf-8"))
   rescode = response.getcode()
   if(rescode==200):
       response_body = response.read()
       js = response_body.decode('utf-8')
    else:
       print("Error Code:" + rescode)
    d = json.loads(js)
    lst = [pd.DataFrame.from_dict(r['data']).set_index('period')\
                                  .rename(columns={'ratio' : r['title']})
           for r in d['results']]

    # HORIZONTAL MERGE
    df = pd.concat(lst, axis=1)
    df = Data.join(df)
    return df


df_list = []
for i in range(len(DNA), 5):
    if((len(DNA) % 5==0) or (i < (len(DNA) - (len(DNA) % 5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + \
               body_keywords + DNA[i+4] + body_last    
        print("5")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 4):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_last    
        print("4")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 3):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_last    
        print("3")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 2):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_last    
        print("2")

        tmp = response_to_df(body)
        df_list.append(tmp) 

    else:
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_last 
        print("1")

        tmp = response_to_df(body)
        df_list.append(tmp)


# Combining all Data (VERTICAL APPEND)
Naver = pd.concat(df_list, axis=0)
print("ddd")
Naver