我正在研究我的Naver Crawler(韩国谷歌:P)。我已经在这个代码上工作了一个星期,我还有最后一个要解决的任务!因此,下面的代码显示了通过Naver API进行数据爬网并将数据接收到" js"在每个循环中。我需要做的就是组合每个数据帧(dfdfdf)并在底部组合。但我的结果总是显示最后的循环数据。底线是我想为我正在采取的每个循环添加DataFrame。 我尝试合并,加入但似乎无法正常工作。请告诉我,如果下面的代码没有意义(或太脏)请告诉我!
import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time
ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1')
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
if item not in seen:
seen.add(item)
DNA.append(item)
# len(DNA)
#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')
setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d')
#Setting DataFrame & List
Data = pd.DataFrame(index=dd)
#Naver API Connection
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";
#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"
df_list=[]
for i in range(2270,len(DNA),5):
if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last
print("5")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==4):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last
print("4")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==3):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last
print("3")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==2):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last
print("2")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
else:
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last
print("1")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
#Combining all Data
#Naver = Data.join(dfdfdf)
print("end")
time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")
答案 0 :(得分:1)
考虑使用在for
循环之外连接的数据帧列表。虽然各个循环运行水平合并,但最终的主合并运行垂直附加。
此外,对于DRY-er解决方案,请考虑使用定义的方法运行对数据帧的响应,将 body 变量作为参数传入,if
块之间的唯一区别
...
def response_to_df(body):
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
d = json.loads(js)
lst = [pd.DataFrame.from_dict(r['data']).set_index('period')\
.rename(columns={'ratio' : r['title']})
for r in d['results']]
# HORIZONTAL MERGE
df = pd.concat(lst, axis=1)
df = Data.join(df)
return df
df_list = []
for i in range(len(DNA), 5):
if((len(DNA) % 5==0) or (i < (len(DNA) - (len(DNA) % 5)))):
body = body_intro + endDate + body_endDate + DNA[i] + \
body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + \
body_keywords + DNA[i+4] + body_last
print("5")
tmp = response_to_df(body)
df_list.append(tmp)
elif(len(DNA) % 5 == 4):
body = body_intro + endDate + body_endDate + DNA[i] + \
body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
body_keywords + DNA[i+3] + body_last
print("4")
tmp = response_to_df(body)
df_list.append(tmp)
elif(len(DNA) % 5 == 3):
body = body_intro + endDate + body_endDate + DNA[i] + \
body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
body_keywords + DNA[i+2] + body_last
print("3")
tmp = response_to_df(body)
df_list.append(tmp)
elif(len(DNA) % 5 == 2):
body = body_intro + endDate + body_endDate + DNA[i] + \
body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
body_keywords + DNA[i+1] + body_last
print("2")
tmp = response_to_df(body)
df_list.append(tmp)
else:
body = body_intro + endDate + body_endDate + DNA[i] + \
body_keywords + DNA[i] + body_last
print("1")
tmp = response_to_df(body)
df_list.append(tmp)
# Combining all Data (VERTICAL APPEND)
Naver = pd.concat(df_list, axis=0)
print("ddd")
Naver