我正在使用循环为每个国家/地区构建图表
import requests
import pandas as pd
import json
from pandas import read_html
from pandas.io.json import json_normalize
import urllib2, json
import html5lib
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
for country in WHOData.columns: # Iterate over countries
country_df = WHOData[[country]].unstack().fillna(0) #; Unstack deaths and rates
cases = country_df.iloc[:, country_df.columns.get_level_values(1)=='Cases']
rate = country_df.iloc[:, country_df.columns.get_level_values(1)=='Rate']
x = np.array(rate)
y = np.array(cases)
plt.scatter(x,y)
results = sm.OLS(y,sm.add_constant(x)).fit()
plt.plot(x, results.fittedvalues)
color = 'red'
plt.ylabel('Measles Cases', color=color)
color = 'blue'
plt.xlabel('Vaccination Rate', color=color)
plt.title(country)
plt.show()
我想删除案例或费率
中包含0的所有行我尝试了这段代码(使用打印df而不是绘制图表来向我展示发生了什么)
for country in WHOData.columns: # Iterate over countries
country_df = WHOData[[country]].unstack(); # Unstack deaths and rates
country_df = country_df.drop((country_df.iloc[:, country_df.columns.get_level_values(1)=='Rate']==0)|(country_df.iloc[:, country_df.columns.get_level_values(1)=='Cases']==0))
print country_df
break
我收到了这个错误:
ValueError: labels [(u'Afghanistan', 'Cases') (u'Afghanistan', 'Rate')] not contained in axis
这实际上让我觉得我很接近 - 但也让我感到难过
感谢任何帮助
这是创建此数据框的代码:
df = pd.DataFrame({(u'Afghanistan', 'Cases'): {1980: 32455.0,
1981: 31107.0,
1982: 20320.0,
1983: 18808.0,
1984: 16199.0,
1985: 14457.0},
(u'Afghanistan', 'Pop'): {1980: 13211000.0,
1981: 12894800.0,
1982: 12578600.0,
1983: 12262400.0,
1984: 11946200.0,
1985: 11630000.0},
(u'Afghanistan', 'Rate'): {1980: 11.0,
1981: 0.0,
1982: 8.0,
1983: 9.0,
1984: 14.0,
1985: 14.0},
(u'Albania', 'Cases'): {1980: 0.0,
1981: 0.0,
1982: 3.0,
1983: 17.0,
1984: 0.0,
1985: 0.0},
(u'Albania', 'Pop'): {1980: 2681000.0,
1981: 2738200.0,
1982: 2795400.0,
1983: 2852600.0,
1984: 2909800.0,
1985: 2967000.0},
(u'Albania', 'Rate'): {1980: 90.0,
1981: 90.0,
1982: 93.0,
1983: 96.0,
1984: 96.0,
1985: 96.0},
(u'Algeria', 'Cases'): {1980: 15527.0,
1981: 20849.0,
1982: 29584.0,
1983: 22126.0,
1984: 22553.0,
1985: 20114.0},
(u'Algeria', 'Pop'): {1980: 19338000.0,
1981: 19983600.0,
1982: 20629200.0,
1983: 21274800.0,
1984: 21920400.0,
1985: 22566000.0},
(u'Algeria', 'Rate'): {1980: 0.0,
1981: 0.0,
1982: 0.0,
1983: 0.0,
1984: 0.0,
1985: 68.0},
(u'Andorra', 'Cases'): {1980: 0.0,
1981: 0.0,
1982: 0.0,
1983: 0.0,
1984: 0.0,
1985: 0.0},
(u'Andorra', 'Pop'): {1980: 36000.0,
1981: 37800.0,
1982: 39600.0,
1983: 41400.0,
1984: 43200.0,
1985: 45000.0},
(u'Andorra', 'Rate'): {1980: 0.0,
1981: 0.0,
1982: 0.0,
1983: 0.0,
1984: 0.0,
1985: 0.0},
(u'Angola', 'Cases'): {1980: 29656.0,
1981: 19714.0,
1982: 30067.0,
1983: 22589.0,
1984: 22685.0,
1985: 22822.0},
(u'Angola', 'Pop'): {1980: 8212000.0,
1981: 8518600.0,
1982: 8825200.0,
1983: 9131800.0,
1984: 9438400.0,
1985: 9745000.0},
(u'Angola', 'Rate'): {1980: 0.0,
1981: 0.0,
1982: 0.0,
1983: 26.0,
1984: 35.0,
1985: 44.0}})
这是我实际运行的代码 - 它用于更大的数据框 - 大约200个国家/地区:
df=WHOData.unstack().fillna(0)
for country in df.columns.get_level_values(0).unique():
country_df = df[[c for c in df.columns if c[0] == country]]
for c in [c for c in country_df.columns if c[1] in ['Cases', 'Rate']]:
country_df = country_df[country_df[c] > 0]
cases = country_df.iloc[:, country_df.columns.get_level_values(1)=='Cases']
rate = country_df.iloc[:, country_df.columns.get_level_values(1)=='Rate']
x = np.array(rate)
y = np.array(cases)
plt.scatter(x,y)
results = sm.OLS(y,sm.add_constant(x)).fit()
plt.plot(x, results.fittedvalues)
color = 'red'
plt.ylabel('Measles Cases', color=color)
color = 'blue'
plt.xlabel('Vaccination Rate', color=color)
plt.title(country)
plt.show()
答案 0 :(得分:1)
您可以遍历国家/地区。对于每个国家/地区,将列作为元组进行迭代,对于第二个值为Cases
或Rate
的用户,请省略相关行:
for country in df.columns.get_level_values(0).unique():
country_df = df[[c for c in df.columns if c[0] == country]]
for c in [c for c in country_df.columns if c[1] in ['Cases', 'Rate']]:
country_df = country_df[country_df[c] > 0]
print(country_df)
这给出了以下输出:
Afghanistan
Cases Pop Rate
1980 32455.0 13211000.0 11.0
1982 20320.0 12578600.0 8.0
1983 18808.0 12262400.0 9.0
1984 16199.0 11946200.0 14.0
1985 14457.0 11630000.0 14.0
Albania
Cases Pop Rate
1982 3.0 2795400.0 93.0
1983 17.0 2852600.0 96.0
Algeria
Cases Pop Rate
1985 20114.0 22566000.0 68.0
Empty DataFrame
Columns: [(Andorra, Cases), (Andorra, Pop), (Andorra, Rate)]
Index: []
Angola
Cases Pop Rate
1983 22589.0 9131800.0 26.0
1984 22685.0 9438400.0 35.0
1985 22822.0 9745000.0 44.0