def visit_table_links():
links = grab_initial_links()
df_final = None
for obi in links:
resp = requests.get(obi[1])
tree = html.fromstring(resp.content)
dflist = []
for attr in tree.xpath('//th[contains(normalize-space(text()), "sometext")]/ancestor::table/tbody/tr'):
population = attr.xpath('normalize-space(string(.//td[2]))')
try:
population = population.replace(',', '')
population = int(population)
year = attr.xpath('normalize-space(string(.//td[1]))')
year = re.findall(r'\d+', year)
year = ''.join(year)
year = int(year)
#appending a to a list, 3 values first two integer last is string
dflist.append([year, population, obi[0]])
except Exception as e:
pass
#creating a dataframe which works fine
df = pd.DataFrame(dflist, columns = ['Year', 'Population', 'Municipality'])
#first time df_final is none so just make first df = df_final
#next time df_final is previous dataframe so concat with the new one
if df_final != None:
df_final = pd.concat(df_final, df)
else:
df_final = df
visit_table_links()
这是即将到来的数据帧
第一个数据框
Year Population Municipality
0 1970 10193 Cape Coral
1 1980 32103 Cape Coral
2 1990 74991 Cape Coral
3 2000 102286 Cape Coral
4 2010 154305 Cape Coral
5 2018 189343 Cape Coral
第二个数据帧
Year Population Municipality
0 1900 383 Clearwater
1 1910 1171 Clearwater
2 1920 2427 Clearwater
3 1930 7607 Clearwater
4 1940 10136 Clearwater
5 1950 15581 Clearwater
6 1960 34653 Clearwater
7 1970 52074 Clearwater
8 1980 85170 Clearwater
9 1990 98669 Clearwater
10 2000 108787 Clearwater
11 2010 107685 Clearwater
12 2018 116478 Clearwater
尝试连接它们会导致此错误
ValueError Traceback (most recent call last)
<ipython-input-93-429ad4d9bce8> in <module>
75
76
---> 77 visit_table_links()
78
79
<ipython-input-93-429ad4d9bce8> in visit_table_links()
62 print(df)
63
---> 64 if df_final != None:
65 df_final = pd.concat(df_final, df)
66 else:
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __nonzero__(self)
1476 raise ValueError("The truth value of a {0} is ambiguous. "
1477 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
-> 1478 .format(self.__class__.__name__))
1479
1480 __bool__ = __nonzero__
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
我搜索了很多线程并用尽了我的资源,我是熊猫的新手,不了解为什么会发生这种情况,
首先我认为这是因为索引重复,然后我将uuid.uuid4.int()用作索引
使用df.set_index('ID', drop=True, inplace=True)
仍然是相同的错误。
任何指导都会非常有帮助,谢谢。
编辑:1
很抱歉,不清楚 错误是由
生成的df_final = pd.concat(df_final, df)
当我尝试将当前数据框与上一个数据框合并时
编辑2:
将参数作为列表传递
df_final = pd.concat([df_final, df])
仍然相同的错误
答案 0 :(得分:1)
尝试使用import numpy as np
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from matplotlib import pyplot as plt
# =============================================================================
# An example costum model
# =============================================================================
class mean_stacker():
def __init__(self, desc = 'Simple Mean Stacker'):
self.description = desc
def predict(self, X):
check_is_fitted(self, ['coef_'])
X = check_array(X)
return np.mean(X, axis = 1)
def fit(self, X, y):
X, y = check_X_y(X, y)
self.coef_ = 'mean'
return self
# =============================================================================
# test data
# =============================================================================
yhats = np.array([[11.64543231, 11.49851957, 11.89059499, 11.77613068],
[12.0166365 , 12.18640595, 11.89059499, 12.03356647],
[11.91435714, 12.00392321, 11.89059499, 12.00279713],
[11.74216858, 11.57740889, 11.89059499, 11.57306004],
[11.9827991 , 12.09409814, 11.89059499, 12.14146709],
[11.64009661, 11.55337117, 11.89059499, 11.539958 ],
[11.8658174 , 11.93479133, 11.89059499, 11.88695717],
[11.53478821, 11.24788878, 11.89059499, 11.47217846],
[12.03600978, 12.16789499, 11.89059499, 12.09874916],
[12.07294432, 12.20473012, 11.89059499, 12.20545864],
[11.62189652, 11.34157305, 11.89059499, 11.4359684 ],
[11.5167136 , 11.66579694, 11.89059499, 11.58799839]])
ytrue = np.array(
[[11.6524265 ],
[11.89470562],
[12.12673719],
[12.7966 ],
[11.86452555],
[11.85743673],
[11.45650325],
[11.96433224],
[12.33647352],
[11.96876678],
[11.24377724],
[11.2209676 ]])
# =============================================================================
# Define set of stacker models, wrap them into pipelines and fit
# =============================================================================
stacker_models =[
Lasso(alpha = 0.0005, random_state = 4),
mean_stacker()
]
stacker = []
for model in stacker_models:
stkr = make_pipeline(StandardScaler(), model)
stkr.fit(yhats, ytrue)
stacker.append(stkr)
# =============================================================================
# plot predicts from models
# the costum model does not rescale the predicts correctly
# =============================================================================
plt.plot(ytrue, label = 'ytrue')
plt.plot(stacker[0].predict(yhats), label = 'yLasso')
plt.plot(stacker[1].predict(yhats), label = 'yMean')
plt.legend()
而不是df_final != None
。
此外,在len(df_final) == 0
命令中,尝试将参数作为列表传递,即pd.concat
答案 1 :(得分:0)
来自Sajan对len(df_final) == 0
的建议
我有一个想法,如果我最初将df_final值设置为None或具有相同列的空数据框,这会有所不同吗?
结果是
这是新代码
def visit_table_links():
links = grab_initial_links()
df_final = pd.DataFrame(columns=['Year', 'Population', 'Municipality'])
for obi in links:
resp = requests.get(obi[1])
tree = html.fromstring(resp.content)
dflist = []
for attr in tree.xpath('//th[contains(normalize-space(text()), "sometext")]/ancestor::table/tbody/tr'):
population = attr.xpath('normalize-space(string(.//td[2]))')
try:
population = population.replace(',', '')
population = int(population)
year = attr.xpath('normalize-space(string(.//td[1]))')
year = re.findall(r'\d+', year)
year = ''.join(year)
year = int(year)
dflist.append([year, population, obi[0]])
except Exception as e:
pass
df = pd.DataFrame(dflist, columns = ['Year', 'Population', 'Municipality'])
df_final = pd.concat([df_final, df])
visit_table_links()
由于某些原因,设置df_final = None
会使大熊猫抛出该错误
即使在第一次迭代中,我在df_final = df
不存在时也分配了df_final
因此,在下一次迭代中,最初df_final
的内容是什么
由于某些原因确实很重要
因此,在这行df_final = pd.DataFrame(columns=['Year', 'Population', 'Municipality'])
中插入的这一行df_final = None
解决了该问题。
这是合并的数据框
Year Population Municipality
0 1970 10193 Cape Coral
1 1980 32103 Cape Coral
2 1990 74991 Cape Coral
3 2000 102286 Cape Coral
4 2010 154305 Cape Coral
5 2018 189343 Cape Coral
0 1900 383 Clearwater
1 1910 1171 Clearwater
2 1920 2427 Clearwater
3 1930 7607 Clearwater
4 1940 10136 Clearwater
5 1950 15581 Clearwater
6 1960 34653 Clearwater
7 1970 52074 Clearwater
8 1980 85170 Clearwater
9 1990 98669 Clearwater
10 2000 108787 Clearwater
11 2010 107685 Clearwater
12 2018 116478 Clearwater
0 1970 1489 Coral Springs
1 1980 37349 Coral Springs
2 1990 79443 Coral Springs
3 2000 117549 Coral Springs
4 2010 121096 Coral Springs
5 2018 133507 Coral Springs