我无法让熊猫以我想要的格式导出一些网络抓取的数据。
我想访问URLs
中的每个URL,并从该页面中获取各种元素,并将它们放入具有指定列名的Excel电子表格中。然后,我想访问URLs
中的下一个URL,并将此数据放在Excel工作表的下一行,这样我就得到了一个包含6列和三行数据的Excel工作表,每个植物一个(每个植物在在单独的网址上。
当前我有一个错误,说ValueError: Length mismatch: Expected axis has 18 elements, new values have 6 elements
,因为新记录是水平并排放置的,而不是Excel中的新行,而Pandas并不期望这样。
有人可以帮忙吗? 谢谢
import csv
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
from urllib2 import urlopen
import bs4
from bs4 import BeautifulSoup
URLs = ["http://adbioresources.org/map/ajax-single/27881",
"http://adbioresources.org/map/ajax-single/27967",
"http://adbioresources.org/map/ajax-single/27880"]
mylist = []
for plant in URLs:
soup = BeautifulSoup(urlopen(plant),'lxml')
table = soup.find_all('td')
for td in table:
mylist.append(td.text)
heading2 = soup.find_all('h2')
for h2 in heading2:
mylist.append(h2.text)
para = soup.find_all('p')
for p in para:
mylist.append(p.text)
df = pd.DataFrame(mylist)
transposed_df = df.T
transposed_df.columns =
['Status','Type','Capacity','Feedstock','Address1','Address2']
writer = ExcelWriter('Pandas-Example.xlsx')
transposed_df.to_excel(writer,'Sheet1',index=False)
writer.save()
答案 0 :(得分:1)
masterlist = []
i = 0
for plant in URLs:
sublist = []
soup = BeautifulSoup(urlopen(plant),'lxml')
table = soup.find_all('td')
for td in table:
sublist.append(td.text)
heading2 = soup.find_all('h2')
for h2 in heading2:
sublist.append(h2.text)
para = soup.find_all('p')
for p in para:
sublist.append(p.text)
masterlist.append(sublist)
i = i + 1
print i
df = pd.DataFrame(masterlist)
df.columns = ['Status','Type','Capacity','Feedstock','Address1','Address2']
writer = ExcelWriter('Pandas-Example.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
答案 1 :(得分:1)
我真的认为您要实现的目标是使用多个URL从网页中提取表,然后将每个表转换为单独的Excel工作表。
以下代码可能是成功的秘诀!我在Google合作实验室上运行了它!希望对您有帮助!
img = new Image();
img.src = "http://domain.tld/path/to/image.jpg";
我得到的输出:
#for writing to excel(xlsx) we will be needing XlsxWriter, please install it first if you don't have it!
try:
import XlsxWriter
except ModuleNotFoundError:
print("XlsxWriter is not installed!!")
get_ipython().system("pip install XlsxWriter")
#to scrape a table from a webpage
from urllib.parse import urlparse,urlsplit
import requests
import pandas as pd
import os
urls=["https://www.macrotrends.net/countries/IND/india/gdp-growth-rate",
"http://www.inwea.org/wind-energy-in-india/wind-power-potential",
"https://en.wikipedia.org/wiki/List_of_districts_in_India",
"https://en.wikipedia.org/wiki/List_of_Indian_people_by_net_worth",
"https://en.wikipedia.org/wiki/States_and_union_territories_of_India",
"https://en.wikipedia.org/wiki/List_of_governors-general_of_India",
"https://en.wikipedia.org/wiki/List_of_Indian_independence_activists",
"https://en.wikipedia.org/wiki/List_of_Indian_Grammy_Award_winners_and_nominees",
"https://en.wikipedia.org/wiki/List_of_Indian_Academy_Award_winners_and_nominees",
"https://en.wikipedia.org/wiki/List_of_highest-grossing_Indian_films"
]
print(len(urls),"Urls Found")
#convert the sheetname- remove _ and - , put title case and remove spaces
def modify_name(my_str):
replaced=my_str.replace("_", " ").replace("-", " ")
return replaced.title().replace(" ","")
#get all tables from a url
def get_dataframes(url):
html = requests.get(url).content
df_list = pd.read_html(html)
#print(len(df_list)," Dataframes Returned")
return df_list
#if df is too small then don't add it
def filter_dfs(dfs_list,min_rows=10):
new_dfs_list=[]
for each_df in dfs_list:
if(len(each_df)>min_rows):
new_dfs_list.append(each_df)
return new_dfs_list
#to avoid InvalidWorksheetName: Excel worksheet name 'StatesAndUnionTerritoriesOfIndia1' must be <= 31 chars.
def crop_name(name,thres=29):
if len(name)<thres:
return name
else:
return name[:thres]
#to get first n elements from list only
def crop_list(lst,thres=29):
if len(lst)<thres:
return lst
else:
return lst[:thres]
#converts urls to dataframes to excel sheets
#get_max= get the maximum number of tables from each url
#min_rows= the minimum number of rows in each table to save it to the excel sheet
#crop_name_thres= some excel sheets can get quite huge sheet names which blows up the code
#so crop the sheet name for the better purpose
def urls_to_excel(urls,excel_path=None,get_max=10,min_rows=0,crop_name_thres=29):
excel_path=os.path.join(os.getcwd(),"Excel_Multiple_Sheets_Output.xlsx") if excel_path==None else excel_path
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
i=0
for url in urls:
parsed=urlsplit(url)
sheet_name=parsed.path.split('/')[-1]
mod_sheet_name=crop_name(modify_name(sheet_name),thres=crop_name_thres)
dfs_list=get_dataframes(url)
filtered_dfs_list=filter_dfs(dfs_list,min_rows=min_rows)
filtered_dfs_list=crop_list(filtered_dfs_list,thres=get_max)
for each_df in filtered_dfs_list:
print("Parsing Excel Sheet "," : ",str(i)+mod_sheet_name)
i+=1
each_df.to_excel(writer, sheet_name=str(i)+mod_sheet_name, index=True)
writer.save()
urls_to_excel(urls,get_max=1,min_rows=10)