Question

我无法让熊猫以我想要的格式导出一些网络抓取的数据。

我想访问URLs中的每个URL，并从该页面中获取各种元素，并将它们放入具有指定列名的Excel电子表格中。然后，我想访问URLs中的下一个URL，并将此数据放在Excel工作表的下一行，这样我就得到了一个包含6列和三行数据的Excel工作表，每个植物一个（每个植物在在单独的网址上。

当前我有一个错误，说ValueError: Length mismatch: Expected axis has 18 elements, new values have 6 elements，因为新记录是水平并排放置的，而不是Excel中的新行，而Pandas并不期望这样。

有人可以帮忙吗？谢谢

import csv
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
from urllib2 import urlopen
import bs4
from bs4 import BeautifulSoup


URLs = ["http://adbioresources.org/map/ajax-single/27881",
"http://adbioresources.org/map/ajax-single/27967",
"http://adbioresources.org/map/ajax-single/27880"]

mylist = []

for plant in URLs:
    soup = BeautifulSoup(urlopen(plant),'lxml')

    table = soup.find_all('td')
    for td in table:
        mylist.append(td.text)

    heading2 = soup.find_all('h2')
    for h2 in heading2:
        mylist.append(h2.text)

    para = soup.find_all('p')   
    for p in para:
        mylist.append(p.text)

df = pd.DataFrame(mylist)
transposed_df = df.T
transposed_df.columns = 
['Status','Type','Capacity','Feedstock','Address1','Address2']
writer = ExcelWriter('Pandas-Example.xlsx')
transposed_df.to_excel(writer,'Sheet1',index=False)
writer.save()

Answer 1

masterlist = []
i = 0
for plant in URLs:
    sublist = []

    soup = BeautifulSoup(urlopen(plant),'lxml')

    table = soup.find_all('td')
    for td in table:
        sublist.append(td.text)

    heading2 = soup.find_all('h2')
    for h2 in heading2:
        sublist.append(h2.text)

    para = soup.find_all('p')   
    for p in para:
        sublist.append(p.text)
    masterlist.append(sublist)

    i = i + 1
    print i 

df = pd.DataFrame(masterlist)
df.columns = ['Status','Type','Capacity','Feedstock','Address1','Address2']
writer = ExcelWriter('Pandas-Example.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()

Answer 2

我真的认为您要实现的目标是使用多个URL从网页中提取表，然后将每个表转换为单独的Excel工作表。

以下代码可能是成功的秘诀！我在Google合作实验室上运行了它！希望对您有帮助！

img = new Image();
img.src = "http://domain.tld/path/to/image.jpg";

我得到的输出：

#for writing to excel(xlsx) we will be needing XlsxWriter, please install it first if you don't have it!
try:
  import XlsxWriter
except ModuleNotFoundError:
  print("XlsxWriter is not installed!!")
  get_ipython().system("pip install XlsxWriter")

#to scrape a table from a webpage
from urllib.parse import urlparse,urlsplit
import requests
import pandas as pd
import os


urls=["https://www.macrotrends.net/countries/IND/india/gdp-growth-rate",
      "http://www.inwea.org/wind-energy-in-india/wind-power-potential",
      "https://en.wikipedia.org/wiki/List_of_districts_in_India",
      "https://en.wikipedia.org/wiki/List_of_Indian_people_by_net_worth",
      "https://en.wikipedia.org/wiki/States_and_union_territories_of_India",
      "https://en.wikipedia.org/wiki/List_of_governors-general_of_India",
      "https://en.wikipedia.org/wiki/List_of_Indian_independence_activists",
      "https://en.wikipedia.org/wiki/List_of_Indian_Grammy_Award_winners_and_nominees",
      "https://en.wikipedia.org/wiki/List_of_Indian_Academy_Award_winners_and_nominees",
      "https://en.wikipedia.org/wiki/List_of_highest-grossing_Indian_films"
      ]


print(len(urls),"Urls Found")

#convert the sheetname- remove _ and - , put title case and remove spaces
def modify_name(my_str):
  replaced=my_str.replace("_", " ").replace("-", " ")
  return replaced.title().replace(" ","")


#get all tables from a url
def get_dataframes(url):
  html = requests.get(url).content
  df_list = pd.read_html(html)
  #print(len(df_list)," Dataframes Returned")
  return df_list

#if df is too small then don't add it
def filter_dfs(dfs_list,min_rows=10):
  new_dfs_list=[]
  for each_df in dfs_list:
    if(len(each_df)>min_rows):
      new_dfs_list.append(each_df)
  return new_dfs_list

#to avoid InvalidWorksheetName: Excel worksheet name 'StatesAndUnionTerritoriesOfIndia1' must be <= 31 chars.
def crop_name(name,thres=29):
  if len(name)<thres:
    return name
  else:
    return name[:thres]

#to get first n elements from list only
def crop_list(lst,thres=29):
  if len(lst)<thres:
    return lst
  else:
    return lst[:thres]

#converts urls to dataframes to excel sheets
#get_max= get the maximum number of tables from each url
#min_rows= the minimum number of rows in each table to save it to the excel sheet
#crop_name_thres= some excel sheets can get quite huge sheet names which blows up the code
#so crop the sheet name for the better purpose

def urls_to_excel(urls,excel_path=None,get_max=10,min_rows=0,crop_name_thres=29):
  excel_path=os.path.join(os.getcwd(),"Excel_Multiple_Sheets_Output.xlsx") if excel_path==None else excel_path
  writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
  i=0
  for url in urls:
    parsed=urlsplit(url)
    sheet_name=parsed.path.split('/')[-1]
    mod_sheet_name=crop_name(modify_name(sheet_name),thres=crop_name_thres)

    dfs_list=get_dataframes(url)
    filtered_dfs_list=filter_dfs(dfs_list,min_rows=min_rows)
    filtered_dfs_list=crop_list(filtered_dfs_list,thres=get_max)
    for each_df in filtered_dfs_list:
      print("Parsing Excel Sheet "," : ",str(i)+mod_sheet_name)
      i+=1
      each_df.to_excel(writer, sheet_name=str(i)+mod_sheet_name, index=True)
  writer.save()
urls_to_excel(urls,get_max=1,min_rows=10)

导出Web报废表到Excel

2 个答案: