如何将此网页抓取(数据)的输出放入pandas DataFrame?
下面的代码只显示一行“数据”,它是垂直排列的。如何使DataFrame看起来像print(数据) - 即。水平?
而且,我是一个新手 - 这是'非pythonic'。请建议如何将其转换为函数(?)?
我尝试了所有类型的追加和连接函数,但嵌套循环会将所有内容都抛入错误状态。请帮忙。
先感谢您。
from bs4 import BeautifulSoup as bs
import urllib.request
import re
import pandas as pd
Url1 = 'http://www5.statcan.gc.ca/cimt-cicm/topNCountryCommodities-marchandises?lang=eng&chapterId=27§ionId=0&refMonth='
Url2 = '&refYr='
Url3 = '&freq=6&countryId=999&usaState=0&provId=1&arrayId=9900000&commodityId=271111&commodityName=Natural+gas%2C+liquefied&topNDefault=10&tradeType=3'
regexQ = re.compile('Date1 Qty')
regexC = re.compile('Footnote')
regexV = re.compile('Date1 Val')
import csv
import collections
for year in range(2016, 2018):
for month in range(0, 13):
sauce = urllib.request.urlopen(Url1+str(month)+Url2+str(year)+Url3).read().decode('utf-8')
soup = bs(sauce, 'lxml')
select = soup.find_all('option', selected=True)
for i in select:
Month=select[4]
Year=select[3]
month=Month.text
year=Year.text
#print(year)
#print(month, year)
value = [x.text.strip() for x in soup.find_all("td", {"headers": regexV})]
country = [x.text.strip() for x in soup.find_all("a", {"href": regexC})]
quantity = [x.text.strip() for x in soup.find_all("td", {"headers": regexQ})]
total_list = [list(x) for x in zip(country,quantity,value)]
for item in total_list:
h=[year , month]
data = h + item
print(data)
df = pd.DataFrame(data=data)
df.head
答案 0 :(得分:0)
您可以使用:
from bs4 import BeautifulSoup as bs
import urllib.request
import re
import pandas as pd
Url1 = 'http://www5.statcan.gc.ca/cimt-cicm/topNCountryCommodities-marchandises?lang=eng&chapterId=27§ionId=0&refMonth='
Url2 = '&refYr='
Url3 = '&freq=6&countryId=999&usaState=0&provId=1&arrayId=9900000&commodityId=271111&commodityName=Natural+gas%2C+liquefied&topNDefault=10&tradeType=3'
regexQ = re.compile('Date1 Qty')
regexC = re.compile('Footnote')
regexV = re.compile('Date1 Val')
import csv
import collections
#create list for all rows
data = []
for year in range(2016, 2018):
for month in range(0, 13):
sauce = urllib.request.urlopen(Url1+str(month)+Url2+str(year)+Url3).read().decode('utf-8')
soup = bs(sauce, 'lxml')
select = soup.find_all('option', selected=True)
print (select)
for i in select:
month=select[4].text
#convert year to integer
year=int(select[3].text)
#print(year)
#print(month, year)
#replace , from numeric values, convert to integer
value = [int(x.text.strip().replace(',','')) for x in soup.find_all("td", {"headers": regexV})]
country = [x.text.strip() for x in soup.find_all("a", {"href": regexC})]
quantity = [int(x.text.strip().replace(',','')) for x in soup.find_all("td", {"headers": regexQ})]
for item in zip(country,quantity,value):
#append each row to list data
data.append([year , month] + list(item))
#create Dataframe
cols = ['year','month','country','quantity','value']
df = pd.DataFrame(data=data, columns=cols)
print (df.head())
year month country quantity value
0 2016 January World 279605300 69598782
1 2016 January Trinidad and Tobago 140185912 36503857
2 2016 January Norway 137835921 32414865
3 2016 January United States 1583467 680060
4 2016 February World 1255156 532059