from bs4 import BeautifulSoup
import urllib
import json
import os
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989"), str("1986"), str("1982"), str("1981"), str("1977"), str("1972"), str("1971"), str("1967"), str("1963"), str("1959"), str("1956")]
DESIRED_COLUMNS = {1, 2, 5} #scrapes only afk, aantal & zetels
verkiezingsData = []
filename = raw_input('Enter a filename: ') or 'data.json'
#open file and open json array
with open(filename, "w") as file:
file.write("[{")
for Jaargetal in jaren:
#url source
r = urllib.urlopen("http://www.nlverkiezingen.com/TK" + Jaargetal +".html").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
header = soup.find_all("h1")[0].getText()
#print header
with open(filename, "a+") as file:
file.write("\"%s\": [" % header) #header as beginning json
trs = table.find_all("tr")[0].getText()
del verkiezingsData[:] #clear list before adding new data
#add the 3 columns to a list
for tr in table.find_all("tr")[1:22]: #22 columns top till bottom
for index, val in enumerate(tr.find_all('td')):
if index in DESIRED_COLUMNS: #linkt naar desired columns bovenin
verkiezingsData.append(val.getText().strip())
#json array van de 3 vallues
for a, b, c in zip(verkiezingsData[::3], verkiezingsData[1::3], verkiezingsData[2::3]): #link naar desired columns 1,2,5
data2 = {'afk':a,"aantal":b, "zetels":c}
#file writing
with open(filename, 'a') as outfile:
json.dump(data2, outfile)
outfile.write(",")
#open file, delete last comma and close array
with open(filename, 'ab+') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("],")
#open file, delete last comma, and close array
with open(filename, 'r+b') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("}]")
#open file and pretty print json data
with open(filename, 'r') as file:
prettydata = json.load(file)
with open(filename, 'w') as file:
json.dump(prettydata, file, sort_keys=True, indent=4, separators=(',', ': '))
我制作了一个从nlverkiezingen.com
刮擦的刮刀它刮掉了Aantal / Afk / Zetels
它有一个字符串,刮了很多年。
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989")]
需要从第1行开始。
for tr in table.find_all("tr")[1:22]: #22 columns top till bottom
但是当它看到白线/白色空间而不是第22行时需要结束。 (每年在另一行结束)我该如何编码呢?
或者是否有可能在代码中的某处 - 大约每年在字符串中 - 当它需要停止刮取行时?例如2010年第22行1959年第10行
答案 0 :(得分:0)
我将在此假设您要求在for
dom为空时突破tr
循环。
for tr in table.find_all("tr")[1:22]:
if tr.getText() == "":
break
答案 1 :(得分:0)
如果您需要停止迭代,如果某个特定列,例如' aantal'是空的然后可能尝试这样的事情:
"NOT /"
但是,我相信您应该尝试以不同方式构建代码,以便您可以更好地控制流程。