from bs4 import BeautifulSoup
import urllib
import json
import os
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989"), str("1986"), str("1982"), str("1981"), str("1977"), str("1972"), str("1971"), str("1967"), str("1963"), str("1959"), str("1956")]
DESIRED_COLUMNS = {1, 2, 5} #scrapes only afk, aantal & zetels
verkiezingsData = []
filename = raw_input('Enter a filename: ') or 'data.json'
#open file and open json array
with open(filename, "w") as file:
file.write("[{")
for Jaargetal in jaren:
#url source
r = urllib.urlopen("http://www.nlverkiezingen.com/TK" + Jaargetal +".html").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
header = soup.find_all("h1")[0].getText()
#print header
with open(filename, "a+") as file:
file.write("\"%s\": [" % header) #header as beginning json
trs = table.find_all("tr")[0].getText()
del verkiezingsData[:] #clear list before adding new data
#add the 3 columns to a list
for tr in table.find_all("tr")[:22]: #22 aantal columns van top till bottom
for index, val in enumerate(tr.find_all('td')):
if index in DESIRED_COLUMNS: #linkt naar desired columns bovenin
verkiezingsData.append(val.getText().strip())
#json array van de 3 vallues
for a, b, c in zip(verkiezingsData[::3], verkiezingsData[1::3], verkiezingsData[2::3]): #link naar desired columns 1,2,5
data2 = {'afk':a,"aantal":b, "zetels":c}
#file writing
with open(filename, 'a') as outfile:
json.dump(data2, outfile)
outfile.write(",")
#open file, delete last comma and close array
with open(filename, 'ab+') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("],")
#open file, delete last comma, and close array
with open(filename, 'r+b') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("}]")
#open file and pretty print json data
with open(filename, 'r') as file:
prettydata = json.load(file)
with open(filename, 'w') as file:
json.dump(prettydata, file, sort_keys=True, indent=4, separators=(',', ': '))
我从nlverkiezingen.com上刮了一个刮刀
当它保存为json文件时:
"Tweede-Kamerverkiezingen - 12 september 2012": [
{
"aantal": "Aantal",
"afk": "Afk.",
"zetels": "Zetels"
},
{
"aantal": "2504948",
"afk": "VVD",
"zetels": "41"
},
第一行是:Aantal / Afk / Zetels。 我不希望这个被刮掉。
我该如何更改?刮取从第二行开始
第二件事是,最后一行到处都是不同的。有时是第20排,第15排。
我该如何更改?当刮刮时看到白色/空行时刮削结束了吗?
答案 0 :(得分:0)
不知道是否正确,这只是一个猜测,但可能像
for tr in table.find_all("tr")[1:22]
跳过第一行?
答案 1 :(得分:0)
第一行是:Aantal / Afk / Zetels。我不希望这件事被刮掉。
替换
for tr in table.find_all("tr")[1:22]:
1
Python具有从零开始的索引,因此
指的是表中的第二行。
我该怎么改变?当刮刮时看到白色/空行时刮削结束了吗?
空表单元格中的u"\xa0"
将由BeautifulSoup解析为 @Override
public View getView(int position, View convertView, ViewGroup parent) {
// Get the data item for this position
LockListDataModel locks = getItem(position);
// Check if an existing view is being reused, otherwise inflate the view
ViewHolder viewHolder; // view lookup cache stored in tag
if (convertView == null) {
viewHolder = new ViewHolder();
LayoutInflater inflater = LayoutInflater.from(getContext());
convertView = inflater.inflate(R.layout.list_row, parent, false);
viewHolder.lockName = (TextView) convertView.findViewById(R.id.lockName);
viewHolder.colour = (ImageView) convertView.findViewById(R.id.list_image);
convertView.setTag(viewHolder);
} else {
viewHolder = (ViewHolder) convertView.getTag();
}// Populate the data into the template view using the data object
viewHolder.lockName.setText(locks.lockName);
viewHolder.colour.setVisibility(View.VISIBLE);
viewHolder.colour.setBackground(ResourcesCompat.getDrawable(getContext().getResources(), R.drawable.time_profile_shape, null));
LayerDrawable shape = (LayerDrawable) ContextCompat.getDrawable(getContext(),R.drawable.time_profile_shape);
GradientDrawable color = (GradientDrawable)(shape.findDrawableByLayerId(R.id.time_profile_lock_colour));
color.setColor(Color.parseColor(locks.color));
// Return the completed view to render on screen
return convertView;
}
Python字符串。检查每行上第一个标记的内容,并将其与该值进行比较,并使用它来打破循环。