我无法从Partywise Result网页上抓取一些数据。我想从该页面上抓取partwise{vote%,vote count}
。
到目前为止我尝试过的代码:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
for records2 in soup2.findAll("div",{"id":"piecharts26"}):
print(records2.table)
for records in records2.findAll("table"):
print(records)
edata = ""
for data in records.findAll('td'):
edata= edata+","+data.text
edatas= edatas + "\n" + edata[1:]+","+code
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
我期望的结果是%vote份额
我希望输出为CSV格式,如下所示:
INC,43.0%,6144192
依此类推,完全从第一页开始
和两个
答案 0 :(得分:0)
直接从div中的javascript中加载数据:
if(document.getElementById('piecharts26')!=null)
因此,您必须使用控制台浏览器,例如selenium
(link here)或使用正则表达式:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
print(result)
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
输出:
[['INC {43.0%,6144192}', 6144192],
['BJP {33.0%,4707141}', 4707141],
['JCCJ {7.6%,1086581}', 1086581],
['IND {5.9%,839053}', 839053],
['BSP {3.9%,552313}', 552313],
['GGP {1.7%,247459}', 247459],
['AAAP {0.9%,123526}', 123526],
['CPI {0.3%,48255}', 48255],
['APoI {0.3%,42013}', 42013],
['SHS {0.2%,34678}', 34678],
['NCP {0.2%,28983}', 28983],
['SP {0.2%,21969}', 21969],
['BYPP {0.1%,8425}', 8425],
['CPM {0.1%,8348}', 8348],
['JD(U) {0.1%,8285}', 8285],
['CSM {0.1%,7783}', 7783],
['BMUP {0.1%,7419}', 7419],
['BSCP {0.0%,5546}', 5546],
['BTP {0.0%,5498}', 5498],
['RJsbhP {0.0%,5141}', 5141],
['RGOP {0.0%,5040}', 5040],
['IPBP {0.0%,4982}', 4982],
['NINSHAD {0.0%,4586}', 4586],
['PSPU {0.0%,4309}', 4309],
['BHBHP {0.0%,3780}', 3780],
['RPI(A) {0.0%,3257}', 3257],
['JAC {0.0%,3034}', 3034],
['CPIM {0.0%,3017}', 3017],
['NDPF {0.0%,2912}', 2912],
['AASPP {0.0%,2474}', 2474],
['BBC {0.0%,2089}', 2089],
['SWAP {0.0%,2023}', 2023],
['cvgrp {0.0%,1582}', 1582],
['bhmm {0.0%,1474}', 1474],
['AVVP {0.0%,1407}', 1407],
['LSWP {0.0%,1399}', 1399],
['CSP {0.0%,1232}', 1232],
['BPSGKD {0.0%,1093}', 1093],
['BKNP {0.0%,1085}', 1085],
['CGVP {0.0%,1053}', 1053],
['SUCI {0.0%,1048}', 1048],
['SUSP {0.0%,988}', 988],
['DPI {0.0%,970}', 970],
['RJBP {0.0%,717}', 717],
['ASSP {0.0%,701}', 701],
['BLRP {0.0%,570}', 570],
['BSHSP {0.0%,562}', 562],
['ABHM {0.0%,549}', 549],
['SSBD {0.0%,468}', 468],
['ABSSP {0.0%,436}', 436],
['BRSP {0.0%,429}', 429],
['ABSKP {0.0%,389}', 389],
['BSSP {0.0%,279}', 279],
['BNIP {0.0%,267}', 267],
['RMGP {0.0%,258}', 258],
['KMSP {0.0%,241}', 241],
['BHBP {0.0%,224}', 224],
['RP(K) {0.0%,202}', 202],
['CMM {0.0%,192}', 192],
['CHSJP {0.0%,183}', 183],
['RSSM {0.0%,72}', 72],
['AnAP {0.0%,66}', 66],
['NOTA {2.0%,282744}', 282744]]
然后,您可以循环搜索结果并将其保存到csv文件中
编辑:
查看此修改以将其保存到csv文件中:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
import csv
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
header = ["Party","Vote%","Count","State code"]
results_export = []
results_export.append(header)
for r in result:
export = []
party = r[0].split(' {')[0]
percent = r[0].split(' {')[1].split(',')[0]
count = r[1]
export.append(str(party))
export.append(str(percent))
export.append(str(count))
export.append(code)
results_export.append(export)
file = open(os.path.expanduser("per2014_result.csv"), "w") # 2018
writer = csv.writer(file)
writer.writerows(results_export)
EDIT2:
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
ms = r.findall(s)
result = '[]'
if ms:
for m in ms:
if m != '[]':
result = m
return json.loads(result.replace("'",'"'))