如何报废无法检查且位于<svg>标记内的数据

时间:2019-05-09 12:42:39

标签: python-3.x web-scraping beautifulsoup

我无法从Partywise Result网页上抓取一些数据。我想从该页面上抓取partwise{vote%,vote count}

到目前为止我尝试过的代码:

import urllib
import urllib.request
from bs4 import BeautifulSoup
import os

def soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
    soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
    #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm"  # 2014
    soup2 = soup(soup3)
    for records2 in soup2.findAll("div",{"id":"piecharts26"}):
        print(records2.table)
        for records in records2.findAll("table"):
            print(records)
            edata = ""
            for data in records.findAll('td'):
                edata= edata+","+data.text
            edatas= edatas + "\n" + edata[1:]+","+code

header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))

我期望的结果是%vote份额

output page 1

我希望输出为CSV格式,如下所示:

INC,43.0%,6144192

依此类推,完全从第一页开始

output for page 2

和两个

1 个答案:

答案 0 :(得分:0)

直接从div中的javascript中加载数据:

if(document.getElementById('piecharts26')!=null)

因此,您必须使用控制台浏览器,例如seleniumlink here)或使用正则表达式:

import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json

def get_data(html_page):
    s = str(html_page)
    r = re.compile('data.addRows\((.*?)\);')
    m = r.search(s)
    if m:
        result = m.group(1)

    return json.loads(result.replace("'",'"'))


def soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata


#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
    soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
    #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm"  # 2014
    soup2 = soup(soup3)

    result = get_data(soup2)

    print(result)

header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))

输出:

[['INC {43.0%,6144192}', 6144192],
 ['BJP {33.0%,4707141}', 4707141],
 ['JCCJ {7.6%,1086581}', 1086581],
 ['IND {5.9%,839053}', 839053],
 ['BSP {3.9%,552313}', 552313],
 ['GGP {1.7%,247459}', 247459],
 ['AAAP {0.9%,123526}', 123526],
 ['CPI {0.3%,48255}', 48255],
 ['APoI {0.3%,42013}', 42013],
 ['SHS {0.2%,34678}', 34678],
 ['NCP {0.2%,28983}', 28983],
 ['SP {0.2%,21969}', 21969],
 ['BYPP {0.1%,8425}', 8425],
 ['CPM {0.1%,8348}', 8348],
 ['JD(U) {0.1%,8285}', 8285],
 ['CSM {0.1%,7783}', 7783],
 ['BMUP {0.1%,7419}', 7419],
 ['BSCP {0.0%,5546}', 5546],
 ['BTP {0.0%,5498}', 5498],
 ['RJsbhP {0.0%,5141}', 5141],
 ['RGOP {0.0%,5040}', 5040],
 ['IPBP {0.0%,4982}', 4982],
 ['NINSHAD {0.0%,4586}', 4586],
 ['PSPU {0.0%,4309}', 4309],
 ['BHBHP {0.0%,3780}', 3780],
 ['RPI(A) {0.0%,3257}', 3257],
 ['JAC {0.0%,3034}', 3034],
 ['CPIM {0.0%,3017}', 3017],
 ['NDPF {0.0%,2912}', 2912],
 ['AASPP {0.0%,2474}', 2474],
 ['BBC {0.0%,2089}', 2089],
 ['SWAP {0.0%,2023}', 2023],
 ['cvgrp {0.0%,1582}', 1582],
 ['bhmm {0.0%,1474}', 1474],
 ['AVVP {0.0%,1407}', 1407],
 ['LSWP {0.0%,1399}', 1399],
 ['CSP {0.0%,1232}', 1232],
 ['BPSGKD {0.0%,1093}', 1093],
 ['BKNP {0.0%,1085}', 1085],
 ['CGVP {0.0%,1053}', 1053],
 ['SUCI {0.0%,1048}', 1048],
 ['SUSP {0.0%,988}', 988],
 ['DPI {0.0%,970}', 970],
 ['RJBP {0.0%,717}', 717],
 ['ASSP {0.0%,701}', 701],
 ['BLRP {0.0%,570}', 570],
 ['BSHSP {0.0%,562}', 562],
 ['ABHM {0.0%,549}', 549],
 ['SSBD {0.0%,468}', 468],
 ['ABSSP {0.0%,436}', 436],
 ['BRSP {0.0%,429}', 429],
 ['ABSKP {0.0%,389}', 389],
 ['BSSP {0.0%,279}', 279],
 ['BNIP {0.0%,267}', 267],
 ['RMGP {0.0%,258}', 258],
 ['KMSP {0.0%,241}', 241],
 ['BHBP {0.0%,224}', 224],
 ['RP(K) {0.0%,202}', 202],
 ['CMM {0.0%,192}', 192],
 ['CHSJP {0.0%,183}', 183],
 ['RSSM {0.0%,72}', 72],
 ['AnAP {0.0%,66}', 66],
 ['NOTA {2.0%,282744}', 282744]]

然后,您可以循环搜索结果并将其保存到csv文件中

编辑:

查看此修改以将其保存到csv文件中:

import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
import csv
def get_data(html_page):
    s = str(html_page)
    r = re.compile('data.addRows\((.*?)\);')
    m = r.search(s)
    if m:
        result = m.group(1)

    return json.loads(result.replace("'",'"'))


def soup(url):
    thepage = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(thepage,"html.parser")
    return soupdata


codes = ["S26"]
for code in codes:
    soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
    #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm"  # 2014
    soup2 = soup(soup3)

    result = get_data(soup2)
    header = ["Party","Vote%","Count","State code"]

    results_export = []
    results_export.append(header)
    for r in result:
        export = []
        party = r[0].split(' {')[0]
        percent = r[0].split(' {')[1].split(',')[0]
        count = r[1]
        export.append(str(party))
        export.append(str(percent))
        export.append(str(count))
        export.append(code)
        results_export.append(export)

    file = open(os.path.expanduser("per2014_result.csv"), "w")  # 2018
    writer = csv.writer(file)
    writer.writerows(results_export)

EDIT2:

def get_data(html_page):
    s = str(html_page)
    r = re.compile('data.addRows\((.*?)\);')
    ms = r.findall(s)
    result = '[]'
    if ms:
        for m in ms:
            if m != '[]':
                result = m
    return json.loads(result.replace("'",'"'))