从网站中提取大量数据

时间:2019-06-15 21:54:26

标签: python

我正在尝试从网站中提取大量数据。 (https://www.cs.cmu.edu/~mleone/gdead/setlists.html)我正在一个数据可视化项目中显示有关感恩节死者音乐会的信息。我已经成功提取并格式化了所需的数据,但是过程非常缓慢。我正在使用urllib库打开并从每个url中读取。 (URL杂乱无章)。有更好的方法吗?

class Song:
    def __init__(self, name, year):
        self.name = name
        self.year = year
    def printName(self):
        print(self.name)
    def getName(self):
        return self.name

class Year:
    def __init__(self, year, dct):
        self.year = year
        self.dct = {}
    def addSong(self, song):
        if song in self.dct:
            self.dct[song] += 1
        else:
            self.dct[song] = 1

    def printDict(self):
        print(dct)

    def printYear(self):
        print(self.year)

    def getYear(self):
        return self.year




from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
import urllib

songlist = []

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

list_open = open("concert_list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")

#empty dictionary
yeardict = {}

#populate dictionary with Year objects from 1972-1995
for i in range(0,24):
    yeardict[i+72] = Year(i+72, {})

# for each website located in concert_list.txt
for url in line_in_list:
    soup = BeautifulSoup(urlopen(url).read(), 'html.parser')
    newurl = []
    # find each link extension
    for link in soup.find_all('a'):
        newurl.append('https://www.cs.cmu.edu/~mleone/gdead/' + link.get('href'))

        # parse through each full url
        for url in newurl:
            soup = BeautifulSoup(urlopen(url).read(), 'html')
            # parse something special in the file
            paragraphs = soup.find_all('p')
            x = []
            # populate x by splitting the file into a list
            for p in paragraphs:
                x = p.getText().split('\n')

        # remove blanks from song list
        x = remove_values_from_list(x, '')

        # for each song in the song list
        for song in x:
            if(hasNumbers(song)):
                year = song[song.rfind("/")+1:song.rfind("/")+3]
            else:
                cursong = Song(song,year)
                #yeardict[int(year)].printYear()
                yeardict[int(year)].addSong(cursong.getName())



print(yeardict[72].dct["Truckin'"]) #song name   

0 个答案:

没有答案