我正在尝试从网站中提取大量数据。 (https://www.cs.cmu.edu/~mleone/gdead/setlists.html)我正在一个数据可视化项目中显示有关感恩节死者音乐会的信息。我已经成功提取并格式化了所需的数据,但是过程非常缓慢。我正在使用urllib库打开并从每个url中读取。 (URL杂乱无章)。有更好的方法吗?
class Song:
def __init__(self, name, year):
self.name = name
self.year = year
def printName(self):
print(self.name)
def getName(self):
return self.name
class Year:
def __init__(self, year, dct):
self.year = year
self.dct = {}
def addSong(self, song):
if song in self.dct:
self.dct[song] += 1
else:
self.dct[song] = 1
def printDict(self):
print(dct)
def printYear(self):
print(self.year)
def getYear(self):
return self.year
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
import urllib
songlist = []
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def remove_values_from_list(the_list, val):
return [value for value in the_list if value != val]
list_open = open("concert_list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
#empty dictionary
yeardict = {}
#populate dictionary with Year objects from 1972-1995
for i in range(0,24):
yeardict[i+72] = Year(i+72, {})
# for each website located in concert_list.txt
for url in line_in_list:
soup = BeautifulSoup(urlopen(url).read(), 'html.parser')
newurl = []
# find each link extension
for link in soup.find_all('a'):
newurl.append('https://www.cs.cmu.edu/~mleone/gdead/' + link.get('href'))
# parse through each full url
for url in newurl:
soup = BeautifulSoup(urlopen(url).read(), 'html')
# parse something special in the file
paragraphs = soup.find_all('p')
x = []
# populate x by splitting the file into a list
for p in paragraphs:
x = p.getText().split('\n')
# remove blanks from song list
x = remove_values_from_list(x, '')
# for each song in the song list
for song in x:
if(hasNumbers(song)):
year = song[song.rfind("/")+1:song.rfind("/")+3]
else:
cursong = Song(song,year)
#yeardict[int(year)].printYear()
yeardict[int(year)].addSong(cursong.getName())
print(yeardict[72].dct["Truckin'"]) #song name