我想知道是否有更简单的方法来做到这一点。
我正在尝试抓取此网站的歌词:
http://www.mldb.org/aza-A.html
以下是目前的代码:
from bs4 import BeautifulSoup
import requests,sys
seed = "http://www.mldb.org/aza-A.html"
ROOT = "http://www.mldb.org/"
def gather_links(seed,already_visisted):
response = requests.get(seed)
soup = BeautifulSoup(response.text)
table = soup.find('table', {'id': 'thelist'})
to_visit=[]
start = "<a href=\""
end = ">"
for row in table.findAll("a"):
stringed = str(row)
ended = stringed.find(end)
cleaned = ROOT+stringed[len(start):ended]
if (cleaned not in already_visisted) or (cleaned not in to_visit):
to_visit.append(cleaned)
already_visisted.append(seed)
return list(set(to_visit))
#visted.append(seed)
global_to_visit = [seed]
already_visited = []
for link in global_to_visit:
global_to_visit += gather_links(link,already_visited)
print len(global_to_visit),len(already_visited),link
我认为正在访问歌词页面,现在我只需要抓取这些页面以获取所需的文本,但我不确定这是否是最好的#34;达到目的的方式。
答案 0 :(得分:1)
这是一个不同的网站(LyricWikia),但我在Python中写了一个简单的command line tool来从那里下载歌词。虽然他们没有世界上所有的歌词,但在那里可以找到最多的歌曲。
我已将其写为class
,因此您可以将其导入代码,而不是在需要时从命令行运行它。
这将是相关部分,
import urllib
import re
import lxml.html
import unicodedata
import os
class Song(object):
def __init__(self, artist, title):
self.artist = self.__format_str(artist)
self.title = self.__format_str(title)
self.url = None
self.lyric = None
def __format_str(self, s):
# remove paranthesis and contents
s = s.strip()
try:
# strip accent
s = ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
except:
pass
s = s.title()
return s
def __quote(self, s):
return urllib.parse.quote(s.replace(' ', '_'))
def __make_url(self):
artist = self.__quote(self.artist)
title = self.__quote(self.title)
artist_title = '%s:%s' %(artist, title)
url = 'http://lyrics.wikia.com/' + artist_title
self.url = url
def update(self, artist=None, title=None):
if artist:
self.artist = self.__format_str(artist)
if title:
self.title = self.__format_str(title)
def lyricwikia(self):
self.__make_url()
try:
doc = lxml.html.parse(self.url)
lyricbox = doc.getroot().cssselect('.lyricbox')[0]
except IOError:
self.lyric = ''
return
lyrics = []
for node in lyricbox:
if node.tag == 'br':
lyrics.append('\n')
if node.tail is not None:
lyrics.append(node.tail)
self.lyric = "".join(lyrics).strip()
return self.lyric
你可以像
一样使用它song = Song(artist='Bob Dylan', title='Blowing in the wind')
lyr = song.lyricwikia()
print(lyr)