我正在尝试使用此python代码从博客中提取链接:
#!/usr/bin/env python
"""
Extract all links from a web page
=================================
Author: Laszlo Szathmary, 2011 (jabba.laci@gmail.com)
Website: https://pythonadventures.wordpress.com/2011/03/10/extract-all-links-from-a-web-page/
GitHub: https://github.com/jabbalaci/Bash-Utils
Given a webpage, extract all links.
Usage:
------
./get_links.py <URL>
"""
import sys
import urllib
import urlparse
from BeautifulSoup import BeautifulSoup
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15'
def process(url):
myopener = MyOpener()
#page = urllib.urlopen(url)
page = myopener.open(url)
text = page.read()
page.close()
soup = BeautifulSoup(text)
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url, tag['href'])
print tag['href']
# process(url)
def main():
if len(sys.argv) == 1:
print "Jabba's Link Extractor v0.1"
print "Usage: %s URL [URL]..." % sys.argv[0]
sys.exit(1)
# else, if at least one parameter was passed
for url in sys.argv[1:]:
process(url)
# main()
#############################################################################
if __name__ == "__main__":
main()
链接来自博客,主要类别为blog.xx / Music / 它将从blog.xx / this_album_name /类别中提取链接,但我想从类别
下的子页面上获取名为quote的类的链接如何解析音乐类别中的链接,并让BS浏览每个标题链接以使用引用类提取下一页上的链接?
即 blog.xx /分类
blog.xx / post1.html
blog.xx / post2.html
在上述每个帖子页面上都有一个引用块,其中包含我想要抓取的链接。
我是python和BS的新手并尝试了一些变化,但此时我需要帮助。 感谢
答案 0 :(得分:1)
如果我理解正确,您希望关注页面中的链接到下一页,并从该页面抓取链接?以下内容应该为您完成:
#!/usr/bin/env python
"""
Extract all links from a web page
=================================
Author: Laszlo Szathmary, 2011 (jabba.laci@gmail.com)
Website: https://pythonadventures.wordpress.com/2011/03/10/extract-all-links-from-a-web-page/
GitHub: https://github.com/jabbalaci/Bash-Utils
Given a webpage, extract all links.
Usage:
------
./get_links.py <URL>
"""
import sys
import urllib
import urlparse
import re
from BeautifulSoup import BeautifulSoup
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15'
def process(url):
myopener = MyOpener()
#page = urllib.urlopen(url)
page = myopener.open(url)
text = page.read()
page.close()
soup = BeautifulSoup(text)
urls = []
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url, tag['href'])
urls.append(tag['href'])
return urls
# process(url)
def main():
# Store the urls we were given
urls_queue = sys.argv[1:]
urls_found = []
urls_done = []
site_roots = []
# Get the domains to keep us on the same domain (don't follow external links)
for url in urls_queue:
mre = re.match('^https?://[^/]*',url,re.IGNORECASE)
if mre:
# If we've found a match, add the entire matched string to site_roots
site_roots.append( mre.group(0) )
while len(urls_queue) > 0:
# Get url off the top of the queue
url = urls_queue.pop()
urls_done.append(url)
found = process(url)
for uf in found:
# I'd suggest checking to make sure it's on the same domain here
# any() returns true if any of the elements in the list passed are True
# In this case, if uf starts with any of the site_root strings.
# 'not any()' is equivalent to saying 'none'
if not any( [ uf.startswith( site_root ) for site_root in site_roots ] ):
continue # Next url, this is off site
if uf not in urls_found:
urls_found.append(uf)
# If we don't have it in the queue, queue it up
if uf not in urls_queue and uf not in urls_done:
urls_queue.append(uf)
print "Done %d; Queued %d; Found %d" % ( len(urls_done), len(urls_queue), len(urls_found) )
print urls_found
# main()
#############################################################################
if __name__ == "__main__":
main()
我添加了一个网址和检查,以确保您不会在指向其他位置的链接后“离开”。它会输出最后找到的所有内容。
请注意,此代码将遵循辅助页面上的链接,因此可能会索引整个网站。您可以通过在主urls_queue.append
循环中注释掉while
位来解决此问题(以停止添加更多内容)。然后就在while循环之前添加:
urls_queue = [url for inurl in sys.argv[1:] for url in process(inurl) if any([url.startswith(sr) for sr in site_roots])]
urls_queue = list( set(urls_queue) ) # Get rid of duplicates
这会构建初始队列,在提供的页面中添加链接。因此,对于您的示例,将添加“类别”页面上的链接,但后续页面上的链接不会添加。