我有网页编写了一个URL,我应该在网页中获取所有链接。我正在使用Beautiful Soup。
from bottle import route, run
import urllib2
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse
import json
import sys
import csv
import re
@route('/hello')
def hello():
text=list();
link=list();
req = urllib2.Request("http://www.amazon.com",
headers={"Content-Type": "application/json"})
html=urllib2.urlopen(req).read()
soup = BeautifulSoup(html)
last_page = soup.find('div', id="nav_subcats")
for elm in last_page.findAll('a'):
texts = elm.text
links = elm.get('href')
links = links.partition("&node=")[2]
text.append(texts)
link.append(links)
alltext=[]
for i,j in zip(text,link):
alltext.append({"name":i,"id":j})
return alltext
run(host='localhost', port=8080, debug=True)
但是当它返回text
时,我会将其AAABBBCCCDDD
,AAA
,BBB
,CCC
和DDD
不同的项目。为什么我不把它放在这样的括号中?
["AAA", "BBB" "CCC","DDD","EEE","FFF"]