确定。这就是我的程序(在Python中)。首先,它会在本网站的locationary.com上下载页面的html,然后它会从该页面获取所有商业页面,并找到这些商家的所有yellowpages.com链接。在找到它们之后,它会使用webbrowser模块将它们插入到网站中。 (如果你仔细阅读代码,这可能会更有意义。)
首先,我想要一种方法来提交没有webbrowser模块的yellowpages.com链接,因为当我这样做时我必须这样做,所以Firefox总是登录到locationary.com然后我必须找到一种方法关闭Firefox,以便它不会被标签超载。我尝试使用urllib2和urlopen,但是当我运行时它没有做任何事情。现在我开始认为我需要根据我的请求发送某种类型的cookie或http标头。你是怎么做到的?
如果某些事情没有意义,请让我澄清一下!
这是我的代码:
from urllib import urlopen
from gzip import GzipFile
from cStringIO import StringIO
import re
import urllib
import urllib2
import webbrowser
import mechanize
import time
from difflib import SequenceMatcher
import os
def download(url):
s = urlopen(url).read()
if s[:2] == '\x1f\x8b': # assume it's gzipped data
with GzipFile(mode='rb', fileobj=StringIO(s)) as ifh:
s = ifh.read()
return s
for t in range(0, 1):
s = download('http://www.locationary.com/place/en/US/Arizona/Phoenix-page7/?ACTION_TOKEN=NumericAction')
findTitle = re.compile('<title>(.*)</title>')
getTitle = re.findall(findTitle,s)
findLoc = re.compile('http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
findLocL = re.findall(findLoc,s)
W, X, XA, Y, YA, Z, ZA = [], [], [], [], [], [], []
for i in range(2, 25):
print i
b = download(findLocL[i])
findYP = re.compile('http://www\.yellowpages\.com/')
findYPL = re.findall(findYP,b)
findTitle = re.compile('<title>(.*) \(\d{1,10}.{1,100}\)</title>')
getTitle = re.findall(findTitle,b)
findAddress = re.compile('<title>.{1,100}\((.*), .{4,14}, United States\)</title>')
getAddress = re.findall(findAddress,b)
if not findYPL:
if not getTitle:
print ""
else:
W.append(findLocL[i])
b = download(findLocL[i])
if not getTitle:
print ""
else:
X.append(getAddress)
b = download(findLocL[i])
if not getTitle:
print ""
else:
Y.append(getTitle)
sizeWXY = len(W)
def XReplace(text, d):
for (k, v) in d.iteritems():
text = text.replace(k, v)
XA.append(text)
def XReplace(text, d):
for (k, v) in d.iteritems():
text = text.replace(k, v)
YA.append(text)
for d in range(0, sizeWXY):
old = str(X[d])
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
XReplace(old, reps)
old2 = str(Y[d])
YReplace(old2, reps)
count = 0
for e in range(0, sizeWXY):
newYPL = "http://www.yellowpages.com/" + XA[e] + "/" + YA[e] + "?order=distance"
v = download(newYPL)
abc = str('<h3 class="business-name fn org">\n<a href="')
dfe = str('" class="no-tracks url "')
findFinal = re.compile(abc + '(.*)' + dfe)
getFinal = re.findall(findFinal, v)
if not getFinal:
W.remove(W[(e-count)])
X.remove(X[(e-count)])
count = (count+1)
else:
for f in range(0,1):
Z.append(getFinal[f])
XA = []
for c in range(0,(len(X))):
aGd = re.compile('(.*), .{1,50}')
bGd = re.findall(aGd, str(X[c]))
XA.append(bGd)
LenZ = len(Z)
V = []
for i in range(0, (len(W))):
if i == 0:
countTwo = 0
gda = download(Z[i-(countTwo)])
ab = str('"street-address">\n')
cd = str('\n</span>')
ZAddress = re.compile(ab + '(.*)' + cd)
ZAddress2 = re.findall(ZAddress, gda)
for b in range(0,(len(ZAddress2))):
if not ZAddress2[b]:
print ""
else:
V.append(str(ZAddress2[b]))
a = str(W[i-(countTwo)])
n = str(Z[i-(countTwo)])
c = str(XA[i])
d = str(V[i])
m = SequenceMatcher(None, c, d)
if m.ratio() < 0.50:
Z.remove(Z[i-(countTwo)])
W.remove(W[i-(countTwo)])
countTwo = (countTwo+1)
def ZReplace(text3, dic3):
for p, q in dic3.iteritems():
text3 = text3.replace(p, q)
ZA.append(text3)
for y in range(0,len(Z)):
old3 = str(Z[y])
reps2 = {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'}
ZReplace(old3, reps2)
for z in range(0,len(ZA)):
findPID = re.compile('\d{5,20}')
getPID = re.findall(findPID,str(W[z]))
newPID = re.sub("\D", "", str(getPID))
finalURL = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=" + str(newPID) + "&xxx_c_1_f_987=" + str(ZA[z])
webbrowser.open(finalURL)
time.sleep(5)
os.system("taskkill /F /IM firefox.exe")
答案 0 :(得分:0)
您可以尝试pycurl,这样您就可以在不使用浏览器的情况下与网页进行互动。
pycurl允许cookie存储(和重用),表单提交(包括登录),以及Python的页面导航。上面的代码似乎已经通过urllib检索了你想要的资源。您可以通过pycurl站点上的pycurl教程轻松调整代码。
关于你需要处理javascript的评论,请详细说明其背景。
一个不同但相关的帖子here at Stackoverflow使用pycurl解决了一个javascript问题,并显示了启动pycurl连接的代码。