我试图通过在文本文件中打开大量网站来在网络上生成流量。
然后我想获得该网站和所有href链接。转到这些链接,然后转到该站点,然后转到文本文档中的下一个站点。
我的问题(我一直注意到)是需要一段时间来执行这些语句,每个卷曲超过5秒。这是因为我过度使用try除了循环吗?我只想了解问题所在。
2018-03-14 16:30:32.590135
2018-03-14 16:30:37.653522
2018-03-14 16:30:42.716842
2018-03-14 16:30:47.762127
2018-03-14 16:30:52.809792
2018-03-14 16:30:57.876936
2018-03-14 16:31:02.947123
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib2
import pycurl
from io import BytesIO
import os
import re
import sys
import random
from datetime import datetime
links = []
while True:
with open("topdomains3.txt", "r") as f:
domains = list(f)
joker=random.randint(1, len(domains))
for i in domains[joker:len(domains)]:
i=i.replace("\\n", "")
i=i.replace("None", "")
i=i.rstrip()
print i
try:
c = pycurl.Curl()
c.setopt(c.URL, i)
c.setopt(pycurl.TIMEOUT, 3)
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.MAXREDIRS , 5)
try:
i='http://' + i
html_page = urllib2.urlopen(i)
soup = BeautifulSoup(html_page, 'html5lib')
except Exception,e:
print e
continue
for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
links.append(link.get('href').replace("u", ""))
for a in links:
try:
print "----------------------------------------------------------"
print str(datetime.now())
print a
d = pycurl.Curl()
#c.setopt(c.VERBOSE, True)
d.setopt(d.URL, str(a))
#c.setopt(c.WRITEDATA, buffer)
d.setopt(d.TIMEOUT, 3)
d.setopt(d.FOLLOWLOCATION, True)
d.setopt(d.MAXREDIRS , 5)
#d.setopt(pycurl.WRITEFUNCTION, lambda x: None)
d.perform()
d.close()
except pycurl.error:
continue
c.perform()
c.close()
except pycurl.error:
continue
任何帮助都将受到赞赏。