我的作业快要到期了,我正在给代码改正将近一个星期,到目前为止我已经尽力了,但是显然还不够。
代码假定要做的工作是搜索URL列表,以查找特定的产品(文本)(在这种情况下,NIKE是否出现在每个网站上)(如果在网站上),然后将该URL保存在打开的输出中(“ NikeShoes.txt”,“ a”),如果没有执行任何操作,则尽快到达列表中的下一个站点。
现在我的问题是,无论我做什么,我都只是不让它保存网址,而是保存了字符串“ nike”(这是我搜索的内容,而不是我打算作为输出获取的内容),输出是应该是在其上找到该字符串的网站。下面是完整的代码。
import urllib2
import re
import sys
import cookielib
from threading import Timer
from multiprocessing import Process, Queue
class GetResults(Process):
def __init__(self, rezqueue):
Process.__init__(self)
self.rezqueue = rezqueue
def run(self):
while True:
shoe = self.rezqueue.get()
if shoe is None: return False
with open("NikeShoes.txt","a") as Product:
Product.write(shoe.rstrip()+"\n")
print shoe
class Crawler(Process):
def __init__(self, queue, rezqueue):
Process.__init__(self)
self.queue = queue
self.rezqueue = rezqueue
def run(self):
while True:
site = self.queue.get()
if site is None: return False
self.crawl(site)
def crawl(self,site):
try:
WatchIt = Timer(15.0, self.WatchDog)
WatchIt.start()
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]
opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]
resp = opener.open(site,timeout=10)
WatchIt.cancel()
self.getem(resp.read())
except Exception, e:
#print e
f = 1
def getem(self,resp):
try:
shoes = re.findall('nike', str(resp))
CleanProducts = set(shoes)
for em in CleanProducts:
self.rezqueue.put(em.lower())
except Exception, e:
return False
def WatchDog(self):
return False
if __name__ == "__main__":
if len(sys.argv) < 3:
print "\tExample: ",sys.argv[0],"30 dom.txt"
sys.exit()
queue = Queue(maxsize=3000)
rezqueue = Queue()
ThreadNumber = int(sys.argv[1])
ThreadList = []
for i in range(ThreadNumber):
t = Crawler(queue,rezqueue)
t.daemon = True
t.start()
ThreadList.append(t)
GR = GetResults(rezqueue)
GR.daemon = True
GR.start()
with open(sys.argv[2],"rU") as urls:
for url in urls:
try:
if url.startswith('http://'):
queue.put(url.rstrip())
else:
url = 'http://'+url.rstrip()
queue.put(url.rstrip())
except Exception, e:
print e
for i in range(ThreadNumber):
queue.put(None)
for Worker in ThreadList:
Worker.join()
GR.join()