Question

#!/usr/bin/env python

import threading
import urllib, sys,os
import Queue


concurrent = 200
queue = Queue.Queue(concurrent*2)

try:
    aim = sys.argv[1].lower()
    dic = open(sys.argv[2],'r')

except:
    print "Usage: %s url wordlist" % sys.argv[0]
    sys.exit(1)

class Scanner(threading.Thread):
    def __init__(self,queue):
        threading.Thread.__init__(self)
        self.queue=queue

    def run(self):

        while True:

            self.path = self.queue.get()
            self.geturl = urllib.urlopen(aim+'/'+self.path)
            self.status =  self.geturl.getcode()
            self.url = aim+self.path
            self.result = self.url+'=>'+str(self.status)
            print self.result
            self.writeresult(self.result)
            self.queue.task_done()



    def writeresult(self,result):

        fp = open('result.txt','a+')
        fp.write(result+'\n')
        fp.close()  


def main():         

    for i in range(concurrent):
        t = Scanner(queue)
        t.setDaemon(True)
        t.start()

    for path in dic.readlines():
        queue.put(path.strip())

    queue.join()

if __name__ == '__main__':
    main()

这是一个扫描网站目录的python程序，扫描完成后，它甚至没有退出ctrl + c 我想知道它何时完成扫描如何自动退出程序。

当它正在进行时，它也会出现这样的问题：

Exception in thread Thread-130:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "tt.py", line 28, in run
    self.geturl = urllib.urlopen(aim+'/'+self.path)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 86, in urlopen
    return opener.open(url)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 207, in open
    return getattr(self, name)(url)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 344, in open_http
    h.endheaders(data)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 954, in endheaders
    self._send_output(message_body)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 814, in _send_output
    self.send(msg)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 776, in send
    self.connect()
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 757, in connect
    self.timeout, self.source_address)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 553, in create_connection
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known

Answer 1

程序原样，它将在所有线程完成后关闭。但是为了轻松摆脱所有这些错误，在你的函数运行中，从类开始，在执行True：claus之后，将try中的所有内容放在try：except：这样的子句

try:
     code
except:
    pass

它并不是最干净的方式，但考虑到你的目标，它会完成这项工作，并且会让你摆脱这些例外，这意味着某些URL已经超时了。

Answer 2

我想要一些练习，所以我试了一下并改变了很多。它能为您提供全套结果吗？您需要使用原始参数读取替换路径。

使用这些主题，您可能会得到未处理的异常导致结果丢失？我添加了一种机制来捕获读取过程中的任何错误并将其传递给结果编写器。
我想从多个线程追加到文件是可以的，但我添加了一个编写器线程来更干净地管理文件
对自己的大部分作业都是不必要的
如果仍然出现套接字错误，请检查结果文件中的路径，看看如何处理这些结果
我不是专家，所以不要把它作为最佳实践

import threading
import urllib
import Queue

concurrent = 5

aim = 'http://edition.cnn.com'
paths = ['2013/10/12/opinion/kazin-tea-party/index.html?hpt=hp_t5',
         '2013/10/11/opinion/opinion-hay-nobel-opcw/index.html?hpt=hp_t5',
         '2013/10/11/opinion/rosin-women-in-charge/index.html?hpt=hp_t5',
         'some invalid path',
         '2013']  # also an invalid path


def main():
    work_q = Queue.Queue()
    result_q = Queue.Queue()

    # start the scanners and the result writer
    scanners = [Scanner(work_q, result_q) for i in range(concurrent)]
    for s in scanners:
        s.start()
    results_file_path = 'results.txt'
    result_writer = ResultWriter(result_q, 'results.txt')
    result_writer.start()
    # send all the work and wait for it to be completed
    for path in paths:
        work_q.put(path.strip())
    work_q.join()
    # tell everyone to stop
    # you could just kill the threads but you writer needs to close the file
    for s in scanners:
        work_q.put(Scanner.STOP_TOKEN)
    result_q.put(ResultWriter.STOP_TOKEN)  # make sure file gets closed
    # wait for everyone to actually stop
    for s in scanners:
        s.join()
    result_writer.join()
    print 'the scan has finished and results are in {}'.format(results_file_path)


class Scanner(threading.Thread):
    STOP_TOKEN = '<<stop>>'

    def __init__(self, work_q, result_q):
        threading.Thread.__init__(self)
        self.work_q = work_q
        self.result_q = result_q

    def run(self):
        while True:
            path = status = None  # reset in case of error
            try:
                try:
                    path = self.work_q.get(timeout=0.00001)
                except Queue.Empty:
                    continue
                if path == self.STOP_TOKEN:
                    break  # stop looking for work
                get_url = urllib.urlopen(aim + '/' + path)
                status = get_url.getcode()
            except Exception as e:
                status = 'unhandled error ({})'.format(e)
            self.result_q.put((path, status))
            self.work_q.task_done()


class ResultWriter(threading.Thread):
    STOP_TOKEN = '<<stop>>'

    def __init__(self, result_q, results_file_path):
        threading.Thread.__init__(self)
        self.result_q = result_q
        self.results_file_path = results_file_path

    def run(self):
        with open(self.results_file_path, 'w') as results_file:
            while True:
                try:
                    result = self.result_q.get(timeout=0.00001)
                except Queue.Empty:
                    continue
                if result == self.STOP_TOKEN:
                    break  # stop looking for results
                path, status = result
                results_file.write('{}=>{}\n'.format(path, status))


if __name__ == '__main__':
    main()

所有线程完成后如何退出程序？

2 个答案: