在网页上找到特定字词并保存网址

时间:2019-07-14 08:02:43

标签: python

我的作业快要到期了,我正在给代码改正将近一个星期,到目前为止我已经尽力了,但是显然还不够。

代码假定要做的工作是搜索URL列表,以查找特定的产品(文本)(在这种情况下,NIKE是否出现在每个网站上)(如果在网站上),然后将该URL保存在打开的输出中(“ NikeShoes.txt”,“ a”),如果没有执行任何操作,则尽快到达列表中的下一个站点。

现在我的问题是,无论我做什么,我都只是不让它保存网址,而是保存了字符串“ nike”(这是我搜索的内容,而不是我打算作为输出获取的内容),输出是应该是在其上找到该字符串的网站。下面是完整的代码。

import urllib2
import re
import sys
import cookielib
from threading import Timer
from multiprocessing import Process, Queue

class GetResults(Process): 
    def __init__(self, rezqueue):
        Process.__init__(self)
        self.rezqueue = rezqueue

    def run(self):
        while True:
            shoe = self.rezqueue.get()
            if shoe is None:   return False
            with open("NikeShoes.txt","a") as Product:
                Product.write(shoe.rstrip()+"\n")
            print shoe

class Crawler(Process): 
    def __init__(self, queue, rezqueue):
        Process.__init__(self)
        self.queue = queue
        self.rezqueue = rezqueue




    def run(self):
        while True:
            site = self.queue.get()
            if site is None:    return False
            self.crawl(site)


    def crawl(self,site):
        try:
            WatchIt = Timer(15.0, self.WatchDog)
            WatchIt.start()

            cj = cookielib.CookieJar()        
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
            opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]
            opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]
            resp = opener.open(site,timeout=10)
            WatchIt.cancel()
            self.getem(resp.read())

        except Exception, e:
            #print e
            f = 1

    def getem(self,resp):
        try:
            shoes = re.findall('nike', str(resp))
            CleanProducts = set(shoes)
            for em in CleanProducts:
                self.rezqueue.put(em.lower())
        except Exception, e:
            return False

    def WatchDog(self):
        return False           



if __name__ == "__main__":

    if len(sys.argv) < 3:
        print "\tExample: ",sys.argv[0],"30 dom.txt"
        sys.exit()

    queue = Queue(maxsize=3000)
    rezqueue = Queue()
    ThreadNumber = int(sys.argv[1])
    ThreadList = []

    for i in range(ThreadNumber):
        t = Crawler(queue,rezqueue)
        t.daemon = True
        t.start()
        ThreadList.append(t)

    GR = GetResults(rezqueue)
    GR.daemon = True
    GR.start()

    with open(sys.argv[2],"rU") as urls:
        for url in urls:
            try:
                if url.startswith('http://'):
                    queue.put(url.rstrip())
                else:
                    url = 'http://'+url.rstrip()
                    queue.put(url.rstrip())
            except Exception, e:
                print e

    for i in range(ThreadNumber):
        queue.put(None)

    for Worker in ThreadList:
        Worker.join()

    GR.join()

0 个答案:

没有答案