“另请参阅”网络爬虫

时间:2019-08-09 16:35:35

标签: python json web-crawler

我在编写Wikipedia的Web爬网程序时遇到问题。该搜寻器需要显示“另请参阅”部分以获取特定链接。而且,对于首次使用“另请参阅”的每个链接,此搜寻器还必须显示“另请参阅”部分。例如:此Wiki页面:https://en.wikipedia.org/wiki/Internet的“另请参见”部分包含此页面https://en.wikipedia.org/wiki/Crowdfunding,例如,该Crowdfunding页面包含以下内容:https://en.wikipedia.org/wiki/Angel_investor

此示例基于单个链接,但是在“另请参见”部分中,有10多个链接,这就是我需要创建的链接。我还必须谨慎行事。这是我的草稿的样子,但是它给了我错误,并且无法正常工作(甚至不是递归的):D

    #Import Libraries
    import time     #For Delay
    import urllib.request    #Extracting web pages
    import re

    #Defining pages
    starting_page = "https://en.wikipedia.org/wiki/Spacetime"
    seed_page = "https://en.wikipedia.org"  #Crawling the English Wikipedia

    #Downloading entire Web Document (Raw Page Content)
    def download_page(url):
        try:
            headers = {}
            headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
            req = urllib.request.Request(url, headers = headers)
            resp = urllib.request.urlopen(req)
            respData = str(resp.read())
            return respData
        except Exception as e:
            print(str(e))

    #Extract the "See also" section elements
    def extract_see_also(page):
        if 'id="See_also">' in page:
            start_see_also = page.find('id="See_also">')
            start_list_items = page.find('<li>', start_see_also + 1)
            end_see_also = page.find('<h2>', start_list_items + 1)
            see_also_section = page[start_list_items: end_see_also]
            pure_item_raw = (re.sub(r'<.+?>', '', see_also_section)).replace('\n', ',')
            pure_item_raw2 = pure_item_raw.replace(',,', ',')
            pure_item = pure_item_raw2.replace(',,', ',')
            flag = 0
        else:
            pure_item = "No Related Links"
            flag = 1
        return pure_item, flag

    #Getting all links with the help of 'get_next_links'
    def get_all_links(page):
        links = []
        while True:
            link, end_link = get_next_link(page)
            if link == "no_links":
                break
            else:
                links.append(link)      #Append all the links in the list named 'Links'
                #time.sleep(0.1)
                page = page[end_link:]
        return links 

    #Crawl Initiation
    #Check for file type in URL so crawler does not crawl images and text files
    def extension_scan(url):
        a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
        j = 0
        while j < (len(a)):
            if a[j] in url:
                #print("There!")
                flag2 = 1
                break
            else:
                #print("Not There!")
                flag2 = 0
                j = j+1
        #print(flag2)
        return flag2

    #URL parsing for incomplete or duplicate URLs
    def url_parse(url):
        try:
            from urllib.parse import urlparse
        except ImportError:
            from urlparse import urlparse
        url = url  #.lower()    #Make it lower case
        s = urlparse(url)       #parse the given url
        seed_page_n = seed_page #.lower()       #Make it lower case
        #t = urlparse(seed_page_n)     #parse the seed page (reference page)
        i = 0
        flag = 0
        while i<=9:
            if url == "/":
                url = seed_page_n
                flag = 0  
            elif not s.scheme:
                url = "http://" + url
                flag = 0
            elif "#" in url:
                url = url[:url.find("#")]
                flag = 0
            elif "?" in url:
                url = url[:url.find("?")]
                flag = 0
            elif s.netloc == "":
                url = seed_page + s.path
                flag = 0
            #elif "www" not in url:
            #    url = "www."[:7] + url[7:]
            #    flag = 0

            elif url[len(url)-1] == "/":
                url = url[:-1]
                flag = 0
            #elif s.netloc != t.netloc:
            #    url = url
            #    flag = 1
            #    break        
            else:
                url = url
                flag = 0
                break

            i = i+1
            s = urlparse(url)   #Parse after every loop to update the values of url parameters
        return(url, flag)



    t0 = time.time()
    database = {}   #Create a dictionary

    #Main Crawl function that calls all the above function and crawls the entire site sequentially
    def web_crawl():  
        to_crawl = [starting_page]      #Define list name 'Seed Page'
        #print(to_crawl)
        crawled=[]      #Define list name 'Seed Page'
        #database = {}   #Create a dictionary
        #k = 0;
        for k in range(0, 3):
            i=0        #Initiate Variable to count No. of Iterations
            while i<3:     #Continue Looping till the 'to_crawl' list is not empty
                urll = to_crawl.pop(0)      #If there are elements in to_crawl then pop out the first element
                urll,flag = url_parse(urll)
                #print(urll)
                flag2 = extension_scan(urll)
                time.sleep(3)

                #If flag = 1, then the URL is outside the seed domain URL
                if flag == 1 or flag2 == 1:
                    pass        #Do Nothing

                else:       
                    if urll in crawled:     #Else check if the URL is already crawled
                        pass        #Do Nothing
                    else:       #If the URL is not already crawled, then crawl it and extract all the links from it
                        print("Link = " + urll)

                        raw_html = download_page(urll)
                        #print(raw_html)


                        see_also,flag2 = extract_see_also(raw_html)
                        print("Related Links = " + see_also)


                        crawled.append(urll)                  

                        #Remove duplicated from to_crawl
                        n = 1
                        j = 0
                        #k = 0
                        while j < (len(to_crawl)-n):
                            if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
                                to_crawl.pop(j)
                                n = n+1
                            else:
                                pass     #Do Nothing
                            j = j+1
                    i=i+1

                    #print(to_crawl)
                    #print("Iteration No. = " + str(i))
                    #print("To Crawl = " + str(len(to_crawl)))
                    #print("Crawled = " + str(len(crawled)))
        return ""

    print (web_crawl())

    t1 = time.time()
    total_time = t1-t0

0 个答案:

没有答案