美丽的汤-Python-搜寻器在工作中间停止-未显示错误

时间:2019-01-16 15:42:35

标签: python html web-scraping beautifulsoup web-crawler

我对Python和Webcrawler都是陌生的。这是我在Uni研讨会上必须完成的任务,但是我遇到了麻烦,因为我必须抓取大量页面,但是我的抓取工具一直停下来。它检索了大约一半的页面(达到1000页,但总数约为2,800页),然后停止了,但未显示任何错误。我不知道该如何解决。你们能帮忙吗!

我的代码如下:

#import the libraries

    from bs4 import BeautifulSoup
    import urllib.request
    from urllib.request import Request, urlopen
    import csv
    import re
    import time
    import pandas as pd
    import numpy as np
    #create the lists to put information into
    name = []
    txhash = []
    age = []
    fr = []
    to = []
    quantity = []
    transaction_code = []
    token_name = []
    last_page = []
    url = []
    #monitor the time when crawl the pages
    start1 = time.time()

    #crawl the main page to get the contract name and the name of the token
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    }
    #number of pages in the main page is 18, run the loop through all 18 pages to get a list of token codes
    for i_main in range(1, 2): #19):
        req_main = Request('https://etherscan.io/tokens?p=' + str(i_main), data=None, headers=headers) 
        webpage_main = urlopen(req_main).read()
        urlopen(req_main).close()
        page_soup_main = BeautifulSoup(webpage_main, 'html.parser')
        table_main = page_soup_main.find_all("div", {"class": "table-responsive"})
        table_link_main = table_main[0].find_all("a")

        #create the loop for each token in a page (about 50 tokens/a page, total: 18 pages, last page only has 2 tokens)
        for i in range(5, 9, 3):#len(table_link_main) - 1, 3):
            transaction_code.append(table_link_main[i]['href'][7:])
            token_name.append(table_link_main[i].text)


    print('transaction code:', len(transaction_code)) #852
    print('token_name:', len(token_name)) #852

    for i_url in range(0, len(transaction_code)):

        url.append('https://etherscan.io/token/generic-tokentxns2?contractAddress=' + str(transaction_code[i_url]) +'&a=')

    #print(url) 
    print('url:', len(url)) #852
    for i_last_page in range(0, len(url)):
        print(i_last_page)
        req = Request(url[i_last_page], data=None, headers=headers)
        webpage = urlopen(req).read()
        urlopen(req).close()
        page_soup = BeautifulSoup(webpage, 'html.parser')
        #find results within a table
        table = page_soup.find_all("div", {"class": "table-responsive"})
        #token = token_name[i]
        #find number of the pages
        paging_panel= table[0].find_all("b")
        no_page = paging_panel[1].text
        last_page.append(no_page)   
        time.sleep(2) #prevent from getting blocked
    print('last page:', len(last_page)) #852

    end1 = time.time()
    print('time to create all lists (seconds):', end1 - start1)

    #monitor the main crawl
    start2 = time.time()

    for i_url_2 in range(1, len(url)):
        print(i_url_2)
        token = token_name[i_url_2]
        page_no = int(last_page[i_url_2])
        for i_page_no in range(1, page_no + 1):

                #Print number of pages to control which page we are at 
            print(i_page_no)

                #parse each page
            req_loop = Request(url[i_url_2]+'&mode=&p='+ str(i_page_no), data=None, headers=headers)

                #pause the loop
            webpage_loop = urlopen(req_loop).read()
            urlopen(req_loop).close()
            page_soup_loop = BeautifulSoup(webpage_loop, 'html.parser')

                #find results within a table in each page
            table_loop = page_soup_loop.find_all("div", {"class": "table-responsive"})
            table_link_loop = table_loop[0].find_all("td")

                #TxHash
            for i_txhash in range(0, len(table_link_loop), 7):
                txhash.append(table_link_loop[i_txhash].text)

                #Age
            for i_age in range(1, len(table_link_loop), 7):
                age.append(table_link_loop[i_age].span['title'])

                #From
            for i_from in range(2,len(table_link_loop),7):
                fr.append(table_link_loop[i_from].text)

                #To
            for i_to in range(4,len(table_link_loop),7):
                to.append(table_link_loop[i_to].text)

                #Quantity
            for i_quantity in range(5, len(table_link_loop), 7):
                quantity.append(table_link_loop[i_quantity].text)    

                #Name 
            for i_name in range(1, len(table_link_loop), 7):
                name.append(token)

                #Setting this to prevent from getting blocked
            time.sleep(2)    
        time.sleep(10)

    end2 = time.time()
    print('time to run the main crawl (seconds):', end2 - start2)

    #create dataframe to store the information
    transactions = pd.DataFrame({'Name': name,
                                'TxHash': txhash,
                                'Age': age,
                                'From': fr,
                                'To': to,
                                'Quantity': quantity})
    print(transactions.info())

    #turn the dataframe into a csv file
    transactions.to_csv('transactions.csv')`

0 个答案:

没有答案