我对Python和Webcrawler都是陌生的。这是我在Uni研讨会上必须完成的任务,但是我遇到了麻烦,因为我必须抓取大量页面,但是我的抓取工具一直停下来。它检索了大约一半的页面(达到1000页,但总数约为2,800页),然后停止了,但未显示任何错误。我不知道该如何解决。你们能帮忙吗!
我的代码如下:
#import the libraries
from bs4 import BeautifulSoup
import urllib.request
from urllib.request import Request, urlopen
import csv
import re
import time
import pandas as pd
import numpy as np
#create the lists to put information into
name = []
txhash = []
age = []
fr = []
to = []
quantity = []
transaction_code = []
token_name = []
last_page = []
url = []
#monitor the time when crawl the pages
start1 = time.time()
#crawl the main page to get the contract name and the name of the token
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
#number of pages in the main page is 18, run the loop through all 18 pages to get a list of token codes
for i_main in range(1, 2): #19):
req_main = Request('https://etherscan.io/tokens?p=' + str(i_main), data=None, headers=headers)
webpage_main = urlopen(req_main).read()
urlopen(req_main).close()
page_soup_main = BeautifulSoup(webpage_main, 'html.parser')
table_main = page_soup_main.find_all("div", {"class": "table-responsive"})
table_link_main = table_main[0].find_all("a")
#create the loop for each token in a page (about 50 tokens/a page, total: 18 pages, last page only has 2 tokens)
for i in range(5, 9, 3):#len(table_link_main) - 1, 3):
transaction_code.append(table_link_main[i]['href'][7:])
token_name.append(table_link_main[i].text)
print('transaction code:', len(transaction_code)) #852
print('token_name:', len(token_name)) #852
for i_url in range(0, len(transaction_code)):
url.append('https://etherscan.io/token/generic-tokentxns2?contractAddress=' + str(transaction_code[i_url]) +'&a=')
#print(url)
print('url:', len(url)) #852
for i_last_page in range(0, len(url)):
print(i_last_page)
req = Request(url[i_last_page], data=None, headers=headers)
webpage = urlopen(req).read()
urlopen(req).close()
page_soup = BeautifulSoup(webpage, 'html.parser')
#find results within a table
table = page_soup.find_all("div", {"class": "table-responsive"})
#token = token_name[i]
#find number of the pages
paging_panel= table[0].find_all("b")
no_page = paging_panel[1].text
last_page.append(no_page)
time.sleep(2) #prevent from getting blocked
print('last page:', len(last_page)) #852
end1 = time.time()
print('time to create all lists (seconds):', end1 - start1)
#monitor the main crawl
start2 = time.time()
for i_url_2 in range(1, len(url)):
print(i_url_2)
token = token_name[i_url_2]
page_no = int(last_page[i_url_2])
for i_page_no in range(1, page_no + 1):
#Print number of pages to control which page we are at
print(i_page_no)
#parse each page
req_loop = Request(url[i_url_2]+'&mode=&p='+ str(i_page_no), data=None, headers=headers)
#pause the loop
webpage_loop = urlopen(req_loop).read()
urlopen(req_loop).close()
page_soup_loop = BeautifulSoup(webpage_loop, 'html.parser')
#find results within a table in each page
table_loop = page_soup_loop.find_all("div", {"class": "table-responsive"})
table_link_loop = table_loop[0].find_all("td")
#TxHash
for i_txhash in range(0, len(table_link_loop), 7):
txhash.append(table_link_loop[i_txhash].text)
#Age
for i_age in range(1, len(table_link_loop), 7):
age.append(table_link_loop[i_age].span['title'])
#From
for i_from in range(2,len(table_link_loop),7):
fr.append(table_link_loop[i_from].text)
#To
for i_to in range(4,len(table_link_loop),7):
to.append(table_link_loop[i_to].text)
#Quantity
for i_quantity in range(5, len(table_link_loop), 7):
quantity.append(table_link_loop[i_quantity].text)
#Name
for i_name in range(1, len(table_link_loop), 7):
name.append(token)
#Setting this to prevent from getting blocked
time.sleep(2)
time.sleep(10)
end2 = time.time()
print('time to run the main crawl (seconds):', end2 - start2)
#create dataframe to store the information
transactions = pd.DataFrame({'Name': name,
'TxHash': txhash,
'Age': age,
'From': fr,
'To': to,
'Quantity': quantity})
print(transactions.info())
#turn the dataframe into a csv file
transactions.to_csv('transactions.csv')`