我为Pinterest制作了一个刮板,该刮板可以正常运行,但在规格良好的硬件上运行速度非常慢。
逻辑
问题 该脚本可以正常运行,但速度很慢,有时会冻结。
代码-精度
import time, argparse, pandas, datetime
from selenium import webdriver
# global lists for data storage
image_links = []
image_titles = []
image_descriptions = []
pin_links = []
pin_Board_Names = []
pin_Author_Names = []
scrape_timestamps = []
代码-makeCSV
def makeCsv(filename):
# csv head title along with data list for each
data={"Time Stamps":scrape_timestamps, "Author Names":pin_Author_Names, "Pins Board":pin_Board_Names, "Pin Links":pin_links, "Image URLs":image_links, "Image Titles":image_titles, "Image Descriptions":image_descriptions}
# create pandas dataframe from above dictionary
dataframe = pandas.DataFrame.from_dict(data, orient='index')
# transpose the dataframe - lists data as rows instead of columns in csv
dataframe = dataframe.transpose()
# save csv file
dataframe.to_csv(filename, sep=',')
代码-抓取/抓取
def grab(url, filename):
driver = webdriver.PhantomJS(executable_path='phantomjs.exe')
print("\n\nGhost Driver Invoked")
driver.implicitly_wait(10) # if element not found, wait for (seconds) before next operation
driver.get(url) # grab the url
time.sleep(3) # seconds
# Scrolling till the end of page
print("Started Scroling ...")
last_height = driver.execute_script("return document.body.scrollHeight")
i = 0
while True:
if i == 2:
break
else:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # seconds
new_height = driver.execute_script("return document.body.scrollHeight")
print("Scrolled: " + str(new_height))
if new_height == last_height:
break
last_height = new_height
i += 1
time.sleep(3) # seconds
# Get all pins , number of pins collected
total_pins = []
try:
total_pins = driver.find_elements_by_class_name("Grid__Item")
except:
print("Unable to load pins")
print("Total Pins: " + str(len(total_pins)))
# get number of 'see more' buttons collected - for error checking
moreButtons = driver.find_elements_by_xpath('//button[@data-test-id="seemoretoggle"]')
print("Dynamic Elements: " + str(len(moreButtons)))
print("Display: Dynamic Elements ...")
# clicking all 'See More' buttons
i = 0
while i <= (len(moreButtons) - 1):
moreButtons[i].click()
print("Clicked: " + str(i+1))
i += 1
# Pin Image URLs
i = 1
while i < len(total_pins):
try:
temp_xpath = "/html/body/div/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[1]/a/img"
temp = driver.find_element_by_xpath(temp_xpath).get_attribute("src")
image_links.append(temp)
scrape_timestamps.append(str(datetime.datetime.now())) # save timestamps of scrape
i += 1
except:
i += 1
pass
print("Total Images: " + str(i))
# Image Titles
imageTitle_Elements = driver.find_elements_by_class_name("PinAttributionTitle__title")
i = 1
for imageTitle_Element in imageTitle_Elements:
image_titles.append(imageTitle_Element.text)
i += 1
print("Total Titles: " + str(i))
# Image Descriptions
imageDesc_Elements = driver.find_elements_by_class_name("PinDescription__desc")
i = 1
for imageDesc_Element in imageDesc_Elements:
image_descriptions.append(imageDesc_Element.text)
i += 1
print("Total Descriptions: " + str(i))
# Pin Page Link
i = 1
while i < len(total_pins):
temp_xpath = "/html/body/div/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[1]/a"
temp = driver.find_element_by_xpath(temp_xpath).get_attribute("href")
print("Pin Link:" + str(i))
pin_links.append(temp)
i += 1
print("Total PinLinks: " + str(i))
# Pin Board Names
print("Extracting Board Names ... ")
i = 1
successful = False # for checking success of try | else not working
while i <= len(total_pins):
try:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div[2]/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Board_Names.append(temp.text)
print("Board_No: " + str(i) + " > " + temp.text)
successful = True
except:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Board_Names.append(temp.text)
print("Board_No: " + str(i) + " > " + temp.text)
successful = True
if successful == False:
print("Board_No: " + str(i) + " not found!")
i += 1
# Pin Author Names
print("Extracting Author Names ... ")
i = 1
successful = False
while i <= len(total_pins):
try:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div[2]/h4/a[2]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Author_Names.append(temp.text)
print("Author_No: " + str(i) + " > " + temp.text)
successful = True
except:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div/h4/a[2]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Author_Names.append(temp.text)
print("Author_No: " + str(i) + " > " + temp.text)
successful = True
if successful == False:
print("Author_No: " + str(i) + " not found!")
i += 1
# generate csv from collected items
makeCsv(filename)
# quit driver
driver.quit()
代码-主
def main():
# start argument parser
parser = argparse.ArgumentParser()
# custom arguments
parser.add_argument("-url", "--url", help="Enter the URL")
parser.add_argument("-fname", "--fname", help="Enter Filename")
# parse arguments
args = parser.parse_args()
# all web driver functions happen here:
grab(args.url, args.fname)
if __name__ == "__main__":
main()
输出缓慢
>>> C:\Users\da74\Desktop\sites\pinscraper\ps>python -W ignore scrape.py --url https://www.pinterest.com/vijayakumargana/classic-kuala-lumpur/ --fname classic_kl.csv
Ghost Driver Invoked
Started Scroling ...
Scrolled: 11124
Scrolled: 18388
Total Pins: 126
Dynamic Elements: 125
Display: Dynamic Elements ...
Clicked: 1
Clicked: 2
Clicked: 3
Clicked: 4
Clicked: 5
Clicked: 6
Clicked: 7
Clicked: 8
Clicked: 9
Clicked: 10
Clicked: 11
Clicked: 12
Clicked: 13
Clicked: 14
Clicked: 15
Clicked: 16
Clicked: 17
Clicked: 18
Clicked: 19
Clicked: 20
Clicked: 21
Clicked: 22
Clicked: 23
Clicked: 24
Clicked: 25
Clicked: 26
Clicked: 27
Clicked: 28
Clicked: 29
Clicked: 30
Clicked: 31
Clicked: 32
Clicked: 33
Clicked: 34
Clicked: 35
Clicked: 36
Clicked: 37
Clicked: 38
Clicked: 39
Clicked: 40
Clicked: 41
Clicked: 42
Clicked: 43
Clicked: 44
Clicked: 45
Clicked: 46
Clicked: 47
Clicked: 48
Clicked: 49
Clicked: 50
Clicked: 51
Clicked: 52
Clicked: 53
Clicked: 54
Clicked: 55
Clicked: 56
Clicked: 57
Clicked: 58
Clicked: 59
Clicked: 60
Clicked: 61
Clicked: 62
Clicked: 63
Clicked: 64
Clicked: 65
Clicked: 66
Clicked: 67
Clicked: 68
Clicked: 69
Clicked: 70
Clicked: 71
Clicked: 72
Clicked: 73
Clicked: 74
Clicked: 75
Clicked: 76
Clicked: 77
Clicked: 78
Clicked: 79
Clicked: 80
Clicked: 81
Clicked: 82
Clicked: 83
Clicked: 84
Clicked: 85
Clicked: 86
Clicked: 87
Clicked: 88
Clicked: 89
Clicked: 90
Clicked: 91
Clicked: 92
Clicked: 93
Clicked: 94
Clicked: 95
Clicked: 96
Clicked: 97
Clicked: 98
Clicked: 99
Clicked: 100
Clicked: 101
Clicked: 102
Clicked: 103
Clicked: 104
Clicked: 105
Clicked: 106
Clicked: 107
Clicked: 108
Clicked: 109
Clicked: 110
Clicked: 111
Clicked: 112
Clicked: 113
Clicked: 114
Clicked: 115
Clicked: 116
Clicked: 117
Clicked: 118
Clicked: 119
Clicked: 120
Clicked: 121
Clicked: 122
Clicked: 123
Clicked: 124
Clicked: 125
单击所有js按钮后,脚本冻结。即使单击按钮,它也会减慢很多速度。使用Python3.6,Windows 10 x64专业版,8GB RAM,2TB HDD。
如果有太多事情或更大页面,谁能帮助并提出优化它的方法并阻止它冻结? 谢谢。