我正在尝试从sharechat.com抓取帖子数据(喜欢,共享,图像等),但是问题是我无法使用Selenium找到帖子的图像URL,因为我怀疑它使用Javascript来填充它。
我曾尝试与Selenium一起寻找最外面的HTML(显示的HTML),但是我得到了所有其他帖子信息,例如喜欢,分享,评论等的数量,但是我找不到商店图像,因为我找不到其网址。
我正在这样做,目的是进行社交网络研究,以进行情感分析和推荐趋势,因此我希望将帖子数据以及标签和喜欢,分享的数量以及其他信息一起抓取。我只是在抓取标签和图像的URL失败
Here是您需要运行的geckodriver文件。
这是my code:
import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'
files = "dataset_link_1.txt"
if not os.path.exists(files):
file(files, 'w').close()
enter = open(files,'w');
url = serviceurl
driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
pass;
for i in range(1,20):
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
print(var) #No of watches
enter.write("Total No of views:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
print(var) #Title
enter.write("Title:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
print(var) #owner bio
enter.write("Writer's Bio:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
print(var) #owner's bio
enter.write("Writer's Name:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
print(var) #comments
enter.write("Total Comments:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
print(var) #whatsapp
enter.write("Whatsapp Share:\n%s\n" %(var));
print()
# driver.save_screenshot("captcha_%s.png"%(i))
driver.back()
driver.quit()
enter.close()
答案 0 :(得分:2)
这是折射代码。最后添加了标签和图像逻辑。
import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'
files = "dataset_link_1.txt"
# if not os.path.exists(files):
# file(files, 'w').close()
enter = open(files,'w');
url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
pass;
for i in range(1,20):
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#get the number of feeds
feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
for ifeedCard in range(len(feedCards)):
# get Number of watches
watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
print(watches)
enter.write("Total No of views:\n%s\n" % (watches));
# get title
title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
print(title)
enter.write("Title:\n%s\n" % (title));
# get owner bio
writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
print(writerBio)
enter.write("Writer's Bio:\n%s\n" % (writerBio));
# get owner name
writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
print(writerName)
enter.write("Writer Name:\n%s\n" % (writerName));
# get comment
comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
print(comment)
enter.write("Number of comments:\n%s\n" % (comment));
# get share via whatsapp
whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
print(whatsApp)
enter.write("Whatsapp Share:\n%s\n" % (whatsApp));
#get tags
tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
print(tags)
enter.write("Tags:\n%s\n" % (tags));
# get onwer image
image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
print(image)
enter.write("Owner Image link:\n%s\n" % (image));
# post image
postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
print(postImage)
enter.write("post image link:\n%s\n" % (postImage))
driver.quit()
enter.close()
如果您尝试将文件下载到其他文件夹。使用以下代码。
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
下载文件后,只需使用以下命令将文件重命名为所需名称即可。
os.rename(download_file_name,desired_name) # you can pass the file name with path.
答案 1 :(得分:0)
我更改了Web驱动程序路径和range变量。如果您创建文件夹C:\ Py,则下面的代码将输出一个带有图像src路径的名为PageSource_StackOverflowQ2.txt的文本文件。
我在htlm中遇到很多与二进制字符有关的问题,因此可能有更好的方法来完成此操作,但希望这可以帮助您找到想要去的地方。
如果图像路径连续包含这9个字符,我的代码将停止 (“ title =”)
import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'
# files = "dataset_link_1.txt"
enter = open('C:\\Py\\dataset_link_1.txt','w+')
# if not os.path.exists(files):
# file(files, 'w').close()
# enter = open(files,'w');
url = serviceurl
# driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver = webdriver.Firefox(executable_path=r'C:\\Py\\geckodriver.exe');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
pass;
# for i in range(1,20):
for i in range ( 1, 2 ):
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
print(var) #No of watches
enter.write("Total No of views:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
print(var) #Title
enter.write("Title:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
print(var) #owner bio
enter.write("Writer's Bio:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
print(var) #owner's bio
enter.write("Writer's Name:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
print(var) #comments
enter.write("Total Comments:\n%s\n" %(var));
var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
print(var) #whatsapp
enter.write("Whatsapp Share:\n%s\n" %(var));
PageSource1 = [driver.page_source]
PageSource1 = PageSource1[0].encode ( "utf-8" )
file = open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', 'ab' )
file.write ( PageSource1 )
file.close ()
FindPageCount = []
file = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'w' )
with open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', "rb" ) as outfile, open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', "a" ) as f1:
for line in outfile:
uline = line.decode ( 'ascii', errors='ignore' )
f1.write ( uline )
outfile.close ()
f1.close ()
data = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'r' ).readlines ()
with open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt' ) as f, open ( 'C:\\Py\\PageSource_StackOverflowQ2.txt', "w" ) as f1:
data = f.readlines ()
for i in range ( len ( data ) ):
line = data[i]
if ("img src" in line):
q = line.split("><")
for k in q:
if("img src" in k):
h = 0
while h < len ( k ):
l = h + 9
if k[h:l] == '" title="':
f1.write ( k[9:h] )
f1.write ( '\n' )
print ( h )
print ( k[9:h] )
h = h + 1
print()
# driver.save_screenshot("captcha_%s.png"%(i))
driver.back()
driver.quit()
enter.close()