我正在训练网络爬网。为此,我向自己提出挑战,以获取喜欢在instagram上发帖的所有人员的列表。 我的问题是,我坚持只获得喜欢者的前11个用户名的问题。在获得喜欢的内容时,我找不到自动滚动过程的正确方法。
这是我在Jupyter Notebook中的过程(它还不能作为脚本运行):
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
userid_element = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/a')[0].click()
elems = driver.find_elements_by_xpath("//*[@id]/div/a")
users = []
for elem in elems:
users.append(elem.get_attribute('title'))
print(users)
你们有什么主意吗?
非常感谢
答案 0 :(得分:1)
请尝试以下代码,并让我知道是否可行。
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
elems = driver.find_elements_by_xpath("//a[@class='FPmhX notranslate TlrDj']")
users = []
for elem in elems:
users.append(elem.get_attribute('title'))
print('Title : ' +elem.get_attribute('title'))
print(users)
输出:-
Title : kyliejenner
Title : saturdayshade28
Title : worldmeetzboy
Title : mrokon
Title : addieisaac
Title : addieisaac
Title : amber_doerksen
Title : amber_doerksen
Title : addieisaac
Title : zayn6117
Title : amber_doerksen
Title : amber_doerksen
Title : worldmeetzboy
Title : worldmeetzboy
Title : razvanpopic1301
Title : johanna.trmn
Title : johanna.trmn
Title : johanna.trmn
Title : americ.av
Title : gabriellcostta1.0
Title : gabriellcostta1.0
Title : gabriellcostta1.0
Title : worldmeetzboy
Title : enactusepi
Title : enactusepi
[u'kyliejenner', u'saturdayshade28', u'worldmeetzboy', u'mrokon', u'addieisaac', u'addieisaac', u'amber_doerksen', u'amber_doerksen', u'addieisaac', u'zayn6117', u'amber_doerksen', u'amber_doerksen', u'worldmeetzboy', u'worldmeetzboy', u'razvanpopic1301', u'johanna.trmn', u'johanna.trmn', u'johanna.trmn', u'americ.av', u'gabriellcostta1.0', u'gabriellcostta1.0', u'gabriellcostta1.0', u'worldmeetzboy', u'enactusepi', u'enactusepi']
答案 1 :(得分:1)
我想instagram网站使用喜欢的用户元素的数量最多为17。
所以,这是一个循环
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
userid_element = driver.find_elements_by_xpath('//*[@id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/a')[0].click()
time.sleep(2)
# here, you can see user list you want.
# you have to scroll down to download more data from instagram server.
# loop until last element with users table view height value.
users = []
height = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div").value_of_css_property("padding-top")
match = False
while match==False:
lastHeight = height
# step 1
elements = driver.find_elements_by_xpath("//*[@id]/div/a")
# step 2
for element in elements:
if element.get_attribute('title') not in users:
users.append(element.get_attribute('title'))
# step 3
driver.execute_script("return arguments[0].scrollIntoView();", elements[-1])
time.sleep(1)
# step 4
height = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div").value_of_css_property("padding-top")
if lastHeight==height:
match = True
print(users)
print(len(users))
driver.quit()
我在近100个喜欢的帖子中进行了测试,并且成功了。
答案 2 :(得分:0)
我无法使代码正常工作,如预测的答案中所述。因此,我在下面进行了改编,现在每个帖子让我获得约500个赞。
def get_post_likers(shortcode):
chrome = ch.initialize()
chrome.get('https://www.instagram.com/p/' + shortcode + '/')
chrome.execute_script("window.scrollTo(0, 1080)")
url = "/p/" + shortcode + "/liked_by/"
time.sleep(2)
like_link = chrome.find_element_by_xpath('//a[@href="'+url+'"]')
like_link.click()
time.sleep(2)
users = []
pb = chrome.find_element_by_xpath("//div[@role = 'dialog']/div[2]/div[1]/div[1]").value_of_css_property("padding-bottom")
match = False
while match==False:
lastHeight = pb
# step 1
elements = chrome.find_elements_by_xpath("//*[@id]/div/a")
# step 2
for element in elements:
if element.get_attribute('title') not in users:
users.append(element.get_attribute('title'))
# step 3
chrome.execute_script("return arguments[0].scrollIntoView();", elements[-1])
time.sleep(1)
# step 4
pb = chrome.find_element_by_xpath("//div[@role = 'dialog']/div[2]/div[1]/div[1]").value_of_css_property("padding-bottom")
if lastHeight==pb or len(users) >= 1500:
match = True
return users
答案 3 :(得分:0)
这对我有用:
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
time.sleep(2)
userid_element = driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/div[1]/article/div[3]/section[2]/div/div[2]/button').click()
time.sleep(2)
elems = driver.find_elements_by_xpath("//a[@class='FPmhX notranslate TlrDj']")
users = []
for i in range(10):
i += 1
if(i%10) == 9 :
driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div').click()
actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
print('/html/body/div[4]/div/div/div[2]/div/div/div['+str(i)+']/div[2]/div[1]/div/a')
Title = driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div/div['+str(i)+']/div[2]/div[1]/div/a').get_attribute('title')
users.append(Title)
print('Title : ' + Title)
print(users)
答案 4 :(得分:0)
我尝试了上面所有的解决方案,但没有一个起作用。我认为它们已经过时了。
相反,我写了自己的书。它可以在2020年完美运行。
此代码转到“用户名”地址,并在个人资料中获取最新帖子并获得喜欢的用户。
def getPosts():
hrefs_in_view = driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('href') for elem in hrefs_in_view
if '.com/p/' in elem.get_attribute('href')]
return hrefs_in_view;
def getLikers(username,limit,post=1):
driver.get('https://www.instagram.com/' + username)
time.sleep(1)
users=[]
#Get Latest Post
driver.get(getPosts()[post])
time.sleep(2)
#Open Dialog
followersLinkX = driver.find_element_by_xpath('//button[@class="sqdOP yWX7d _8A5w5 "]')
followersLinkX.click()
time.sleep(1)
#Get Dialog
xxx = driver.find_element_by_xpath('//div[@role="dialog"]/div[1]/div[2]/div[1]/div[1]')
#Focus on and Scroll
xxx.click()
# step 3
actionChain = webdriver.ActionChains(driver)
count = 0
while(count < limit):
for i in range(1,1000):
try:
users.append("https://www.instagram.com/" + driver.find_element_by_xpath('//div[@role="dialog"]/div[1]/div[2]/div[1]/div[1]/div['+ str(i) +']/div[2]/div[1]/div[1]').text)
count+=1
except:
break
actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
time.sleep(0.5)
return users
用于运行likers = getLikers("deirvlon",100,1)