我试图在搜索中抓取由ajax(页面滚动)加载的Facebook帖子。 我使用Grab并请求这个。
使用Grab授权正在工作,然后我尝试模拟滚动请求,由浏览器生成并通过post发送以加载下一堆结果。 响应html中的错误是:"抱歉,您的请求无法加载"
这是代码(大代码因为巨大的facebook请求)。
import requests
# from bs4 import BeautifulSoup
class FacebookScraper(object):
def __init__(self):
self.data = {
"view":"list",
"encoded_query":"{\"bqf\":\"keywords_top(\\u0025D0\\u0025BB\\u0025D0\\u0025B0\\u0025D0\\u0025B2\\u0025D0\\u0025B0\\u0025D0\\u0025BB\\u0025D1\\u00258C+\\u0025D0\\u0025BF\\u0025D0\\u0025BE\\u0025D1\\u002582\\u0025D0\\u0025BE\\u0025D0\\u0025BA)\",\"vertical\":\"content\",\"post_search_vertical\":None,\"intent_data\":\"{\\\"intent\\\":\\\"posts\\\",\\\"entity_id\\\":None,\\\"sub_intents\\\":{\\\"user\\\":True},\\\"user_confidence\\\":0.2384902536869,\\\"typeahead_user_confidence\\\":0.69785696268082,\\\"quel_topics\\\":[],\\\"multi_label_intents\\\":[{\\\"value\\\":True,\\\"confidence\\\":0.0113326292485},{\\\"value\\\":True,\\\"confidence\\\":6.7988303271704e-6},{\\\"value\\\":True,\\\"confidence\\\":0.0008767499239184},{\\\"value\\\":True,\\\"confidence\\\":0.025693353265524},{\\\"value\\\":True,\\\"confidence\\\":0.010204718448222},{\\\"value\\\":True,\\\"confidence\\\":0.0020656401757151},{\\\"value\\\":True,\\\"confidence\\\":0.01333892159164},{\\\"value\\\":True,\\\"confidence\\\":0.0087564773857594},{\\\"value\\\":True,\\\"confidence\\\":0.0016323747113347},{\\\"value\\\":True,\\\"confidence\\\":0.0084763253107667},{\\\"value\\\":True,\\\"confidence\\\":0.015831520780921},{\\\"value\\\":True,\\\"confidence\\\":0.004807879216969},{\\\"value\\\":True,\\\"confidence\\\":0.058732055127621},{\\\"value\\\":True,\\\"confidence\\\":0.033020552247763},{\\\"value\\\":True,\\\"confidence\\\":0.0066938307136297},{\\\"value\\\":True,\\\"confidence\\\":0.0071515664458275},{\\\"value\\\":True,\\\"confidence\\\":0.57516884803772},{\\\"value\\\":True,\\\"confidence\\\":0.21925939619541},{\\\"value\\\":True,\\\"confidence\\\":0.0026327867526561},{\\\"value\\\":True,\\\"confidence\\\":0.0046313949860632},{\\\"value\\\":True,\\\"confidence\\\":0.0015605170046911}],\\\"annotated_string\\\":\\\"{\\\\\\\"entities\\\\\\\":[],\\\\\\\"segments\\\\\\\":[{\\\\\\\"type\\\\\\\":\\\\\\\"\\\\\\\\u003Cusername>\\\\\\\",\\\\\\\"tokens\\\\\\\":\\\\\\\"\\\\\\\\u043b\\\\\\\\u0430\\\\\\\\u0432\\\\\\\\u0430\\\\\\\\u043b\\\\\\\\u044c \\\\\\\\u043f\\\\\\\\u043e\\\\\\\\u0442\\\\\\\\u043e\\\\\\\\u043a\\\\\\\"}]}\\\"}\",\"filters\":[],\"has_chrono_sort\":False,\"query_analysis\":\"\",\"subrequest_disabled\":False}",
"encoded_title":"WyJcdTAwMjVEMFx1MDAyNUJCXHUwMDI1RDBcdTAwMjVCMFx1MDAyNUQwXHUwMDI1QjJcdTAwMjVEMFx1MDAyNUIwXHUwMDI1RDBcdTAwMjVCQlx1MDAyNUQxXHUwMDI1OEMrXHUwMDI1RDBcdTAwMjVCRlx1MDAyNUQwXHUwMDI1QkVcdTAwMjVEMVx1MDAyNTgyXHUwMDI1RDBcdTAwMjVCRVx1MDAyNUQwXHUwMDI1QkEiXQ",
"ref":"unknown",
"logger_source":"www_main",
"typeahead_sid":"",
"tl_log":False,
"impression_id":"ed9cde39",
"filter_ids":{
"1597273353820386:1674330276114693:0":"1597273353820386:1674330276114693:0"
},
"experience_type":"grammar",
"exclude_ids":None,
"browse_location":"",
"trending_source":None,
"reaction_surface":None,
"reaction_session_id":None,
"ref_path":"/search/top/",
"is_trending":False,
"topic_id":None,
"place_id":None,
"story_id":None,
"callsite":"browse_ui:init_result_set",
"has_top_pagelet":True,
"display_params":{
"mrss":True
},
"cursor":"Abp0rWq8oGjiUWYmIeN9rZCuAXTubQ9beY7hyGlZpau3ekEF9BSelQe85TthKUvpoNEU65kQorg_Fya3xI47JPWGoJ-lNMS4JiOobTZ0Q3tP0HD1Z5JZiCLbT10PwrRNibnv-TKdTBhwtfMvqR816Hd9vIHPRxCBmT5lPfrVZC7f0ohVLeSCKFEYmP-47IiDsWW1YynB8Yqr_54b7iIQWB4uixrp5Zm5AHrCilxpqGtp9ye5Y2nKCyK8UtMkzzQ11CTfidmPmDMPQgU3rFroTvpUc96QGvfX1pNjNw3sn-CFyn5TLq_0mX_jOVP8BzfRP7qhnmcmjs3Rf2_l22Q9C-gQGqSwDUqE63XszHUYqEW5e-KtU_3Hcpb3OT0MJyZ7EQI",
"page_number":2,
"em":False,
"mr":False,
"tr":None
}
def scrape(self):
posts = self.scrape_posts()
for post in posts:
print ("Фейсбучный пост ", post)
def scrape_posts(self, max_pages=4):
posts = []
pageno = 2
cursor = [
'AbocHlVuS3n0hW0t-IUaCGN9y1jZzVg_cbHqG4IPXcnKhfacxLL52L4S1fMLklQPLgIBtRBA6z_hRaeCS0xzr1BCZbBe8fIdpVYMegK7Ye2TxzWSNRWUEJC0IMYWCjEkZQvI-Ix_Kx_E0Kx1oap8jGJAN9housnxkkAq8_ElwDTK9TbaHr0TB1MNNwR4lQU2PKYXOWajSG_0_AZ2DOCsn-DIMHL4jbhnhrwps_2kIi0gaC6y3aqbp9q8iILcwbXsjUzsTQce3tr3U9GQdVFClJQf2guaIwNa7KSmTEM99-SeNAMC07e8-VTCAd5Uc01qSa4yW_Asfl27xsXKbv9Aa5JBMt7fB8fvo1OBpOrU43Zg-lb1C7wAKoeqmo6bp2VfHVw',
'Abo8hIfmoANZ0PCtYHB4ooCJ59SHjd5JoFmxoE2nu99WKD5kwB8riOnI6OiqTTO3gnI2Gehgu8BONQytNPKWWgBPnOTV7OF1Rlwmp2bEO5tQfDEw3Vcjiy_X3QJe2bj74E-DV2qom6G75bL4tRa-zPi4JcGYOGWvBTFTabTHJR8fWIAK1NctX4_lztV-aaGAZanQvrcDKFP9gz7w5luvOo4Lv6eN4xfGlJwoEmyj9blI-EG1Ijc3lhKXrHse0hPSJlKaWVo4w1lkijdeJE1QyzW-WRtaGbm6FTLDrFaOeOnLp8g6LhS3nJ2X2yx6-9967VQbNtWUQ2dCs3QqomitMMuXkDb-B3ASJmtRc4krWmpi0-hN46Si2r_ntU4cSt5idmg'
]
dyn = [
'7AmajEzUGByA5Q9UoGya4A5EWq2W8GAdy8Z9LFwxBxCbzES2N6xybxu13wHgf8jyR88y8aGjzEgDKuEjKeCwxxW3Ouum2SUpGqewIUsz8nxm3a229yoOm8yUgx62q78vDw',
'7AmajEzUGByA5Q9UoGya4A5EWq2W8GAdy8Z9LFwxBxvyUWdwIhEoyUnwgUaQ3O4UJi28y2GAUW49XDG4XzFE8ouwYDDBwJK6qCzEbe78O5UlwOwwyoCcBy8K48hwCxO7VU'
]
req = [
'q',
'y'
]
self.data['page_number'] = pageno
while pageno < max_pages:
#self.data['cursor'] = cursor[pageno-2]
payload = {
'data': json.dumps(self.data),
'__dpr': 1,
'__user':100012747062538,
'__a':1,
'__dyn':'7AmajEzUGByA5Q9UoGya4A5EWq2WiWF3oyfirWo8popyUWdwIhEoyUnwgUat0Hx24UJi28y2GAUW49XDG4XzFE8ouwh9VobrxCFEW2PxOcxu5pUaE88C9z9oybx24oqyUsx-u6vU',
#'__dyn':dyn[pageno-2],
'__req':8,
'__be':-1,
'__pc':'PHASED:DEFAULT',
'__rev':2467498
}
#grab.cookies.load_from_file('cookies.html')
url='https://www.facebook.com/search/top/?init=quick&q=%D0%BB%D0%B0%D0%B2%D0%B0%D0%BB%D1%8C%20%D0%BF%D0%BE%D1%82%D0%BE%D0%BA&tas=0.715880302462869'
g = Grab(url=url, cookiefile='cookies.html', post=payload)
g.go(url=url)
r = g.response.unicode_body()
"""
post =
r = requests.post(
url='https://www.facebook.com/search/top/?init=quick&q=%D0%BB%D0%B0%D0%B2%D0%B0%D0%BB%D1%8C%20%D0%BF%D0%BE%D1%82%D0%BE%D0%BA&tas=0.715880302462869',
data=payload,
# 'X-Requested-With': 'XMLHttpRequest'
#}
)
"""
print ("---------------------- ↓r ", pageno , "↓ ------------------")
print(r)
print ("---------------------- ↑r ", pageno , "↑ ------------------")
# Next page
pageno += 1
self.data['page_number'] = pageno
return posts
def grab_demo(request):
# search page to scrape
page = "https://www.facebook.com/search/top/?init=quick&q=%D0%BB%D0%B0%D0%B2%D0%B0%D0%BB%D1%8C%20%D0%BF%D0%BE%D1%82%D0%BE%D0%BA&tas=0.715880302462869"
# id of user
parser_user_id = 100012747062538
#
page_number = 1
# Логинимся в фейсбуке
grab = Grab(log_file='out.html')
grab.go('https://www.facebook.com/')
grab.set_input("email", "your_email_here")
grab.set_input("pass", "your_password_here")
grab.submit()
s1 = grab.response.unicode_body()
grab.dump_cookies('cookies.html')
print ("---------------------- Login done ------------------")
print(s1)
# Login done
scraper = FacebookScraper()
scraper.scrape()
我认为这部分存在问题:
g = Grab(url=url, cookiefile='cookies.html', post=payload)
g.go(url=url)
r = g.response.unicode_body()
谁可以提供帮助?另外,我不想使用Selenium,因为它可以生成真正的浏览器来启动。