我希望在CSV文件中写入所有作者的列表,其中包含CSV文件的URL,该文件将自己归类为Google学术搜索中的特定标记。例如,如果我们采用'security',我会想要这个输出:
author url
Howon Kim https://scholar.google.pl/citations?user=YUoJP-oAAAAJ&hl=pl
Adrian Perrig https://scholar.google.pl/citations?user=n-Oret4AAAAJ&hl=pl
... ...
我写了这段代码,打印每个作者的姓名
# -*- coding: utf-8 -*-
import urllib.request
import csv
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
mydivs = soup.findAll("h3", { "class" : "gsc_1usr_name"})
outputFile = open('sample.csv', 'w', newline='')
outputWriter = csv.writer(outputFile)
for each in mydivs:
for anchor in each.find_all('a'):
print (anchor.text)
然而,这仅适用于第一页。相反,我想浏览每一页。我怎么能这样做?
答案 0 :(得分:2)
我不会为你编写代码..但我会给你一个大纲,告诉你如何。
查看页面底部。看下一个按钮?搜索包含div的id
gsc_authors_bottom_pag
应该很容易找到。我用selenium做这个,找到下一个按钮(右)并单击它。等待页面加载,刮擦重复。处理边缘情况(页面外等)。
如果网址中的after_author=*
位没有改变,你可以只增加url
开始......但除非您想尝试破解该代码(不太可能),然后只需点击下一步按钮
答案 1 :(得分:1)
此页面使用<button>
代替<a>
来链接到下一页/上一页。
下一页的按钮有aria-label="Następna"
。
下一页有两个按钮,但您可以使用其中任何一个。
按钮有JavaScript
代码可重定向到新页面
window.location=url_to_next_page
但它是简单的文字,所以你可以使用切片来获取网址
import urllib.request
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
while True:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
# ... do something on page ...
# find buttons to next page
buttons = soup.findAll("button", {"aria-label": "Następna"})
# exit if no buttons
if not buttons:
break
on_click = buttons[0].get('onclick')
print('javascript:', on_click)
#add `domain` and remove `window.location='` and `'` at the end
url = 'http://scholar.google.pl' + on_click[17:-1]
# converting some codes to chars
url = url.encode('utf-8').decode('unicode_escape')
print('url:', url)
顺便说一句:如果你说波兰语,那么你可以访问Facebook:Python Poland或Python: pierwsze kroki
答案 2 :(得分:0)
由于 furas 已经回答了如何遍历所有页面,因此这是对他的回答的补充回答。下面的脚本抓取的内容比您提出的问题和抓取到 .csv
文件要多得多。
from bs4 import BeautifulSoup
import requests, lxml, os, csv
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
proxies = {
'http': os.getenv('HTTP_PROXY')
}
def get_profiles_to_csv():
html = requests.get('http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security', headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# creating CSV File
with open('awesome_file.csv', mode='w') as csv_file:
# defining column names
fieldnames = ['Author', 'URL']
# defining .csv writer
# https://docs.python.org/3/library/csv.html#csv.DictWriter
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# writing (creating) columns
writer.writeheader()
# collecting scraped data
author_data = []
# Selecting container where all data located
for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646
# id = link
# id_identifer = 'user='
# before_keyword, keyword, after_keyword = id.partition(id_identifer)
# author_id = after_keyword
# affiliations = result.select_one('.gs_ai_aff').text
# email = result.select_one('.gs_ai_eml').text
# try:
# interests = result.select_one('.gs_ai_one_int').text
# except:
# interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
# cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
# because we have a csv.DictWriter() we converting to the required format
# dict() keys should be exactly the same as fieldnames, otherwise it will throw an error
author_data.append({
'Author': name,
'URL': f'https://scholar.google.com{link}',
})
# iterating over celebrity data list() that became dict() and writing it to the .csv
for data in author_data:
writer.writerow(data)
# print(f'{name}\nhttps://scholar.google.com{link}\n{author_id}\n{affiliations}\n{email}\n{interests}\n{cited_by}\n')
# output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=pl&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=pl&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=pl&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=pl&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=pl&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=pl&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=pl&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=pl&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=pl&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=pl&user=ty7wIXoAAAAJ
'''
或者,您可以使用来自 SerpApi 的 Google Scholar Profiles API 来做同样的事情。这是一个付费 API,可免费试用 5,000 次搜索。
要集成的代码:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import csv, os
def get_profiles_to_csv():
with open('awesome_serpapi_file_pagination.csv', mode='w') as csv_file:
fieldnames = ['Author', 'URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_profiles",
"mauthors": "label:security"
}
search = GoogleSearch(params)
while True:
results = search.get_dict()
try:
for result in results['profiles']:
name = result['name']
link = result['link']
writer.writerow({'Author': name, 'URL': link})
except:
print('Done')
break
if (not 'pagination' in results) and (not 'next' in results['pagination']):
break
search.params_dict.update(dict(parse_qsl(urlsplit(results["pagination"]["next"]).query)))
get_profiles_to_csv()
# part of the output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=en&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=en&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=en&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=en&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=en&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=en&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=en&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=en&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=en&user=ty7wIXoAAAAJ
'''
<块引用>
免责声明,我为 SerpApi 工作。