下面的代码将20行数据写入csv文件。它从此Web地址获取表的第一页中的数据: https://www.finviz.com/screener.ashx?v=161
我正在寻找一种方法来更改代码,以便继续自动提取表中的其余页面,如下所示,以添加下面的40行。
https://www.finviz.com/screener.ashx?v=161&r=21
https://www.finviz.com/screener.ashx?v=161&r=42
任何人都知道如何实现这一目标?感谢
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.finviz.com/screener.ashx?v=161'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
main_div = soup.find('div', attrs = {'id':'screener-content'})
light_rows = main_div.find_all('tr', class_="table-light-row-cp")
dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
data = []
for rows_set in (light_rows, dark_rows):
for row in rows_set:
row_data = []
for cell in row.find_all('td'):
val = cell.a.get_text()
row_data.append(val)
data.append(row_data)
# sort rows to maintain original order
data.sort(key=lambda x: int(x[0]))
import pandas
pandas.DataFrame(data).to_csv("AAA.csv", header=False)
答案 0 :(得分:0)
首先重构一个函数,以获取该页面中任何网址的数据
def get_rows(r=0):
base_url = 'https://www.finviz.com/screener.ashx?v=161'
if r > 0:
base_url+="&r=" + str(r)
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
main_div = soup.find('div', attrs = {'id':'screener-content'})
light_rows = main_div.find_all('tr', class_="table-light-row-cp")
dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
return light_rows, dark_rows
然后循环一些r值
data = []
for r in range(0, 43, 21):
print("getting r={0}".format(r))
light_rows, dark_rows=get_rows(r)
for rows_set in (light_rows, dark_rows):
for row in rows_set:
row_data = [cell.a.get_text() for cell in row.find_all('td')]
data.append(row_data)
# sort rows to maintain original order
data.sort(key=lambda x: int(x[0]))
答案 1 :(得分:0)
要抓取所有页面,请观察网址中的尾随参数增加2
,而不是1
。因此,下面的代码找到列表中的最大页面,将后一个结果乘以2,并将结果用作范围:
import requests, re, contextlib
from bs4 import BeautifulSoup as soup
import csv
@contextlib.contextmanager
def scrape_table(url):
d = soup(requests.get(url).text, 'html.parser')
headers = [i.text for i in d.find_all('td', {'class':re.compile('table\-top')})]
full_table = [i.text for i in d.find_all('td', {'class':re.compile('screener-body-table-nw')})]
grouped_table = [full_table[i:i+len(headers)] for i in range(0, len(full_table), len(headers))]
yield [dict(zip(headers, i)) for i in grouped_table]
start = 'https://www.finviz.com/screener.ashx?v=161'
max_link = int(max(soup(requests.get(start).text, 'html.parser').find_all('a', {'class':'screener-pages'}), key=lambda x:int(x.text)).text)
headers = [i.text for i in soup(requests.get(start).text, 'html.parser').find_all('td', {'class':re.compile('table\-top')})]
with open('filename.csv', 'a') as f:
write = csv.writer(f)
write.writerow(headers)
with scrape_table(start) as r1:
write.writerows(list(filter(None, [[i[b] for b in headers] for i in r1])))
for i in range(1, max_link):
url = 'https://www.finviz.com/screener.ashx?v=161&r={}1'.format(i*2)
with scrape_table(url) as result:
_r = list(filter(None, [[i[b] for b in headers] for i in result]))
if _r:
write.writerows(_r)
结果(第一页):
{'Profit M': '4.60%', 'Ticker': 'ABAC', 'Price': '2.17', 'ROI': '1.10%', 'Quick R': '17.40', 'Market Cap': '17.60M', 'Curr R': '19.00', 'Gross M': '16.30%', 'ROA': '1.40%', 'Dividend': '-', 'Earnings': 'Apr 02/a', 'LTDebt/Eq': '0.00', 'No.': '21', 'Volume': '1,744', 'ROE': '1.40%', 'Debt/Eq': '0.02', 'Change': '-1.36%', 'Oper M': '4.20%'}, {'Profit M': '11.10%', 'Ticker': 'ABAX', 'Price': '82.74', 'ROI': '8.90%', 'Quick R': '5.00', 'Market Cap': '1.88B', 'Curr R': '6.00', 'Gross M': '54.60%', 'ROA': '8.30%', 'Dividend': '0.87%', 'Earnings': 'Apr 26/a', 'LTDebt/Eq': '0.00', 'No.': '22', 'Volume': '253,661', 'ROE': '9.70%', 'Debt/Eq': '0.00', 'Change': '-0.07%', 'Oper M': '15.80%'}, {'Profit M': '5.90%', 'Ticker': 'ABB', 'Price': '23.23', 'ROI': '11.50%', 'Quick R': '0.90', 'Market Cap': '48.87B', 'Curr R': '1.20', 'Gross M': '30.40%', 'ROA': '4.90%', 'Dividend': '3.57%', 'Earnings': 'Apr 19/b', 'LTDebt/Eq': '0.39', 'No.': '23', 'Volume': '1,371,355', 'ROE': '14.80%', 'Debt/Eq': '0.58', 'Change': '2.15%', 'Oper M': '9.40%'}, {'Profit M': '21.50%', 'Ticker': 'ABBV', 'Price': '98.05', 'ROI': '25.40%', 'Quick R': '1.10', 'Market Cap': '157.01B', 'Curr R': '1.20', 'Gross M': '75.80%', 'ROA': '9.20%', 'Dividend': '3.92%', 'Earnings': 'Apr 26/b', 'LTDebt/Eq': '8.70', 'No.': '24', 'Volume': '14,832,723', 'ROE': '119.30%', 'Debt/Eq': '10.49', 'Change': '-0.90%', 'Oper M': '34.00%'}, {'Profit M': '0.50%', 'Ticker': 'ABC', 'Price': '83.34', 'ROI': '8.70%', 'Quick R': '0.50', 'Market Cap': '18.05B', 'Curr R': '0.90', 'Gross M': '2.90%', 'ROA': '2.40%', 'Dividend': '1.82%', 'Earnings': 'May 02/b', 'LTDebt/Eq': '1.46', 'No.': '25', 'Volume': '1,020,497', 'ROE': '32.00%', 'Debt/Eq': '1.53', 'Change': '1.46%', 'Oper M': '0.60%'}
答案 2 :(得分:-2)
这是我认为你给我的代码,但我可能犯了错误。
import requests
from bs4 import BeautifulSoup
def get_rows(r=0):
base_url = 'https://www.finviz.com/screener.ashx?v=161'
if r > 0:
base_url+="&r=" + str(r)
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
main_div = soup.find('div', attrs = {'id':'screener-content'})
light_rows = main_div.find_all('tr', class_="table-light-row-cp")
dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
return light_rows, dark_rows
data = []
for r in range(0, 43, 21):
print("getting r={0}".format(r))
light_rows, dark_rows=get_rows(r)
for rows_set in (light_rows, dark_rows):
pass
for row in rows_set:
row_data = []
for cell in row.find_all('td'):
val = cell.a.get_text()
row_data.append(val)
data.append(row_data)
# sort rows to maintain original order
data.sort(key=lambda x: int(x[0]))
import pandas
pandas.DataFrame(data).to_csv("AAA.csv", header=False)
这些是我收到的错误消息。
>>> import requests
>>> from bs4 import BeautifulSoup
>>>
>>> def get_rows(r=0):
... base_url = 'https://www.finviz.com/screener.ashx?v=161'
... if r > 0:
... base_url+="&r=" + str(r)
... html = requests.get(base_url)
... soup = BeautifulSoup(html.content, "html.parser")
... main_div = soup.find('div', attrs = {'id':'screener-content'})
...
>>> light_rows = main_div.find_all('tr', class_="table-light-row-cp")
File "<stdin>", line 1
light_rows = main_div.find_all('tr', class_="table-light-row-cp")
^
IndentationError: unexpected indent
>>> dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
File "<stdin>", line 1
dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
^
IndentationError: unexpected indent
>>> return light_rows, dark_rows
File "<stdin>", line 1
return light_rows, dark_rows
^
IndentationError: unexpected indent
>>> data = []
>>> for r in range(0, 43, 21):
... print("getting r={0}".format(r))
... light_rows, dark_rows=get_rows(r)
... for rows_set in (light_rows, dark_rows):
... pass
... for row in rows_set:
... row_data = []
... for cell in row.find_all('td'):
... val = cell.a.get_text()
... row_data.append(val)
... data.append(row_data)
...
getting r=0
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: 'NoneType' object is not iterable
>>> # sort rows to maintain original order
... data.sort(key=lambda x: int(x[0]))
>>>
>>> import pandas
>>> pandas.DataFrame(data).to_csv("AAA.csv", header=False)