有人可以帮我如何遍历下一页,在这里尝试了所有解决方案,但似乎无法正常工作。
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn' # to suppress "false positive" warnings
import datetime as dt
import requests
from bs4 import BeautifulSoup
import time
def getPage(url):
attempt = 1
while True:
response = requests.get(url)
if response.status_code == requests.codes.ok:
return response.content
else:
time.sleep(0.5)
attempt += 1
if attempt > 3:
print("Data could not be requested for url: ", url, " after ", attempt, " attempts")
return None
if __name__ == '__main__':
url = "https://www.opic.com/upphandlingar/"
data_df = pd.DataFrame() # all data from the websites is saved to this data frame
# get data
try:
markup = getPage(url).decode('utf-8')
except:
markup = getPage(url)
if markup is None:
print("Nothing was found. Value of 'markup' is 'None'.")
sys.exit()
soup = BeautifulSoup(markup, 'lxml')
containers = soup.findAll("a", {"class": "ListItem"})
for container in containers:
upplagtdatum = container.div.p.text.strip()
titel = container.h3.text.strip()
stad_kommun = container.span.text.strip()
# ----------------------------------------------------------------------------------------------------------
# Save data to data frame
df = pd.DataFrame(data={'Upplagtdatum': [upplagtdatum], 'Titel': [titel], 'Stad Kommun': [stad_kommun]})
data_df = pd.concat([data_df, df], sort=False)
# SAVE DATA
# Save data frame to csv-file
filePathName = "data_" + dt.datetime.now().strftime('%Y-%m-%d') + ".csv"
data_df.to_csv(filePathName, sep=';', index=False, encoding='utf-8')
print(data_df)
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn' # to suppress "false positive" warnings
import datetime as dt
import requests
from bs4 import BeautifulSoup
import time
def getPage(url):
attempt = 1
while True:
response = requests.get(url)
if response.status_code == requests.codes.ok:
return response.content
else:
time.sleep(0.5)
attempt += 1
if attempt > 3:
print("Data could not be requested for url: ", url, " after ", attempt, " attempts")
return None
if __name__ == '__main__':
url = "https://www.opic.com/upphandlingar/"
data_df = pd.DataFrame() # all data from the websites is saved to this data frame
# get data
try:
markup = getPage(url).decode('utf-8')
except:
markup = getPage(url)
if markup is None:
print("Nothing was found. Value of 'markup' is 'None'.")
sys.exit()
soup = BeautifulSoup(markup, 'lxml')
containers = soup.findAll("a", {"class": "ListItem"})
for container in containers:
upplagtdatum = container.div.p.text.strip()
titel = container.h3.text.strip()
stad_kommun = container.span.text.strip()
# Save data to data frame
df = pd.DataFrame(data={'Upplagtdatum': [upplagtdatum], 'Titel': [titel], 'Stad Kommun': [stad_kommun]})
data_df = pd.concat([data_df, df], sort=False)
filePathName = "data_" + dt.datetime.now().strftime('%Y-%m-%d') + ".csv"
data_df.to_csv(filePathName, sep=';', index=False, encoding='utf-8')
print(data_df)
答案 0 :(得分:1)
我对您的代码进行了一些更改。使用这种格式可以实现分页。
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn' # to suppress "false positive" warnings
import datetime as dt
import requests
from bs4 import BeautifulSoup
import time
import sys
def getPage(url):
attempt = 1
while True:
response = requests.get(url)
if response.status_code == requests.codes.ok:
return response.content
else:
time.sleep(0.5)
attempt += 1
if attempt > 3:
print("Data could not be requested for url: ", url, " after ", attempt, " attempts")
return None
def getData(markup):
data_df = pd.DataFrame() # all data from the websites is saved to this data frame
soup = BeautifulSoup(markup, 'lxml')
containers = soup.findAll("a", {"class": "ListItem"})
for container in containers:
upplagtdatum = container.div.p.text.strip()
titel = container.h3.text.strip()
stad_kommun = container.span.text.strip()
# ----------------------------------------------------------------------------------------------------------
# Save data to data frame
df = pd.DataFrame(data={'Upplagtdatum': [upplagtdatum], 'Titel': [titel], 'Stad Kommun': [stad_kommun]})
data_df = pd.concat([data_df, df], sort=False)
# SAVE DATA
# Save data frame to csv-file
filePathName = "data_" + dt.datetime.now().strftime('%Y-%m-%d') + ".csv"
data_df.to_csv(filePathName, sep=';', index=False, encoding='utf-8')
print(data_df)
if __name__ == '__main__':
results = 2871
per_page = 20
url = "https://www.opic.com/upphandlingar/?p={}"
no_of_pages = int(results/per_page)
for page_no in range(1,no_of_pages + 1):
try:
markup = getPage(url.format(page_no)).decode('utf-8')
except:
markup = getPage(url)
if markup is None:
print("Nothing was found. Value of 'markup' is 'None'.")
sys.exit()
else:
getData(markup)
说明
看看代码,如果需要的话更新它。
答案 1 :(得分:0)
通过查看网站和提供的代码,我假设您想从获得的所有ListItem(容器)中提取href属性。您可以像这样简单地获取href: (假设您有BeautifulSoup4)
for container in containers:
upplagtdatum = container.div.p.text.strip()
titel = container.h3.text.strip()
stad_kommun = container.span.text.strip()
href = container.get('href')
然后您可以立即使用该href,也可以将其保存在DataFrame中以在以后遍历。