I am very new to python and BeautifulSoup and I am trying to use it to scrape multiple urls at the same time using a loop. The loop will consist of locating the banner slide on the home page of each website and get the len of how many banners that website has and place them into an excel file next to the corresponding url. I have a list of urls saved in a csv file and basically what I want to do is take each of those urls and run the loop, pulling the number of banners, and put that number next to the url into a separate column in excel.
This is the code I have so far and all it does for me is write the urls back into a csv file and gives me the number of banners for only the last url.
from bs4 import BeautifulSoup
import requests
with open("urls.csv", "r") as f:
csv_raw_cont=f.read()
split_csv=csv_raw_cont.split('\n')
split_csv.remove('')
separator=';'
filename = "DDC_number_of_banners.csv"
f = open(filename, "w")
headers = "url, Number_of_Banners\n"
f.write(headers)
for each in split_csv:
url_row_index=0
url = each.split(separator)[url_row_index]
html = requests.get(url).content
soup= BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link',
'html-slide slide has-link']})
Number_of_banners = len(banner_info)
f.write(csv_raw_cont + "," + str(Number_of_banners) + "," + "\n")
f.close()
答案 0 :(得分:0)
利用Python的CSV库可以使这更简单:
from bs4 import BeautifulSoup
import requests
import csv
with open("urls.csv", "r") as f_urls, open("DDC_number_of_banners.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['url', 'Number_of_banners'])
for url in f_urls:
url = url.strip()
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link', 'html-slide slide has-link']})
csv_output.writerow([url, len(banner_info)])
包含每个横幅的data-label
:
from bs4 import BeautifulSoup
import requests
import csv
with open("urls.csv", "r") as f_urls, open("DDC_number_of_banners.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['url', 'Number_of_banners', 'data_labels'])
for url in f_urls:
url = url.strip()
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
banner_info = soup.findAll('div',{'class':['slide', 'slide has-link', 'html-slide slide has-link']})
data_labels = [banner.get('data-label') for banner in banner_info]
csv_output.writerow([url, len(banner_info)] + data_labels)