import requests
import re
from bs4 import BeautifulSoup
#The website I like to get, converts the contents of the web page to lxml format
base_url = "https://festivalfans.nl/event/dominator-festival"
url = requests.get(base_url)
soup = BeautifulSoup(url.content, "lxml")
#Modifies the given string to look visually good. Like this:
#['21 / JulZaterdag2018'] becomes 21 Jul 2018
def remove_char(string):
#All blacklisted characters and words
blacklist = ["/", "[", "]", "'", "Maandag", "Dinsdag", "Woensdag",
"Donderdag", "Vrijdag", "Zaterdag", "Zondag"]
#Replace every blacklisted character with white space
for char in blacklist:
string = string.replace(char,' ')
#Replace more than 2 consecutive white spaces
string = re.sub("\s\s+", " ", string)
#Gets the date of the festival I'm interested in
def get_date_info():
#Makes a list for the data
raw_info = []
#Adds every "div" with a certain name to list, and converts it to text
for link in soup.find_all("div", {"class": "event-single-data"}):
raw_info.append(link.text)
#Converts list into string, because remove_char() only accepts strings
raw_info = str(raw_info)
#Modifies the string as explained above
final_date = remove_char(raw_info)
#Prints the date in this format: 21 Jul 2018(example)
print(final_date)
get_date_info()
你好!因此,我目前正在从事一个小型网络抓取项目。我以为我有个好主意,我想对Python有更多的经验。它的基本作用是获取节日信息,如日期,时间和价格,并将其放入一个小的文本文件中。我正在使用BeautifulSoup导航和编辑网页。链接在那儿!
但是现在我有点遇到问题了。我不知道怎么了。也许我完全在看它。因此,当我运行此程序时,应该给我:2018年7月21日。但是,它返回“ None”。由于某种原因,字符串中的每个字符都会被删除。
我尝试自己运行remove_char(),并使用相同的列表(首先将其转换为字符串)作为输入。这工作得很好。它按原计划返回了“ 2018年7月21日”。因此,我很确定该函数中没有错误。
所以我不知所措。也许与BeautifulSoup及其处理方式有关?
希望有人可以帮助我!
BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
答案 0 :(得分:2)
您忘记在remove_char()
函数中返回值。
就是这样!
答案 1 :(得分:0)
这两个函数都没有return
语句,因此默认情况下返回None
。 remove_char()
应该以{{1}}结尾。
答案 2 :(得分:0)
import requests
from bs4 import BeautifulSoup
base_url = "https://festivalfans.nl/event/dominator-festival"
url = requests.get(base_url)
soup = BeautifulSoup(url.content , "html.parser")
def get_date_info():
for link in soup.find_all("div", {"class": "event-single-data"}):
day = link.find('div', {"class":"event-single-day"}).text.replace(" ", '')
month = link.find('div', {"class": "event-single-month"}).text.replace('/', "").replace(' ', '')
year = link.find('div', {"class": "event-single-year"}).text.replace(" ", '')
print(day, month, year)
get_date_info()
这是一个更简单的代码,无需重新输入