我正在尝试在列表的任何索引中找到关键字并获取该索引。我使用BeautifulSoup4创建了一个小型网页抓取工具,以抓取同人小说的数据。
由于并非所有幻想小说都列出了流派或人物,甚至更新日期(如果它们是新出版的),所有信息都将位于不同的索引中。
因此,我需要搜索,例如'Words:',并获取整个字符串的索引,即'Words:1,854'== list [3]或类似的东西,并将其保存为变量words = list [3]稍后再调用,以便稍后将其放入excel文件中的正确列中。这是我当前的抓取工具,目前仅设置为抓取一页,只需减小“ u”的原始值即可添加更多要抓取的页面。
import requests
from bs4 import BeautifulSoup
# import time
# from random import randint
# import xlsxwriter
# import urllib3
# from tinydb import TinyDB, Query
total = 0
u = int(1127)
while u < 2000:
u = u+1
url = 'https://www.fanfiction.net/Naruto-Crossovers/1402/0/?&srt=1&lan=1&r=10&p=' + str(u)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
raw = soup.find_all('div', class_='z-indent z-padtop')
for n in range(len(raw)):
stats = raw[n]
info = stats.div
text = info.text
formatted = text.split(' - ')
print(formatted[1:(len(formatted))])
答案 0 :(得分:0)
然后解决方案可能是这样的(检查功能find_keyword
)
import requests
from bs4 import BeautifulSoup
# import time
# from random import randint
# import xlsxwriter
# import urllib3
# from tinydb import TinyDB, Query
total = 0
u = int(1127)
results = []
while u < 1130: #decreased u due to testing time
u = u+1
url = 'https://www.fanfiction.net/Naruto-Crossovers/1402/0/?&srt=1&lan=1&r=10&p=' + str(u)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
raw = soup.find_all('div', class_='z-indent z-padtop')
for n in range(len(raw)):
stats = raw[n]
info = stats.div
text = info.text
formatted = text.split(' - ')
if formatted:
results.append(formatted)
print(results)
# function to search for a keyword
def find_keyword(list, keyword):
results = []
for element in list:
value = ''
for tag in element:
if tag.find(keyword) >= 0:
value = tag
results.append(value)
return(results)
words_list = find_keyword(results, 'Words') #example of how to search and build list for keyword
print(words_list)
答案 1 :(得分:0)
This is the code I came up with, it wordks wonderfully. The find function was essential.
# For later use, searches for keywords and adds them to the specified list
def assign_stats(keyword, stat_list):
k = 13
b = 0
t = 0
while k >= 1:
if t == len(formatted):
t = 0
check = formatted[t]
value = check.find(keyword)
if value != -1:
# values = formatted[t]
stat_list.append(check)
b = 1
elif k < 2 and b == 0:
stat_list.append('')
t = t + 1
k = k - 1
# For later use, searches for keywords and adds them to the specified list
def assign_stats_status(keyword, stat_list):
k = 13
b = 0
t = 0
while k >= 1:
if t == len(formatted):
t = 0
check = formatted[t]
value = check.find(keyword)
if value != -1:
# values = formatted[t]
stat_list.append(check)
b = 1
elif k < 2 and b == 0:
stat_list.append('In-Progress')
t = t + 1
k = k - 1
# For later use, searches for specified indexes of story data lists and adds them to specified list
def assign_stats_concrete(index, stat_list):
stat_list.append(formatted[index])
# Searches for keywords/indexes for the specified story stat lists
assign_stats('Words', words)
assign_stats_concrete(2, rating)
assign_stats('English', language)
assign_stats('Chapters', chapters)
assign_stats('Reviews', reviews)
assign_stats('Favs', favorites)
assign_stats('Follows', follows)
assign_stats('Updated', updated)
assign_stats_status('Complete', status)
assign_stats('Published', published)
assign_stats_concrete(1, crossover)