下面的脚本可以很好地从维基百科页面获取类别名称。如何在10分钟后或在获得100个类别后停止?
下面的代码需要上述限制: -
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import csv
import time
#getting all the contents of a url
url = 'https://en.wikipedia.org/wiki/Category:Free software'
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
#showing the category-pages Summary
catPageSummaryTag = soup.find(id='mw-pages')
catPageSummary = catPageSummaryTag.find('p')
print(catPageSummary.text)
#showing the category-pages only
catPageSummaryTag = soup.find(id='mw-pages')
tag = soup.find(id='mw-pages')
links = tag.findAll('a')
#getting the category pages
catpages = soup.find(id='mw-pages')
whatlinksherelist = catpages.find_all('li')
things_to_write = []
for titles in whatlinksherelist:
things_to_write.append(titles.find('a').get('title'))
WAIT_TIME = 15
print(titles.text)
time.sleep(WAIT_TIME)
#writing the category pages as a output file
with open('001-catPages.csv', 'a') as csvfile:
writer = csv.writer(csvfile,delimiter="\n")
writer.writerow(things_to_write)
答案 0 :(得分:1)
在您的程序中添加此内容: -
#showing the category-pages only
catPageSummaryTag = soup.find(id='mw-pages')
tag = soup.find(id='mw-pages')
links = tag.findAll('a')
catpages = soup.find(id='mw-pages')
whatlinksherelist = catpages.find_all('li')
things_to_write = []
count = 0 #mentioned to count till 100
for titles in whatlinksherelist:
if count<=100:
things_to_write.append(titles.find('a').get('title'))
count+=1
print(titles.text)
#writing the category pages as a output file
with open('001-catPages.csv', 'a') as csvfile:
writer = csv.writer(csvfile,delimiter="\n")
writer.writerow(things_to_write)