我正在尝试解析超过200个链接,但是BS4只是处理掉了。我看到Beautifulsoup findall get stuck without processing,但那是不同的。 被困在随机的地方。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const char space[2] = " ";
int main(){
char input[50], firstName[20], middleName[20], lastName[20], *token;
scanf("%[^\n]s", &input);
token = strtok(input, space);
strcpy(firstName, token);
if (token != NULL){
token = strtok(NULL, space);
strcpy(middleName, token);
}
if (token != NULL){
token = strtok(NULL, space);
strcpy(lastName, token);
printf("%s %c. %c.\n", lastName, middleName[0], firstName[0]);
}
else {
printf("%s", middleName);
printf("%s %c.\n", middleName, firstName[0]);
}
return 0;
}
TimeoutError:
import os
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse(html, url):
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all('tr', title = "Допущено до конкурсу")
if os.path.exists('base/%s.txt' % url[27:]):
pass
else:
abitbase = open('base/%s.txt' % (url[27:]), 'w')
for unit in table:
collection = unit.find_all('td')
position = collection[0].text
name = collection[1].text
priority = collection[2].text
score = collection[3].text
abitbase.write('%s %s %s %s \n' % (position, name, priority, score))
abitbase.close()
def main():
global applicants
url_list = open('clist.txt', 'r')
for count in range(1, 241):
url_s = url_list.readline()
if url_s[-1] == '\n':
url = url_s[:-1]
else:
url = url_s
parse(get_html(url), url)
print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%'))
if __name__ == '__main__':
applicants = {}
main()
答案 0 :(得分:1)
BS4工作正常,这是我的错。
我只是将os.path.exists
放在parse(get_html(url), url)
之前,它运作良好。
对不起。