Beautifulsoup卡住了

时间:2016-07-20 19:17:01

标签: python-3.x parsing beautifulsoup html-parsing

我正在尝试解析超过200个链接,但是BS4只是处理掉了。我看到Beautifulsoup findall get stuck without processing,但那是不同的。 被困在随机的地方。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const char space[2] = " ";

int main(){
    char input[50], firstName[20], middleName[20], lastName[20], *token;
    scanf("%[^\n]s", &input);
    token = strtok(input, space);
    strcpy(firstName, token);   
    if (token != NULL){
        token = strtok(NULL, space);
        strcpy(middleName, token);
    }
    if (token != NULL){
        token = strtok(NULL, space);
        strcpy(lastName, token);
        printf("%s %c. %c.\n", lastName, middleName[0], firstName[0]);
    }
    else {
        printf("%s", middleName);
        printf("%s %c.\n", middleName, firstName[0]);
    }
    return 0;
}

TimeoutError:

import os
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
    response = urllib.request.urlopen(url)
    return response.read()

def parse(html, url):
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find_all('tr', title = "Допущено до конкурсу")
    if os.path.exists('base/%s.txt' % url[27:]):
        pass
    else:
        abitbase = open('base/%s.txt' % (url[27:]), 'w')
        for unit in table:
            collection = unit.find_all('td')
            position = collection[0].text
            name = collection[1].text
            priority = collection[2].text
            score = collection[3].text
            abitbase.write('%s %s %s %s \n' % (position, name, priority, score))
        abitbase.close()

def main():
    global applicants
    url_list = open('clist.txt', 'r')
    for count in range(1, 241):
        url_s = url_list.readline()
        if url_s[-1] == '\n':
            url = url_s[:-1]
        else:
            url = url_s
        parse(get_html(url), url)
        print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%'))

if __name__ == '__main__':
    applicants = {}
    main()

1 个答案:

答案 0 :(得分:1)

BS4工作正常,这是我的错。

我只是将os.path.exists放在parse(get_html(url), url)之前,它运作良好。

对不起。