如何从URL链接获取某些文本

时间:2016-06-10 22:09:28

标签: python-3.x web-scraping beautifulsoup

所以我试图在每个团队的网址页面的统计信息框页面中获取所有统计信息。页面的样子就在我下面的超链接上。我试图这样打印出来;

月:赢% 月:赢% 所有时间:赢得%

但是我不确定如何编写代码,因为我在main中编写的最后一段代码给了我一个错误。

http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners

    import time
    import requests
    from bs4 import BeautifulSoup


    def get_all(url, base):  # Well called it will print all the team links
        r = requests.get(url)
        page = r.text

        soup = BeautifulSoup(page, 'html.parser')

        for team_links in soup.select('div.details h3 a'):
            members = int(team_links.find_next('th', text='Members:').find_next_sibling('td').text.strip().split()[0])
            if members < 5:
                continue
            yield base + team_links['href']

        next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')


        while next_page:
            # Gives the server a break
            time.sleep(0.2)

            r = requests.get(BASE_URL + next_page.find_previous('a')['href'])
            page = r.text
            soup = BeautifulSoup(page)
            for team_links in soup.select('div.details h3 a'):
                yield BASE_URL + team_links['href']
            next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')


    if __name__ == '__main__':

        BASE_URL = 'http://www.gosugamers.net'
        URL = 'http://www.gosugamers.net/counterstrike/teams'

        for links in get_all(URL, BASE_URL): # When run it will generate all the links for all the teams
           r = requests.get(links)
           page = r.content
           soup = BeautifulSoup(page)

           for statistics in soup.select('div.statistics tr'):
               win_rate = int(statistics.find('th', text='Winrate:').find_next_sibling('td'))
               print(win_rate)

1 个答案:

答案 0 :(得分:1)

不确定您想要什么,但这将获得所有球队统计数据:

from bs4 import BeautifulSoup, Tag
import requests

soup = BeautifulSoup(requests.get("http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners").content)

table = soup.select_one("table.stats-table")
head1 = [th.text.strip() for th in table.select("tr.header th") if th.text]
head2 = [th.text.strip() for th in table.select_one("tr + tr") if isinstance(th, Tag)]
scores = [th.text.strip() for th in table.select_one("tr + tr + tr") if isinstance(th, Tag)]

print(head1, head2, scores)

输出:

([u'Jun', u'May', u'All time'], [u'Winrate:', u'0%', u'0%', u'0%'], [u'Matches played:', u'0 / 0 / 0', u'0 / 0 / 0', u'0 / 0 / 0'])