无法抓取所有数据

时间:2019-11-27 10:40:29

标签: python-3.x beautifulsoup

from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd

URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']


Year= []
CompanyName = []
Rank = []
Score = []

print('\n>>Process started please wait\n\n')

for I, Page in enumerate(My_list, start=1):
    url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
    print('\nData fetching from  : ',url)

    Res = requests.get(url)
    soup = BeautifulSoup(Res.content , 'html.parser')
    data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})

    if len(soup) > 0:
        print("\n>>Getting page source for :" , url)
    else:
        print("Please Check url :",url)


    for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):

        year = item.find("i",{"class":"fa-stack fa-2x"})
        Year.append(year)

        title = item.find("h3", {"class": "MainLink"}).get_text().strip()
        CompanyName.append(title)

        rank = item.find("div", {"class": "RankNumber"}).get_text().strip() 
        Rank.append(rank)

        score = item.find("div", {"class": "score"}).get_text().strip()
        Score.append(score)

        Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})

        Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
        Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)

        Data.drop(columns = ['hash','First'],inplace = True)
        Data.to_csv('Vault_scrap.csv',index = False)

对于每个URL,年份,排名,标题和得分的预期输出数据为100行,但我只有10行。

1 个答案:

答案 0 :(得分:1)

您可以像这样遍历年份和页面。

import requests
import pandas as pd

url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'

def page_loop(year, url):
    tableReturn = pd.DataFrame()
    for page in range(1,101):
        payload = {
                'rank': '2',
                'year': year,
                'category': 'LBACCompany',
                'pg': page}

        jsonData = requests.get(url, params=payload).json()

        if jsonData == []:
            return tableReturn

        else:
            print ('page: %s' %page)
            tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)

    return tableReturn




results = pd.DataFrame()
for year in range(2007,2021):
    print ("\n>>Getting page source for :" , year)

    jsonData = page_loop(year, url)
    results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)