使用Python遍历列表列表并追加到数据框

时间:2020-05-24 08:19:39

标签: python

我试图遍历列表列表并废弃所有链接,并将它们作为一个表附加到数据框,但徒劳。 帮助将不胜感激。

import pandas as pd

import requests
from bs4 import BeautifulSoup

page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]

company_A_subpg1 = soup.find_all(class_='dataTable')

def convert(url):

  if not url.startswith('http://'):
    return 'http:' + url
  return url

data_df = pd.DataFrame()

for sub_tab in company_A_subpg1:
    for tab in sub_tab:

        sub_table_1 = tab.find_all('a', href=True)
        company_name = [name.text.strip() for name in sub_table_1]
        company_link = [name.get('href') for name in sub_table_1]
        company_link_edit=[convert(name) for name in company_link]

df=pd.DataFrame(
        {'Name':company_name,
         'Link':company_link_edit
         })
data_df = pd.concat([data_df, df], sort=False)


data_df.to_csv('results_3.csv')

1 个答案:

答案 0 :(得分:1)


import pandas as pd

import requests
from bs4 import BeautifulSoup

page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]

company_A_subpg1 = soup.find_all(class_='dataTable')

def convert(url):

  if not url.startswith('http://'):
    return 'http:' + url
  return url

for sub_tab in company_A_subpg1:
  temp = sub_tab.find('tbody')
  all_rows = temp.find_all('tr')
  for val in all_rows:
    a_tag = val.find('a', href=True)
    company_name.append(a_tag.text.strip())
    company_link_edit.append(convert(a_tag.get('href')))


print(len(company_name), len(company_link_edit))

data_df = pd.DataFrame()
df=pd.DataFrame(
        {'Name':company_name,
         'Link':company_link_edit
         })
data_df = pd.concat([data_df, df], sort=False)

print(df.shape)

data_df.to_csv('results_3.csv')

您可以检查csv文件中的值,我提取了页面中提到的所有200个名称和链接。