我试图遍历列表列表并废弃所有链接,并将它们作为一个表附加到数据框,但徒劳。 帮助将不胜感激。
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
data_df = pd.DataFrame()
for sub_tab in company_A_subpg1:
for tab in sub_tab:
sub_table_1 = tab.find_all('a', href=True)
company_name = [name.text.strip() for name in sub_table_1]
company_link = [name.get('href') for name in sub_table_1]
company_link_edit=[convert(name) for name in company_link]
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
data_df.to_csv('results_3.csv')
答案 0 :(得分:1)
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
for sub_tab in company_A_subpg1:
temp = sub_tab.find('tbody')
all_rows = temp.find_all('tr')
for val in all_rows:
a_tag = val.find('a', href=True)
company_name.append(a_tag.text.strip())
company_link_edit.append(convert(a_tag.get('href')))
print(len(company_name), len(company_link_edit))
data_df = pd.DataFrame()
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
print(df.shape)
data_df.to_csv('results_3.csv')
您可以检查csv文件中的值,我提取了页面中提到的所有200个名称和链接。