由于链接未更改,我试图在此处抓取有关所有公司的信息。请帮助我解决这种情况。
下面是我的代码:
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.mpcci.com/members_list.php')
soup = BeautifulSoup(r.text, 'lxml')
companies = [option['value'] for option in soup.select('#select_gp_id option')[1:2]]
for company in companies:
r = requests.post('http://www.mpcci.com/get_members.php', data={'select': company})
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table',{"id":"task-table"})
for member in table.find_all('tr')[1:128]:
td = member.find_all('td')
data = td[1]
members = [data['data-member'] for data in data.find_all('a',class_='get_detailed')]
## print(members)
for member in members:
r = requests.post('http://www.mpcci.com/get_detailed_members.php',data={'member':member})
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('div',class_='modal-content')
print(table)
答案 0 :(得分:2)
返回的member
HTML不包含您要查找的<div>
。它仅返回<table>
,因此以下内容将为您提供表格,您可以从中提取所需的信息。
另外,针对不同的请求使用不同的变量可能更安全。
如果要以CSV格式写入数据,可以使用Python的csv
库来完成:
import requests
import csv
import time
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
r = requests.get('http://www.mpcci.com/members_list.php', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
companies = [option['value'] for option in soup.select('#select_gp_id option')][1:]
with open('names.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Group ID', 'Serial No', 'Firm Name', 'Representative Name', 'Address', 'City', 'Mobile', 'Office Phone', 'Residence Phone', 'Fax','Email'])
for company_number, company in enumerate(companies, start=1):
r = requests.post('http://www.mpcci.com/get_members.php', data={'select': company}, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table', {"id":"task-table"})
print(f"{company_number} - {soup.h3.text}") # Show group header
for member_number, row in enumerate(table.find_all('tr')):
time.sleep(3.0) # Slow down request rate
tds = row.find_all('td')
if tds:
member = tds[1].find('a', class_='get_detailed')['data-member']
try:
r2 = requests.post('http://www.mpcci.com/get_detailed_members.php', data={'member' : member}, headers=headers)
except:
print(f" {member_number} - Failed to get {member}")
r2 = None
if r2:
soup2 = BeautifulSoup(r2.text, 'lxml')
row = [tr.find_all('td')[1].text for tr in soup2.table.find_all('tr')]
print(f" {member_number} - {row[2]}") # Show firm name as progress
csv_output.writerow(row)
这将使您names.csv
开始:
Group ID,Serial No,Firm Name,Representative Name,Address,City,Mobile,Office Phone,Residence Phone,Fax,Email
1,1,M/s Premdayal Garg & Sons,Shri Mayur Garg,"Gayatri Sadan, Mahadji Park, Palace Road, Lashkar",Gwalior,9826214555,2435504,2370288,0751-2435504,mayurhello@gmail.com
1,2,"M/s Gwalior Land, Deals & Finance",Shri Ashok Goyal,"Khatke Saheb Ka Bada, Dal Bazar, Lashkar",Gwalior,"7770845000,8889173876",4071308,2452920,-,-
1,3,M/s Yogendra Parakh,Shri Yogendra Parakh,"18/21, Cocher Compound, Jawahar Colony, Kampoo, Lashkar",Gwalior,9425109515,3253395,2444552,-,yparakh@rediffmail.com; k.p.mkt.gwl@gmail.com
使用Python 3.6.7测试
答案 1 :(得分:1)
我喜欢用熊猫来抓<table>
标签。它为您完成了艰苦的工作。还添加了一个睡眠/延迟,就好像您尝试在短时间内对站点进行过多次访问一样,它会阻止您。
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
r = requests.get('http://www.mpcci.com/members_list.php', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
companies = [option['value'] for option in soup.select('#select_gp_id option')]
results = pd.DataFrame()
for company in companies:
r = requests.post('http://www.mpcci.com/get_members.php', data={'select': company}, headers=headers)
time.sleep(6)
try:
table = pd.read_html(r.text)[0]
results = results.append(table)
print ('Collected: %s' %(company))
except:
print ('No tables found')
results = results.reset_index(drop=True)
答案 2 :(得分:1)
@MartinEvans和@ chitown88都添加了很好的答案,但是,下面的解决方案还演示了如何利用selenium
来抓取动态网页:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time, collections
d = webdriver.Chrome('/Users/path/to/chromedriver')
results = []
d.get('http://www.mpcci.com/members_list.php')
for i in d.find_element_by_id('select_gp_id').find_elements_by_tag_name('option')[1:]:
_current = collections.defaultdict(list)
_current['categories'].append(i.text)
i.click()
time.sleep(2)
_current['members'].append({'companies':[[c.text for c in b.find_all('td')][:-1] for b in soup(d.page_source, 'html.parser').find('table', {'id':'task-table'}).find_all('tr')], 'people':[]})
for h in d.find_elements_by_class_name('get_detailed'):
if 'Click to View' in h.text:
h.send_keys('\n')
time.sleep(3)
_current['members'][-1]['people'].append([[j.text for j in k.find_all('td')] for k in soup(d.page_source, 'html.parser').find('div', {'id':'put_detailed_info'}).find('table').find_all('tr')])
for _h in d.find_elements_by_class_name('close'):
try:
_h.send_keys('\n')
except:
pass
results.append(current)