Question

由于链接未更改，我试图在此处抓取有关所有公司的信息。请帮助我解决这种情况。

下面是我的代码：

import requests
from bs4 import BeautifulSoup

r = requests.get('http://www.mpcci.com/members_list.php')
soup = BeautifulSoup(r.text, 'lxml')

companies = [option['value'] for option in soup.select('#select_gp_id option')[1:2]]

for company in companies:

    r = requests.post('http://www.mpcci.com/get_members.php', data={'select': company})
    soup = BeautifulSoup(r.text, 'lxml')

    table = soup.find('table',{"id":"task-table"})

    for member in table.find_all('tr')[1:128]:

        td = member.find_all('td')

        data = td[1]

        members = [data['data-member'] for data in data.find_all('a',class_='get_detailed')]
##        print(members)
        for member in members:

            r = requests.post('http://www.mpcci.com/get_detailed_members.php',data={'member':member})
            soup = BeautifulSoup(r.text, 'lxml')
            table = soup.find('div',class_='modal-content')
            print(table)

Answer 1

返回的member HTML不包含您要查找的<div>。它仅返回<table>，因此以下内容将为您提供表格，您可以从中提取所需的信息。

另外，针对不同的请求使用不同的变量可能更安全。

如果要以CSV格式写入数据，可以使用Python的csv库来完成：

import requests
import csv
import time

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}

r = requests.get('http://www.mpcci.com/members_list.php', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')

companies = [option['value'] for option in soup.select('#select_gp_id option')][1:]

with open('names.csv', 'w', newline='') as f_output:
    csv_output = csv.writer(f_output)
    csv_output.writerow(['Group ID', 'Serial No', 'Firm Name', 'Representative Name', 'Address', 'City', 'Mobile', 'Office Phone', 'Residence Phone', 'Fax','Email'])

    for company_number, company in enumerate(companies, start=1):
        r = requests.post('http://www.mpcci.com/get_members.php', data={'select': company}, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        table = soup.find('table', {"id":"task-table"})

        print(f"{company_number} - {soup.h3.text}")     # Show group header

        for member_number, row in enumerate(table.find_all('tr')):
            time.sleep(3.0) # Slow down request rate
            tds = row.find_all('td')

            if tds:
                member = tds[1].find('a', class_='get_detailed')['data-member']

                try:
                    r2 = requests.post('http://www.mpcci.com/get_detailed_members.php', data={'member' : member}, headers=headers)
                except:
                    print(f"  {member_number} - Failed to get {member}")
                    r2 = None

                if r2:
                    soup2 = BeautifulSoup(r2.text, 'lxml')
                    row = [tr.find_all('td')[1].text for tr in soup2.table.find_all('tr')]
                    print(f"  {member_number} - {row[2]}")   # Show firm name as progress
                    csv_output.writerow(row)

这将使您names.csv开始：

Group ID,Serial No,Firm Name,Representative Name,Address,City,Mobile,Office Phone,Residence Phone,Fax,Email
1,1,M/s Premdayal Garg & Sons,Shri Mayur  Garg,"Gayatri Sadan, Mahadji Park, Palace Road, Lashkar",Gwalior,9826214555,2435504,2370288,0751-2435504,mayurhello@gmail.com
1,2,"M/s Gwalior Land, Deals & Finance",Shri Ashok  Goyal,"Khatke Saheb Ka Bada, Dal Bazar, Lashkar",Gwalior,"7770845000,8889173876",4071308,2452920,-,-
1,3,M/s Yogendra Parakh,Shri Yogendra  Parakh,"18/21, Cocher Compound,  Jawahar Colony, Kampoo, Lashkar",Gwalior,9425109515,3253395,2444552,-,yparakh@rediffmail.com; k.p.mkt.gwl@gmail.com

使用Python 3.6.7测试

Answer 2

我喜欢用熊猫来抓<table>标签。它为您完成了艰苦的工作。还添加了一个睡眠/延迟，就好像您尝试在短时间内对站点进行过多次访问一样，它会阻止您。

import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

r = requests.get('http://www.mpcci.com/members_list.php', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')

companies = [option['value'] for option in soup.select('#select_gp_id option')]


results = pd.DataFrame()
for company in companies:
    r = requests.post('http://www.mpcci.com/get_members.php', data={'select': company}, headers=headers)
    time.sleep(6)
    try:
        table = pd.read_html(r.text)[0]
        results = results.append(table)
        print ('Collected: %s' %(company))
    except:
        print ('No tables found')

results = results.reset_index(drop=True)

Answer 3

@MartinEvans和@ chitown88都添加了很好的答案，但是，下面的解决方案还演示了如何利用selenium来抓取动态网页：

from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time, collections
d = webdriver.Chrome('/Users/path/to/chromedriver')
results = []
d.get('http://www.mpcci.com/members_list.php')
for i in d.find_element_by_id('select_gp_id').find_elements_by_tag_name('option')[1:]:
   _current = collections.defaultdict(list)
   _current['categories'].append(i.text)
   i.click()
   time.sleep(2) 
   _current['members'].append({'companies':[[c.text for c in b.find_all('td')][:-1] for b in soup(d.page_source, 'html.parser').find('table', {'id':'task-table'}).find_all('tr')], 'people':[]})
   for h in d.find_elements_by_class_name('get_detailed'):
     if 'Click to View' in h.text:
       h.send_keys('\n')
       time.sleep(3)
       _current['members'][-1]['people'].append([[j.text for j in k.find_all('td')] for k in soup(d.page_source, 'html.parser').find('div', {'id':'put_detailed_info'}).find('table').find_all('tr')])
       for _h in d.find_elements_by_class_name('close'):
         try:
           _h.send_keys('\n')
         except:
           pass
   results.append(current)

想要通过单击文本来刮擦公司的信息，但无法使用beautifulsoup刮擦

3 个答案: