如何从嵌套标签的页面中提取文本

时间:2020-10-10 09:09:16

标签: python beautifulsoup

我有需要提取的链接

  • 姓名,联系方式,联系我们,使用正则表达式从联系我们中提取的电话号码

以下代码成功提取名称,Rest标签是否为嵌套格式?怎么做

    import re
    l = ['https://www.aaaindia.org/author/aapl/']
    result = []
    for link in l:
        parser = 'html.parser' 
        resp = urllib.request.urlopen(link)
        soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
        for individual in soup.find_all("div", {"class": "grey_container"}):
            name = individual.find('h4').text
            contact = individual.find("p").text
            contactus = individual.find("p").text
            phonenu = re.findall('/d+',contactus )
        result.append({"name":name,'contact': contact, 'contactus':contactus, 'phone':phonenu})
        

3 个答案:

答案 0 :(得分:2)

import requests
from bs4 import BeautifulSoup


url = 'https://www.aaaindia.org/author/aapl/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}

soup = BeautifulSoup( requests.get(url, headers=headers).content, 'html.parser' )

name = soup.select_one('h4').text
contact = soup.find(text=lambda t: 'Contact Person:' in t).find_next('p').find_next(text=True).strip()
contact_us = soup.find('h6', text=lambda t: 'Contact Us' in t).find_next('p').text.strip()
phone = soup.select_one('.fa-phone').find_next(text=True)
email = soup.select_one('.fa-envelope').find_next(text=True)

print(name)
print(contact)
print(contact_us)
print(phone)
print(email)

打印:

ASSOCIATED ADVERTISING PVT LTD
Mr Hemant Agarwal – Managing Director
550/A1, 1st Floor, Road No.92,  
Jubilee Hills,  
HYDERABAD 500 096
(040) 2354 2429 / 2355 1095 
hemant@associated.co.in

答案 1 :(得分:1)

尝试:

import re
from bs4 import BeautifulSoup as bs4
from urllib.request import urlopen as uReq

l = ['https://www.aaaindia.org/author/aapl/']
result = []
for link in l:
    parser = 'html.parser' 
    resp = uReq(link)
    soup = bs4(resp, parser, from_encoding=resp.info().get_param('charset'))
    for individual in soup.find_all("div", {"class": "grey_container"}):
        all_p = individual.findAll("p")
        address = all_p[1].text.strip()
        contact, phonenu, contactus = (all_p[2].text.strip().split('\n'))
        name = individual.find('h4').text

    result.append({"name":name,'contact': contact.strip(), 'contactus':contactus, 'phone':phonenu})

结果:

[{'name': 'ASSOCIATED ADVERTISING PVT LTD',
  'contact': 'Mr Hemant Agarwal – Managing Director',
  'contactus': 'hemant@associated.co.in',
  'phone': '(040) 2354 2429 / 2355 1095 '}]

答案 2 :(得分:1)

另一种方法。

const Component = () => {
  const [items, setItems] = useState([]);
  const itemsRef = useRef(items);

  const fetchItems = useCallback(() => {
    const [first] = itemsRef.current;
    fetchNewItemsSince(first || 0).then((newItems) => {
      setItems((oldItems) => [...oldItems, ...newItems]);
    });
  }, []);

  // Update ref to dispose closure on `items` state
  useEffect(() => {
    itemsRef.current = items;
  }, [items]);

  // Call once on mount
  useEffect(() => {
    fetchItems();
  }, [fetchItems]);

  // Make an interval
  useEffect(() => {
    const id = setInterval(fetchItems, ONE_MINUTE);

    return () => {
      clearInterval(id);
    };
  }, [fetchItems]);
};

结果:

from simplified_scrapy import SimplifiedDoc, utils, req

l = ['https://www.aaaindia.org/author/aapl/']
result = []
for link in l:
    html = req.get(link)
    doc = SimplifiedDoc(html)
    grey_container = doc.getElement('div', value='grey_container')
    name = grey_container.h4.text
    contactus = grey_container.getElement('p', start='Contact Us').text

    person = grey_container.getElement('p', start='Contact Person').firstText()
    contact = grey_container.getElement('i', value='fa fa-envelope').nextText()
    phone = grey_container.getElement('i', value='fa fa-phone').nextText()
    # Or
    [person, phone, contact] = grey_container.getElement('p', start='Contact Person').getText('|').split('|')

    result.append({
        'name': name,
        'contactus': contactus,
        'contact': contact,
        'phone': phone,
        'person': doc.unescape(person)
    })

print(result)