我有需要提取的链接
以下代码成功提取名称,Rest标签是否为嵌套格式?怎么做
import re
l = ['https://www.aaaindia.org/author/aapl/']
result = []
for link in l:
parser = 'html.parser'
resp = urllib.request.urlopen(link)
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
for individual in soup.find_all("div", {"class": "grey_container"}):
name = individual.find('h4').text
contact = individual.find("p").text
contactus = individual.find("p").text
phonenu = re.findall('/d+',contactus )
result.append({"name":name,'contact': contact, 'contactus':contactus, 'phone':phonenu})
答案 0 :(得分:2)
import requests
from bs4 import BeautifulSoup
url = 'https://www.aaaindia.org/author/aapl/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup( requests.get(url, headers=headers).content, 'html.parser' )
name = soup.select_one('h4').text
contact = soup.find(text=lambda t: 'Contact Person:' in t).find_next('p').find_next(text=True).strip()
contact_us = soup.find('h6', text=lambda t: 'Contact Us' in t).find_next('p').text.strip()
phone = soup.select_one('.fa-phone').find_next(text=True)
email = soup.select_one('.fa-envelope').find_next(text=True)
print(name)
print(contact)
print(contact_us)
print(phone)
print(email)
打印:
ASSOCIATED ADVERTISING PVT LTD
Mr Hemant Agarwal – Managing Director
550/A1, 1st Floor, Road No.92,
Jubilee Hills,
HYDERABAD 500 096
(040) 2354 2429 / 2355 1095
hemant@associated.co.in
答案 1 :(得分:1)
尝试:
import re
from bs4 import BeautifulSoup as bs4
from urllib.request import urlopen as uReq
l = ['https://www.aaaindia.org/author/aapl/']
result = []
for link in l:
parser = 'html.parser'
resp = uReq(link)
soup = bs4(resp, parser, from_encoding=resp.info().get_param('charset'))
for individual in soup.find_all("div", {"class": "grey_container"}):
all_p = individual.findAll("p")
address = all_p[1].text.strip()
contact, phonenu, contactus = (all_p[2].text.strip().split('\n'))
name = individual.find('h4').text
result.append({"name":name,'contact': contact.strip(), 'contactus':contactus, 'phone':phonenu})
结果:
[{'name': 'ASSOCIATED ADVERTISING PVT LTD',
'contact': 'Mr Hemant Agarwal – Managing Director',
'contactus': 'hemant@associated.co.in',
'phone': '(040) 2354 2429 / 2355 1095 '}]
答案 2 :(得分:1)
另一种方法。
const Component = () => {
const [items, setItems] = useState([]);
const itemsRef = useRef(items);
const fetchItems = useCallback(() => {
const [first] = itemsRef.current;
fetchNewItemsSince(first || 0).then((newItems) => {
setItems((oldItems) => [...oldItems, ...newItems]);
});
}, []);
// Update ref to dispose closure on `items` state
useEffect(() => {
itemsRef.current = items;
}, [items]);
// Call once on mount
useEffect(() => {
fetchItems();
}, [fetchItems]);
// Make an interval
useEffect(() => {
const id = setInterval(fetchItems, ONE_MINUTE);
return () => {
clearInterval(id);
};
}, [fetchItems]);
};
结果:
from simplified_scrapy import SimplifiedDoc, utils, req
l = ['https://www.aaaindia.org/author/aapl/']
result = []
for link in l:
html = req.get(link)
doc = SimplifiedDoc(html)
grey_container = doc.getElement('div', value='grey_container')
name = grey_container.h4.text
contactus = grey_container.getElement('p', start='Contact Us').text
person = grey_container.getElement('p', start='Contact Person').firstText()
contact = grey_container.getElement('i', value='fa fa-envelope').nextText()
phone = grey_container.getElement('i', value='fa fa-phone').nextText()
# Or
[person, phone, contact] = grey_container.getElement('p', start='Contact Person').getText('|').split('|')
result.append({
'name': name,
'contactus': contactus,
'contact': contact,
'phone': phone,
'person': doc.unescape(person)
})
print(result)