答案 0 :(得分:0)
可能有点棘手,因为每个网站可能都不同。但是您可以尝试使用一些常见的标识符通过执行soup.select("a[href*=mailto]")
或soup.select("a[href*=callto]")
来获取电话或电子邮件。您还可以使用正则表达式在html文本中提取与您假定为电话号码和/或电子邮件地址的字符串匹配的字符串。当然,这将撤出它找到的所有电子邮件或电话,因此我必须做一些假设。
这并不完美,但希望能帮助您前进:
import pandas as pd
import requests
import bs4
import re
src_df = pd.read_csv('C:/src_file.csv')
def get_phone(soup):
try:
phone = soup.select("a[href*=callto]")[0].text
return phone
except:
pass
try:
phone = re.findall(r'\(?\b[2-9][0-9]{2}\)?[-][2-9][0-9]{2}[-][0-9]{4}\b', response.text)[0]
return phone
except:
pass
try:
phone = re.findall(r'\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b', response.text)[-1]
return phone
except:
print ('Phone number not found')
phone = ''
return phone
def get_email(soup):
try:
email = re.findall(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', response.text)[-1]
return email
except:
pass
try:
email = soup.select("a[href*=mailto]")[-1].text
except:
print ('Email not found')
email = ''
return email
for i, row in src_df.iterrows():
url = 'http://www.' + row['website']
try:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
except:
print ('Unsucessful: ' + str(response))
continue
phone = get_phone(soup)
email = get_email(soup)
src_df.loc[i,'Phone'] = phone
src_df.loc[i,'Email'] = email
print ('website:%s\nphone: %s\nemail: %s\n' %(url, phone, email))
src_df.to_csv('output.csv', index=False)
输出:
print (src_df)
Organization ... Email
0 California Community College ... tcalhoon@ccctechcenter.org
1 Colorado Community College ... cccs.communications@cccs.edu
2 Raritan Valley Community College ... Cheryl.Wallace@RaritanVal.edu
[3 rows x 5 columns]