如何从此代码中删除重复的电子邮件?
它可以与import io
from unittest.mock import patch, MagicMock
class MyObject():
def __init__(self,path):
fp = open(path)
self.file_list = []
for line in fp:
self.file_list.append(line.strip())
fp.close()
def testsimpleFile():
fake_file = MagicMock()
fake_file.__iter__.return_value = ["data.csv", "data2.csv"]
with patch("builtins.open", return_value=fake_file, create=True) as mock_file:
f = MyObject("/path/to/open/test.f")
mock_file.assert_called_once_with("/path/to/open/test.f")
golden_list = ["data.csv", "data2.csv"]
assert f.file_list == golden_list
fake_file.close.assert_called_once()
一起使用吗?我做了一些尝试,但是没有运气。
set()
是否可以只获取一次电子邮件地址?
答案 0 :(得分:0)
您的写作是如此不可思议。那是一页获取电子邮件的扩展写作范例:
import requests
from lxml.html import fromstring
def start(link):
response = requests.get(link)
if response.status_code == 200:
tree = fromstring(response.text)
all_links_titles = tree.xpath('//a/@title')
emails = []
for title in all_links_titles:
if '@' in title:
emails.append(title)
return list(set(emails))
if __name__ == "__main__":
for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
print(i)
或者您可以使用正则表达式从网址中提取电子邮件,如下所示:
import requests
import re
from lxml.html import fromstring
def start(link):
response = requests.get(link)
if response.status_code == 200:
tree = fromstring(response.text)
all_links = tree.xpath('//a/@href')
emails = []
for link in all_links:
is_there = re.search('(?<=email=).+@.+?(?=(&|$))', link)
if is_there:
emails.append(is_there.group())
return list(set(emails))
if __name__ == "__main__":
for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
print(i)
以及所有学校报废的全部代码:
import requests
import random
from time import sleep
from lxml.html import fromstring
def get_all_schools_urls(url='http://www.schulliste.eu/type/gymnasien/', paginate_by=20, ping=None):
school_urls = []
offset = 0
while True:
if ping:
sleep(random.randrange(*ping))
school_list_url = '{0}?start={1}'.format(url, offset)
print('\tCollecting urls from {0}'.format(school_list_url))
response = requests.get(school_list_url)
if response.status_code == 200:
tree = fromstring(response.text)
urls = tree.xpath('//div[@class="school_name"]/a/@href')
if urls:
school_urls += urls
print('\t\tFound urls {0}'.format(len(urls)))
else:
break
else:
raise ConnectionError
offset += paginate_by
return list(set(school_urls))
def get_emails(urls):
emails = []
for url in urls:
print('\tCollecting e-mails from {0}'.format(url))
response = requests.get(url)
if response.status_code == 200:
tree = fromstring(response.text)
all_links_titles = tree.xpath('//a/@title')
for title in all_links_titles:
if '@' in title:
emails.append(title)
else:
raise ConnectionError
return list(set(emails))
def start(output_urls, output_emails):
print("Starting collection of school urls")
schools_urls = get_all_schools_urls()
print("Collected {0} schools urls".format(len(schools_urls)))
with open(output_urls, 'w') as file:
file.write("\n".join(schools_urls))
print("Schools urls saved: {0}".format(output_urls))
print("Starting collection of school emails")
schools_emails = get_emails(schools_urls)
print("Collected {0} schools emails".format(len(schools_emails)))
with open(output_emails, 'w') as file:
file.write("\n".join(schools_emails))
print("Schools e-mails saved: {0}".format(output_urls))
if __name__ == "__main__":
start('schools_urls.txt', 'schools_emails.txt')