Python网络抓取删除重复的电子邮件

时间:2018-10-10 10:47:29

标签: python email web-scraping duplicates

如何从此代码中删除重复的电子邮件? 它可以与import io from unittest.mock import patch, MagicMock class MyObject(): def __init__(self,path): fp = open(path) self.file_list = [] for line in fp: self.file_list.append(line.strip()) fp.close() def testsimpleFile(): fake_file = MagicMock() fake_file.__iter__.return_value = ["data.csv", "data2.csv"] with patch("builtins.open", return_value=fake_file, create=True) as mock_file: f = MyObject("/path/to/open/test.f") mock_file.assert_called_once_with("/path/to/open/test.f") golden_list = ["data.csv", "data2.csv"] assert f.file_list == golden_list fake_file.close.assert_called_once() 一起使用吗?我做了一些尝试,但是没有运气。

set()

是否可以只获取一次电子邮件地址?

1 个答案:

答案 0 :(得分:0)

您的写作是如此不可思议。那是一页获取电子邮件的扩展写作范例:

import requests
from lxml.html import fromstring


def start(link):
    response = requests.get(link)
    if response.status_code == 200:
        tree = fromstring(response.text)
        all_links_titles = tree.xpath('//a/@title')
        emails = []
        for title in all_links_titles:
            if '@' in title:
                emails.append(title)

        return list(set(emails))


if __name__ == "__main__":
    for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
        print(i)

或者您可以使用正则表达式从网址中提取电子邮件,如下所示:

import requests
import re
from lxml.html import fromstring


def start(link):
    response = requests.get(link)
    if response.status_code == 200:
        tree = fromstring(response.text)
        all_links = tree.xpath('//a/@href')
        emails = []
        for link in all_links:
            is_there = re.search('(?<=email=).+@.+?(?=(&|$))', link)
            if is_there:
                emails.append(is_there.group())

        return list(set(emails))


if __name__ == "__main__":
    for i in start("http://www.schulliste.eu/schule/33601-elsterschloss-gymnasium/"):
        print(i)

以及所有学校报废的全部代码:

import requests
import random
from time import sleep
from lxml.html import fromstring


def get_all_schools_urls(url='http://www.schulliste.eu/type/gymnasien/', paginate_by=20, ping=None):
    school_urls = []
    offset = 0
    while True:
        if ping:
            sleep(random.randrange(*ping))

        school_list_url = '{0}?start={1}'.format(url, offset)

        print('\tCollecting urls from {0}'.format(school_list_url))
        response = requests.get(school_list_url)
        if response.status_code == 200:
            tree = fromstring(response.text)
            urls = tree.xpath('//div[@class="school_name"]/a/@href')
            if urls:
                school_urls += urls
                print('\t\tFound urls {0}'.format(len(urls)))
            else:
                break
        else:
            raise ConnectionError

        offset += paginate_by

    return list(set(school_urls))


def get_emails(urls):
    emails = []

    for url in urls:
        print('\tCollecting e-mails from {0}'.format(url))
        response = requests.get(url)
        if response.status_code == 200:
            tree = fromstring(response.text)
            all_links_titles = tree.xpath('//a/@title')
            for title in all_links_titles:
                if '@' in title:
                    emails.append(title)
        else:
            raise ConnectionError

    return list(set(emails))


def start(output_urls, output_emails):
    print("Starting collection of school urls")
    schools_urls = get_all_schools_urls()
    print("Collected {0} schools urls".format(len(schools_urls)))
    with open(output_urls, 'w') as file:
        file.write("\n".join(schools_urls))
    print("Schools urls saved: {0}".format(output_urls))

    print("Starting collection of school emails")
    schools_emails = get_emails(schools_urls)
    print("Collected {0} schools emails".format(len(schools_emails)))
    with open(output_emails, 'w') as file:
        file.write("\n".join(schools_emails))
    print("Schools e-mails saved: {0}".format(output_urls))


if __name__ == "__main__":
    start('schools_urls.txt', 'schools_emails.txt')