Question

我有一个脚本来清理网址，以便从example.com/example1和example.com/example2获取基本域到example.com我的问题是当它通过url文件时会有重复的基本域。我想在将URL打印到文件时删除重复项。下面是我目前的代码。

enter from Tkinter import *
import tkFileDialog
import re

def main():

        fileOpen = Tk()
        fileOpen.withdraw() #hiding tkinter window

        file_path = tkFileDialog.askopenfilename(
        title="Open file", filetypes=[("txt file",".txt")])

        if file_path != "":
            print "you chose file with path:", file_path

        else:
            print "you didn't open anything!"

        fin = open(file_path)
        fout = open("URL Cleaned.txt", "wt")
        for line in fin.readlines():
                editor = (line.replace('[.]', '.')
                .replace('[dot]', '.')
                .replace('hxxp://www.', '')
                .replace('hxxps://www.', '')
                .replace('hxxps://', '')
                .replace('hxxp://', '')
                .replace('www.', '')
                .replace('http://www.', '')
                .replace('https://www.', '')
                .replace('https://', '')
                .replace('http://', ''))
                editor = re.sub(r'/.*', '', editor)

if __name__ == '__main__':
        main()

感谢任何帮助。我仔细检查了这些帖子并尝试了所有关于我的问题的建议，但没有找到一个有用的建议。

Answer 1

您可以使用正则表达式查找基本域。

如果您的文件中每行有一个网址：

import re

def main():
    file = open("url.txt",'r')

    domains = set()
    # will works for any web like https://www.domain.com/something/somethingmore... , also without www, without https or just for www.domain.org
    matcher= re.compile("(h..ps?://)?(?P<domain>(www\.)?[^/]*)/?.*") 

    for line in file:
        # make here any replace you need with obfuscated urls like: line = line.replace('[.]','.')
        if line[-1] == '\n': # remove "\n" from end of line if present
            line = line[0:-1]
        match = matcher.search(line)
        if match != None: # If a url has been found
            domains.add(match.group('domain'))

    print domains

    file.close()

main()

例如，使用此file，它将打印：

set(['platinum-shakers.net', 'wmi.ns01.us', 'adservice.no-ip.org', 'samczeruno.pl', 'java.ns1.name', 'microsoft.dhcp.biz', 'ids.us01.us', 'devsite.quostar.com', 'orlandmart.com'])

Answer 2

也许你可以使用正则表达式：

import re

p = re.compile(r".*\.com/(.*)") # to get for instance 'example1' or 'example2' etc.

with open(file_path) as fin, open("URL Cleaned.txt", "wt") as fout:
    lines = fin.readlines():
    bases = set(re.search(p, line).groups()[0] for line in lines if len(line) > 1) 
    for b in bases:
        fout.write(b)

在执行代码块

后，使用with open(..)自动关闭文件

<强>输出：

使用带有以下内容的文本文件

www.example.com/example1
www.example.com/example2
# blank lines are accounted for
www.example.com/example3

www.example.com/example4 
www.example.com/example4 # as are duplicates

作为线条，我得到了输出，

example1
example2
example3
example4

更改项目后删除重复项

2 个答案: