从大文本文件中打印多个URL

时间:2014-04-16 23:37:08

标签: python regex url

我试图使用正则表达式从大文本文件中查找并打印所有.com网址。由于有大约40个不同的网址,我想知道是否有办法搜索它们而不是一个接一个地进行搜索。

我使用的代码获取xxxx.com但在开头缺少https //:www。谁能告诉我如何获得完整的结果?提前谢谢!

import re 
url = len(".com") 
re = re.compile(r'\w*.com\b', url) 
for line in open("report.txt"): 
    for url in re.findall(line): 
        print url

2 个答案:

答案 0 :(得分:0)

这似乎有效:

#!/usr/local/cpython-2.7/bin/python

import re

def main():
    regex = re.compile(r'https?://[^ \t]*.com\b', re.MULTILINE | re.DOTALL)

    with open('logs.txt', 'r') as file_:
        text = file_.read()

    for url in regex.findall(text):
        print(url)

main()

HTH

答案 1 :(得分:0)

#!/usr/bin/python

import urllib
import urlparse
import re
import requests

#
# A class for dealing with links 
#

class linkGrabber:

  linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

  #
  # Remove White space and hash tags 
  #

  def clean(self,link):
    link = re.sub(' ','',link)
    link = re.sub("#",'',link)
    return link

def depth(self,link):  
    return len(urlparse.urlparse(url).path.split("/")) -1

  def isAbsolute(self,link):
    return len(urlparse.urlparse(link).netloc) > 0

  def isRelative(self,link):
    return len(urlparse.urlparse(link).netloc) < 1

  def grab(self,markup,*args):
    links = self.linkregex.findall(markup)
    relative = []
    absolute = []
    for this in links:
      #this = urlparse.urlparse(this)
      if self.isAbsolute(this) == True:
        absolute.append(this)
      elif  self.isAbsolute(this) == False:
        relative.append(this)
    if len(args) <=0:
      return relative + absolute
    elif "abs" in args:
      return absolute
    else:
      return relative