我试图使用正则表达式从大文本文件中查找并打印所有.com网址。由于有大约40个不同的网址,我想知道是否有办法搜索它们而不是一个接一个地进行搜索。
我使用的代码获取xxxx.com但在开头缺少https //:www。谁能告诉我如何获得完整的结果?提前谢谢!
import re
url = len(".com")
re = re.compile(r'\w*.com\b', url)
for line in open("report.txt"):
for url in re.findall(line):
print url
答案 0 :(得分:0)
这似乎有效:
#!/usr/local/cpython-2.7/bin/python
import re
def main():
regex = re.compile(r'https?://[^ \t]*.com\b', re.MULTILINE | re.DOTALL)
with open('logs.txt', 'r') as file_:
text = file_.read()
for url in regex.findall(text):
print(url)
main()
HTH
答案 1 :(得分:0)
#!/usr/bin/python
import urllib
import urlparse
import re
import requests
#
# A class for dealing with links
#
class linkGrabber:
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
#
# Remove White space and hash tags
#
def clean(self,link):
link = re.sub(' ','',link)
link = re.sub("#",'',link)
return link
def depth(self,link):
return len(urlparse.urlparse(url).path.split("/")) -1
def isAbsolute(self,link):
return len(urlparse.urlparse(link).netloc) > 0
def isRelative(self,link):
return len(urlparse.urlparse(link).netloc) < 1
def grab(self,markup,*args):
links = self.linkregex.findall(markup)
relative = []
absolute = []
for this in links:
#this = urlparse.urlparse(this)
if self.isAbsolute(this) == True:
absolute.append(this)
elif self.isAbsolute(this) == False:
relative.append(this)
if len(args) <=0:
return relative + absolute
elif "abs" in args:
return absolute
else:
return relative